MADNESS  version 0.9
mxm.h
Go to the documentation of this file.
1 /*
2  This file is part of MADNESS.
3 
4  Copyright (C) 2007,2010 Oak Ridge National Laboratory
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 
20  For more information please contact:
21 
22  Robert J. Harrison
23  Oak Ridge National Laboratory
24  One Bethel Valley Road
25  P.O. Box 2008, MS-6367
26 
27  email: harrisonrj@ornl.gov
28  tel: 865-241-3937
29  fax: 865-572-0680
30 
31 
32  $Id$
33 */
34 
35 #ifndef MADNESS_TENSOR_MXM_H__INCLUDED
36 #define MADNESS_TENSOR_MXM_H__INCLUDED
37 
40 
41 // This file is ONLY included into tensor.cc ... separated here just
42 // to shrink file size. Don't try to include anywhere else
43 
44 // Due to both flakey compilers and performance concerns,
45 // we use a simple reference implementation of the mxm
46 // routines for all except T=double.
47 
48 
49 // Here undef _CRAY since no longer care about Cray X1
50 // and want to ignore it for the XT3/4
51 #ifdef _CRAY
52 #undef _CRAY
53 #endif
54 
55 
57 template <typename T, typename Q, typename S>
58 static inline void mxm(long dimi, long dimj, long dimk,
59  T* restrict c, const Q* restrict a,
60  const S* restrict b) {
61  /*
62  c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
63 
64  where it is assumed that the last index in each array is has unit
65  stride and the dimensions are as provided.
66  */
67 
68  for (long i=0; i<dimi; ++i) {
69  for (long k=0; k<dimk; ++k) {
70 #ifdef _CRAY
71 #pragma _CRI prefervector
72 #endif
73  for (long j=0; j<dimj; ++j) {
74  c[i*dimj+j] += a[i*dimk+k]*b[k*dimj+j];
75  }
76  }
77  }
78 }
79 
80 
82 template <typename T, typename Q, typename S>
83 static inline
84 void mTxm(long dimi, long dimj, long dimk,
85  T* restrict c, const Q* restrict a,
86  const S* restrict b) {
87  /*
88  c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j)
89 
90  where it is assumed that the last index in each array is has unit
91  stride and the dimensions are as provided.
92 
93  i loop might be long in anticipated application
94  */
95 
96  for (long k=0; k<dimk; ++k) {
97  for (long j=0; j<dimj; ++j) {
98 #ifdef _CRAY
99 #pragma _CRI prefervector
100 #endif
101  for (long i=0; i<dimi; ++i) {
102  c[i*dimj+j] += a[k*dimi+i]*b[k*dimj+j];
103  }
104  }
105  }
106 }
107 
109 template <typename T, typename Q, typename S>
110 static inline void mxmT(long dimi, long dimj, long dimk,
111  T* restrict c, const Q* restrict a,
112  const S* restrict b) {
113  /*
114  c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
115 
116  where it is assumed that the last index in each array is has unit
117  stride and the dimensions are as provided.
118 
119  i loop might be long in anticipated application
120  */
121 
122  for (long i=0; i<dimi; ++i) {
123  for (long j=0; j<dimj; ++j) {
124  T sum = 0;
125  for (long k=0; k<dimk; ++k) {
126  sum += a[i*dimk+k]*b[j*dimk+k];
127  }
128  c[i*dimj+j] += sum;
129  }
130  }
131 }
132 
134 template <typename T, typename Q, typename S>
135 static inline void mTxmT(long dimi, long dimj, long dimk,
136  T* restrict c, const Q* restrict a,
137  const S* restrict b) {
138  /*
139  c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
140 
141  where it is assumed that the last index in each array is has unit
142  stride and the dimensions are as provided.
143  */
144 
145  for (long i=0; i<dimi; ++i) {
146 #ifdef _CRAY
147 #pragma _CRI prefervector
148 #endif
149  for (long j=0; j<dimj; ++j) {
150  for (long k=0; k<dimk; ++k) {
151  c[i*dimj+j] += a[k*dimi+i]*b[j*dimk+k];
152  }
153  }
154  }
155 }
156 
157 #ifdef _CRAY
158 // Simple loop structure best on the Cray X1
159 template
160 void mTxm(long dimi, long dimj, long dimk, double* restrict c,
161  const double* restrict a, const double* restrict b);
162 #else
163 // The following are restricted to double only
164 
166 
167 template <>
168 inline void mTxm(long dimi, long dimj, long dimk,
169  double* restrict c, const double* restrict a,
170  const double* restrict b) {
171  /*
172  c(i,j) = c(i,j) + sum(k) a(k,i)*b(k,j) <--- NOTE ACCUMULATION INTO C
173 
174  where it is assumed that the last index in each array is has unit
175  stride and the dimensions are as provided.
176 
177  i loop might be long in anticipated application
178 
179  4-way unrolled k loop ... empirically fastest on PIII
180  compared to 2/3 way unrolling (though not by much).
181  */
182 
183  long dimk4 = (dimk/4)*4;
184  for (long i=0; i<dimi; ++i,c+=dimj) {
185  const double* ai = a+i;
186  const double* p = b;
187  for (long k=0; k<dimk4; k+=4,ai+=4*dimi,p+=4*dimj) {
188  double ak0i = ai[0 ];
189  double ak1i = ai[dimi];
190  double ak2i = ai[dimi+dimi];
191  double ak3i = ai[dimi+dimi+dimi];
192  const double* bk0 = p;
193  const double* bk1 = p+dimj;
194  const double* bk2 = p+dimj+dimj;
195  const double* bk3 = p+dimj+dimj+dimj;
196  for (long j=0; j<dimj; ++j) {
197  c[j] += ak0i*bk0[j] + ak1i*bk1[j] + ak2i*bk2[j] + ak3i*bk3[j];
198  }
199  }
200  for (long k=dimk4; k<dimk; ++k) {
201  double aki = a[k*dimi+i];
202  const double* bk = b+k*dimj;
203  for (long j=0; j<dimj; ++j) {
204  c[j] += aki*bk[j];
205  }
206  }
207  }
208 }
209 
210 
211 
213 
214 template <>
215 inline void mxmT(long dimi, long dimj, long dimk,
216  double* restrict c,
217  const double* restrict a, const double* restrict b) {
218  /*
219  c(i,j) = c(i,j) + sum(k) a(i,k)*b(j,k)
220 
221  where it is assumed that the last index in each array is has unit
222  stride and the dimensions are as provided.
223 
224  j loop might be long in anticipated application
225 
226  Unrolled i loop. Empirically fastest on PIII compared
227  to unrolling j, or both i&j.
228  */
229 
230  long dimi2 = (dimi/2)*2;
231  for (long i=0; i<dimi2; i+=2) {
232  const double* ai0 = a+i*dimk;
233  const double* ai1 = a+i*dimk+dimk;
234  double* restrict ci0 = c+i*dimj;
235  double* restrict ci1 = c+i*dimj+dimj;
236  for (long j=0; j<dimj; ++j) {
237  double sum0 = 0;
238  double sum1 = 0;
239  const double* bj = b + j*dimk;
240  for (long k=0; k<dimk; ++k) {
241  sum0 += ai0[k]*bj[k];
242  sum1 += ai1[k]*bj[k];
243  }
244  ci0[j] += sum0;
245  ci1[j] += sum1;
246  }
247  }
248  for (long i=dimi2; i<dimi; ++i) {
249  const double* ai = a+i*dimk;
250  double* restrict ci = c+i*dimj;
251  for (long j=0; j<dimj; ++j) {
252  double sum = 0;
253  const double* bj = b+j*dimk;
254  for (long k=0; k<dimk; ++k) {
255  sum += ai[k]*bj[k];
256  }
257  ci[j] += sum;
258  }
259  }
260 }
261 
263 template <>
264 inline void mxm(long dimi, long dimj, long dimk,
265  double* restrict c, const double* restrict a, const double* restrict b) {
266  /*
267  c(i,j) = c(i,j) + sum(k) a(i,k)*b(k,j)
268 
269  where it is assumed that the last index in each array is has unit
270  stride and the dimensions are as provided.
271 
272  4-way unrolled k loop ... empirically fastest on PIII
273  compared to 2/3 way unrolling (though not by much).
274  */
275 
276  long dimk4 = (dimk/4)*4;
277  for (long i=0; i<dimi; ++i, c+=dimj,a+=dimk) {
278  const double* p = b;
279  for (long k=0; k<dimk4; k+=4,p+=4*dimj) {
280  double aik0 = a[k ];
281  double aik1 = a[k+1];
282  double aik2 = a[k+2];
283  double aik3 = a[k+3];
284  const double* bk0 = p;
285  const double* bk1 = bk0+dimj;
286  const double* bk2 = bk1+dimj;
287  const double* bk3 = bk2+dimj;
288  for (long j=0; j<dimj; ++j) {
289  c[j] += aik0*bk0[j] + aik1*bk1[j] + aik2*bk2[j] + aik3*bk3[j];
290  }
291  }
292  for (long k=dimk4; k<dimk; ++k) {
293  double aik = a[k];
294  for (long j=0; j<dimj; ++j) {
295  c[j] += aik*b[k*dimj+j];
296  }
297  }
298  }
299 }
300 
302 template <>
303 inline void mTxmT(long dimi, long dimj, long dimk,
304  double* restrict csave, const double* restrict asave, const double* restrict b) {
305  /*
306  c(i,j) = c(i,j) + sum(k) a(k,i)*b(j,k)
307 
308  where it is assumed that the last index in each array is has unit
309  stride and the dimensions are as provided.
310 
311  Tiled k, copy row of a into temporary, and unroll j once.
312  */
313 
314  const int ktile=32;
315  double ai[ktile];
316  long dimj2 = (dimj/2)*2;
317 
318  for (long klo=0; klo<dimk; klo+=ktile, asave+=ktile*dimi, b+=ktile) {
319  long khi = klo+ktile;
320  if (khi > dimk) khi = dimk;
321  long nk = khi-klo;
322 
323  const double *restrict a = asave;
324  double *restrict c = csave;
325  for (long i=0; i<dimi; ++i,c+=dimj,++a) {
326  const double* q = a;
327  for (long k=0; k<nk; ++k,q+=dimi) ai[k] = *q;
328 
329  const double* bj0 = b;
330  for (long j=0; j<dimj2; j+=2,bj0+=2*dimk) {
331  const double* bj1 = bj0+dimk;
332  double sum0 = 0;
333  double sum1 = 0;
334  for (long k=0; k<nk; ++k) {
335  sum0 += ai[k]*bj0[k];
336  sum1 += ai[k]*bj1[k];
337  }
338  c[j ] += sum0;
339  c[j+1] += sum1;
340  }
341 
342  for (long j=dimj2; j<dimj; ++j,bj0+=dimk) {
343  double sum = 0;
344  for (long k=0; k<nk; ++k) {
345  sum += ai[k]*bj0[k];
346  }
347  c[j] += sum;
348  }
349  }
350  }
351 }
352 #endif
353 #endif // MADNESS_TENSOR_MXM_H__INCLUDED
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
const int k
Definition: dielectric.cc:184
FLOAT a(int j, FLOAT z)
Definition: y1.cc:86
void mxmT(long dimi, long dimj, long dimk, double *restrict c, const double *restrict a, const double *restrict b)
Matrix * matrix transpose (hand unrolled version)
Definition: mxm.h:215
void mxm(long dimi, long dimj, long dimk, double *restrict c, const double *restrict a, const double *restrict b)
Matrix * matrix (hand unrolled version)
Definition: mxm.h:264
void mTxm(long dimi, long dimj, long dimk, double *restrict c, const double *restrict a, const double *restrict b)
Matrix transpose * matrix (hand unrolled version)
Definition: mxm.h:168
const mpreal sum(const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode)
Definition: mpreal.cc:241
#define restrict
Definition: config.h:403
const double c
Definition: gfit.cc:200
FLOAT b(int j, FLOAT z)
Definition: y1.cc:79
void mTxmT(long dimi, long dimj, long dimk, double *restrict csave, const double *restrict asave, const double *restrict b)
Matrix transpose * matrix transpose (hand tiled and unrolled)
Definition: mxm.h:303