MADNESS  version 0.9
aligned.h
Go to the documentation of this file.
1 /*
2  This file is part of MADNESS.
3 
4  Copyright (C) 2007,2010 Oak Ridge National Laboratory
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 
20  For more information please contact:
21 
22  Robert J. Harrison
23  Oak Ridge National Laboratory
24  One Bethel Valley Road
25  P.O. Box 2008, MS-6367
26 
27  email: harrisonrj@ornl.gov
28  tel: 865-241-3937
29  fax: 865-572-0680
30 
31  $Id$
32 */
33 #ifndef MADNESS_TENSOR_ALIGNED_H__INCLUDED
34 #define MADNESS_TENSOR_ALIGNED_H__INCLUDED
35 
44 #include <madness/madness_config.h>
45 #include <madness/tensor/tensor.h>
46 #include <cstring>
47 
48 namespace madness {
49 
50  template <typename T>
51  static
52  inline
53  void aligned_zero(long n, T* a) {
54 #ifdef HAVE_MEMSET
55  // A hand coded SSE2 loop is faster only for data in the L1 cache
56  std::memset((void *) a, 0, n*sizeof(T));
57 #else
58  long n4 = (n>>2)<<2;
59  long rem = n-n4;
60  for (long i=0; i<n4; i+=4,a+=4) {
61  a[0] = 0;
62  a[1] = 0;
63  a[2] = 0;
64  a[3] = 0;
65  }
66  for (long i=0; i<rem; ++i) *a++ = 0;
67 #endif
68  }
69 
70  template <typename T, typename Q>
71  static
72  inline
73  void aligned_axpy(long n, T* restrict a, const T* restrict b, Q s) {
74  long n4 = (n>>2)<<2;
75  long rem = n-n4;
76  for (long i=0; i<n4; i+=4,a+=4,b+=4) {
77  a[0] += s*b[0];
78  a[1] += s*b[1];
79  a[2] += s*b[2];
80  a[3] += s*b[3];
81  }
82  for (long i=0; i<rem; ++i) *a++ += s * *b++;
83  }
84 
85  template <typename T, typename Q>
86  static
87  inline
88  void aligned_add(long n, T* restrict a, const Q* restrict b) {
89  long n4 = (n>>2)<<2;
90  long rem = n-n4;
91  for (long i=0; i<n4; i+=4,a+=4,b+=4) {
92  a[0] += b[0];
93  a[1] += b[1];
94  a[2] += b[2];
95  a[3] += b[3];
96  }
97  for (long i=0; i<rem; ++i) *a++ += *b++;
98  }
99 
100  template <typename T, typename Q>
101  static
102  inline
103  void aligned_sub(long n, T* restrict a, const Q* restrict b) {
104  long n4 = (n>>2)<<2;
105  long rem = n-n4;
106  for (long i=0; i<n4; i+=4,a+=4,b+=4) {
107  a[0] -= b[0];
108  a[1] -= b[1];
109  a[2] -= b[2];
110  a[3] -= b[3];
111  }
112  for (long i=0; i<rem; ++i) *a++ -= *b++;
113  }
114 }
115 
116 #endif // MADNESS_TENSOR_ALIGNED_H__INCLUDED
void aligned_add(long n, double *restrict a, const double *restrict b)
Defines and implements most of Tensor.
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
FLOAT a(int j, FLOAT z)
Definition: y1.cc:86
void aligned_sub(long n, double *restrict a, const double *restrict b)
#define restrict
Definition: config.h:403
Holds machinery to set up Functions/FuncImpls using various Factories and Interfaces.
Definition: chem/atomutil.cc:45
FLOAT b(int j, FLOAT z)
Definition: y1.cc:79