MADNESS  version 0.9
apps/ii/systolic.h
Go to the documentation of this file.
1 /*
2  This file is part of MADNESS.
3 
4  Copyright (C) 2007,2010 Oak Ridge National Laboratory
5 
6  This program is free software; you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation; either version 2 of the License, or
9  (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program; if not, write to the Free Software
18  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 
20  For more information please contact:
21 
22  Robert J. Harrison
23  Oak Ridge National Laboratory
24  One Bethel Valley Road
25  P.O. Box 2008, MS-6367
26 
27  email: harrisonrj@ornl.gov
28  tel: 865-241-3937
29  fax: 865-572-0680
30 
31  $Id$
32 */
33 #ifndef MADNESS_SYSTOLIC_H
34 #define MADNESS_SYSTOLIC_H
35 #include <madness/world/world.h>
36 
37 #include <utility>
38 #include <madness/tensor/tensor.h>
39 
40 namespace madness {
41 
43 
50  template <typename T>
52  public:
53  DistributedMatrix(World& world, int64_t n, int64_t m, int64_t coltile, int64_t rowtile)
54  : _world(world)
55  , P(world.size())
56  , rank(world.rank())
57  , n(n)
58  , m(m)
59  , tilen(coltile)
60  , tilem(rowtile)
61  , Pcoldim((n-1)/tilen+1)
62  , Prowdim((m-1)/tilem+1)
63  , Pcol(rank/Prowdim)
64  , Prow(rank - Pcol*Prowdim)
65  , ilo(Pcol*tilen)
66  , ihi(std::min(ilo+tilen-1,n-1))
67  , jlo(Prow*tilem)
68  , jhi(std::min(jlo+tilem-1,m-1))
69  , idim(std::max(ihi-ilo+1,int64_t(0)))
70  , jdim(std::max(jhi-jlo+1,int64_t(0)))
71  {
72  //print("DM: dims", n, m, "tiles", tilen, tilem, "ilo ihi jlo jhi", ilo, ihi, jlo, jhi, "idim jdim", idim, jdim);
73 
74  if (idim>0 && jdim>0) t = Tensor<T>(idim,jdim);
75  }
76 
77  virtual ~DistributedMatrix() {}
78 
80  int64_t coldim() const { return n; }
81 
83  int64_t rowdim() const { return m; }
84 
86  int64_t coltile() const { return tilen; }
87 
89  int64_t rowtile() const { return tilem; }
90 
92  int64_t process_coldim() const {return Pcoldim;}
93 
95  int64_t process_rowdim() const {return Prowdim;}
96 
98  int64_t local_size() const {return idim*jdim;}
99 
101  int64_t local_coldim() const {return idim;}
102 
104  int64_t local_rowdim() const {return jdim;}
105 
107 
109  void local_colrange(int64_t& ilow, int64_t& ihigh) const {
110  if (ilo <= ihi) {
111  ilow = ilo;
112  ihigh = ihi;
113  }
114  else {
115  ilow = 0;
116  ihigh = -1;
117  }
118  }
119 
121 
123  void local_rowrange(int64_t& jlow, int64_t& jhigh) const {
124  if (jlo <= jhi) {
125  jlow = jlo;
126  jhigh = jhi;
127  }
128  else {
129  jlow = 0;
130  jhigh = -1;
131  }
132  }
133 
135 
137  void get_colrange(int p, int64_t& ilow, int64_t& ihigh) const {
138  int pi = p/Prowdim;
139  int pj = p - pi*Prowdim;
140  if (pi >= process_coldim() || pj >= process_rowdim()) {
141  ilow = 0;
142  ihigh = -1;
143  }
144  else {
145  ilow = pi*tilen;
146  ihigh= std::min(ilow+tilen-1,n-1);
147  }
148  return;
149  }
150 
152  World& get_world() const {return _world;}
153 
155  Tensor<T>& data() {return t;}
156 
158  const Tensor<T>& data() const {return t;}
159 
161  bool is_column_distributed() const {return process_rowdim()==1;}
162 
164  bool is_row_distributed() const {return process_coldim()==1;}
165 
167  void copyin(const Tensor<T>& s) {
168  if (local_size() > 0) t(___) = s(Slice(ilo,ihi),Slice(jlo,jhi));
169  }
170 
172  void copyout(Tensor<T>& s) const {
173  if (local_size() > 0) s(Slice(ilo,ihi),Slice(jlo,jhi)) = t(___);
174  }
175 
176  private:
177  World& _world;
178  const int64_t P; //< No. of processors
179  const ProcessID rank; //< My processor rank
180  const int64_t n; //< Column dimension of A(n,m)
181  const int64_t m; //< Row dimension of A(n,m)
182  const int64_t tilen; //< Tile size for column
183  const int64_t tilem; //< Tile size for row
184  const int64_t Pcoldim; //< Column dimension of processor grid
185  const int64_t Prowdim; //< Row dimension of processor grid
186  const int64_t Pcol; //< Column of processor grid for this processor
187  const int64_t Prow; //< Row of processor grid for this processor
188  const int64_t ilo,ihi; //< Range of column indices on this processor
189  const int64_t jlo,jhi; //< Range of row indices on this processor
190  const int64_t idim,jdim; //< Dimension of data on this processor
191 
192  Tensor<T> t; //< The data
193 
194  };
195 
196 
198 
200  template <typename T>
201  DistributedMatrix<T> column_distributed_matrix(World& world, int64_t n, int64_t m, int64_t coltile=0) {
202  /*
203  if given coltile is not enough to hold entire data of matrix or used default coltile = 0,
204  recalculate it.
205  */
206  if (world.size()*coltile < n) coltile = (n-1)/world.size() + 1;
207  if ((coltile&0x1)) coltile++;
208  coltile = std::min(coltile,n);
209 
210  return DistributedMatrix<T>(world, n, m, coltile, m);
211  }
212 
213 
215  template <typename T>
216  DistributedMatrix<T> row_distributed_matrix(World& world, int64_t n, int64_t m, int64_t rowtile=0) {
217  if (world.size()*rowtile < m) rowtile = (n-1)/world.size() + 1;
218  rowtile = std::min(rowtile,m);
219 
220  return DistributedMatrix<T>(world, n, m, n, rowtile);
221  }
222 
223 
225 
232  template <typename T>
234  MADNESS_ASSERT(a.rowdim()==b.rowdim() && a.coldim()==b.coldim() && a.coltile()==b.coltile() && a.rowtile()==b.rowtile());
235 
236  DistributedMatrix<T> c(a.get_world(), a.coldim()*2, a.rowdim(), a.coltile()*2, a.rowtile());
237  c.data()(Slice(0,-1,2),_) = a.data()(___);
238  c.data()(Slice(1,-1,2),_) = b.data()(___);
239  }
240 
241 
243 
250  template <typename T>
252  MADNESS_ASSERT(a.coldim()==b.coldim() && a.coltile()==b.coltile() && a.is_column_distributed() && b.is_column_distributed());
253 
254  int64_t ma = a.rowdim();
255  int64_t mb = b.rowdim();
256 
257  DistributedMatrix<T> c(a.get_world(), a.coldim(), ma+mb, a.coltile(), ma+mb);
258 
259  if(a.local_size() > 0) c.data()( _ , Slice(0,ma-1) ) = a.data()(___);
260  if(b.local_size() > 0) c.data()( _ , Slice(ma, -1) ) = b.data()(___);
261 
262  return c;
263  }
264 
265  template <typename T>
267  MADNESS_ASSERT(a.coldim()==b.coldim() && b.coldim()==c.coldim() && c.coldim()==d.coldim());
268  MADNESS_ASSERT(a.coltile()==b.coltile() && b.coltile()==c.coltile() && c.coltile()==d.coltile());
270 
271  int64_t ma = a.rowdim();
272  int64_t mb = b.rowdim();
273  int64_t mc = c.rowdim();
274  int64_t md = d.rowdim();
275 
276  DistributedMatrix<T> result(a.get_world(), a.coldim(), ma+mb+mc+md, a.coltile(), ma+mb+mc+md);
277 
278  if(a.local_size() > 0) result.data()( _ , Slice(0,ma-1) ) = a.data()(___);
279  if(b.local_size() > 0) result.data()( _ , Slice(ma, ma+mb-1) ) = b.data()(___);
280  if(c.local_size() > 0) result.data()( _ , Slice(ma+mb, ma+mb+mc-1) ) = c.data()(___);
281  if(d.local_size() > 0) result.data()( _ , Slice(ma+mb+mc, -1) ) = d.data()(___);
282 
283  return result;
284  }
285 
286 
287  template <typename T>
289  MADNESS_ASSERT(a.rowdim()==b.rowdim() && a.rowtile()==b.rowtile() && a.is_row_distributed() && b.is_row_distributed());
290 
291  int64_t ma = a.coldim();
292  int64_t mt = ma + b.coldim();
293 
294  DistributedMatrix<T> c(a.get_world(), mt, a.rowdim(), b.rowtile(), mt);
295 
296  if(a.local_size() > 0) c.data()( Slice(0,ma-1), _ ) = a.data()(___);
297  if(a.local_size() > 0) c.data()( Slice(ma,-1), _ ) = b.data()(___);
298 
299  return c;
300  }
301 
303  template <typename T>
305  int64_t n = A.coldim();
306  int64_t m = A.rowdim();
307  MADNESS_ASSERT( n == m );
308 
309  DistributedMatrix<T> result( A.get_world(), n, m, A.coltile(), A.rowtile() );
310 
311  int64_t ilo, ihi;
312  result.local_colrange(ilo,ihi);
313  for(int64_t i=ilo; i<=ihi; ++i) {
314  result.data()(i-ilo, i) = 1;
315  }
316 
317  return result;
318  }
319 
321  template <typename T>
323  private:
324  DistributedMatrix<T>& A; //< internal data
325  int64_t nproc; //< No. of processes with rows of the matrix (not size of world)
326  int64_t coldim; //< A(coldim,rowdim)
327  int64_t rowdim; //< A(coldim,rowdim)
328  int64_t nlocal; //< No. of local pairs
329  const ProcessID rank; //< Rank of current process
330  const int tag; //< MPI tag to be used for messages
331  std::vector<T*> iptr, jptr; //< Indirection for implementing cyclic buffer !! SHOULD BE VOLATILE ?????
332  std::vector<int64_t> map; //< Used to keep track of actual row indices
333 
334  void iteration(const TaskThreadEnv& env) {
335  env.barrier();
337  env.barrier();
338 
339  int64_t ilo, ihi;
340  A.local_colrange(ilo, ihi);
341 
342  int neven = coldim + (coldim&0x1);
343 
344  int pairlo = rank*A.coltile()/2;
345 
346  int threadid = env.id();
347  int nthread = env.nthread();
348 
349  for (int loop=0; loop<(neven-1); loop++) {
350 
351  // This loop is parallelized over threads
352  for (int pair=env.id(); pair<nlocal; pair+=nthread) {
353 
354  int rp = neven/2-1-(pair+pairlo);
355  int iii = (rp+loop)%(neven-1);
356  int jjj = (2*neven-2-rp+loop)%(neven-1);
357  if (rp == 0) jjj = neven-1;
358 
359  iii = map[iii];
360  jjj = map[jjj];
361 
362  if (jptr[pair]) {
363  kernel(iii, jjj, iptr[pair], jptr[pair]);
364  }
365  }
366  env.barrier();
367 
368  if (threadid == 0) cycle();
369 
370  env.barrier();
371  }
372 
373  end_iteration_hook(env);
374  env.barrier();
375  }
376 
378 
385  void unshuffle() {
386  if (nlocal <= 0) return;
387  Tensor<T>& t = A.data();
388  //Tensor<T> tmp(2L, t.ndim(), false);
389  Tensor<T> tmp(2L, t.dims(), false);
390  T* tp = tmp.ptr();
391  for (int64_t i=0; i<nlocal; i++) {
392  memcpy(tp+i*rowdim, iptr[i], rowdim*sizeof(T));
393  if (jptr[i]) {
394  memcpy(tp+(i+nlocal)*rowdim, jptr[i], rowdim*sizeof(T));
395  }
396  iptr[i] = &t(i,0);
397  jptr[i] = &t(i+nlocal,0);
398  }
399  memcpy(t.ptr(), tmp.ptr(), t.size()*sizeof(T));
400 
401  if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
402  }
403 
405  void cycle() {
406  if (coldim <= 2) return; // No cycling necessary
407  if (nlocal <= 0) { // Nothing local
408  MADNESS_ASSERT(rank >= nproc);
409  return;
410  }
411 
412  // Check assumption that tiling put incomplete tile at the end
413  MADNESS_ASSERT(A.local_coldim() == A.coltile() || rank == (nproc-1));
414 
415  const ProcessID left = rank-1; //Invalid values are not used
416  const ProcessID right = rank+1;
417 
418  /*
419  Consider matrix (10,*) distributed with coltile=4 over
420  three processors.
421 
422  . 0 1 2 3 4 5 6 7 8 9
423 
424  This is divided up as follows into this initial
425  configuration for the loop
426 
427  . P=0 P=1 P=2
428  . msg msg
429  . i -->0-->1 --> 4-->5 --> 8 -->
430  . ^ | msg
431  . | <---------
432  . j <--2<--3 <-- 6<--7 <--| 9
433  msg msg
434 
435  The first and last processes in the loop have to wrap ... others
436  just pass left and right. Note that 9 stays put.
437 
438  Note that the algorithm is assuming distribution puts equal
439  amount of data on all nodes except the last.
440 
441  The i data is considered as flowing to the right.
442  The j data is considered as flowing to the left.
443 
444 
445  Hence, we should explore the pairs in this order
446  (n-1 sets of n/2 pairs)
447 
448  . P=0 P=1 P=2
449  . 0 1 4 5 8
450  . 2 3 6 7 9
451 
452  . 2 0 1 4 5
453  . 3 6 7 8 9
454 
455  . 3 2 0 1 4
456  . 6 7 8 5 9
457 
458  . 6 3 2 0 1
459  . 7 8 5 4 9
460 
461  . 7 6 3 2 0
462  . 8 5 4 1 9
463 
464  . 8 7 6 3 2
465  . 5 4 1 0 9
466 
467  . 5 8 7 6 3
468  . 4 1 0 2 9
469 
470  . 4 5 8 7 6
471  . 1 0 2 3 9
472 
473  . 1 4 5 8 7
474  . 0 2 3 6 9
475  */
476 
477  // Copy end elements before they are overwritten
478  T* ilast = iptr[nlocal-1];
479  T* jfirst = jptr[0];
480 
481  // Cycle local pointers
482  for (int64_t i=0; i<nlocal-1; i++) {
483  iptr[nlocal-i-1] = iptr[nlocal-i-2];
484  jptr[i] = jptr[i+1];
485  }
486 
487  World& world = A.get_world();
488 
489  // Lazily program unsafely assuming enough MPI buffering available
490  if (nproc == 1) {
491  iptr[0] = jfirst;
492  jptr[nlocal-2] = ilast;
493  }
494  else if (rank == 0) {
495  iptr[0] = jfirst;
496  world.mpi.Send(ilast, rowdim, right, tag);
497  jptr[nlocal-1] = ilast;
498  world.mpi.Recv(ilast, rowdim, right, tag);
499  }
500  else if (rank == (nproc-1)) {
501  if (nlocal > 1) {
502  iptr[0] = jfirst;
503  jptr[nlocal-2] = ilast;
504  }
505  world.mpi.Send(iptr[0], rowdim, left, tag);
506  world.mpi.Recv(iptr[0], rowdim, left, tag);
507  }
508  else {
509  world.mpi.Send( ilast, rowdim, right, tag);
510  world.mpi.Send(jfirst, rowdim, left, tag);
511  world.mpi.Recv( ilast, rowdim, right, tag);
512  world.mpi.Recv(jfirst, rowdim, left, tag);
513  iptr[0] = jfirst;
514  jptr[nlocal-1] = ilast;
515  }
516  }
517 
519 
521  virtual void get_id(std::pair<void*,unsigned short>& id) const {
522  PoolTaskInterface::make_id(id, *this);
523  }
524 
525  public:
528  : A(A)
529  , nproc(A.process_coldim()*A.process_rowdim())
530  , coldim(A.coldim())
531  , rowdim(A.rowdim())
532  , nlocal((A.local_coldim()+1)/2)
533  , rank(A.get_world().rank())
534  , tag(tag)
535  , iptr(nlocal)
536  , jptr(nlocal)
537  , map(coldim+(coldim&0x1))
538  {
540 
541  MADNESS_ASSERT(A.is_column_distributed() && (nproc==1 || (A.coltile()&0x1)==0));
542 
543  // Initialize vectors of pointers to matrix rows
544  Tensor<T>& t = A.data();
545 
546  //madness::print(nproc, coldim, rowdim, nlocal, rank, tag);
547 
548  for (int64_t i=0; i<nlocal; i++) {
549  iptr[i] = &t(i,0);
550  jptr[i] = &t(i+nlocal,0);
551  }
552 
553  // If no. of rows is odd, last process should have an empty last row
554  if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;
555 
556  // Initialize map from logical index order to actual index order
557 
558  int neven = (coldim+1)/2;
559  int ii=0;
560  for (ProcessID p=0; p<nproc; p++) {
561  int64_t lo, hi;
562  A.get_colrange(p, lo, hi);
563  int p_nlocal = (hi - lo + 2)/2;
564  //print("I think process",p,"has",lo,hi,p_nlocal);
565  for (int i=0; i<p_nlocal; i++) {
566  map[ii+i] = lo+i;
567  //map[coldim-ii-nlocal+i] = lo+i+nlocal;
568  map[ii+i+neven] = lo+i+p_nlocal;
569  }
570  ii += p_nlocal;
571  }
572 
573  std::reverse(map.begin(),map.begin()+neven);
574 
575  //print("MAP", map);
576  }
577 
579 
581  virtual void kernel(int i, int j, T* rowi, T* rowj) = 0;
582 
583 
585 
587  virtual bool converged(const TaskThreadEnv& env) const = 0;
588 
589 
591 
593  virtual void start_iteration_hook(const TaskThreadEnv& env) {}
594 
595 
597 
599  virtual void end_iteration_hook(const TaskThreadEnv& env) {}
600 
601 
603  void run(World& world, const TaskThreadEnv& env) {
604  //if (nlocal <= 0) return; // Nothing to do
605 
606  do {
607  env.barrier();
608  iteration(env);
609  env.barrier();
610  } while (!converged(env));
611 
612  if (env.id() == 0) unshuffle();
613  env.barrier();
614  }
615 
617 
621  void solve() { run(A.get_world(), TaskThreadEnv(1,0,0)); }
622 
623 
625  int get_rowdim() const {return rowdim;}
626 
628  int get_coldim() const {return coldim;}
629 
631  World& get_world() const { return A.get_world(); }
632 
634  ProcessID get_rank() const{ return rank; }
635 
636  };
637 
638  template <typename T>
640  {
641  public:
642  LocalizeBoys<T>( DistributedMatrix<T> &M, const std::vector<int>& set, long nmo, int tag,
643  const double thresh = 1e-6, const double thetamax = 0.5, const bool randomized = true, const bool doprint = false):
645  M(M), // distributed marix of ( U, X, Y, Z)
646  world(M.get_world()),
647  set(set), // set of orbital
648  nmo(nmo), // number of molecule
649  niter(0), // number of iteration
650  ndone(0), // number of rotation
651  nrot(0),
652  thetamax(thetamax),
653  thresh(thresh),
654  tol(thetamax),
655  randomized(false),
656  doprint(doprint)
657  {
658  if (doprint) madness::print("Start boys localization\n");
659  }
660 
661  virtual ~LocalizeBoys() {}
662 
663  void start_iteration_hook(const TaskThreadEnv& env);
664  void kernel(int i, int j, T* rowi, T* rowj);
665  void end_iteration_hook(const TaskThreadEnv& env);
666  bool converged(const TaskThreadEnv& env) const;
667 
668  private:
670  World& world;
671  std::vector<int> set;
672  long nmo, niter, ndone, ndone_iter; // number of molecule, number of iteration, number of rotation, number of rotation in 1 iteration
673  volatile int64_t nrot; // number of rotation in 1 iteration
674  const double thetamax, thresh;
675  double tol; // current error
676  volatile double maxtheta; // max rotation angle
677  const bool randomized, doprint;
678 
679  void drot(T restrict a[], T restrict b[], double sin, double cos);
680  inline T DIP(const T ij[], const T kl[]) const;
681  inline T inner(const T a[], const T b[] ) const;
682  };
683  template <typename T>
685  {
686  int id = env.id();
687  if(doprint) {
688  T sum = 0.0;
689  int64_t ilo, ihi;
690  M.local_colrange(ilo, ihi);
691  for(int64_t i=0; i <=(ihi-ilo); i++){
692  T ii[] = { M.data()(i,i+ilo+nmo), M.data()(i,i+ilo+2*nmo), M.data()(i,i+ilo+3*nmo) };
693  sum += DIP(ii, ii);
694  }
695  env.barrier();
696  if (env.id() == 0) world.gop.sum(sum);
697  env.barrier();
698 
699  // print a result of previous iteration
700  printf("\titeration %ld sum=%.4f ndone=%ld tol=%.2e, maxtheta=%.2e\n", niter, sum, ndone, tol, maxtheta);
701  }
702 
703  if( id == 0 ){
704  this->nrot = 0;
705  this->maxtheta = 0.0;
706  }
707  env.barrier();
708  }
709  template <typename T>
710  void LocalizeBoys<T>::kernel(int i, int j, T rowi[], T rowj[])
711  {
712  if(set[i] != set[j]) return;
713 
714  // make rowi and rowj since we're using one-sided jacobi
715  T *ui = rowi;
716  T *uj = rowj;
717  T *ri[] = { rowi + nmo, rowi + 2*nmo, rowi + 3*nmo };
718  T *rj[] = { rowj + nmo, rowj + 2*nmo, rowj + 3*nmo };
719  T ii[] = { inner(ui, ri[0]), inner(ui, ri[1]), inner(ui, ri[2]) };
720  T ij[] = { inner(ui, rj[0]), inner(ui, rj[1]), inner(ui, rj[2]) };
721  T jj[] = { inner(uj, rj[0]), inner(uj, rj[1]), inner(uj, rj[2]) };
722  bool doit = false;
723 
724  double g = DIP(ij, jj) - DIP(ij, ii);
725  double h = 4.0*DIP(ij, ij) + 2.0*DIP(ii, jj) - DIP(ii, ii) - DIP(jj, jj);
726 
727  if (h >= 0.0) {
728  if (doprint) print("\t\tforcing negative h", i, j, h);
729  doit = true;
730  h = -1.0;
731  }
732  double theta = -g / h;
733 
734  this->maxtheta = std::max<double>(fabs(theta), (double)maxtheta); // this doesn't seem to thread safe
735 
737  if (fabs(theta) > thetamax){
738  if (doprint) print("\t\trestricting", i,j);
739 
740  if (g < 0) theta = -thetamax;
741  else theta = thetamax * 0.8;
742  doit = true;
743  }
744 
745  // randomize will be implemented here
746  // double sij = DIP(ij, ij); // this line will be used by randomize
747 
748  // rotation
749  if (fabs(theta) >= tol || randomized || doit){
750  if (doprint) print("\t\trotating", i,j, theta);
751  this->nrot++;
752 
753  double c = cos(theta);
754  double s = sin(theta);
755  drot(ri[0], rj[0], s, c);
756  drot(ri[1], rj[1], s, c);
757  drot(ri[2], rj[2], s, c);
758  drot(ui, uj, s, c);
759  }
760  }
761  template <typename T>
763  {
764  if (env.id() == 0) {
765  this->niter++;
766  this->tol = std::max(0.1 * maxtheta, thresh);
767  this->ndone_iter = nrot;
768  world.gop.max(tol);
769 
770  world.gop.sum(ndone_iter); // get total number of rotation whole processes
771  this->ndone += ndone_iter;
772  }
773  env.barrier();
774  }
775  template <typename T>
777  {
778  if( ndone_iter == 0 && tol <= thresh){
779  if (doprint) madness::print("\tBoys localization converged in", ndone, "steps.");
780  return true;
781  }
782  else if(niter >= 300){
783  if(doprint) madness::print("\tWARNING!! Boys localization did not fully converged in", niter, "iteration!\n");
784  return true;
785  }
786  else return false;
787  env.barrier();
788  }
790  template <typename T>
791  void LocalizeBoys<T>::drot(T restrict a[], T restrict b[], double sin, double cos)
792  {
793  for ( long i=0; i<this->nmo; i++ ) {
794  T aa = a[i]*cos - b[i]*sin;
795  T bb = a[i]*sin + b[i]*cos;
796  a[i] = aa;
797  b[i] = bb;
798  }
799  }
800  template <typename T>
801  inline T LocalizeBoys<T>::DIP(const T ij[], const T kl[]) const
802  {
803  return ij[0] * kl[0] + ij[1] * kl[1] + ij[2] * kl[2];
804  }
805  template <typename T>
806  inline T LocalizeBoys<T>::inner(const T a[], const T b[] ) const
807  {
808  T s=0;
809  for(int64_t i=0; i<nmo; i++){
810  s += a[i] * b[i];
811  }
812  return s;
813  }
814 }
815 #endif
int id() const
Definition: worldthread.h:309
NDIM const Function< R, NDIM > & g
Definition: mra.h:2179
void solve()
Invoked by the user to run the algorithm with one thread.
Definition: apps/ii/systolic.h:621
int64_t local_size() const
Returns the total no. of elements stored on this processor.
Definition: apps/ii/systolic.h:98
int get_coldim() const
Returns length of column.
Definition: apps/ii/systolic.h:628
virtual void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: apps/ii/systolic.h:593
void Recv(T *buf, long lenbuf, int src, int tag) const
Receive data of up to lenbuf elements from process dest.
Definition: worldmpi.h:313
WorldMpiInterface & mpi
MPI interface.
Definition: worldfwd.h:459
virtual void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration.
Definition: apps/ii/systolic.h:599
virtual ~DistributedMatrix()
Definition: apps/ii/systolic.h:77
void get_colrange(int p, int64_t &ilow, int64_t &ihigh) const
Returns the inclusive range of column indices on processor p.
Definition: apps/ii/systolic.h:137
DistributedMatrix(World &world, int64_t n, int64_t m, int64_t coltile, int64_t rowtile)
Definition: apps/ii/systolic.h:53
const double L
Definition: 3dharmonic.cc:123
int64_t rowdim() const
Returns the row dimension of the matrix ... i.e., m for A(n,m)
Definition: apps/ii/systolic.h:83
Used to pass info about thread environment into users task.
Definition: worldthread.h:289
void kernel(int i, int j, T *rowi, T *rowj)
Threadsafe routine to apply the operation to rows i and j of the matrix.
Definition: apps/ii/systolic.h:710
bool barrier() const
Definition: worldthread.h:311
const mpreal cos(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2255
void copyout(Tensor< T > &s) const
Given the full matrix s(n,m), copy out the data range local to this processor.
Definition: apps/ii/systolic.h:172
World & get_world() const
Returns associated world.
Definition: apps/ii/systolic.h:152
const Tensor< T > & data() const
Returns const reference to data.
Definition: apps/ii/systolic.h:158
int get_rowdim() const
Returns length of row.
Definition: apps/ii/systolic.h:625
Definition: mpreal.h:3066
DistributedMatrix< T > column_distributed_matrix(World &world, int64_t n, int64_t m, int64_t coltile=0)
Generates an (n,m) matrix distributed by columns (row dimension is not distributed) ...
Definition: apps/ii/systolic.h:201
Tensor< T > & data()
Returns reference to data.
Definition: apps/ii/systolic.h:155
ProcessID size() const
Returns the number of processes in this world (same as MPI_Comm_size())
Definition: worldfwd.h:533
int64_t coldim() const
Returns the column dimension of the matrix ... i.e., n for A(n,m)
Definition: apps/ii/systolic.h:80
World & get_world() const
Returns a reference to the world.
Definition: apps/ii/systolic.h:631
void run(World &world, const TaskThreadEnv &env)
Invoked by the task queue to run the algorithm with multiple threads.
Definition: apps/ii/systolic.h:603
int64_t process_rowdim() const
Returns the no. of processors in the row dimension.
Definition: apps/ii/systolic.h:95
This header should include pretty much everything needed for the parallel runtime.
virtual ~SystolicMatrixAlgorithm()
Definition: apps/ii/systolic.h:578
T inner(const vecfunc< T, NDIM > &a, const vecfunc< T, NDIM > &b)
the non-linear solver requires an inner product
Definition: nemo.h:112
bool is_row_distributed() const
Returns true if the matrix is row distributed (i.e., column dimension not distributed) ...
Definition: apps/ii/systolic.h:164
void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration.
Definition: apps/ii/systolic.h:762
Defines and implements most of Tensor.
void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: apps/ii/systolic.h:684
Base class for parallel algorithms that employ a systolic loop to generate all row pairs in parallel...
Definition: apps/ii/systolic.h:322
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)
#define max(a, b)
Definition: lda.h:53
const mpreal min(const mpreal &x, const mpreal &y)
Definition: mpreal.h:2675
virtual ~LocalizeBoys()
Definition: apps/ii/systolic.h:661
FLOAT a(int j, FLOAT z)
Definition: y1.cc:86
Definition: apps/ii/systolic.h:639
void drot(long n, double *restrict a, double *restrict b, double s, double c, long inc)
Simple (?) version of BLAS-1 DROT(N, DX, INCX, DY, INCY, DC, DS)
Definition: chem/distpm.cc:18
void copyin(const Tensor< T > &s)
Given the full matrix(n,m), copy in the data range local to this processor.
Definition: apps/ii/systolic.h:167
static std::size_t size()
Returns number of threads in the pool.
Definition: worldthread.h:1040
int nthread() const
Definition: worldthread.h:307
ProcessID get_rank() const
Returns rank of this process in the world.
Definition: apps/ii/systolic.h:634
A parallel world with full functionality wrapping an MPI communicator.
Definition: worldfwd.h:416
void local_colrange(int64_t &ilow, int64_t &ihigh) const
Returns the inclusive range of column indices on this processor.
Definition: apps/ii/systolic.h:109
int ProcessID
Used to clearly identify process number/rank.
Definition: worldtypes.h:37
DistributedMatrix< T > idMatrix(const DistributedMatrix< T > &A)
make identity matrix with same propaties of A
Definition: apps/ii/systolic.h:304
const mpreal sum(const mpreal tab[], unsigned long int n, mp_rnd_t rnd_mode)
Definition: mpreal.cc:241
const mpreal sin(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2262
const double m
Definition: gfit.cc:199
virtual void kernel(int i, int j, T *rowi, T *rowj)=0
Threadsafe routine to apply the operation to rows i and j of the matrix.
DistributedMatrix< T > concatenate_rows(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a distributed matrix with rows of a and b concatenated.
Definition: apps/ii/systolic.h:251
int64_t process_coldim() const
Returns the no. of processors in the column dimension.
Definition: apps/ii/systolic.h:92
static enable_if_c< detail::function_traits< fnT >::value||detail::memfunc_traits< fnT >::value >::type make_id(std::pair< void *, unsigned short > &id, fnT fn)
Definition: worldthread.h:680
DistributedMatrix< T > concatenate_columns(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a row-distributed matrix with rows of a and b contatenated.
Definition: apps/ii/systolic.h:288
DistributedMatrix< T > row_distributed_matrix(World &world, int64_t n, int64_t m, int64_t rowtile=0)
Generates an (n,m) matrix distributed by rows (column dimension is not distributed) ...
Definition: apps/ii/systolic.h:216
bool is_column_distributed() const
Returns true if the matrix is column distributed (i.e., row dimension not distributed) ...
Definition: apps/ii/systolic.h:161
int64_t coltile() const
Returns the column tile size.
Definition: apps/ii/systolic.h:86
void doit(World &world)
Definition: tdse1.cc:753
void print(const A &a)
Print a single item to std::cout terminating with new line.
Definition: print.h:122
void set_nthread(int nthread)
Call this to reset the number of threads before the task is submitted.
Definition: worldthread.h:769
A slice defines a sub-range or patch of a dimension.
Definition: slice.h:103
SystolicMatrixAlgorithm(DistributedMatrix< T > &A, int tag, int nthread=ThreadPool::size()+1)
A must be a column distributed matrix with an even column tile >= 2.
Definition: apps/ii/systolic.h:527
bool converged(const TaskThreadEnv &env) const
Invoked simultaneously by all threads after each sweep to test for convergence.
Definition: apps/ii/systolic.h:776
All world tasks must be derived from this public interface.
Definition: taskfn.h:68
int64_t local_rowdim() const
Returns the no. of row elements stored on this processor.
Definition: apps/ii/systolic.h:104
#define restrict
Definition: config.h:403
const mpreal fabs(const mpreal &v, mp_rnd_t rnd_mode)
Definition: mpreal.h:2187
Holds machinery to set up Functions/FuncImpls using various Factories and Interfaces.
Definition: chem/atomutil.cc:45
void local_rowrange(int64_t &jlow, int64_t &jhigh) const
Returns the inclusive range of row indices on this processor.
Definition: apps/ii/systolic.h:123
const double c
Definition: gfit.cc:200
FLOAT b(int j, FLOAT z)
Definition: y1.cc:79
DistributedMatrix< T > interleave_rows(const DistributedMatrix< T > &a, const DistributedMatrix< T > &b)
Generates a distributed matrix with rows of a and b interleaved.
Definition: apps/ii/systolic.h:233
void Send(const T *buf, long lenbuf, int dest, int tag=SafeMPI::DEFAULT_SEND_RECV_TAG) const
Send array of lenbuf elements to process dest.
Definition: worldmpi.h:296
virtual bool converged(const TaskThreadEnv &env) const =0
Invoked simultaneously by all threads after each sweep to test for convergence.
int64_t local_coldim() const
Returns the no. of column elements stored on this processor.
Definition: apps/ii/systolic.h:101
int64_t rowtile() const
Returns the row tile size.
Definition: apps/ii/systolic.h:89
Manages data associated with a row/column/block distributed array.
Definition: apps/ii/systolic.h:51