madness/madness_2tensor_2systolic_8h_source.html

 #ifndef MADNESS_SYSTOLIC_H

 #define MADNESS_SYSTOLIC_H


 /*

   This file is part of MADNESS.


   Copyright (C) 2007,2010 Oak Ridge National Laboratory


   This program is free software; you can redistribute it and/or modify

   it under the terms of the GNU General Public License as published by

   the Free Software Foundation; either version 2 of the License, or

   (at your option) any later version.


   This program is distributed in the hope that it will be useful,

   but WITHOUT ANY WARRANTY; without even the implied warranty of

   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

   GNU General Public License for more details.


   You should have received a copy of the GNU General Public License

   along with this program; if not, write to the Free Software

   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA


   For more information please contact:


   Robert J. Harrison

   Oak Ridge National Laboratory

   One Bethel Valley Road

   P.O. Box 2008, MS-6367


   email: harrisonrj@ornl.gov

   tel:   865-241-3937

   fax:   865-572-0680


   $Id$

 */


 #include <madness/world/world.h>

 #include <utility>

 #include <madness/tensor/tensor.h>

 #include <madness/tensor/distributed_matrix.h>


 namespace madness {


     template <typename T>

     class SystolicMatrixAlgorithm : public TaskInterface {

     private:

         DistributedMatrix<T>& A;

         const int64_t nproc;            //< No. of processes with rows of the matrix (not size of world)

         const int64_t coldim;           //< A(coldim,rowdim)

         const int64_t rowdim;           //< A(coldim,rowdim)

         const int64_t nlocal;           //< No. of local pairs

         const ProcessID rank;           //< Rank of current process

         const int tag;                  //< MPI tag to be used for messages

         std::vector<T*> iptr, jptr;     //< Indirection for implementing cyclic buffer !! SHOULD BE VOLATILE ?????

         std::vector<int64_t> map;       //< Used to keep track of actual row indices


         void iteration(const TaskThreadEnv& env) {


             env.barrier();

             start_iteration_hook(env);

             env.barrier();


             if (nlocal > 0) {

                 int64_t ilo, ihi;

                 A.local_colrange(ilo, ihi);


                 int neven = coldim + (coldim&0x1);


                 int pairlo = rank*A.coltile()/2;


                 int threadid = env.id();

                 int nthread = env.nthread();


                 for (int loop=0; loop<(neven-1); ++loop) {


                     // This loop is parallelized over threads

                     for (int pair=env.id(); pair<nlocal; pair+=nthread) {


                         int rp = neven/2-1-(pair+pairlo);

                         int iii = (rp+loop)%(neven-1);

                         int jjj = (2*neven-2-rp+loop)%(neven-1);

                         if (rp == 0) jjj = neven-1;


                         iii = map[iii];

                         jjj = map[jjj];


                         if (jptr[pair]) {

                             kernel(iii, jjj, iptr[pair], jptr[pair]);

                         }

                     }

                     env.barrier();


                     if (threadid == 0) cycle();


                     env.barrier();

                 }

             }


             end_iteration_hook(env);


             env.barrier();

         }


         void unshuffle() {

             if (nlocal <= 0) return;

             Tensor<T>& t = A.data();

             Tensor<T> tmp(2L, t.dims(), false);

             T* tp = tmp.ptr();

             for (int64_t i=0; i<nlocal; ++i) {

                 memcpy(tp+i*rowdim, iptr[i], rowdim*sizeof(T));

                 if (jptr[i]) {

                     memcpy(tp+(i+nlocal)*rowdim, jptr[i], rowdim*sizeof(T));

                 }

                 iptr[i] = &t(i,0);

                 jptr[i] = &t(i+nlocal,0);

             }

             memcpy(t.ptr(), tmp.ptr(), t.size()*sizeof(T));


             if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;

         }


         void cycle() {

             if (coldim <= 2) return; // No cycling necessary

             if (nlocal <= 0) {       // Nothing local

                 MADNESS_ASSERT(rank >= nproc);

                 return;

             }


             // Check assumption that tiling put incomplete tile at the end

             MADNESS_ASSERT(A.local_coldim() == A.coltile()  ||  rank == (nproc-1));


             const ProcessID left = rank-1; //Invalid values are not used

             const ProcessID right = rank+1;


             /*

               Consider matrix (10,*) distributed with coltile=4 over

               three processors.


               .   0 1 2 3      4 5 6 7      8 9


               This is divided up as follows into this initial

               configuration for the loop


               .            P=0          P=1         P=2

               .                  msg          msg

               .   i    -->0-->1  -->   4-->5  -->    8  -->

               .       ^                                   |  msg

               .       |                         <---------

               .   j    <--2<--3  <--   6<--7  <--|   9

               .                  msg          msg


               The first and last processes in the loop have to wrap ... others

               just pass left and right.  Note that 9 stays put.


               Note that the algorithm is assuming distribution puts equal

               amount of data on all nodes except the last.


               The i data is considered as flowing to the right.

               The j data is considered as flowing to the left.


               Hence, we should explore the pairs in this order

               (n-1 sets of n/2 pairs)


               .          P=0         P=1        P=2

               .          0  1        4  5       8

               .          2  3        6  7       9


               .          2  0        1  4       5

               .          3  6        7  8       9


               .          3  2        0  1       4

               .          6  7        8  5       9


               .          6  3        2  0       1

               .          7  8        5  4       9


               .          7  6        3  2       0

               .          8  5        4  1       9


               .          8  7        6  3       2

               .          5  4        1  0       9


               .          5  8        7  6       3

               .          4  1        0  2       9


               .          4  5        8  7       6

               .          1  0        2  3       9


               .          1  4        5  8       7

               .          0  2        3  6       9

             */


             // Copy end elements before they are overwritten

             T* ilast  = iptr[nlocal-1];

             T* jfirst = jptr[0];


             // Cycle local pointers

             for (int64_t i=0; i<nlocal-1; ++i) {

                 iptr[nlocal-i-1] = iptr[nlocal-i-2];

                 jptr[i] = jptr[i+1];

             }


             World& world = A.get_world();


             if (nproc == 1) {

                 iptr[0] = jfirst;

                 jptr[nlocal-2] = ilast;

             }

             else if (rank == 0) {

                 iptr[0] = jfirst;

                 world.mpi.Send(ilast, rowdim, right, tag);

                 jptr[nlocal-1] = ilast;

                 world.mpi.Recv(ilast, rowdim, right, tag);

             }

             else if (rank == (nproc-1)) {

                 if (nlocal > 1) {

                     iptr[0] = jfirst;

                     jptr[nlocal-2] = ilast;

                 }

                 std::vector<T> buf(rowdim);

                 SafeMPI::Request req = world.mpi.Irecv(&buf[0], rowdim, left, tag);

                 world.mpi.Send(iptr[0], rowdim, left, tag);

                 world.await(req,false);

                 std::memcpy(iptr[0], &buf[0], rowdim*sizeof(T));

             }

             else {

                 std::vector<T> buf1(rowdim);

                 std::vector<T> buf2(rowdim);

                 SafeMPI::Request req1 = world.mpi.Irecv(&buf1[0], rowdim, left, tag);

                 SafeMPI::Request req2 = world.mpi.Irecv(&buf2[0], rowdim, right, tag);

                 world.mpi.Send( ilast, rowdim, right, tag);

                 world.mpi.Send(jfirst, rowdim,  left, tag);

                 world.await(req1,false);

                 world.await(req2,false);

                 std::memcpy(ilast, &buf2[0], rowdim*sizeof(T)); //world.mpi.Recv( ilast, rowdim, right, tag);

                 std::memcpy(jfirst, &buf1[0], rowdim*sizeof(T)); //world.mpi.Recv(jfirst, rowdim,  left, tag);


                 iptr[0] = jfirst;

                 jptr[nlocal-1] = ilast;

             }

         }


         virtual void get_id(std::pair<void*,unsigned short>& id) const {

             PoolTaskInterface::make_id(id, *this);

         }


     public:


         SystolicMatrixAlgorithm(DistributedMatrix<T>& A, int tag, int nthread=ThreadPool::size()+1)

             : A(A)

             , nproc(A.process_coldim()*A.process_rowdim())

             , coldim(A.coldim())

             , rowdim(A.rowdim())

             , nlocal((A.local_coldim()+1)/2)

             , rank(A.get_world().rank())

             , tag(tag)

             , iptr(nlocal)

             , jptr(nlocal)

             , map(coldim+(coldim&0x1))

         {

             TaskInterface::set_nthread(nthread);


             MADNESS_ASSERT(A.is_column_distributed() && (nproc==1 || (A.coltile()&0x1)==0));


             // Initialize vectors of pointers to matrix rows)

             Tensor<T>& t = A.data();


             //madness::print(nproc, coldim, rowdim, nlocal, rank, tag);


             for (int64_t i=0; i<nlocal; ++i) {

                 iptr[i] = &t(i,0);

                 jptr[i] = &t(i+nlocal,0);

             }


             // If no. of rows is odd, last process should have an empty last row

             if (rank==(nproc-1) && (coldim&0x1)) jptr[nlocal-1] = 0;


             // Initialize map from logical index order to actual index order


             int neven = (coldim+1)/2;

             int ii=0;

             for (ProcessID p=0; p<nproc; ++p) {

                 int64_t lo, hi;

                 A.get_colrange(p, lo, hi);

                 int p_nlocal = (hi - lo + 2)/2;

                 //print("I think process",p,"has",lo,hi,p_nlocal);

                 for (int i=0; i<p_nlocal; ++i) {

                     map[ii+i] = lo+i;

                     //map[coldim-ii-nlocal+i] = lo+i+nlocal;

                     map[ii+i+neven] = lo+i+p_nlocal;

                 }

                 ii += p_nlocal;

             }


             std::reverse(map.begin(),map.begin()+neven);


             //print("MAP", map);

         }


         virtual ~SystolicMatrixAlgorithm() {}


         virtual void kernel(int i, int j, T* rowi, T* rowj) = 0;


         virtual bool converged(const TaskThreadEnv& env) const = 0;


         virtual void start_iteration_hook(const TaskThreadEnv& env) {}


         virtual void end_iteration_hook(const TaskThreadEnv& env) {}


         void run(World& world, const TaskThreadEnv& env) {

             do {

                 iteration(env);

             } while (!converged(env));


             if (env.id() == 0) unshuffle();


             env.barrier();

         }


         void solve_sequential() {

             run(A.get_world(), TaskThreadEnv(1,0,0));

         }


         int64_t get_rowdim() const {return rowdim;}


         int64_t get_coldim() const {return coldim;}


         World& get_world() const {

             return A.get_world();

         }


         ProcessID get_rank() const {

             return rank;

         }

     };

 }


 #endif

madness::TaskThreadEnv::id
int id() const
Definition: worldthread.h:309

madness::SystolicMatrixAlgorithm::start_iteration_hook
virtual void start_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the start of each iteration.
Definition: apps/ii/systolic.h:593

madness::SystolicMatrixAlgorithm::end_iteration_hook
virtual void end_iteration_hook(const TaskThreadEnv &env)
Invoked by all threads at the end of each iteration.
Definition: apps/ii/systolic.h:599

madness::SystolicMatrixAlgorithm::get_rowdim
int64_t get_rowdim() const
Returns length of row.
Definition: madness/tensor/systolic.h:373

madness::DistributedMatrix::get_colrange
void get_colrange(int p, int64_t &ilow, int64_t &ihigh) const
Returns the inclusive range of column indices on processor p.
Definition: apps/ii/systolic.h:137

madness::TaskThreadEnv
Used to pass info about thread environment into users task.
Definition: worldthread.h:289

madness::TaskThreadEnv::barrier
bool barrier() const
Definition: worldthread.h:311

madness::DistributedMatrix::get_world
World & get_world() const
Returns associated world.
Definition: apps/ii/systolic.h:152

distributed_matrix.h

madness::DistributedMatrix::data
Tensor< T > & data()
Returns reference to data.
Definition: apps/ii/systolic.h:155

madness::SystolicMatrixAlgorithm::solve_sequential
void solve_sequential()
Invoked by the user to run the algorithm with one thread mostly for debugging.
Definition: madness/tensor/systolic.h:368

madness::SystolicMatrixAlgorithm::get_world
World & get_world() const
Returns a reference to the world.
Definition: apps/ii/systolic.h:631

madness::SystolicMatrixAlgorithm::run
void run(World &world, const TaskThreadEnv &env)
Invoked by the task queue to run the algorithm with multiple threads.
Definition: madness/tensor/systolic.h:354

world.h
This header should include pretty much everything needed for the parallel runtime.

madness::SystolicMatrixAlgorithm::~SystolicMatrixAlgorithm
virtual ~SystolicMatrixAlgorithm()
Definition: madness/tensor/systolic.h:319

tensor.h
Defines and implements most of Tensor.

std::tr1::T
const T1 &f1 return GTEST_2_TUPLE_() T(f0, f1)

madness::ThreadPool::size
static std::size_t size()
Returns number of threads in the pool.
Definition: worldthread.h:1040

madness::SystolicMatrixAlgorithm::get_rank
ProcessID get_rank() const
Returns rank of this process in the world.
Definition: madness/tensor/systolic.h:385

madness::World
A parallel world with full functionality wrapping an MPI communicator.
Definition: worldfwd.h:416

ProcessID
int ProcessID
Used to clearly identify process number/rank.
Definition: worldtypes.h:37

madness::SystolicMatrixAlgorithm::kernel
virtual void kernel(int i, int j, T *rowi, T *rowj)=0
Threadsafe routine to apply the operation to rows i and j of the matrix.

SafeMPI::Request
Definition: safempi.h:243

madness::SystolicMatrixAlgorithm::get_coldim
int64_t get_coldim() const
Returns length of column.
Definition: madness/tensor/systolic.h:377

madness::PoolTaskInterface::make_id
static enable_if_c< detail::function_traits< fnT >::value||detail::memfunc_traits< fnT >::value >::type make_id(std::pair< void *, unsigned short > &id, fnT fn)
Definition: worldthread.h:680

madness::DistributedMatrix::is_column_distributed
bool is_column_distributed() const
Returns true if the matrix is column distributed (i.e., row dimension not distributed) ...
Definition: apps/ii/systolic.h:161

madness::DistributedMatrix::coltile
int64_t coltile() const
Returns the column tile size.
Definition: apps/ii/systolic.h:86

madness::PoolTaskInterface::set_nthread
void set_nthread(int nthread)
Call this to reset the number of threads before the task is submitted.
Definition: worldthread.h:769

madness::SystolicMatrixAlgorithm::SystolicMatrixAlgorithm
SystolicMatrixAlgorithm(DistributedMatrix< T > &A, int tag, int nthread=ThreadPool::size()+1)
A must be a column distributed matrix with an even column tile >= 2.
Definition: madness/tensor/systolic.h:268

madness
Holds machinery to set up Functions/FuncImpls using various Factories and Interfaces.
Definition: chem/atomutil.cc:45

madness::SystolicMatrixAlgorithm::converged
virtual bool converged(const TaskThreadEnv &env) const =0
Invoked simultaneously by all threads after each sweep to test for convergence.

madness::DistributedMatrix
Manages data associated with a row/column/block distributed array.
Definition: apps/ii/systolic.h:51