#include "parallelJacobi.h"

#if CMK_BLUEGENE_CHARM
#define BgElapse BgElapse
#define BgGetTime   BgGetTime
#else
#define BgElapse(x)
#define BgGetTime   CmiWallTimer
#endif

#define DEBUG   0

#define indexof(i,j,ydim) ( ((i)*(ydim)) + (j))

CProxy_Chunk chunk_arr;

double startTime;

void workover(void * param, int size, void *data){

  double max = *((double*)(data));
  static int loopCount=0;

  loopCount++;
  CkPrintf("workover: iter=%d max = %f\n\n\n",loopCount, max);
 
  if(max < 0.01)  {
    double elapt = BgGetTime()-startTime;
    CkPrintf("Finished in %fs %fs/step\n", elapt, elapt/loopCount );
    CkExit();
  }
  BgElapse(2e-5);

  chunk_arr.singleStep();
}

Chunk::Chunk(int t, int x, int y){

  __sdag_init();
  int xdim = x;
  int ydim = y;
  total = t; 

  //CkPrintf("[%d:%d] x is %d, y is %d, t is %d %f\n", CkMyPe(),thisIndex,x,y,t,BgGetTime());  

  myxdim = int(xdim/total);
  counter=0;
  iterations =0;

  if(thisIndex == total-1) 
    myxdim = xdim - myxdim*(total-1);    

  myydim = ydim;

  if((thisIndex != 0)&&(thisIndex != total-1)){
    A = new double[(myxdim+2)*myydim];
    B = new double[(myxdim+2)*myydim];
      //Initialize everything to zero
    for (int i=0; i<myxdim+2; i++)
      for (int j=0; j<myydim; j++) 
	A[indexof(i,j,ydim)] = B[indexof(i,j,ydim)] = 0.0;    
  }
  else {
    A = new double[(myxdim+1)*myydim];
    B = new double[(myxdim+1)*myydim];
    //Initialize everything to zero
    for (int i=0; i<myxdim+1; i++)
      for (int j=0; j<myydim; j++) 
        A[indexof(i,j,ydim)] = B[indexof(i,j,ydim)] = 0.0;  
  }

  usesAtSync = false;
  //LBDatabase *lbdb = getLBDB();
  //lbdb->SetLBPeriod(50);

  resetBoundary();

  BgElapse(.5e-4);
}


Chunk::Chunk(CkMigrateMessage *m){

}


void Chunk::resetBoundary() {
  int i;
  if((thisIndex !=0))
    if(thisIndex < (int)(total/2))
      for(i=1;i<myxdim+1;i++)
	A[indexof(i,0,myydim)] = B[indexof(i,0,myydim)] = 1.0;

  if(thisIndex ==0){
    //if(thisIndex < (int)(total/2))
      for(i=0;i<myxdim;i++)
	A[indexof(i,0,myydim)] = B[indexof(i,0,myydim)] = 1.0;
    
    for (i = 0;2*i<myydim; i++) 
	A[indexof(0,i,myydim)] = B[indexof(0,i,myydim)] = 1.0;
  }
}


void Chunk::print() {

  if ((myxdim>100)||(myydim>100)) return;

#if 1
  CkPrintf("thisIndex = %d,myxdim=%d,myydim=%d\n",thisIndex,myxdim,myydim);

  if(thisIndex !=0)
    for (int i=0; i<myydim; i++) {
      for (int j=1; j<myxdim+1; j++) 
	CkPrintf("%lf ", A[indexof(j,i,myydim)]) ;
      CkPrintf("\n");
    }
  else
    for (int i=0; i<myydim; i++) {
      for (int j=0; j<myxdim; j++) 
	CkPrintf("%lf ", A[indexof(j,i,myydim)]) ;
      CkPrintf("\n");
    }
#endif
}


void Chunk::startWork(){

  CkPrintf("[%d] in startWork %f\n", CkMyPe(), BgGetTime());

#if DEBUG
  print();
  CkPrintf("\n\n\n");
#endif

  double* temp = new double[myydim];

  if(thisIndex >0){
    for(int i=0;i<myydim;i++)
      temp[i] = A[indexof(1,i,myydim)];
    chunk_arr[thisIndex-1].getStripfromright(myydim,temp);
  } 
  else{
    //Send dummy if thisIndex==0 for dagger to work
    chunk_arr[total-1].getStripfromright(myydim,temp);
  }

  BgElapse(0.25e-4); //Time for sending to getStripfromright

  if(thisIndex < total-1){
    int startI=1;
    if (thisIndex==0) startI=0;
    for(int i=0;i<myydim;i++)
	temp[i] = A[indexof(startI+myxdim-1,i,myydim)];
    chunk_arr[thisIndex+1].getStripfromleft(myydim,temp);
  }
  else{
    //Send dummy if thisIndex==total-1:For dagger to work
    chunk_arr[0].getStripfromleft(myydim,temp);
  }

  BgElapse(0.25e-4); //Time for sending to getStripfromleft
  
#if DEBUG
  CkPrintf("[%d] in end of startWork is \n", thisIndex);
  print();
#endif
  //chunk_arr[thisIndex].singleStep(new VoidMsg());
  delete [] temp;
}


void Chunk::doWork(){
  double maxChange = 0.0;
  double * temp;

  //CkPrintf("In do work for %d\n",thisIndex);  
  int maxI = myxdim;
  if (thisIndex==0) maxI--;
  if (thisIndex==total-1) maxI--;

  for (int i=1; i<=maxI; i++)
    for (int j=1; j<myydim-1; j++) {
	  B[indexof(i,j,myydim)] = 
	   (0.2)*( A[indexof(i,j,myydim)]     +
		   A[indexof(i,  j+1,myydim)] +
		   A[indexof(i,  j-1,myydim)] +
		   A[indexof(i+1,j,  myydim)] +
		   A[indexof(i-1,j,  myydim)]);

	  double change =  B[indexof(i,j,myydim)] - A[indexof(i,j,myydim)];
	  if (change < 0) change = - change;
	  if (change > maxChange) maxChange = change;
  }
  
  temp = A;
  A =B;	
  B=temp;  

  //  CkPrintf("\n\nB is \n");
  //print();

  //resetBoundary();
  BgElapse(20e-4);
  contribute(sizeof(double),(void*)&maxChange,CkReduction::max_double);  
}


void Chunk::processStripfromleft(int n,double *a){

//Do nothing if this is 0 Pe because the message will be a dummy for the 0 Pe.

 //CkPrintf("In process from left for %d\n",thisIndex);  
 BgElapse(2.5e-4);
  if(thisIndex !=0) {
    for(int i=0;i<myydim;i++)
      A[indexof(0,i,myydim)] = a[i];
  }
  
}

void Chunk::processStripfromright(int n,double *a){

  //Do nothing if this is Pr number:(total-1) because this will be a dummy message for that Pe.
 BgElapse(2.5e-4);
  if(thisIndex != total -1){
    if(thisIndex != 0)
      for(int i=0;i<myydim;i++)
	A[indexof(myxdim+1,i,myydim)] = a[i];
    else
      for(int i=0;i<myydim;i++)
	A[indexof(myxdim,i,myydim)] = a[i];
  }
}


Main::Main(CkArgMsg *m)
{
  int x,y,k;

  if(m->argc != 4) CkAbort("Usage: jacobi <x size> <y size> <x strips>\n");
	
  x = atoi(m->argv[1]);
  y = atoi(m->argv[2]);
  k = atoi(m->argv[3]);

  if(x < k) CkAbort("Xdim must be greater than k");

  chunk_arr = CProxy_Chunk::ckNew(k,x,y, k);
  chunk_arr.setReductionClient(workover, (void*)NULL);
  
  BgElapse(.25e-5);
  /*
  for(int i=0;i<k;i++)
    chunk_arr[i].startWork();*/
  
  chunk_arr.singleStep();

  startTime = BgGetTime();
}


#include "parallelJacobi.def.h"
