// Cuda runtime library
#include <cuda_runtime.h>
#include "utility.h"

#ifdef LINUX
#include <inttypes.h>
#define __int64 int64_t
#endif

// Global texture references
// *Note* : Remember to unbind texture after use
texture<uint2, 1, cudaReadModeElementType> genocase_Texture;
texture<uint2, 1, cudaReadModeElementType> genoctrl_Texture;
texture<unsigned char, 1, cudaReadModeElementType> wordbits_Texture;

//// general helper functions
long long iDivUp(long long a, long long b) {
	return ((a % b) != 0) ? (a / b + 1) : (a / b);
}

void checkCUDAError(const char *msg) {
  cudaError_t err = cudaGetLastError();
  if( cudaSuccess != err) {
    fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err) ); 
    exit(EXIT_FAILURE); 
  }
} 

//__constant__ unsigned char dev_wordbits[65536];// { bitcounts of ints between 0 and 65535 };
//extern "C" void cuda_SetWordBits(const unsigned char* wordBits, int count) {
//	cudaMemcpyToSymbol(dev_wordbits, wordBits, sizeof(unsigned char)*count, 0, cudaMemcpyHostToDevice);
//}

// Hamming weight
inline __device__ int dev_count_bit(__int64 i) {
	i = i - ((i >> 1) & 0x5555555555555555);
    i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333);
    return (((i + (i >> 4)) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56;
}

inline __device__ int dev_count_bit_slow_mult(__int64 x) {
	x -= (x >> 1) & 0x5555555555555555;								//put count of each 2 bits into those 2 bits
    x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); //put count of each 4 bits into those 4 bits 
    x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f;						//put count of each 8 bits into those 8 bits 
    x += x >>  8;													//put count of each 16 bits into their lowest 8 bits
    x += x >> 16;													//put count of each 32 bits into their lowest 8 bits
    x += x >> 32;													//put count of each 64 bits into their lowest 8 bits
    return x & 0x7f;
}

__global__ void pairwiseKSA_Kernel(uint64* genocase, uint64* genoctrl, int p, int nLongIntcase, int nLongIntctrl, int * interactionInputOffsetJ1, int* interactionInputOffsetJ2, int *interactionPairOffsetJ1, int *interactionPairOffsetJ2, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y, int n, float thresholdRecord, int ncase, int nctrl, unsigned char* wordbits)
{
	__int64 andResult = 0;

	int outIndex = blockIdx.x * blockDim.x + threadIdx.x;
	int j1 = interactionInputOffsetJ1[outIndex];
	int j2 = interactionInputOffsetJ2[outIndex];

	int i, j, k;
	int count;
	int localGenoDistr[18];
	float tao = 0;
	float InteractionMeasure=0;
	float ptmp1, ptmp2;
	//uint2 cache;

	// Skip not valid and out of boundary case
	if ( j2 <= j1 || j1 >= p-1 || j2 >= p ) {
		return;
	}

	for (i = 0; i<2 ; i++)
	{
		for (j = 0; j<2; j++)
		{
			count = 0;
			for (k = 0; k<nLongIntcase; k++)
			{
				andResult = genocase[k*3*p+i*p+j1] & genocase[k*3*p+j*p+j2];
				//cache = tex1Dfetch(genocase_Texture, k*3*p+i*p+j1);
				//andResult = (((unsigned long long)cache.y) << 32 | cache.x) & genocase[k*3*p+j*p+j2];
				//andResult = (((long long)cache.x) << 32 | cache.y) & genocase[k*3*p+j*p+j2];
				//andResult = genocase[k*3*p+i*p+j1] & genocase[k*3*p+j*p+j2];
				//andResult = genocase[(j1*3 + i1)*nLongIntcase+i3] & genocase[(j2*3 + i2)*nLongIntcase+i3];

				count += dev_count_bit(andResult);

				// look-up table method
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)(andResult&0xFFFF));
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)((andResult>>16)&0xFFFF));
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)((andResult>>32)&0xFFFF));
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)(andResult>>48));

		 		//count += dev_popcount(pgeno[j1*3 + i1].genocase[i3] & pgeno[j2*3 + i2].genocase[i3]);
				//GenoDistr[GenoDistr_offset+i1*3 + i2] += bitCount(pgeno[j1*3 + i1].genocase[i3] & pgeno[j2*3 + i2].genocase[i3]);
			}
			localGenoDistr[i*3 + j] = count;
		}
	}

	for (i = 0; i<2 ; i++)
	{
		for (j = 0; j <2; j++)
		{
			count = 0;
			for (k = 0; k < nLongIntctrl; k++)
			{
				andResult = genoctrl[k*3*p+i*p+j1] & genoctrl[k*3*p+j*p+j2];
				//cache = tex1Dfetch(genoctrl_Texture, k*3*p+i*p+j1);
				//andResult = (((unsigned long long)cache.y) << 32 | cache.x) & (genoctrl[k*3*p+j*p+j2]);
				//andResult = andResult & genoctrl[k*3*p+j*p+j2];
				//andResult =  (((long long)cache.x) << 32 | cache.y) & genoctrl[k*3*p+j*p+j2];
				//andResult = tex1Dfetch(geno_ctrl_Texture, k*3*p+i*p+j1) & tex1Dfetch(geno_ctrl_Texture, k*3*p+j*p+j2);
				//andResult = genoctrl[(j1*3 + i)*nLongIntctrl+k] & genoctrl[(j2*3 + j)*nLongIntctrl+k];
				//andResult = ((long long)tex1Dfetch(genoctrl_Texture, (j1*3 + i1)*nLongIntctrl+i3).x) << 32 | tex1Dfetch(genoctrl_Texture, (j1*3 + i1)*nLongIntctrl+i3).y;
				//andResult = andResult & (((long long)tex1Dfetch(genoctrl_Texture, (j2*3 + i2)*nLongIntctrl+i3).x) << 32 | tex1Dfetch(genoctrl_Texture, (j2*3 + i2)*nLongIntctrl+i3).y);

				count += dev_count_bit(andResult);

				// look-up table method
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)(andResult&0xFFFF));
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)((andResult>>16)&0xFFFF));
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)((andResult>>32)&0xFFFF));
				//count += tex1Dfetch(wordbits_Texture, (unsigned short)(andResult>>48));

				//count += bitCount(pgeno[j1*3 + i1].genoctrl[i3] & pgeno[j2*3 + i2].genoctrl[i3]);
				//count += dev_popcount(pgeno[j1*3 + i1].genoctrl[i3] & pgeno[j2*3 + i2].genoctrl[i3]);
				//GenoDistr[GenoDistr_offset+9 + i1*3 + i2] += bitCount(pgeno[j1*3 + i1].genoctrl[i3] & pgeno[j2*3 + i2].genoctrl[i3]);
			}
			localGenoDistr[9+i*3 + j] = count;
		}
	}

	//for case
	localGenoDistr[2] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*p+j1] - localGenoDistr[0] - localGenoDistr[1];
	localGenoDistr[5] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*p+j1] - localGenoDistr[3] - localGenoDistr[4];
	
	localGenoDistr[6] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[0] - localGenoDistr[3];
	localGenoDistr[7] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[1] - localGenoDistr[4];
	
	localGenoDistr[8] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[2] - localGenoDistr[5];

	//for ctrl
	localGenoDistr[11] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*p+j1] - localGenoDistr[9] - localGenoDistr[10];
	localGenoDistr[14] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*p+j1] - localGenoDistr[12] - localGenoDistr[13];

	localGenoDistr[15] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[9] - localGenoDistr[12];
	localGenoDistr[16] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[10] - localGenoDistr[13];

	localGenoDistr[17] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[11] - localGenoDistr[14];

	/*
		GenoJointDistr: the index is as follows:
						AABB
				Case	0	1	2	3	4	5	6	7	8
				Ctrl	9	10	11	12	13	14	15	16	17
	*/
	//Papprx = Pab*Pbc*Pca
	// tao = sum(Papprx)
	// sum(p.* log(p) - p.*log(Pappr) + p.* log(tao))
	tao = 0;
	InteractionMeasure=0;

	for (i = 0; i<3; i++)// index for A
	{
		for (j = 0; j<3; j++) //index for B
		{
			// ** Branch prediction to reduce if branch ** //
			ptmp1 = (float)localGenoDistr[i*3+j];
			
			if (ptmp1>0)
			{
				InteractionMeasure += ptmp1*log(ptmp1);
			}			

			//ptmp2 KSA  n_{ij}*n_{jk}*n_{ik}/ (n_i * n_j * n_k)
			ptmp2 = (float)n*
				(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*
				pMarginalDistrSNP_Y[(i*MarginalDistrSNP_Y_DimensionX+0)*p+j1]*
				pMarginalDistrSNP_Y[(j*MarginalDistrSNP_Y_DimensionX+0)*p+j2]/
				pMarginalDistrSNP[i*p+j1]/
				pMarginalDistrSNP[j*p+j2]/
				ncase;

			if (ptmp2>0)
			{
				InteractionMeasure += -ptmp1*log(ptmp2);
				tao += ptmp2;
			}

			ptmp1 = (float)localGenoDistr[9+i*3+j];

			if (ptmp1>0)
			{
				InteractionMeasure += ptmp1*log(ptmp1);
			}

			ptmp2 = (float)n*
				(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*
				pMarginalDistrSNP_Y[(i*MarginalDistrSNP_Y_DimensionX+1)*p+j1]*
				pMarginalDistrSNP_Y[(j*MarginalDistrSNP_Y_DimensionX+1)*p+j2]/
				pMarginalDistrSNP[i*p+j1]/
				pMarginalDistrSNP[j*p+j2]/
				nctrl;

			if (ptmp2>0)
			{
				InteractionMeasure += -ptmp1*log(ptmp2);
				tao += ptmp2;
			}

			//for (k = 0; k<2; k++)	//index for C
			//{
			//	ptmp1 = (float)localGenoDistr[k*9+i*3+j];
			//	if (ptmp1>0)
			//	{
			//		InteractionMeasure += ptmp1 * log(ptmp1);
			//	}

			//	if ( k == 0 ) {
			//		//ptmp2 KSA  n_{ij}*n_{jk}*n_{ik}/ (n_i * n_j * n_k)
			//		ptmp2 = (float)n*(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/pMarginalDistr[j2].MarginalDistrSNP[j]/ncase;
			//	}
			//	else if ( k == 1 ) {
			//		ptmp2 = (float)n*(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/pMarginalDistr[j2].MarginalDistrSNP[j]/nctrl;
			//	}
			//	if (ptmp2>0)
			//	{
			//		InteractionMeasure += -ptmp1*log(ptmp2);
			//		tao += ptmp2;
			//	}
			//}
		}
	}

	InteractionMeasure = (InteractionMeasure+n*log(tao/n))*2;

	if (InteractionMeasure > thresholdRecord) 
	{
		interactionPairOffsetJ1[outIndex] = j1;
		interactionPairOffsetJ2[outIndex] = j2;
	}
	else 
	{
		interactionPairOffsetJ1[outIndex] = 0;
		interactionPairOffsetJ2[outIndex] = 0;
	}

}

__global__ void pairwiseChiSquare_Kernel(uint64* genocase, uint64* genoctrl, int p, int nLongIntcase, int nLongIntctrl, int * interactionInputOffsetJ1, int* interactionInputOffsetJ2, int *interactionPairOffsetJ1, int *interactionPairOffsetJ2, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y, int n, float thresholdRecord, int ncase, int nctrl, unsigned char* wordbits, float* floatArray)
{
	__int64 andResult = 0;

	int outIndex = blockIdx.x * blockDim.x + threadIdx.x;
	int j1 = interactionInputOffsetJ1[outIndex];
	int j2 = interactionInputOffsetJ2[outIndex];

	int i, j, k;
	int count;
	int localGenoDistr[18];
	float ratio[9];
	int index[9];

	float maxChiSquareValue = -1.0f;
	float chiSquareValue = 0.0f;
	int input[4];
	float numerator, denominator;
	
	// Skip not valid and out of boundary case
	if ( j2 <= j1 || j1 >= p-1 || j2 >= p ) {
		return;
	}

	for (i = 0; i<2 ; i++)
	{
		for (j = 0; j<2; j++)
		{
			count = 0;
			for (k = 0; k<nLongIntcase; k++)
			{
				andResult = genocase[k*3*p+i*p+j1] & genocase[k*3*p+j*p+j2];
				count += dev_count_bit(andResult);
			}
			localGenoDistr[i*3 + j] = count;
		}
	}

	for (i = 0; i<2 ; i++)
	{
		for (j = 0; j <2; j++)
		{
			count = 0;
			for (k = 0; k < nLongIntctrl; k++)
			{
				andResult = genoctrl[k*3*p+i*p+j1] & genoctrl[k*3*p+j*p+j2];
				count += dev_count_bit(andResult);
			}
			localGenoDistr[9+i*3 + j] = count;
		}
	}

	//for case
	localGenoDistr[2] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*p+j1] - localGenoDistr[0] - localGenoDistr[1];
	localGenoDistr[5] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*p+j1] - localGenoDistr[3] - localGenoDistr[4];
	
	localGenoDistr[6] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[0] - localGenoDistr[3];
	localGenoDistr[7] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[1] - localGenoDistr[4];
	
	localGenoDistr[8] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[2] - localGenoDistr[5];

	//for ctrl
	localGenoDistr[11] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*p+j1] - localGenoDistr[9] - localGenoDistr[10];
	localGenoDistr[14] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*p+j1] - localGenoDistr[12] - localGenoDistr[13];

	localGenoDistr[15] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[9] - localGenoDistr[12];
	localGenoDistr[16] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[10] - localGenoDistr[13];

	localGenoDistr[17] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[11] - localGenoDistr[14];

	for ( i = 0; i < 9; i++ ) {
		if ( localGenoDistr[9+i] == 0 ) {
			ratio[i] = localGenoDistr[i];
		}
		else {
			ratio[i] = localGenoDistr[i] / ((float)localGenoDistr[9+i]);
		}
		index[i] = i;
	}

	// insertion sort
	for ( i = 0; i < 9; i++ ) {
		for ( j = i+1; j < 9; j++ ) {
			if ( ratio[i] < ratio[j] ) {
				ratio[i] = ratio[i] + ratio[j];
				ratio[j] = ratio[i] - ratio[j];
				ratio[i] = ratio[i] - ratio[j];

				index[i] = index[i] + index[j];
				index[j] = index[i] - index[j];
				index[i] = index[i] - index[j];
			}
		}
	}

	for ( i = 1; i < 9; i++ ) {
		input[0] = 0;
		input[1] = 0;
		input[2] = 0;
		input[3] = 0;
		for ( j = 0; j < i; j++ ) {
			input[0] += localGenoDistr[index[j]];
			input[2] += localGenoDistr[9+index[j]];
		}
		for ( j = i; j < 9; j++ ) {
			input[1] += localGenoDistr[index[j]];
			input[3] += localGenoDistr[9+index[j]];
		}

		numerator = ((float)(input[0]*input[3] - input[1]*input[2])*(input[0]*input[3] - input[1]*input[2]))*(input[0]+input[1]+input[2]+input[3]);
		denominator = ((float)(input[0]+input[1]))*(input[2]+input[3])*(input[1]+input[3])*(input[0]+input[2]);

		if ( abs(denominator) < 0.000001 ) {
			chiSquareValue = 0.0f;
		}
		else {
			chiSquareValue = numerator/denominator;
		}

		if ( chiSquareValue > maxChiSquareValue ) 
			maxChiSquareValue = chiSquareValue;
	}

	if (maxChiSquareValue > thresholdRecord) 
	{
		interactionPairOffsetJ1[outIndex] = j1;
		interactionPairOffsetJ2[outIndex] = j2;
	}
	else 
	{
		interactionPairOffsetJ1[outIndex] = 0;
		interactionPairOffsetJ2[outIndex] = 0;
	}

}

extern "C" void cuda_GetInteractionPairs(std::vector<int> indexVector,
	unsigned long long* genocase, unsigned long long* genoctrl, 
	int p, int n, int nLongIntcase, int nLongIntctrl, int ncase, int nctrl, float thresholdRecord, 
	int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y, 
	const unsigned char* wordBits, int wordBitCount, 
	std::list<int> &offsetListJ1, std::list<int> &offsetListJ2, int screenMode)
{	
	printf("\nStart screening ... \n");
	float timeInMs;
	cudaEvent_t evStart, evStop;
	cudaEventCreate(&evStart);
	cudaEventCreate(&evStop);

	cudaEventRecord(evStart, 0);

	uint64* gpu_genocase;
	uint64* gpu_genoctrl;
	int* gpu_pMarginalDistrSNP;
	int* gpu_pMarginalDistrSNP_Y;
	int *gpu_inputOffsetJ1;
	int *gpu_inputOffsetJ2;
	
	unsigned char* gpu_wordBits;
	cudaMalloc((void**)&gpu_wordBits, sizeof(char)*wordBitCount);
	cudaMemcpy(gpu_wordBits, wordBits, sizeof(char)*wordBitCount, cudaMemcpyHostToDevice);
	cudaBindTexture(0, wordbits_Texture, gpu_wordBits);

	bool firstJ1Loop = true;
	int i, j1 = 0, j2 = j1+1;
	int shiftOffset = 0;
	
	long inputSize = indexVector.size();
	long long offset;
	long long totalTask = ((long long)(inputSize-1))*inputSize/2;
	int threadNum = THREAD_NUM;
	int blockNum = BLOCK_NUM;
	int totalNumberOfThreadBlock = iDivUp(totalTask, (long long)threadNum);
	int totalNumberOfGridBlock = iDivUp(totalNumberOfThreadBlock, (long long)blockNum);
	
	int* interactionInputOffsetJ1;
	int* interactionInputOffsetJ2;
	int *gpu_InteractionPairOffsetJ1;
	int *gpu_InteractionPairOffsetJ2;
	dim3 threads(threadNum, 1, 1);
    dim3 grid(blockNum, 1, 1);

	//printf("Input Size : %d\n", inputSize);

	// For Testing ONLY
	//float* floatArray = (float *) calloc(THREAD_NUM*BLOCK_NUM*9, sizeof(float));
	float* gpu_floatArray;
	//cudaMalloc((void**)&gpu_floatArray, sizeof(float)*blockNum*threadNum*9);

	//this flag must be set in order to allocate pinned
	//host memory that is accessible to the device
	//cudaSetDeviceFlags(cudaDeviceMapHost);

	// normal host memory allocation
	int* interactionPairOffsetJ1 = (int *)calloc(threadNum*blockNum, sizeof(int));
	int* interactionPairOffsetJ2 = (int *)calloc(threadNum*blockNum, sizeof(int));

	//allocate host page-locked and accessible to the device memory
	//maps the memory allocation on host into cuda device address
	cudaHostAlloc((void**)&interactionInputOffsetJ1,sizeof(int)*blockNum*threadNum,cudaHostAllocMapped);
	cudaHostAlloc((void**)&interactionInputOffsetJ2,sizeof(int)*blockNum*threadNum,cudaHostAllocMapped);

	//pass back the device pointer and map with host
	cudaHostGetDevicePointer((void**)&gpu_inputOffsetJ1,(void*)interactionInputOffsetJ1,0);
	cudaHostGetDevicePointer((void**)&gpu_inputOffsetJ2,(void*)interactionInputOffsetJ2,0);

	// allocate gpu memory
	cudaMalloc((void**)&gpu_genocase, sizeof(uint64)*nLongIntcase*3*p);
	cudaMalloc((void**)&gpu_genoctrl, sizeof(uint64)*nLongIntctrl*3*p);

	cudaMalloc((void**)&gpu_pMarginalDistrSNP, sizeof(int)*MarginalDistrSNP_Y_DimensionY*p);
	cudaMalloc((void**)&gpu_pMarginalDistrSNP_Y, sizeof(int)*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX*p);

	//cudaMalloc((void**)&gpu_inputOffsetJ1, sizeof(int)*blockNum*threadNum);
	//cudaMalloc((void**)&gpu_inputOffsetJ2, sizeof(int)*blockNum*threadNum);

	cudaMalloc((void**)&gpu_InteractionPairOffsetJ1, sizeof(int)*blockNum*threadNum);
	cudaMalloc((void**)&gpu_InteractionPairOffsetJ2, sizeof(int)*blockNum*threadNum);

	// copy geno data and bind as texture
	cudaMemcpy(gpu_genocase, genocase, sizeof(uint64)*nLongIntcase*3*p, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_genoctrl, genoctrl, sizeof(uint64)*nLongIntctrl*3*p, cudaMemcpyHostToDevice);
	
	cudaMemcpy(gpu_pMarginalDistrSNP, pMarginalDistrSNP, sizeof(int)*MarginalDistrSNP_Y_DimensionY*p, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_pMarginalDistrSNP_Y, pMarginalDistrSNP_Y, sizeof(int)*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX*p, cudaMemcpyHostToDevice);

	cudaBindTexture(0, genocase_Texture, gpu_genocase );
	cudaBindTexture(0, genoctrl_Texture, gpu_genoctrl );

	//printf("SizePerGridRun : %d, totalTask : %ld\n", blockNum*threadNum, totalTask);
	//printf("totalNumberOfThreadBlock : %d, totalNumberOfGridBlock : %d\n",  totalNumberOfThreadBlock, totalNumberOfGridBlock);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();
	//system("pause");
	//Sleep(2000);

	for ( i = 1, offset = 0; i <= totalNumberOfGridBlock; i++, offset = offset + blockNum*threadNum ) {
		if ( i % 100 == 0 ) {
			printf("\rProgress:%d%%", (int)floor(((float)i/totalNumberOfGridBlock)*100));
			
			// flush the STD_OUT for external process to read
			FLUSH_STDOUT();
		}

		// Generate index for computation
		for ( /* Set start condition outside */; j1 < inputSize-1; j1++ ) {
			if ( firstJ1Loop ) {
				// Continue old J2 index and toggle firstJ1Loop flag
				firstJ1Loop = false;
			}
			else {
				// Set regular J2 index
				j2 = j1+1;
			}
			for ( /* Set by logic before */; j2 < inputSize; j2++ ) {
				interactionInputOffsetJ1[shiftOffset] = indexVector[j1];
				interactionInputOffsetJ2[shiftOffset] = indexVector[j2];
				
				shiftOffset++;
				if ( shiftOffset == blockNum*threadNum ) {
					// update j2 and break the loop
					j2++;
					break;
				}
			}
			if ( shiftOffset == blockNum*threadNum ) {
				break;
			}
		}

		firstJ1Loop = true;
		shiftOffset = 0;

		//// Copy required index to gpu for computation
		//if ( i == totalNumberOfGridBlock ) {
		//	cudaMemcpy(gpu_inputOffsetJ1, interactionInputOffsetJ1,  sizeof(int)*( (totalTask%(blockNum*threadNum)) ), cudaMemcpyHostToDevice);
		//	cudaMemcpy(gpu_inputOffsetJ2, interactionInputOffsetJ2,  sizeof(int)*( (totalTask%(blockNum*threadNum)) ), cudaMemcpyHostToDevice);
		//}
		//else {
		//	cudaMemcpy(gpu_inputOffsetJ1, interactionInputOffsetJ1,  sizeof(int)*blockNum*threadNum, cudaMemcpyHostToDevice);
		//	cudaMemcpy(gpu_inputOffsetJ2, interactionInputOffsetJ2,  sizeof(int)*blockNum*threadNum, cudaMemcpyHostToDevice);
		//}

		// Reset memory for next iteration
		cudaMemset(gpu_InteractionPairOffsetJ1, 0, sizeof(int)*blockNum*threadNum);
		cudaMemset(gpu_InteractionPairOffsetJ2, 0, sizeof(int)*blockNum*threadNum);

		//cudaEventRecord(evStart, 0);
		//cudaThreadSynchronize();

		// execute kernel
		if ( screenMode == SCREENING_KSA ) 
			pairwiseKSA_Kernel<<<grid, threads>>>(gpu_genocase, gpu_genoctrl, p, nLongIntcase, nLongIntctrl, gpu_inputOffsetJ1, gpu_inputOffsetJ2, gpu_InteractionPairOffsetJ1, gpu_InteractionPairOffsetJ2, gpu_pMarginalDistrSNP, gpu_pMarginalDistrSNP_Y, n, thresholdRecord, ncase, nctrl, gpu_wordBits);
		else 
			pairwiseChiSquare_Kernel<<<grid, threads>>>(gpu_genocase, gpu_genoctrl, p, nLongIntcase, nLongIntctrl, gpu_inputOffsetJ1, gpu_inputOffsetJ2, gpu_InteractionPairOffsetJ1, gpu_InteractionPairOffsetJ2, gpu_pMarginalDistrSNP, gpu_pMarginalDistrSNP_Y, n, thresholdRecord, ncase, nctrl, gpu_wordBits, gpu_floatArray);
		cudaThreadSynchronize(); // For page lock memory synchronization

		//cudaEventRecord(evStop, 0);
		//cudaEventSynchronize(evStop);

		//cudaEventElapsedTime( &timeInMs, evStart, evStop );
		//printf("i %d, GPU Time = %fms\n", i, timeInMs);

		if ( i == totalNumberOfGridBlock ) {
			// read back data from gpu
			cudaMemcpy(interactionPairOffsetJ1, gpu_InteractionPairOffsetJ1, sizeof(int)*( (totalTask%(blockNum*threadNum)) ), cudaMemcpyDeviceToHost);
			cudaMemcpy(interactionPairOffsetJ2, gpu_InteractionPairOffsetJ2, sizeof(int)*( (totalTask%(blockNum*threadNum)) ), cudaMemcpyDeviceToHost);
			
			//cudaMemcpy(floatArray, gpu_floatArray, sizeof(float)*9*(totalTask%(blockNum*threadNum)), cudaMemcpyDeviceToHost);

			for ( int j = 0; j < totalTask%(blockNum*threadNum); j++ ) {
				if ( interactionPairOffsetJ1[j] != 0 && interactionPairOffsetJ2[j] != 0 ) {
					offsetListJ1.push_back(interactionPairOffsetJ1[j]);
					offsetListJ2.push_back(interactionPairOffsetJ2[j]);
				}
			}
		}
		else {
			// read back data from gpu
			cudaMemcpy(interactionPairOffsetJ1, gpu_InteractionPairOffsetJ1, sizeof(int)*blockNum*threadNum, cudaMemcpyDeviceToHost);
			cudaMemcpy(interactionPairOffsetJ2, gpu_InteractionPairOffsetJ2, sizeof(int)*blockNum*threadNum, cudaMemcpyDeviceToHost);

			//cudaMemcpy(floatArray, gpu_floatArray, sizeof(float)*9*(blockNum*threadNum), cudaMemcpyDeviceToHost);
			
			//for ( int a = 0; a < 9; a++ ) {
			//	printf("FloatArray[%d] : %f\n", a, floatArray[a]);
			//}
			//system("pause");
			//Sleep(2000);

			for ( int j = 0; j < blockNum*threadNum; j++ ) {
				if ( interactionPairOffsetJ1[j] != 0 && interactionPairOffsetJ2[j] != 0 ) {
					offsetListJ1.push_back(interactionPairOffsetJ1[j]);
					offsetListJ2.push_back(interactionPairOffsetJ2[j]);
				}
			}
		}
		checkCUDAError("Kernel Error"); 
	}
	printf("\rProgress:%d%%\n", 100);	

	// Unbind used texture
	cudaUnbindTexture(genocase_Texture);
	cudaUnbindTexture(genoctrl_Texture);
	cudaUnbindTexture(wordbits_Texture);

	cudaFree(gpu_wordBits);
	cudaFree(gpu_genocase);
	cudaFree(gpu_genoctrl);
	cudaFree(gpu_pMarginalDistrSNP);
	cudaFree(gpu_pMarginalDistrSNP_Y);
	cudaFree(gpu_InteractionPairOffsetJ1);
	cudaFree(gpu_InteractionPairOffsetJ2);

	// free host memory
	cudaFreeHost(interactionInputOffsetJ1);
	cudaFreeHost(interactionInputOffsetJ2);

	//cudaFree(gpu_inputOffsetJ1);
	//cudaFree(gpu_inputOffsetJ2);

	cudaEventRecord(evStop, 0);
	cudaEventSynchronize(evStop);

	cudaEventElapsedTime( &timeInMs, evStart, evStop );
	//printf("i %d, GPU Time = %fms\n", i, timeInMs);
	printf("GPU Time = %fms\n", timeInMs);

	cudaEventDestroy(evStart);
	cudaEventDestroy(evStop);

	// free normal host allocation memory
	free(interactionPairOffsetJ1);
	free(interactionPairOffsetJ2);
}

__global__ void columnResultKernel(uint64* genocase, uint64* genoctrl, int p, int nLongIntcase, int nLongIntctrl, int *interactionPairOffsetJ1, int *interactionPairOffsetJ2, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y, int n, float thresholdRecord, int ncase, int nctrl, int xOffset, int yOffset, int width, int height)
{
	int outIndex = blockIdx.x * blockDim.x + threadIdx.x;
	int j1 = xOffset + outIndex;
	int j2;
	int i, j, k;
	int count;
	int localGenoDistr[18];
	float tao = 0;
	float InteractionMeasure=0;
	float ptmp1, ptmp2;
	__int64 andResult = 0;
	
	// Skip not valid and out of boundary case
	if ( j1 >= p-1 ) {
		return;
	}
	/*if ( j2 <= j1 || j1 >= p-1 || j2 >= p ) {
		return;
	}*/
	
	for ( j2 = yOffset; j2 < yOffset+height; j2++ ) {
		
		// terminate when reach the end
		if ( j2 >= p ) {
			return;
		}

		if ( j2 > j1 ) {
			return;
		}

		for (i = 0; i<2 ; i++)
		{
			for (j = 0; j<2; j++)
			{
				count = 0;
				for (k = 0; k<nLongIntcase; k++)
				{
					andResult = genocase[k*3*p+i*p+j1] & genocase[k*3*p+j*p+j2];
					count += dev_count_bit(andResult);
				}
				localGenoDistr[i*3 + j] = count;
			}
		}

		for (i = 0; i<2 ; i++)
		{
			for (j = 0; j <2; j++)
			{
				count = 0;
				for (k = 0; k < nLongIntctrl; k++)
				{
					andResult = genoctrl[k*3*p+i*p+j1] & genoctrl[k*3*p+j*p+j2];
					count += dev_count_bit(andResult);
				}
				localGenoDistr[9+i*3 + j] = count;
			}
		}

		//for case
		localGenoDistr[2] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*p+j1] - localGenoDistr[0] - localGenoDistr[1];
		localGenoDistr[5] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*p+j1] - localGenoDistr[3] - localGenoDistr[4];
		
		localGenoDistr[6] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[0] - localGenoDistr[3];
		localGenoDistr[7] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[1] - localGenoDistr[4];
		
		localGenoDistr[8] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+0)*p+j2] - localGenoDistr[2] - localGenoDistr[5];

		//for ctrl
		localGenoDistr[11] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*p+j1] - localGenoDistr[9] - localGenoDistr[10];
		localGenoDistr[14] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*p+j1] - localGenoDistr[12] - localGenoDistr[13];

		localGenoDistr[15] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[9] - localGenoDistr[12];
		localGenoDistr[16] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[10] - localGenoDistr[13];

		localGenoDistr[17] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+1)*p+j2] - localGenoDistr[11] - localGenoDistr[14];

		//	GenoJointDistr: the index is as follows:
		//					AABB
		//			Case	0	1	2	3	4	5	6	7	8
		//			Ctrl	9	10	11	12	13	14	15	16	17
		//Papprx = Pab*Pbc*Pca
		// tao = sum(Papprx)
		// sum(p.* log(p) - p.*log(Pappr) + p.* log(tao))
		tao = 0;
		InteractionMeasure=0;

		for (i = 0; i<3; i++)// index for A
		{
			for (j = 0; j<3; j++) //index for B
			{
				// ** Branch prediction to reduce if branch ** //
				ptmp1 = (float)localGenoDistr[i*3+j];
				
				if (ptmp1>0)
				{
					InteractionMeasure += ptmp1*log(ptmp1);
				}			

				//ptmp2 KSA  n_{ij}*n_{jk}*n_{ik}/ (n_i * n_j * n_k)
				ptmp2 = (float)n*
					(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*
					pMarginalDistrSNP_Y[(i*MarginalDistrSNP_Y_DimensionX+0)*p+j1]*
					pMarginalDistrSNP_Y[(j*MarginalDistrSNP_Y_DimensionX+0)*p+j2]/
					pMarginalDistrSNP[i*p+j1]/
					pMarginalDistrSNP[j*p+j2]/
					ncase;

				if (ptmp2>0)
				{
					InteractionMeasure += -ptmp1*log(ptmp2);
					tao += ptmp2;
				}

				ptmp1 = (float)localGenoDistr[9+i*3+j];

				if (ptmp1>0)
				{
					InteractionMeasure += ptmp1*log(ptmp1);
				}

				ptmp2 = (float)n*
					(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*
					pMarginalDistrSNP_Y[(i*MarginalDistrSNP_Y_DimensionX+1)*p+j1]*
					pMarginalDistrSNP_Y[(j*MarginalDistrSNP_Y_DimensionX+1)*p+j2]/
					pMarginalDistrSNP[i*p+j1]/
					pMarginalDistrSNP[j*p+j2]/
					nctrl;

				if (ptmp2>0)
				{
					InteractionMeasure += -ptmp1*log(ptmp2);
					tao += ptmp2;
				}

				//for (k = 0; k<2; k++)	//index for C
				//{
				//	ptmp1 = (float)localGenoDistr[k*9+i*3+j];
				//	if (ptmp1>0)
				//	{
				//		InteractionMeasure += ptmp1 * log(ptmp1);
				//	}

				//	if ( k == 0 ) {
				//		//ptmp2 KSA  n_{ij}*n_{jk}*n_{ik}/ (n_i * n_j * n_k)
				//		ptmp2 = (float)n*(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/pMarginalDistr[j2].MarginalDistrSNP[j]/ncase;
				//	}
				//	else if ( k == 1 ) {
				//		ptmp2 = (float)n*(localGenoDistr[i*3+j]+localGenoDistr[9+i*3+j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/pMarginalDistr[j2].MarginalDistrSNP[j]/nctrl;
				//	}
				//	if (ptmp2>0)
				//	{
				//		InteractionMeasure += -ptmp1*log(ptmp2);
				//		tao += ptmp2;
				//	}
				//}
			}
		}

		InteractionMeasure = (InteractionMeasure+n*log(tao/n))*2;
		
		if ( (InteractionMeasure > thresholdRecord) ) 
		{
			interactionPairOffsetJ1[(j2-yOffset)*width+outIndex] = j1;
			interactionPairOffsetJ2[(j2-yOffset)*width+outIndex] = j2;
		}
		/*else 
		{
			interactionPairOffsetJ1[(j2-yOffset)*width+outIndex] = 0;
			interactionPairOffsetJ2[(j2-yOffset)*width+outIndex] = 0;
		}*/
	
	}

}

extern "C" void cuda_GetInteractionPairsDirect(unsigned long long* genocase, unsigned long long* genoctrl, int p, int nLongIntcase, int nLongIntctrl, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y, int n, float thresholdRecord, int ncase, int nctrl, int* interactionInputOffsetJ1, int* interactionInputOffsetJ2, int* interactionPairOffsetJ1, int* interactionPairOffsetJ2, float* interactionMeasureScore, const unsigned char* wordBits, int wordBitCount, std::list<int> &offsetListJ1, std::list<int> &offsetListJ2)
{	
	float timeInMs;
	cudaEvent_t evStart, evStop;
	cudaEventCreate(&evStart);
	cudaEventCreate(&evStop);

	uint64* gpu_genocase;
	uint64* gpu_genoctrl;
	int* gpu_pMarginalDistrSNP;
	int* gpu_pMarginalDistrSNP_Y;
	
	int totalTaskX = p;
	int totalTaskY = p;
	int threadNum = 128;
	int blockNum = 100;
	int width = threadNum*blockNum;
	int height = 1000;
	int totalNumberOfGridBlockX = iDivUp(totalTaskX, width);
	int totalNumberOfGridBlockY = iDivUp(totalTaskY, height);
	int xOffset = 0, yOffset = 0;
	
	int *gpu_InteractionPairOffsetJ1;
	int *gpu_InteractionPairOffsetJ2;
	dim3 threads(threadNum, 1, 1);
    dim3 grid(blockNum, 1, 1);	

	//this flag must be set in order to allocate pinned
	//host memory that is accessible to the device
	cudaSetDeviceFlags(cudaDeviceMapHost);

	// allocate gpu memory
	cudaMalloc((void**)&gpu_genocase, sizeof(uint64)*nLongIntcase*3*p);
	cudaMalloc((void**)&gpu_genoctrl, sizeof(uint64)*nLongIntctrl*3*p);

	cudaMalloc((void**)&gpu_pMarginalDistrSNP, sizeof(int)*MarginalDistrSNP_Y_DimensionY*p);
	cudaMalloc((void**)&gpu_pMarginalDistrSNP_Y, sizeof(int)*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX*p);

	interactionPairOffsetJ1 = (int*) malloc(sizeof(int)*width*height);
	interactionPairOffsetJ2 = (int*) malloc(sizeof(int)*width*height);

	cudaMalloc((void**)&gpu_InteractionPairOffsetJ1, sizeof(int)*width*height);
	cudaMalloc((void**)&gpu_InteractionPairOffsetJ2, sizeof(int)*width*height);

	// copy geno data and bind as texture
	cudaMemcpy(gpu_genocase, genocase, sizeof(uint64)*nLongIntcase*3*p, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_genoctrl, genoctrl, sizeof(uint64)*nLongIntctrl*3*p, cudaMemcpyHostToDevice);
	
	cudaMemcpy(gpu_pMarginalDistrSNP, pMarginalDistrSNP, sizeof(int)*MarginalDistrSNP_Y_DimensionY*p, cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_pMarginalDistrSNP_Y, pMarginalDistrSNP_Y, sizeof(int)*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX*p, cudaMemcpyHostToDevice);

	//printf("SizePerGridRun : %d, totalTask : %ld\n", blockNum*threadNum, totalTask);
	//printf("totalNumberOfThreadBlock : %d, totalNumberOfGridBlock : %d\n",  totalNumberOfThreadBlock, totalNumberOfGridBlock);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	printf("totalNumberOfGridBlockX : %d, totalNumberOfGridBlockY : %d\n", totalNumberOfGridBlockX, totalNumberOfGridBlockY);
	int xGridIndex, yGridIndex;

	cudaEventRecord(evStart, 0);
	for ( xGridIndex = 1, xOffset = 0; xGridIndex <= totalNumberOfGridBlockX; xGridIndex++, xOffset = xOffset+width ) {
		for ( yGridIndex = 1, yOffset = 0; yGridIndex <= totalNumberOfGridBlockY; yGridIndex++, yOffset = yOffset+height ) {

			// Reset memory for next iteration
			cudaMemset(gpu_InteractionPairOffsetJ1, 0, sizeof(int)*width*height);
			cudaMemset(gpu_InteractionPairOffsetJ2, 0, sizeof(int)*width*height);

			// Execute the kernel
			columnResultKernel<<<grid, threads>>>(gpu_genocase, gpu_genoctrl, p, nLongIntcase, nLongIntctrl, gpu_InteractionPairOffsetJ1, gpu_InteractionPairOffsetJ2, gpu_pMarginalDistrSNP, gpu_pMarginalDistrSNP_Y, n, thresholdRecord, ncase, nctrl, xOffset, yOffset, width, height);

			// Copy back the results
			cudaMemcpy(interactionPairOffsetJ1, gpu_InteractionPairOffsetJ1, sizeof(int)*width*height, cudaMemcpyDeviceToHost);
			cudaMemcpy(interactionPairOffsetJ2, gpu_InteractionPairOffsetJ2, sizeof(int)*width*height, cudaMemcpyDeviceToHost);

			checkCUDAError("Kernel Error"); 

			// Add corresponding results to the list
			for ( int j = 0; j < width*height; j++ ) {
				if ( interactionPairOffsetJ1[j] != 0 || interactionPairOffsetJ2[j] != 0 ) {
					offsetListJ1.push_back(interactionPairOffsetJ1[j]);
					offsetListJ2.push_back(interactionPairOffsetJ2[j]);
				}
			}
		}
	}

	cudaEventRecord(evStop, 0);
	cudaEventSynchronize(evStop);

	cudaEventElapsedTime( &timeInMs, evStart, evStop );
	printf("GPU Time = %fms\n", timeInMs);

	cudaEventDestroy(evStart);
	cudaEventDestroy(evStop);

	// Free used cpu memory
	free(interactionPairOffsetJ1);
	free(interactionPairOffsetJ2);

	// Free used gpu memory
	cudaFree(gpu_genocase);
	cudaFree(gpu_genoctrl);
	cudaFree(gpu_pMarginalDistrSNP);
	cudaFree(gpu_pMarginalDistrSNP_Y);
	cudaFree(gpu_InteractionPairOffsetJ1);
	cudaFree(gpu_InteractionPairOffsetJ2);
}
