#include "utility.h"

// CUDA runtime library
#include <cuda_runtime.h>

// General uility functions for BOOST

// compute the number of 1s in 64 bit string (alternative implementation)
int bitCount(uint64 i)
{
	i = i - ((i >> 1) & 0x5555555555555555);
	i = (i & 0x3333333333333333) + ((i >> 2) & 0x3333333333333333);
	i = (i + (i >> 4)) & 0x0f0f0f0f0f0f0f0f;
	i = i + (i >> 8);
	i = i + (i >> 16);
	i = i + (i >> 32);
	return (int)i & 0x7f;
}

// compute the absolute value of double
double Abs(double a)
{
	return((a<0) ? -a : a);
}

// convert string to upper case
void toUpperCaseString(char* inputString, int strLen) {
	for ( int i = 0; i < strLen; i++ ) {
		inputString[i] = toupper(inputString[i]);
	}
}

// get the data size of an input list file of BOOST program
int GetDataSize(char *filename, int **DataSize)
{
	FILE * fp, *fp_i;
	int c, ndataset;
	time_t st,ed;
	int n, p, i, flag,ii;
	char filename_i[100];

	fp = fopen(filename,"r");
	if(fp == NULL)
	{
		printf("can't open input file %s\n",filename);
		exit(1);
	}

	ndataset = 0;
	while(!feof(fp)) {
		ndataset++;
		fscanf(fp, "%s\n", &filename_i);
	}

	*DataSize = (int *)calloc( ndataset*2, sizeof(int));

	ii = 0;
	rewind(fp);
	while(!feof(fp)) {
		ii++;
		fscanf(fp, "%s\n", &filename_i);

		fp_i = fopen(filename_i, "r");
		if(fp_i == NULL)
		{
			printf("can't open input file %s\n",filename_i);
			exit(1);
		}
		printf("start getting data size of file %d: %s\n", ii, filename_i);
		time(&st);
		//initialization
		if (ii == 1)
		{
			n = 0;//samples number

			// find the number of samples: n
			while(1)
			{
				int c = fgetc(fp_i);//read a character from the data file
				switch(c)
				{
				case '\n'://the end of line
					n++;
					break;
					// fall through,
					// count the '-1' element
				case EOF://file end
					goto out;
				default:
					;
				}
			}

		}
out:
		rewind(fp_i);//Repositions the file pointer to the beginning of a file

		// find number of variables: p
		p= 0;
		i= 0;
		flag = 1;
		while(1)
		{
			c = getc(fp_i);
			if(c=='\n') goto out2;//end of line
			if(isspace(c))
			{
				flag = 1;
			}
			/*do {
			c = getc(fp);
			if(c=='\n') goto out2;//end of line
			} while(isspace(c));//space
			*/
			if (!isspace(c) && (flag==1))
			{
				p++;//indicate the dimension of the vector
				flag = 0;
			}

		}
out2:
		fclose(fp_i);

		time(&ed);

		//	DataSize[0] = n;
		(*DataSize)[ndataset * 0 + ii - 1] = n;
		(*DataSize)[ndataset * 1 + ii - 1] += p-1;

	}

	fclose(fp);
	//printf("Data contains %d rows and %d column. \n", n, p);

	printf("cputime for getting data size: %d seconds.\n", (int) ed - st);
	return ndataset;
}

// calculate the marginal entropy
void CalculateMarginalEntropy(uint64* genocase, uint64* genoctrl, int nsnp, int n, int nlongintcase, int nlongintctrl, double *MarginalEntropySNP, double *MarginalEntropySNP_Y)
{
	int i1, i2, i3;
	int count;
	double tmp, ptmp;
	int GenoMarginalDistr[3][2];

	for (i1 = 0; i1< nsnp; i1++)
	{
		for (i2 = 0; i2<3; i2++)
		{
			count = 0;
			for (i3 = 0; i3< nlongintcase; i3++)
			{
				count += bitCount(genocase[i3*3*nsnp+i2*nsnp+i1]);
				//count += bitCount(genocase[(i1*3+i2)*nlongintcase+i3]);
				//count += bitCount(pgeno[i1*3 + i2].genocase[i3]);
			}
			GenoMarginalDistr[i2][0] = count;

			count = 0;
			for (i3 = 0; i3< nlongintctrl; i3++)
			{
				count += bitCount(genoctrl[i3*3*nsnp+i2*nsnp+i1]);
				//count += bitCount(genoctrl[(i1*3+i2)*nlongintctrl+i3]);
				//count += bitCount(pgeno[i1*3 + i2].genoctrl[i3]);
			}
			GenoMarginalDistr[i2][1] = count;
		}

		for (i2 = 0; i2<3; i2++)
		{
			tmp = (double) GenoMarginalDistr[i2][0] + GenoMarginalDistr[i2][1];
			if ( tmp > 0)
			{
				ptmp = tmp/n;
				MarginalEntropySNP[i1] += -(ptmp)*log(ptmp);
			}

			if (GenoMarginalDistr[i2][0]>0)
			{
				ptmp = (double) GenoMarginalDistr[i2][0]/n;
				MarginalEntropySNP_Y[i1] += -ptmp*log(ptmp);
			}

			if (GenoMarginalDistr[i2][1]>0)
			{
				ptmp = (double) GenoMarginalDistr[i2][1]/n;
				MarginalEntropySNP_Y[i1] += -ptmp*log(ptmp);
			}

		}
	}
}

// calculate the marginal distribution
void CalculateMarginalDistr(uint64* genocase, uint64* genoctrl, int nsnp, int n, int nlongintcase, int nlongintctrl, int* pMarginalDistrSNP, int* pMarginalDistrSNP_Y)
{
	int i1, i2, i3;
	int count;
	//double tmp, ptmp;
	//int GenoMarginalDistr[3][2];

	for (i1 = 0; i1< nsnp; i1++)
	{
		for (i2 = 0; i2<3; i2++)
		{
			count = 0;
			for (i3 = 0; i3< nlongintcase; i3++)
			{
				count += bitCount(genocase[i3*3*nsnp+i2*nsnp+i1]);
				//count += bitCount(genocase[(i1*3 + i2)*nlongintcase+i3]);
				//count += bitCount(pgeno[i1*3 + i2].genocase[i3]);
			}

			pMarginalDistrSNP_Y[(i2*MarginalDistrSNP_Y_DimensionX+0)*nsnp+i1] = count;
			//pMarginalDistrSNP_Y[i1*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX+i2*MarginalDistrSNP_Y_DimensionX+0] = count;
			//pMarginalDistr[i1].MarginalDistrSNP_Y[i2*MarginalDistrSNP_Y_DimensionX+0] = count;

			count = 0;
			for (i3 = 0; i3< nlongintctrl; i3++)
			{
				count += bitCount(genoctrl[i3*3*nsnp+i2*nsnp+i1]);
				//count += bitCount(genoctrl[(i1*3 + i2)*nlongintctrl+i3]);
				//count += bitCount(pgeno[i1*3 + i2].genoctrl[i3]);
			}
			pMarginalDistrSNP_Y[(i2*MarginalDistrSNP_Y_DimensionX+1)*nsnp+i1] = count;
			//pMarginalDistrSNP_Y[i1*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX+i2*MarginalDistrSNP_Y_DimensionX+1] = count;
			//pMarginalDistr[i1].MarginalDistrSNP_Y[i2*MarginalDistrSNP_Y_DimensionX+1] = count;

			pMarginalDistrSNP[i2*nsnp+i1] = 
				pMarginalDistrSNP_Y[(i2*MarginalDistrSNP_Y_DimensionX+0)*nsnp+i1] + 
				pMarginalDistrSNP_Y[(i2*MarginalDistrSNP_Y_DimensionX+1)*nsnp+i1];

			//pMarginalDistrSNP[i1*MarginalDistrSNP_Y_DimensionY+i2] = 
			//	pMarginalDistrSNP_Y[i1*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX+i2*MarginalDistrSNP_Y_DimensionX+0] + 
			//	pMarginalDistrSNP_Y[i1*MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX+i2*MarginalDistrSNP_Y_DimensionX+1];
			//pMarginalDistr[i1].MarginalDistrSNP[i2] = pMarginalDistr[i1].MarginalDistrSNP_Y[i2*MarginalDistrSNP_Y_DimensionX+0] + pMarginalDistr[i1].MarginalDistrSNP_Y[i2*MarginalDistrSNP_Y_DimensionX+1];
		}
	}
}

// calculate the genome joint distribution
void CalculateGenoJointDistr(uint64* genocase, uint64* genoctrl, int nsnp, int nLongIntcase, int nLongIntctrl, int *GenoDistr, int j1, int j2, int* pMarginalDistrSNP_Y)
{
	//// For time performance query in windows
	//__int64 ctr1 = 0, ctr2 = 0, freq = 0;
	//int acc = 0, i = 0;
	int i1, i2, i3;

	register int count;
	//	uint64 tmp;

	//ctr1 = 0;
	//ctr2 = 0;
	//freq = 0;
	//QueryPerformanceCounter((LARGE_INTEGER *)&ctr1);
	for (i1 = 0; i1<2 ; i1++)
	{
		for (i2 = 0; i2 <2; i2++)
		{
			count = 0;
			for (i3 = 0; i3< nLongIntcase; i3++)
			{
				count += popcount(genocase[i3*3*nsnp+i1*nsnp+j1] & genocase[i3*3*nsnp+i2*nsnp+j2]);
				//count += bitCount(pgeno[j1*3 + i1].genocase[i3] & pgeno[j2*3 + i2].genocase[i3]);
				//count += popcount(genocase[(j1*3 + i1)*nLongIntcase+i3] & genocase[(j2*3 + i2)*nLongIntcase+i3]);
				//count += popcount(pgeno[j1*3 + i1].genocase[i3] & pgeno[j2*3 + i2].genocase[i3]);
				//GenoDistr[i1*3 + i2] += bitCount(pgeno[j1*3 + i1].genocase[i3] & pgeno[j2*3 + i2].genocase[i3]);
			}

			GenoDistr[i1*3 + i2] = count;
			count = 0;
			//printf("i1*3+i2 : %d, count : %d\n", i1*3 + i2, count);

			for (i3 = 0; i3< nLongIntctrl; i3++)
			{
				count += popcount(genoctrl[i3*3*nsnp+i1*nsnp+j1] & genoctrl[i3*3*nsnp+i2*nsnp+j2]);
				//count += bitCount(pgeno[j1*3 + i1].genoctrl[i3] & pgeno[j2*3 + i2].genoctrl[i3]);
				//count += popcount(genoctrl[(j1*3 + i1)*nLongIntctrl+i3] & genoctrl[(j2*3 + i2)*nLongIntctrl+i3]);
				//count += popcount(pgeno[j1*3 + i1].genoctrl[i3] & pgeno[j2*3 + i2].genoctrl[i3]);
				//GenoDistr[9 + i1*3 + i2] += bitCount(pgeno[j1*3 + i1].genoctrl[i3] & pgeno[j2*3 + i2].genoctrl[i3]);
			}
			GenoDistr[9 + i1*3 + i2] = count;

			//printf("9 + i1*3 + i2 : %d, count : %d\n", 9 + i1*3 + i2, count);
			//system("pause");
			//Sleep(1000);
			//
		}
	}
	//QueryPerformanceCounter((LARGE_INTEGER *)&ctr2);
	//QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
	////printf("Time required : %e\n", ((ctr2 - ctr1) * 1.0 / freq));

	//for case
	GenoDistr[2] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*nsnp+j1] - GenoDistr[0] - GenoDistr[1];
	GenoDistr[5] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*nsnp+j1] - GenoDistr[3] - GenoDistr[4];

	GenoDistr[6] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+0)*nsnp+j2] - GenoDistr[0] - GenoDistr[3];
	GenoDistr[7] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+0)*nsnp+j2] - GenoDistr[1] - GenoDistr[4];

	GenoDistr[8] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+0)*nsnp+j2] - GenoDistr[2] - GenoDistr[5];

	//for ctrl
	GenoDistr[11] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*nsnp+j1] - GenoDistr[9] - GenoDistr[10];
	GenoDistr[14] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*nsnp+j1] - GenoDistr[12] - GenoDistr[13];

	GenoDistr[15] = pMarginalDistrSNP_Y[(0*MarginalDistrSNP_Y_DimensionX+1)*nsnp+j2] - GenoDistr[9] - GenoDistr[12];
	GenoDistr[16] = pMarginalDistrSNP_Y[(1*MarginalDistrSNP_Y_DimensionX+1)*nsnp+j2] - GenoDistr[10] - GenoDistr[13];

	GenoDistr[17] = pMarginalDistrSNP_Y[(2*MarginalDistrSNP_Y_DimensionX+1)*nsnp+j2] - GenoDistr[11] - GenoDistr[14];
}

// Fast Log implmentation
// build log lookup table
void fill_icsi_log_table(const int n, float *lookup_table) {
	float numlog;
	int *const exp_ptr = ((int*)&numlog);
	int x = *exp_ptr;
	x = 0x3F800000;
	*exp_ptr = x;
	int incr = 1 << (23-n);
	int p=pow(2.0,n);
	for ( int i = 0; i < p; ++i ) {
		lookup_table[i] = log(numlog)/log(2.0);
		x += incr;
		*exp_ptr = x;
	}
}

// calculate the chi-square value of a model
float CalculateChiSquareOfModel(int* input) {
	float numerator = pow((float)(input[0]*input[3] - input[1]*input[2]),2)*(input[0]+input[1]+input[2]+input[3]);
	float denominator = ((float)(input[0]+input[1]))*(input[2]+input[3])*(input[1]+input[3])*(input[0]+input[2]);

	//printf("numerator : %f, denominator : %f", numerator, denominator);

	if ( abs(denominator) < 0.000001 ) {
		return 0.0f;
	}
	return numerator/denominator;
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///// CUDA Related Functions //////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

int meetCUDARequirement() {
	#if CUDART_VERSION < 2030
		return -1;
	#else
		return 0;
	#endif
}

int initCUDA()
{
	int count, i;
	struct cudaDeviceProp prop;

	cudaGetDeviceCount(&count);
	if(count == 0) {
		fprintf(stderr, "There is no device.\n");
		return -1;
	}

	for(i = 0; i < count; i++) {
		if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
			if(prop.major >= 1) {
				break;
			}
		}
	}

	if(i == count) {
		fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
		return -1;
	}
	cudaSetDevice(i);

	return 0;
}

DeviceProperties::DeviceProperties() {
	// Find the number of devices
	cudaGetDeviceCount(&devCount);

	if ( devCount == 0 ) {
		return;
	}

	devPropArray = (cudaDeviceProp*)malloc(sizeof(struct cudaDeviceProp)*devCount);

	for (int i = 0; i < devCount; ++i)
	{
		cudaGetDeviceProperties(&devPropArray[i], i);
	}
}

DeviceProperties::~DeviceProperties() {
	free(devPropArray);
}

int DeviceProperties::getDeviceCount() {
	return devCount;
}

cudaDeviceProp DeviceProperties::getDeviceProp(int i) {
	return devPropArray[i];
}

void DeviceProperties::printDevProp(int i)
{
	cudaDeviceProp devProp = devPropArray[i];
	printf("Major revision number:         %d\n",  devProp.major);
	printf("Minor revision number:         %d\n",  devProp.minor);
	printf("Name:                          %s\n",  devProp.name);
	printf("Total global memory:           %u\n",  devProp.totalGlobalMem);
	printf("Total shared memory per block: %u\n",  devProp.sharedMemPerBlock);
	printf("Total registers per block:     %d\n",  devProp.regsPerBlock);
	printf("Warp size:                     %d\n",  devProp.warpSize);
	printf("Maximum memory pitch:          %u\n",  devProp.memPitch);
	printf("Maximum threads per block:     %d\n",  devProp.maxThreadsPerBlock);
	for (int i = 0; i < 3; ++i)
		printf("Maximum dimension %d of block:  %d\n", i, devProp.maxThreadsDim[i]);
	for (int i = 0; i < 3; ++i)
		printf("Maximum dimension %d of grid:   %d\n", i, devProp.maxGridSize[i]);
	printf("Clock rate:                    %d\n",  devProp.clockRate);
	printf("Total constant memory:         %u\n",  devProp.totalConstMem);
	printf("Texture alignment:             %u\n",  devProp.textureAlignment);
	printf("Concurrent copy and execution: %s\n",  (devProp.deviceOverlap ? "Yes" : "No"));
	printf("Number of multiprocessors:     %d\n",  devProp.multiProcessorCount);
	printf("Kernel execution timeout:      %s\n",  (devProp.kernelExecTimeoutEnabled ? "Yes" : "No"));
	return;
}

// C++ calling function for GBOOST
int BOOST_GPU(char* inputFilename, char* outputFilePrefix, float preFilteringThreshold, int screenMode, float screenThreshold, int testMode, float testThreshold)
{
	time_t st, ed;
	int* pMarginalDistrSNP;
	int* pMarginalDistrSNP_Y;

	// Prepare for out filename
	char marginalAssociationFilename[255], distrCollectionFilename[255], interactionFilename[255];
	if ( testMode == EXACT_GUESS ) {
		sprintf(marginalAssociationFilename, "%sMarginalAssociation.txt", outputFilePrefix);
		sprintf(distrCollectionFilename, "%sDistrCollection.txt", outputFilePrefix);
		sprintf(interactionFilename, "%sInteractionRecords.txt", outputFilePrefix);
	}
	else {
		sprintf(marginalAssociationFilename, "%sMarginalAssociation.txt", outputFilePrefix);
		sprintf(distrCollectionFilename, "%sDistrCollection.txt", outputFilePrefix);
		sprintf(interactionFilename, "%sModelSelectionRecords.txt", outputFilePrefix);
	}

	/* Declare variable */
	int *DataSize;
	int ndataset;

	int n, p;  //n--number of samples; p number of varibles
	int ncase, nctrl, nlongintcase, nlongintctrl;
	int icase, ictrl;
	int offset;
	uint64 *genocase, *genoctrl;
	int interactionPairCount = 0;	
	long long indexShift = 0;
	int count = 0;
	int x = 0;

	int *GenoJointDistr, *AlleleJointDistr;
	double or_aff, v_aff, or_unf, v_unf, pvalPLINK;//PLINK
	double *zval; //PLINK

	double *MarginalEntropySNP, *MarginalEntropySNP_Y, *MarginalAssociation;
	double tau1, tau2; // thresholds of test association allowing for interactions
	register double JointEntropyTwoSNP, JointEntropyTwoSNP_Y, MarginalEntropyY, ptmp1, ptmp2, ptmp3;
	register double InteractionMeasure, AssociationMeasure, AssociationMeasure1, AssociationMeasure2;
	double *InteractionMeasureSNPpair;
	//double *AssociationMeasureSNPpair;

	double maxInteraction, minInteraction;
	double *Pab, *Pbc, *Pca; // conditional probability P(a|b), P(b|c), P(c|a)
	double tao; // normalization term;

	int *DistrCollection;
	int *InteractionSNPpairs;
	int InteractionCount;

	//int *AssociationSNPpairs;
	//int AssociationCount;

	int flag, i, ii, j, k, j1,j2;
	int c, tmp;
	// used for post-correction (post-correction is exact solution of loglinear model)
	static double mu[3][3][2] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
	static double mu0[3][3][2] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	static double mutmp[3][3][2];
	static double mu0tmp[3][3][2];

	static double mu_ij[3][3];
	static double mu_ik[3][2];
	static double mu_jk[3][2];
	double muError;

	int LengthLongType=64;
	uint64 mask1 = 0x0000000000000001;

	int buffersize = 50000;
	int buffersizeAssociation = 50000;

	// load data
	FILE *fp, *fp_i;
	char filename_i[100];

	// Create a context to run CUDA program on our CUDA-enabled NVIDIA GPU
	if( initCUDA() == -1) {
		printf("Unable to initialize CUDA\n");
		return -1;
	}
	printf("CUDA initialized.\n\n");

	fp = fopen(inputFilename,"r");
	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",inputFilename);
		return -1;
	}

	// precompute the wordbits (a global variable)
	for (i = 0; i<65536; i++)
	{
		wordbits[i] = bitCount(i);
		//printf("%d\n",wordbits[i]);
	}
	//cuda_SetWordBits(wordbits, 65536);

	printf("start loading ...\n");
	
	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	ndataset = GetDataSize(inputFilename, &DataSize);

	n = DataSize[0];
	p = 0;
	printf("n = %d\n", n);
	for (i = 0; i<ndataset ; i++)
	{
		p += DataSize[ndataset*1 + i];
		printf("DataSize %d-th file: p[%d] = %d \n", i+1, i+1, DataSize[ndataset*1 + i]);
	}
	printf("p = %d\n",p);

	// get ncase and nctrl
	i = 0;
	j = 0;

	ncase = 0;
	nctrl = 0;

	rewind(fp);

	// only use the first file to get ncase and nctrl
	fscanf(fp, "%s\n", &filename_i);
	printf("%s\n", filename_i);
	fp_i = fopen(filename_i, "r");

	while(!feof(fp_i)) {

		//if (n*j + i == 400)
		//{
		//	printf("%d,%d\n",i,j);
		//}

		/* loop through and store the numbers into the array */
		if(j==0)
		{
			//j = 0 means read ind class label y
			fscanf(fp_i, "%d", &tmp);

			if (tmp)
			{
				// tmp=1 means case
				ncase++;

			}
			else
			{
				nctrl ++;

			}
			j++;
		}
		else
		{
			fscanf(fp_i, "%d", &tmp);
			j++; //column index
			if (j==(DataSize[ndataset]+1)) // DataSize[ndataset] is the nsnp in the first dataset
			{
				j=0;
				i++; // row index
			}

		}

		if (i>=n)
		{
			break;
		}
	}

	printf("total sample: %d (ncase = %d; nctrl = %d).\n", n, (int)ncase, (int)nctrl);

	nlongintcase = ceil( ((double) ncase)/LengthLongType);
	nlongintctrl = ceil( ((double) nctrl)/LengthLongType);
	printf("nLongIntcase = %d; nLongIntctrl = %d.\n", nlongintcase, nlongintctrl);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	//calloc memory for bit representation
	genocase = (uint64*)calloc(3*p*nlongintcase, sizeof(uint64));
	genoctrl = (uint64*)calloc(3*p*nlongintctrl, sizeof(uint64));

	// using CUDA
	cudaDeviceProp prop;
	cudaGetDeviceProperties(&prop, 0);
	if (!prop.canMapHostMemory) {
		printf("No map memory support");
		exit(1);
	}

	cudaSetDeviceFlags(cudaDeviceMapHost);
	//cudaHostAlloc((void **)&genocase, 3*p*nlongintcase*sizeof(uint64), cudaHostAllocWriteCombined);
	//cudaHostAlloc((void **)&genoctrl, 3*p*nlongintctrl*sizeof(uint64), cudaHostAllocWriteCombined);
	
	memset(genocase, 0, 3*p*nlongintcase*sizeof(uint64));
	memset(genoctrl, 0, 3*p*nlongintctrl*sizeof(uint64));

	//load data to bit representation
	rewind(fp);

	time(&st);
	j = 0; // column index
	ii = 0; // file index
	k = 0;
	while(!feof(fp)) { 
		ii++;
		fscanf(fp, "%s\n", &filename_i);

		fp_i = fopen(filename_i, "r");
		if(fp_i == NULL)
		{
			fprintf(stderr,"can't open input file %s\n", filename_i);
			exit(1);
		}

		i = 0; //row index
		icase = -1;
		ictrl = -1;

		printf("Loading data in file %d: %s\n", ii, filename_i);
		
		// flush the STD_OUT for external process to read
		FLUSH_STDOUT();
		while(!feof(fp_i)) { 
			/* loop through and store the numbers into the array */

			if(j==0)
			{
				//j = 0 means read class label y
				fscanf(fp_i, "%d", &tmp);

				if (tmp)
				{
					// tmp=1 means case
					icase ++;
					flag = 1;
				}
				else
				{
					ictrl ++;
					flag = 0;
				}
				j++;
			}
			else
			{
				fscanf(fp_i, "%d", &tmp);

				if (flag)
				{
					genocase[((icase/LengthLongType)*3+tmp)*p+(j+k-1)] |= (mask1 << (icase%LengthLongType));
				}
				else
				{
					genoctrl[((ictrl/LengthLongType)*3+tmp)*p+(j+k-1)] |= (mask1 << (ictrl%LengthLongType));
				}

				j++; //column index
				if (j==(DataSize[ndataset + ii-1]+1))
				{
					j=0;
					i++; // row index
				}

			}

			if (i>=n)
			{
				break;
			}
		}

		fclose(fp_i);
		k += DataSize[ndataset + ii-1];
	}

	fclose(fp);
	//printf("Number of numbers read: %d\n\n", n*p);
	time(&ed);
	printf("cputime for loading data: %d seconds\n", (int)ed -st);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();
	free(DataSize);

	// calculate marginal distribution
	ptmp1 = (double) ncase/n;
	MarginalEntropyY =  -ptmp1 *log(ptmp1) - (1-ptmp1) *log(1-ptmp1);

	MarginalEntropySNP = (double *)calloc(p, sizeof(double));
	MarginalEntropySNP_Y = (double *)calloc(p, sizeof(double));
	MarginalAssociation = (double *)calloc(p, sizeof(double));

	CalculateMarginalEntropy(genocase, genoctrl, p, n, nlongintcase, nlongintctrl, MarginalEntropySNP, MarginalEntropySNP_Y);
	//CalculateMarginalEntropy(pgeno, p, n, nlongintcase, nlongintctrl, MarginalEntropySNP, MarginalEntropySNP_Y);

	//	for (i = 0; i < 10; i++)
	//	{
	//		printf("MarginalEntropySNP[%2d]: %6f\t MarginalEntropySNP_Y[%2d]: %6f\n", i, MarginalEntropySNP[i], i, MarginalEntropySNP_Y[i]);
	//	}


	fp = fopen(marginalAssociationFilename,"w");
	for (i = 0; i<p; i++)
	{
		MarginalAssociation[i] = (-MarginalEntropySNP_Y[i] + MarginalEntropySNP[i] + MarginalEntropyY)*n*2;
		fprintf(fp, "%d %f\n", i, MarginalAssociation[i]);
	}
	fclose(fp);

	pMarginalDistrSNP = (int *)malloc(MarginalDistrSNP_Y_DimensionY*p*sizeof(int));
	pMarginalDistrSNP_Y = (int *)malloc(MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX*p*sizeof(int));
	CalculateMarginalDistr(genocase, genoctrl, p, n, nlongintcase, nlongintctrl, pMarginalDistrSNP, pMarginalDistrSNP_Y);

	time(&st);
	// calculate joint distribution
	GenoJointDistr = (int *)calloc(2*9, sizeof(int));
	AlleleJointDistr = (int *)calloc(2*4, sizeof(int) ); // for PLINK

	//printf("Required MB : %f\n", (p-1)*p/2*5*4/1024.0f/1024.0f);
	//system("pause");

	Pab		= (double *)calloc(9, sizeof(double));
	Pbc		= (double *)calloc(6, sizeof(double));
	Pca		= (double *)calloc(6, sizeof(double));

	DistrCollection = (int *)calloc(1001,sizeof(int));
	InteractionSNPpairs = (int *) calloc(buffersize*2, sizeof(int));
	InteractionMeasureSNPpair = (double *) calloc(buffersize,sizeof(double));

	//AssociationSNPpairs = (int *) calloc(buffersizeAssociation*2, sizeof(int));// association
	//AssociationMeasureSNPpair = (double *) calloc(buffersizeAssociation*2,sizeof(double));// association measure

	//printf("Required MB for buffer : %f\n", (buffersize*4*4+buffersize*3*8)/1024.0f/1024.0f);
	//system("pause");

	InteractionCount = 0;
	//AssociationCount = 0;

	maxInteraction = - 9999999;
	minInteraction = 9999999;

	tau1 = 30.0;
	tau2 = 50.0;

	// Perform pre-filtering
	std::vector<int> indexVector;

	if ( abs(preFilteringThreshold - DEL) != DEFAULT_PREFILTERING_VALUE ) {
		for ( int i = 0; i < p; i++ ) {
			if ( MarginalAssociation[i] < preFilteringThreshold ) {
				indexVector.push_back(i);
			}
		}
	}
	else {
		for ( int i = 0; i < p; i++ ) {
			indexVector.push_back(i);
		}
	}

	printf("Number of SNP after pre-filtering : %d\n", indexVector.size());

	// Start of GPU section
	std::list<int> offsetListJ1;
	std::list<int> offsetListJ2;
	std::list<int>::iterator iterJ1;
	std::list<int>::iterator iterJ2;
	cuda_GetInteractionPairs(indexVector, genocase, genoctrl, 
		p, n, nlongintcase, nlongintctrl, ncase, nctrl, screenThreshold,
		pMarginalDistrSNP, pMarginalDistrSNP_Y, wordbits, 65536, offsetListJ1, offsetListJ2, screenMode);

	//printf("interaction Count : %d\n", offsetListJ1.size());
	iterJ1 = offsetListJ1.begin();
	iterJ2 = offsetListJ2.begin();
	InteractionCount = offsetListJ1.size();

	zval = (double *)calloc(InteractionCount, sizeof(double));
	InteractionMeasureSNPpair = (double *) realloc(InteractionMeasureSNPpair, InteractionCount*sizeof(double));

	std::list<int> selectedJ1;
	std::list<int> selectedJ2;
	std::list<int> selectedModel;
	std::list<float> selectedModelChiSquareValue;
	
	printf("Number of SNP passed screening stage : %d\n", InteractionCount);

	if ( testMode == EXACT_GUESS ) {
		int passedCount = 0;
		//post-correction
		printf("\nStart post-correction: Exact Gtest...\n");

		// flush the STD_OUT for external process to read
		FLUSH_STDOUT();

		for (ii = 0; ii < InteractionCount ; ii++, iterJ1++, iterJ2++)
		{
			//j1 = InteractionSNPpairs[2*ii];
			//j2 = InteractionSNPpairs[2*ii + 1];
			j1 = *iterJ1;
			j2 = *iterJ2;
			CalculateGenoJointDistr(genocase, genoctrl, p, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistrSNP_Y);
			//CalculateGenoJointDistr(pgeno, p-1, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistr);

			//printf("j1 : %d, j2 : %d\n", j1, j2);

			memcpy(mutmp, mu, 18*sizeof(double));
			memcpy(mu0tmp, mu0, 18*sizeof(double));

			muError = 0.0;
			for (i = 0; i<3; i++)
			{
				for(j = 0;j <3; j++)
				{
					for (k = 0; k <2; k++)
					{
						muError += Abs(mutmp[i][j][k]-mu0tmp[i][j][k]);
					}
				}
			}

			while (muError > 0.001)
			{
				memcpy(mu0tmp, mutmp, 18*sizeof(double)); //mu0tmp = mutmp;

				// mu_ij
				for (i = 0; i<3; i++)
				{
					for (j = 0; j<3; j++)
					{
						mu_ij[i][j] = mutmp[i][j][0] + mutmp[i][j][1];
					}

				}
				//mu_ijk = mu_ijk*n_ij/mu_ij
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							if (mu_ij[i][j]>0)
							{
								mutmp[i][j][k] = mutmp[i][j][k] * (GenoJointDistr[3*i + j]+GenoJointDistr[9 + 3*i + j])/mu_ij[i][j];
							}
							else
								mutmp[i][j][k] = 0;

						}
					}
				}

				// mu_ik
				for (i = 0; i<3; i++)
				{
					for (k = 0; k<2; k++)
					{
						mu_ik[i][k] = mutmp[i][0][k] + mutmp[i][1][k] + mutmp[i][2][k];
					}

				}
				//mu_ijk = mu_ijk*n_ik/mu_ik
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							// mu(i,j,k) = mu(i,j,k) * n_ik(i,k)/mu_ik(i,1,k);
							if (mu_ik[i][k] > 0)
							{
								mutmp[i][j][k] = mutmp[i][j][k] * (GenoJointDistr[9*k + 3*i ]+GenoJointDistr[9*k + 3*i + 1] +GenoJointDistr[9*k + 3*i + 2])/mu_ik[i][k];
							}
							else
								mutmp[i][j][k] = 0;
						}
					}
				}

				// mu_jk
				for (j = 0; j<3; j++)
				{
					for (k = 0; k<2; k++)
					{
						mu_jk[j][k] = mutmp[0][j][k] + mutmp[1][j][k] + mutmp[2][j][k];
					}

				}

				//mu_ijk = mu_ijk*n_jk/mu_jk
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							// mu(i,j,k) = mu(i,j,k) * n_jk(k,j)/mu_jk(1,j,k);
							if (mu_jk[j][k] > 0)
							{
								mutmp[i][j][k] = mutmp[i][j][k] * (GenoJointDistr[9*k + j]+GenoJointDistr[9*k + j + 3] +GenoJointDistr[9*k + j + 6])/mu_jk[j][k];
							}
							else
								mutmp[i][j][k] = 0;

						}
					}
				}
				//calculate Error
				muError = 0.0;
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							muError += Abs(mutmp[i][j][k]-mu0tmp[i][j][k]);
						}
					}
				}

			}// end for while

			tao = 0.0;
			InteractionMeasure=0.0;
			for (i = 0; i<3; i++)// index for A
			{
				for (j = 0; j<3; j++) //index for B
				{
					for (k = 0; k<2; k++)	//index for C
					{
						ptmp1 = (double)GenoJointDistr[k*9 + i*3 +j]/n;
						if (ptmp1>0)
						{
							InteractionMeasure += ptmp1 * log(ptmp1);
						}
						ptmp2 = mutmp[i][j][k]/n;
						if (ptmp2>0)
						{
							InteractionMeasure += -ptmp1*log(ptmp2);
							tao += ptmp2;
						}
					}
				}
			}

			InteractionMeasure = (InteractionMeasure+log(tao))*n*2;
			InteractionMeasureSNPpair[ii] = InteractionMeasure; // update the interactionMeasure;
			//printf("InteractionMeasure : %f\n", InteractionMeasure);

			//PLINK
			//     BB Bb  bb
			//     AA  a  b  c
			//     Aa  d  e  f
			//     aa  g  h  i
			//          B            b
			//     A  4a+2b+2d+e   4c+2b+2f+e
			//     a  4g+2h+2d+e   4i+2h+2f+e

			//  case
			AlleleJointDistr[0] = 4*GenoJointDistr[0] + 2*GenoJointDistr[1] + 2*GenoJointDistr[3] + GenoJointDistr[4];
			AlleleJointDistr[1] = 4*GenoJointDistr[2] + 2*GenoJointDistr[1] + 2*GenoJointDistr[5] + GenoJointDistr[4];
			AlleleJointDistr[2] = 4*GenoJointDistr[6] + 2*GenoJointDistr[7] + 2*GenoJointDistr[3] + GenoJointDistr[4];
			AlleleJointDistr[3] = 4*GenoJointDistr[8] + 2*GenoJointDistr[7] + 2*GenoJointDistr[5] + GenoJointDistr[4];
			// control
			AlleleJointDistr[4] = 4*GenoJointDistr[9] + 2*GenoJointDistr[10] + 2*GenoJointDistr[12] + GenoJointDistr[13];
			AlleleJointDistr[5] = 4*GenoJointDistr[11] + 2*GenoJointDistr[10] + 2*GenoJointDistr[14] + GenoJointDistr[13];
			AlleleJointDistr[6] = 4*GenoJointDistr[15] + 2*GenoJointDistr[16] + 2*GenoJointDistr[12] + GenoJointDistr[13];
			AlleleJointDistr[7] = 4*GenoJointDistr[17] + 2*GenoJointDistr[16] + 2*GenoJointDistr[14] + GenoJointDistr[13];
			//
			or_aff = log( (double)(AlleleJointDistr[0]*AlleleJointDistr[3])/ (double)(AlleleJointDistr[1]*AlleleJointDistr[2]) );
			v_aff = 1/(double)AlleleJointDistr[0] + 1/(double)AlleleJointDistr[1] + 1/(double)AlleleJointDistr[2] + 1/(double)AlleleJointDistr[3];

			or_unf = log( (double)(AlleleJointDistr[4]*AlleleJointDistr[7])/ (double)(AlleleJointDistr[5]*AlleleJointDistr[6]) );
			v_unf = 1/(double)AlleleJointDistr[4] + 1/(double)AlleleJointDistr[5] + 1/(double)AlleleJointDistr[6] + 1/(double)AlleleJointDistr[7];

			//zval[ii] = Abs( (or_aff - or_unf) / sqrt ( v_aff + v_unf ) );
			zval[ii] = ( (or_aff - or_unf) / sqrt ( v_aff + v_unf ) );

			//printf("InteractionMeasureSNPpair[%d] : %f, zval[%d] : %f\n", ii, InteractionMeasureSNPpair[ii], ii, zval[ii]);

			if (InteractionMeasureSNPpair[ii] > testThreshold) {
				passedCount++;
			}

			if ((ii+1)%10000==0) {
				printf("iteration %d.\n", ii+1);

				// flush the STD_OUT for external process to read
				FLUSH_STDOUT();
			}

		}
		
		printf("Number of SNP passed testing stage : %d\n", passedCount);
	}
	else {
		testThreshold = CHI_SQUARE_THRESHOLD_DF1;

		int jj = 0;
		int model = 0;
		int modelBits[9];
		int countTable[4];
		float chiSquareValue = 0.0;

		// Model selection : Chi-square test
		printf("\nStart model selection...\n");

		for (ii = 0; ii < InteractionCount ; ii++, iterJ1++, iterJ2++)
		{
			j1 = *iterJ1;
			j2 = *iterJ2;
			CalculateGenoJointDistr(genocase, genoctrl, p, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistrSNP_Y);

			for ( model = 0; model < 256; model++ ) {
				// decompose corresponding bits
				for ( jj = 0; jj < 9; jj++) {
					modelBits[jj] = ((0x0000001 << jj) & model) >> jj;
				}

				for ( jj = 0; jj < 4; jj++)
					countTable[jj] = 0;

				for ( jj = 0; jj < 9; jj++ ) {
					if ( !modelBits[jj] ) {
						countTable[0] += GenoJointDistr[jj];
						countTable[2] += GenoJointDistr[9+jj];
					}
					else {
						countTable[1] += GenoJointDistr[jj];
						countTable[3] += GenoJointDistr[9+jj];
					}
				}

				chiSquareValue = CalculateChiSquareOfModel(countTable);

				if ( chiSquareValue >= testThreshold ) {
					selectedJ1.push_back(j1);
					selectedJ2.push_back(j2);
					selectedModel.push_back(model);
					selectedModelChiSquareValue.push_back(chiSquareValue);
				}
			}
		}
		printf("Number of SNP passed testing stage : %d\n", selectedJ1.size());
	}

	time(&ed);
	//printf("maxInteraction : %f \t minInteraction: %f \n", maxInteraction, minInteraction);
	

	fp = fopen(distrCollectionFilename,"w");
	for (i = 0; i<1001; i++)
	{
		fprintf(fp, "%d\n", DistrCollection[i]);
	}

	fclose(fp);

	if ( testMode == EXACT_GUESS ) {
		iterJ1 = offsetListJ1.begin();
		iterJ2 = offsetListJ2.begin();
		fp = fopen(interactionFilename,"w");
		for (i = 0; i < InteractionCount; i++, iterJ1++, iterJ2++)
		{
			if (InteractionMeasureSNPpair[i] > testThreshold) {
				fprintf(fp,"%7d\t%7d\t%7d\t%f\t%f\t%f\t%f\n", i, *iterJ1, *iterJ2, MarginalAssociation[*iterJ1], MarginalAssociation[*iterJ2], InteractionMeasureSNPpair[i], zval[i]);
				//fprintf(fp,"%7d\t%7d\t%7d\t%f\t%f\n", i,InteractionSNPpairs[2*i],InteractionSNPpairs[2*i+1], InteractionMeasureSNPpair[i], zval[i]);
			}
		}
		fclose(fp);

		//fp = fopen("AssociationRecords.txt","w");
		//for (i = 0; i < AssociationCount; i++)
		//{
		//	//if (AssociationMeasureSNPpair[i] > tau2)
		//	fprintf(fp,"%7d\t%7d\t%7d\t%f\t%f\n", i,AssociationSNPpairs[2*i],AssociationSNPpairs[2*i+1], AssociationMeasureSNPpair[2*i], AssociationMeasureSNPpair[2*i+1]);
		//}
		//fclose(fp);
	}
	else {
		std::list<int>::iterator iterModel;
		std::list<float>::iterator iterChiSquareValue;
		iterJ1 = selectedJ1.begin();
		iterJ2 = selectedJ2.begin();
		iterModel = selectedModel.begin();
		iterChiSquareValue = selectedModelChiSquareValue.begin();

		fp = fopen(interactionFilename,"w");
		for (i = 0; i < selectedJ1.size(); i++, iterJ1++, iterJ2++, iterModel++, iterChiSquareValue++) {
			fprintf(fp,"%7d\t%7d\t%7d\t%7d\t%f\n", i, *iterJ1, *iterJ2, *iterModel, *iterChiSquareValue);
		}
		fclose(fp);
	}
	
	printf("cputime: %d\n", (int)ed - st);
	printf("Progress:%d%%\n", 100);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	free(Pab);
	free(Pbc);
	free(Pca);

	free(DistrCollection);
	free(InteractionSNPpairs);
	free(InteractionMeasureSNPpair);

	//free(AssociationSNPpairs);
	//free(AssociationMeasureSNPpair);

	free(MarginalEntropySNP);
	free(MarginalEntropySNP_Y);
	free(MarginalAssociation);
	free(GenoJointDistr);
	free(pMarginalDistrSNP);
	free(pMarginalDistrSNP_Y);

	free(AlleleJointDistr);//PLINK
	free(zval);//PLINK

	// Free CudaHostAlloc memory
	free(genocase);
	free(genoctrl);
	//cudaFreeHost(genocase);
	//cudaFreeHost(genoctrl);

	return 0;
}

int BOOST_CPU(char* inputFilename, char* outputFilePrefix, float preFilteringThreshold, int screenMode, float screenThreshold, int testMode, float testThreshold)
{
	time_t st, ed;
	int* pMarginalDistrSNP;
	int* pMarginalDistrSNP_Y;

	// Prepare for out filename
	char marginalAssociationFilename[255], distrCollectionFilename[255], interactionFilename[255];
	if ( testMode == EXACT_GUESS ) {
		sprintf(marginalAssociationFilename, "%sMarginalAssociation.txt", outputFilePrefix);
		sprintf(distrCollectionFilename, "%sDistrCollection.txt", outputFilePrefix);
		sprintf(interactionFilename, "%sInteractionRecords.txt", outputFilePrefix);
	}
	else {
		sprintf(marginalAssociationFilename, "%sMarginalAssociation.txt", outputFilePrefix);
		sprintf(distrCollectionFilename, "%sDistrCollection.txt", outputFilePrefix);
		sprintf(interactionFilename, "%sModelSelectionRecords.txt", outputFilePrefix);
	}

	/* Declare variable */
	int *DataSize;
	int ndataset;

	int n, p;  //n--number of samples; p number of varibles
	int ncase, nctrl, nlongintcase, nlongintctrl;
	int icase, ictrl;
	uint64 *genocase, *genoctrl;
	int interactionPairCount = 0;
	long long indexShift = 0;
	int count = 0;
	int x = 0;

	int *GenoJointDistr, *AlleleJointDistr;
	double or_aff, v_aff, or_unf, v_unf, pvalPLINK;//PLINK
	double *zval; //PLINK

	double *MarginalEntropySNP, *MarginalEntropySNP_Y, *MarginalAssociation;
	double tau1, tau2; // thresholds of test association allowing for interactions
	register double JointEntropyTwoSNP, JointEntropyTwoSNP_Y, MarginalEntropyY, ptmp1, ptmp2;
	register double InteractionMeasure;
	double *InteractionMeasureSNPpair, *AssociationMeasureSNPpair;

	double maxInteraction, minInteraction;
	double *Pab, *Pbc, *Pca; // conditional probability P(a|b), P(b|c), P(c|a)
	double tao; // normalization term;

	int *DistrCollection;
	int *InteractionSNPpairs;
	int InteractionCount;

	int *AssociationSNPpairs;
	int AssociationCount;

	int flag, i, ii, j, k, j1,j2;
	int tmp;
	// used for post-correction (post-correction is exact solution of loglinear model)
	static double mu[3][3][2] = {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
	static double mu0[3][3][2] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	static double mutmp[3][3][2];
	static double mu0tmp[3][3][2];

	static double mu_ij[3][3];
	static double mu_ik[3][2];
	static double mu_jk[3][2];
	double muError;

	int LengthLongType=64;
	uint64 mask1 = 0x0000000000000001;

	int buffersize = 50000;
	int buffersizeAssociation = 50000;

	// load data
	FILE *fp, *fp_i;
	char filename_i[100];

	fp = fopen(inputFilename,"r");
	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",inputFilename);
		return -1;
	}

	// precompute the wordbits (a global variable)
	for (i = 0; i<65536; i++)
	{
		wordbits[i] = bitCount(i);
		//printf("%d\n",wordbits[i]);
	}
	//cuda_SetWordBits(wordbits, 65536);
	printf("start loading ...\n");
	
	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	ndataset = GetDataSize(inputFilename, &DataSize);

	n = DataSize[0];
	p = 0;
	printf("n = %d\n", n);
	for (i = 0; i<ndataset ; i++)
	{
		p += DataSize[ndataset*1 + i];
		printf("DataSize %d-th file: p[%d] = %d \n", i+1, i+1, DataSize[ndataset*1 + i]);
	}
	printf("p = %d\n",p);

	// get ncase and nctrl
	i = 0;
	j = 0;

	ncase = 0;
	nctrl = 0;

	rewind(fp);

	// only use the first file to get ncase and nctrl
	fscanf(fp, "%s\n", &filename_i);
	printf("%s\n", filename_i);
	fp_i = fopen(filename_i, "r");

	while(!feof(fp_i)) {

		//if (n*j + i == 400)
		//{
		//	printf("%d,%d\n",i,j);
		//}

		/* loop through and store the numbers into the array */
		if(j==0)
		{
			//j = 0 means read ind class label y
			fscanf(fp_i, "%d", &tmp);

			if (tmp)
			{
				// tmp=1 means case
				ncase++;

			}
			else
			{
				nctrl ++;

			}
			j++;
		}
		else
		{
			fscanf(fp_i, "%d", &tmp);
			j++; //column index
			if (j==(DataSize[ndataset]+1)) // DataSize[ndataset] is the nsnp in the first dataset
			{
				j=0;
				i++; // row index
			}

		}

		if (i>=n)
		{
			break;
		}
	}

	printf("total sample: %d (ncase = %d; nctrl = %d).\n", n, (int)ncase, (int)nctrl);

	nlongintcase = ceil( ((double) ncase)/LengthLongType);
	nlongintctrl = ceil( ((double) nctrl)/LengthLongType);
	printf("nLongIntcase = %d; nLongIntctrl = %d.\n", nlongintcase, nlongintctrl);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	//calloc memory for bit representation
	genocase = (uint64*)calloc(3*p*nlongintcase, sizeof(uint64));
	genoctrl = (uint64*)calloc(3*p*nlongintctrl, sizeof(uint64));
	memset(genocase, 0, 3*p*nlongintcase*sizeof(uint64));
	memset(genoctrl, 0, 3*p*nlongintctrl*sizeof(uint64));

	//load data to bit representation
	rewind(fp);

	time(&st);
	j = 0; // column index
	ii = 0; // file index
	k = 0;
	while(!feof(fp)) { 
		ii++;
		fscanf(fp, "%s\n", &filename_i);

		fp_i = fopen(filename_i, "r");
		if(fp_i == NULL)
		{
			fprintf(stderr,"can't open input file %s\n", filename_i);
			exit(1);
		}

		i = 0; //row index
		icase = -1;
		ictrl = -1;

		printf("Loading data in file %d: %s\n", ii, filename_i);
		
		// flush the STD_OUT for external process to read
		FLUSH_STDOUT();

		while(!feof(fp_i)) { 
			/* loop through and store the numbers into the array */

			if(j==0)
			{
				//j = 0 means read class label y
				fscanf(fp_i, "%d", &tmp);

				if (tmp)
				{
					// tmp=1 means case
					icase ++;
					flag = 1;
				}
				else
				{
					ictrl ++;
					flag = 0;
				}
				j++;
			}
			else
			{
				fscanf(fp_i, "%d", &tmp);

				if (flag)
				{
					genocase[((icase/LengthLongType)*3+tmp)*p+(j+k-1)] |= (mask1 << (icase%LengthLongType));
				}
				else
				{
					genoctrl[((ictrl/LengthLongType)*3+tmp)*p+(j+k-1)] |= (mask1 << (ictrl%LengthLongType));
				}

				j++; //column index
				if (j==(DataSize[ndataset + ii-1]+1))
				{
					j=0;
					i++; // row index
				}

			}

			if (i>=n)
			{
				break;
			}
		}

		fclose(fp_i);
		k += DataSize[ndataset + ii-1];
	}

	fclose(fp);
	//printf("Number of numbers read: %d\n\n", n*p);
	time(&ed);
	printf("cputime for loading data: %d seconds\n", (int)ed -st);
	free(DataSize);

	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	// calculate marginal distribution
	ptmp1 = (double) ncase/n;
	MarginalEntropyY =  -ptmp1 *log(ptmp1) - (1-ptmp1) *log(1-ptmp1);

	MarginalEntropySNP = (double *)calloc(p, sizeof(double));
	MarginalEntropySNP_Y = (double *)calloc(p, sizeof(double));
	MarginalAssociation = (double *)calloc(p, sizeof(double));

	CalculateMarginalEntropy(genocase, genoctrl, p, n, nlongintcase, nlongintctrl, MarginalEntropySNP, MarginalEntropySNP_Y);

	fp = fopen(marginalAssociationFilename,"w");
	for (i = 0; i<p; i++)
	{
		MarginalAssociation[i] = (-MarginalEntropySNP_Y[i] + MarginalEntropySNP[i] + MarginalEntropyY)*n*2;
		fprintf(fp, "%d %f\n", i, MarginalAssociation[i]);
	}
	fclose(fp);

	pMarginalDistrSNP = (int *)malloc(MarginalDistrSNP_Y_DimensionY*p*sizeof(int));
	pMarginalDistrSNP_Y = (int *)malloc(MarginalDistrSNP_Y_DimensionY*MarginalDistrSNP_Y_DimensionX*p*sizeof(int));
	CalculateMarginalDistr(genocase, genoctrl, p, n, nlongintcase, nlongintctrl, pMarginalDistrSNP, pMarginalDistrSNP_Y);

	time(&st);
	
	// calculate joint distribution
	GenoJointDistr = (int *)calloc(2*9, sizeof(int));
	AlleleJointDistr = (int *)calloc(2*4, sizeof(int) ); // for PLINK

	//printf("Required MB : %f\n", (p-1)*p/2*5*4/1024.0f/1024.0f);
	//system("pause");

	Pab		= (double *)calloc(9, sizeof(double));
	Pbc		= (double *)calloc(6, sizeof(double));
	Pca		= (double *)calloc(6, sizeof(double));

	DistrCollection = (int *)calloc(1001,sizeof(int));
	InteractionSNPpairs = (int *) calloc(buffersize*2, sizeof(int));
	InteractionMeasureSNPpair = (double *) calloc(buffersize,sizeof(double));

	AssociationSNPpairs = (int *) calloc(buffersizeAssociation*2, sizeof(int));// association
	AssociationMeasureSNPpair = (double *) calloc(buffersizeAssociation*2,sizeof(double));// association measure

	//printf("Required MB for buffer : %f\n", (buffersize*4*4+buffersize*3*8)/1024.0f/1024.0f);
	//system("pause");

	InteractionCount = 0;
	AssociationCount = 0;

	maxInteraction = - 9999999;
	minInteraction = 9999999;

	tau1 = 30.0;
	tau2 = 50.0;

	int oldProgress = 0, newProgress = 0;

	// Perform pre-filtering
	std::vector<int> indexVector;

	if ( abs(preFilteringThreshold - DEL) != DEFAULT_PREFILTERING_VALUE ) {
		for ( int i = 0; i < p; i++ ) {
			if ( MarginalAssociation[i] < preFilteringThreshold ) {
				indexVector.push_back(i);
			}
		}
	}
	else {
		for ( int i = 0; i < p; i++ ) {
			indexVector.push_back(i);
		}
	}

	printf("Number of SNP after pre-filtering : %d\n", indexVector.size());

	double totalTask = indexVector.size()-1;
	int inputSize = indexVector.size();

	printf("\nStart screening ... \n");
	if ( screenMode == SCREENING_KSA ) {
		for (int indexJ1 = 0; indexJ1<inputSize-1; indexJ1++)
		{
			for (int indexJ2 = indexJ1+1; indexJ2<inputSize; indexJ2++)
			{
				j1 = indexVector[indexJ1];
				j2 = indexVector[indexJ2];
				CalculateGenoJointDistr(genocase, genoctrl, p, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistrSNP_Y);

				////Test association allowing for interactions
				//if ((MarginalAssociation[j1]< tau1) && (MarginalAssociation[j2]< tau1))
				//{

				//	//tao = 0.0;
				//	AssociationMeasure=0.0;
				//	AssociationMeasure1=0.0;
				//	AssociationMeasure2=0.0;

				//	for (i = 0; i<3; i++)// index for A
				//	{
				//		for (j = 0; j<3; j++) //index for B
				//		{
				//			for (k = 0; k<2; k++)	//index for C
				//			{
				//				ptmp1 = (double)GenoJointDistr[offset*2*9+k*9 + i*3 +j]/n;
				//				if (ptmp1>0)
				//				{
				//					AssociationMeasure += ptmp1 * log(ptmp1);
				//				}

				//				//AssociationMeasure1=AssociationMeasure;
				//				//AssociationMeasure2=AssociationMeasure;

				//				ptmp2 = (double)(GenoJointDistr[offset*2*9+i*3 +j]+GenoJointDistr[offset*2*9+9+i*3 +j])*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j2].MarginalDistrSNP[j]/n;

				//				// n_{ij}*n_{jk}/n_{j}/n
				//				if (ptmp2>0)
				//				{
				//					AssociationMeasure1 += -ptmp1*log(ptmp2);
				//					//tao += ptmp2;
				//				}

				//				ptmp3 = (double)(GenoJointDistr[offset*2*9+i*3 +j]+GenoJointDistr[offset*2*9+9+i*3 +j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/n;

				//				// n_{ij}*n_{jk}/n_{j}/n
				//				if (ptmp3>0)
				//				{
				//					AssociationMeasure2 += -ptmp1*log(ptmp3);
				//					//tao += ptmp2;
				//				}
				//			}
				//		}
				//	}

				//	AssociationMeasure1 = (AssociationMeasure + AssociationMeasure1)*n*2 ;
				//	AssociationMeasure2 = (AssociationMeasure + AssociationMeasure2)*n*2 ;

				//	if ((AssociationMeasure1 > tau2) || (AssociationMeasure2 > tau2)) {

				//		if (AssociationCount >= buffersizeAssociation)// buffersize is not enough, calloc new memory
				//		{
				//			buffersizeAssociation = buffersizeAssociation * 2;
				//			AssociationSNPpairs = (int *)realloc(AssociationSNPpairs, buffersizeAssociation*2*sizeof(int));
				//			AssociationMeasureSNPpair = (double *)realloc(AssociationMeasureSNPpair, buffersizeAssociation*2*sizeof(double));

				//			AssociationSNPpairs[2*AssociationCount] = j1;
				//			AssociationSNPpairs[2*AssociationCount + 1] = j2;

				//			AssociationMeasureSNPpair[2*AssociationCount] = AssociationMeasure1;
				//			AssociationMeasureSNPpair[2*AssociationCount+1] = AssociationMeasure2;
				//			AssociationCount ++;
				//			//printf("InteractionCount: %6d\t(%6d,%6d)\t %f\n", InteractionCount,j1,j2, InteractionMeasure);
				//		}
				//		else
				//		{
				//			AssociationSNPpairs[2*AssociationCount] = j1;
				//			AssociationSNPpairs[2*AssociationCount + 1] = j2;

				//			AssociationMeasureSNPpair[2*AssociationCount] = AssociationMeasure1;
				//			AssociationMeasureSNPpair[2*AssociationCount+1] = AssociationMeasure2;
				//			AssociationCount ++;
				//			//printf("InteractionCount: %6d\t(%6d,%6d)\t %f\n", InteractionCount,j1,j2, InteractionMeasure);
				//		}

				//	}


				//}// end for test association

				/*
				GenoJointDistr: the index is as follows:
				AABB
				Case	0	1	2	3	4	5	6	7	8
				Ctrl	9	10	11	12	13	14	15	16	17
				*/
				tao = 0.0;
				InteractionMeasure=0.0;
				for (i = 0; i<3; i++)// index for A
				{
					for (j = 0; j<3; j++) //index for B
					{
						for (k = 0; k<2; k++)	//index for C
						{
							//ptmp1 = (double)GenoJointDistr[offset*2*9+k*9 + i*3 +j];
							ptmp1 = (double)GenoJointDistr[k*9 + i*3 +j];
							if (ptmp1>0)
							{
								InteractionMeasure += ptmp1 * log(ptmp1);
							}

							if ( k == 0 ) {
								//ptmp2 KSA  n_{ij}*n_{jk}*n_{ik}/ (n_i * n_j * n_k)
								//ptmp2 = (double)n*(GenoJointDistr[offset*2*9+i*3+j]+GenoJointDistr[offset*2*9+9+i*3+j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/pMarginalDistr[j2].MarginalDistrSNP[j]/ncase;
								ptmp2 = (double)n*(GenoJointDistr[i*3+j]+GenoJointDistr[9+i*3+j])*
									pMarginalDistrSNP_Y[(i*MarginalDistrSNP_Y_DimensionX+k)*p+j1]*
									pMarginalDistrSNP_Y[(j*MarginalDistrSNP_Y_DimensionX+k)*p+j2]/
									pMarginalDistrSNP[i*p+j1]/
									pMarginalDistrSNP[j*p+j2]/
									ncase;
							}
							else if ( k == 1 ) {
								//ptmp2 = (double)n*(GenoJointDistr[offset*2*9+i*3+j]+GenoJointDistr[offset*2*9+9+i*3+j])*pMarginalDistr[j1].MarginalDistrSNP_Y[i*MarginalDistrSNP_Y_DimensionX+k]*pMarginalDistr[j2].MarginalDistrSNP_Y[j*MarginalDistrSNP_Y_DimensionX+k]/pMarginalDistr[j1].MarginalDistrSNP[i]/pMarginalDistr[j2].MarginalDistrSNP[j]/nctrl;
								ptmp2 = (double)n*(GenoJointDistr[i*3+j]+GenoJointDistr[9+i*3+j])*
									pMarginalDistrSNP_Y[(i*MarginalDistrSNP_Y_DimensionX+k)*p+j1]*
									pMarginalDistrSNP_Y[(j*MarginalDistrSNP_Y_DimensionX+k)*p+j2]/
									pMarginalDistrSNP[i*p+j1]/
									pMarginalDistrSNP[j*p+j2]/
									nctrl;
							}

							if (ptmp2>0)
							{
								InteractionMeasure += -ptmp1*log(ptmp2);
								tao += ptmp2;
							}

						}
					}
				}

				InteractionMeasure = (InteractionMeasure+n*log(tao/n))*2;

				if (InteractionMeasure > maxInteraction)
				{
					maxInteraction = InteractionMeasure;
				}

				if (InteractionMeasure < minInteraction)
				{
					minInteraction = InteractionMeasure;
				}

				if (InteractionMeasure > screenThreshold)
				{

					if (InteractionCount >= buffersize)// buffersize is not enough, calloc new memory
					{
						buffersize = buffersize * 2;
						InteractionSNPpairs = (int *)realloc(InteractionSNPpairs, buffersize*2*sizeof(int));
						InteractionMeasureSNPpair = (double *)realloc(InteractionMeasureSNPpair, buffersize*sizeof(double));

						InteractionSNPpairs[2*InteractionCount] = j1;
						InteractionSNPpairs[2*InteractionCount + 1] = j2;

						InteractionMeasureSNPpair[InteractionCount] = InteractionMeasure;
						InteractionCount ++;
						//printf("InteractionCount: %6d\t(%6d,%6d)\t %f\n", InteractionCount,j1,j2, InteractionMeasure);
					}
					else
					{
						InteractionSNPpairs[2*InteractionCount] = j1;
						InteractionSNPpairs[2*InteractionCount + 1] = j2;

						InteractionMeasureSNPpair[InteractionCount] = InteractionMeasure;
						InteractionCount ++;
						//printf("InteractionCount: %6d\t(%6d,%6d)\t %f\n", InteractionCount,j1,j2, InteractionMeasure);
					}
				}

				if (InteractionMeasure > 100)
				{
					DistrCollection[1000] ++;
					//printf("pair:%d,%d\t %f\n", j1,j2, InteractionMeasure);
				}
				else if(InteractionMeasure >0)
				{
					DistrCollection[(int)(InteractionMeasure/0.1)] ++;
				}

				//if ((j1 == 49) && (j2 == 149))
				//    printf("(%d, %d) interactionMeasure:%f\n", j1, j2, InteractionMeasure);

			}
			if ((j1+1)%100==0) {
				newProgress = floor((indexJ1+1) / totalTask * 100);
				printf("\riteration %d. Progress:%d%%", indexJ1+1, newProgress);

				if ( newProgress - oldProgress > 1 ) {
					oldProgress = newProgress;

					// flush the STD_OUT for external process to read
					FLUSH_STDOUT();
				}
			}
		}
		printf("\n");
	}
	else {
		float ratio[9];
		int index[9];
		int input[4];
		float numerator, denominator, chiSquareValue, maxChiSquareValue = 0.0f;

		for (int indexJ1 = 0; indexJ1<inputSize-1; indexJ1++)
		{
			for (int indexJ2 = indexJ1+1; indexJ2<inputSize; indexJ2++)
			{
				j1 = indexVector[indexJ1];
				j2 = indexVector[indexJ2];
				CalculateGenoJointDistr(genocase, genoctrl, p, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistrSNP_Y);

				for ( i = 0; i < 9; i++ ) {
					if ( GenoJointDistr[9+i] == 0 ) {
						ratio[i] = GenoJointDistr[i];
					}
					else {
						ratio[i] = GenoJointDistr[i] / ((float)GenoJointDistr[9+i]);
					}
					index[i] = i;
				}

				// insertion sort
				for ( i = 0; i < 9; i++ ) {
					for ( j = i+1; j < 9; j++ ) {
						if ( ratio[i] < ratio[j] ) {
							ratio[i] = ratio[i] + ratio[j];
							ratio[j] = ratio[i] - ratio[j];
							ratio[i] = ratio[i] - ratio[j];

							index[i] = index[i] + index[j];
							index[j] = index[i] - index[j];
							index[i] = index[i] - index[j];
						}
					}
				}

				maxChiSquareValue = 0.0f;
				for ( i = 1; i < 9; i++ ) {					
					input[0] = 0;
					input[1] = 0;
					input[2] = 0;
					input[3] = 0;
					for ( j = 0; j < i; j++ ) {
						input[0] += GenoJointDistr[index[j]];
						input[2] += GenoJointDistr[9+index[j]];
					}
					for ( j = i; j < 9; j++ ) {
						input[1] += GenoJointDistr[index[j]];
						input[3] += GenoJointDistr[9+index[j]];
					}

					numerator = ((float)(input[0]*input[3] - input[1]*input[2])*(input[0]*input[3] - input[1]*input[2]))*(input[0]+input[1]+input[2]+input[3]);
					denominator = ((float)(input[0]+input[1]))*(input[2]+input[3])*(input[1]+input[3])*(input[0]+input[2]);

					if ( abs(denominator) < 0.000001 ) {
						chiSquareValue = 0.0f;
					}
					else {
						chiSquareValue = numerator/denominator;
					}

					if ( chiSquareValue > maxChiSquareValue ) 
						maxChiSquareValue = chiSquareValue;
				}

				if ( maxChiSquareValue > screenThreshold )
				{

					if ( InteractionCount >= buffersize )// buffersize is not enough, calloc new memory
					{
						buffersize = buffersize * 2;
						InteractionSNPpairs = (int *)realloc(InteractionSNPpairs, buffersize*2*sizeof(int));
						InteractionMeasureSNPpair = (double *)realloc(InteractionMeasureSNPpair, buffersize*sizeof(double));

						InteractionSNPpairs[2*InteractionCount] = j1;
						InteractionSNPpairs[2*InteractionCount + 1] = j2;

						InteractionMeasureSNPpair[InteractionCount] = InteractionMeasure;
						InteractionCount ++;
						//printf("InteractionCount: %6d\t(%6d,%6d)\t %f\n", InteractionCount,j1,j2, InteractionMeasure);
					}
					else
					{
						InteractionSNPpairs[2*InteractionCount] = j1;
						InteractionSNPpairs[2*InteractionCount + 1] = j2;

						InteractionMeasureSNPpair[InteractionCount] = InteractionMeasure;
						InteractionCount ++;
						//printf("InteractionCount: %6d\t(%6d,%6d)\t %f\n", InteractionCount,j1,j2, InteractionMeasure);
					}
				}
			}

			if ((j1+1)%100==0) {
				newProgress = floor((indexJ1+1) / totalTask * 100);
				printf("\riteration %d. Progress:%d%%", indexJ1+1, newProgress);

				if ( newProgress - oldProgress > 1 ) {
					oldProgress = newProgress;

					// flush the STD_OUT for external process to read
					FLUSH_STDOUT();
				}
			}
		}
		printf("\n");
	}

	printf("Number of SNP passed screening stage : %d\n", InteractionCount);

	zval = (double *)calloc(InteractionCount, sizeof(double));
	InteractionMeasureSNPpair = (double *) realloc(InteractionMeasureSNPpair, InteractionCount*sizeof(double));

	std::list<int> selectedJ1;
	std::list<int> selectedJ2;
	std::list<int> selectedModel;
	std::list<float> selectedModelChiSquareValue;
	float chisquareThreshold = CHI_SQUARE_THRESHOLD_DF1;

	if ( testMode == EXACT_GUESS ) {
		int passedCount = 0;
		//post-correction
		printf("\nStart post-correction: Exact Gtest...\n");

		// flush the STD_OUT for external process to read
		FLUSH_STDOUT();

		zval = (double *)calloc(InteractionCount, sizeof(double));
		InteractionMeasureSNPpair = (double *) realloc(InteractionMeasureSNPpair, InteractionCount*sizeof(double));

		for (ii = 0; ii < InteractionCount ; ii++)
		{
			j1 = InteractionSNPpairs[2*ii];
			j2 = InteractionSNPpairs[2*ii + 1];
			CalculateGenoJointDistr(genocase, genoctrl, p, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistrSNP_Y);

			//printf("j1 : %d, j2 : %d\n", j1, j2);

			memcpy(mutmp, mu, 18*sizeof(double));
			memcpy(mu0tmp, mu0, 18*sizeof(double));

			muError = 0.0;
			for (i = 0; i<3; i++)
			{
				for(j = 0;j <3; j++)
				{
					for (k = 0; k <2; k++)
					{
						muError += Abs(mutmp[i][j][k]-mu0tmp[i][j][k]);
					}
				}
			}

			while (muError > 0.001)
			{
				memcpy(mu0tmp, mutmp, 18*sizeof(double)); //mu0tmp = mutmp;

				// mu_ij
				for (i = 0; i<3; i++)
				{
					for (j = 0; j<3; j++)
					{
						mu_ij[i][j] = mutmp[i][j][0] + mutmp[i][j][1];
					}

				}
				//mu_ijk = mu_ijk*n_ij/mu_ij
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							if (mu_ij[i][j]>0)
							{
								mutmp[i][j][k] = mutmp[i][j][k] * (GenoJointDistr[3*i + j]+GenoJointDistr[9 + 3*i + j])/mu_ij[i][j];
							}
							else
								mutmp[i][j][k] = 0;

						}
					}
				}

				// mu_ik
				for (i = 0; i<3; i++)
				{
					for (k = 0; k<2; k++)
					{
						mu_ik[i][k] = mutmp[i][0][k] + mutmp[i][1][k] + mutmp[i][2][k];
					}

				}
				//mu_ijk = mu_ijk*n_ik/mu_ik
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							// mu(i,j,k) = mu(i,j,k) * n_ik(i,k)/mu_ik(i,1,k);
							if (mu_ik[i][k] > 0)
							{
								mutmp[i][j][k] = mutmp[i][j][k] * (GenoJointDistr[9*k + 3*i ]+GenoJointDistr[9*k + 3*i + 1] +GenoJointDistr[9*k + 3*i + 2])/mu_ik[i][k];
							}
							else
								mutmp[i][j][k] = 0;
						}
					}
				}

				// mu_jk
				for (j = 0; j<3; j++)
				{
					for (k = 0; k<2; k++)
					{
						mu_jk[j][k] = mutmp[0][j][k] + mutmp[1][j][k] + mutmp[2][j][k];
					}

				}

				//mu_ijk = mu_ijk*n_jk/mu_jk
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							// mu(i,j,k) = mu(i,j,k) * n_jk(k,j)/mu_jk(1,j,k);
							if (mu_jk[j][k] > 0)
							{
								mutmp[i][j][k] = mutmp[i][j][k] * (GenoJointDistr[9*k + j]+GenoJointDistr[9*k + j + 3] +GenoJointDistr[9*k + j + 6])/mu_jk[j][k];
							}
							else
								mutmp[i][j][k] = 0;

						}
					}
				}
				//calculate Error
				muError = 0.0;
				for (i = 0; i<3; i++)
				{
					for(j = 0;j <3; j++)
					{
						for (k = 0; k <2; k++)
						{
							muError += Abs(mutmp[i][j][k]-mu0tmp[i][j][k]);
						}
					}
				}

			}// end for while

			tao = 0.0;
			InteractionMeasure=0.0;
			for (i = 0; i<3; i++)// index for A
			{
				for (j = 0; j<3; j++) //index for B
				{
					for (k = 0; k<2; k++)	//index for C
					{
						ptmp1 = (double)GenoJointDistr[k*9 + i*3 +j]/n;
						if (ptmp1>0)
						{
							InteractionMeasure += ptmp1 * log(ptmp1);
						}
						ptmp2 = mutmp[i][j][k]/n;
						if (ptmp2>0)
						{
							InteractionMeasure += -ptmp1*log(ptmp2);
							tao += ptmp2;
						}
					}
				}
			}

			InteractionMeasure = (InteractionMeasure+log(tao))*n*2;
			InteractionMeasureSNPpair[ii] = InteractionMeasure; // update the interactionMeasure;
			//printf("InteractionMeasure : %f\n", InteractionMeasure);

			//PLINK
			//     BB Bb  bb
			//     AA  a  b  c
			//     Aa  d  e  f
			//     aa  g  h  i
			//          B            b
			//     A  4a+2b+2d+e   4c+2b+2f+e
			//     a  4g+2h+2d+e   4i+2h+2f+e

			//  case
			AlleleJointDistr[0] = 4*GenoJointDistr[0] + 2*GenoJointDistr[1] + 2*GenoJointDistr[3] + GenoJointDistr[4];
			AlleleJointDistr[1] = 4*GenoJointDistr[2] + 2*GenoJointDistr[1] + 2*GenoJointDistr[5] + GenoJointDistr[4];
			AlleleJointDistr[2] = 4*GenoJointDistr[6] + 2*GenoJointDistr[7] + 2*GenoJointDistr[3] + GenoJointDistr[4];
			AlleleJointDistr[3] = 4*GenoJointDistr[8] + 2*GenoJointDistr[7] + 2*GenoJointDistr[5] + GenoJointDistr[4];
			// control
			AlleleJointDistr[4] = 4*GenoJointDistr[9] + 2*GenoJointDistr[10] + 2*GenoJointDistr[12] + GenoJointDistr[13];
			AlleleJointDistr[5] = 4*GenoJointDistr[11] + 2*GenoJointDistr[10] + 2*GenoJointDistr[14] + GenoJointDistr[13];
			AlleleJointDistr[6] = 4*GenoJointDistr[15] + 2*GenoJointDistr[16] + 2*GenoJointDistr[12] + GenoJointDistr[13];
			AlleleJointDistr[7] = 4*GenoJointDistr[17] + 2*GenoJointDistr[16] + 2*GenoJointDistr[14] + GenoJointDistr[13];
			//
			or_aff = log( (double)(AlleleJointDistr[0]*AlleleJointDistr[3])/ (double)(AlleleJointDistr[1]*AlleleJointDistr[2]) );
			v_aff = 1/(double)AlleleJointDistr[0] + 1/(double)AlleleJointDistr[1] + 1/(double)AlleleJointDistr[2] + 1/(double)AlleleJointDistr[3];

			or_unf = log( (double)(AlleleJointDistr[4]*AlleleJointDistr[7])/ (double)(AlleleJointDistr[5]*AlleleJointDistr[6]) );
			v_unf = 1/(double)AlleleJointDistr[4] + 1/(double)AlleleJointDistr[5] + 1/(double)AlleleJointDistr[6] + 1/(double)AlleleJointDistr[7];

			//zval[ii] = Abs( (or_aff - or_unf) / sqrt ( v_aff + v_unf ) );
			zval[ii] = ( (or_aff - or_unf) / sqrt ( v_aff + v_unf ) );

			//printf("InteractionMeasureSNPpair[%d] : %f, zval[%d] : %f\n", ii, InteractionMeasureSNPpair[ii], ii, zval[ii]);

			if ( InteractionMeasureSNPpair[ii] > testThreshold ) {
				passedCount++;
			}

			if ((ii+1)%10000==0) {
				printf("iteration %d.\n", ii+1);

				// flush the STD_OUT for external process to read
				FLUSH_STDOUT();
			}
		}
		printf("Number of SNP passed testing stage : %d\n", passedCount);
	}
	else {
		testThreshold = CHI_SQUARE_THRESHOLD_DF1;
		int jj = 0;
		int model = 0;
		int modelBits[9];
		int countTable[4];
		float chiSquareValue = 0.0;

		// Model selection : Chi-square test
		printf("\nStart model selection...\n");

		for (ii = 0; ii < InteractionCount ; ii++)
		{
			j1 = InteractionSNPpairs[2*ii];
			j2 = InteractionSNPpairs[2*ii + 1];
			CalculateGenoJointDistr(genocase, genoctrl, p, nlongintcase, nlongintctrl, GenoJointDistr, j1, j2, pMarginalDistrSNP_Y);

			for ( model = 0; model < 256; model++ ) {
				// decompose corresponding bits
				for ( jj = 0; jj < 9; jj++) {
					modelBits[jj] = ((0x0000001 << jj) & model) >> jj;
				}

				for ( jj = 0; jj < 4; jj++)
					countTable[jj] = 0;

				for ( jj = 0; jj < 9; jj++ ) {
					if ( !modelBits[jj] ) {
						countTable[0] += GenoJointDistr[jj];
						countTable[2] += GenoJointDistr[9+jj];
					}
					else {
						countTable[1] += GenoJointDistr[jj];
						countTable[3] += GenoJointDistr[9+jj];
					}
				}

				chiSquareValue = CalculateChiSquareOfModel(countTable);

				if ( chiSquareValue >= testThreshold ) {
					selectedJ1.push_back(j1);
					selectedJ2.push_back(j2);
					selectedModel.push_back(model);
					selectedModelChiSquareValue.push_back(chiSquareValue);
				}
			}
		}

		printf("Number of SNP passed testing stage : %d\n", selectedJ1.size());
	}

	time(&ed);

	//printf("maxInteraction : %f \t minInteraction: %f \n", maxInteraction, minInteraction);

	fp = fopen(distrCollectionFilename,"w");
	for (i = 0; i<1001; i++)
	{
		fprintf(fp, "%d\n", DistrCollection[i]);
	}

	fclose(fp);

	if ( testMode == EXACT_GUESS ) {
		fp = fopen(interactionFilename,"w");
		for (i = 0; i < InteractionCount; i++)
		{
			if (InteractionMeasureSNPpair[i] > testThreshold) {
				fprintf(fp,"%7d\t%7d\t%7d\t%f\t%f\t%f\t%f\n", i, InteractionSNPpairs[2*i],InteractionSNPpairs[2*i+1], MarginalAssociation[InteractionSNPpairs[2*i]], MarginalAssociation[InteractionSNPpairs[2*i+1]], InteractionMeasureSNPpair[i], zval[i]);
			}
		}
		fclose(fp);

		//fp = fopen("AssociationRecords.txt","w");
		//for (i = 0; i < AssociationCount; i++)
		//{
		//	//if (AssociationMeasureSNPpair[i] > tau2)
		//	fprintf(fp,"%7d\t%7d\t%7d\t%f\t%f\n", i,AssociationSNPpairs[2*i],AssociationSNPpairs[2*i+1], AssociationMeasureSNPpair[2*i], AssociationMeasureSNPpair[2*i+1]);
		//}
		//fclose(fp);
	}
	else {
		std::list<int>::iterator iterModel;
		std::list<float>::iterator iterChiSquareValue;
		std::list<int>::iterator iterJ1 = selectedJ1.begin();
		std::list<int>::iterator iterJ2 = selectedJ2.begin();
		iterModel = selectedModel.begin();
		iterChiSquareValue = selectedModelChiSquareValue.begin();

		fp = fopen(interactionFilename,"w");
		for (i = 0; i < selectedJ1.size(); i++, iterJ1++, iterJ2++, iterModel++, iterChiSquareValue++) {
			fprintf(fp,"%7d\t%7d\t%7d\t%7d\t%f\n", i, *iterJ1, *iterJ2, *iterModel, *iterChiSquareValue);
		}
		fclose(fp);
	}

	printf("cputime: %d\n", (int)ed - st);
	printf("Progress:%d%%\n", 100);
	// flush the STD_OUT for external process to read
	FLUSH_STDOUT();

	free(Pab);
	free(Pbc);
	free(Pca);

	free(DistrCollection);
	free(InteractionSNPpairs);
	free(InteractionMeasureSNPpair);

	free(AssociationSNPpairs);
	free(AssociationMeasureSNPpair);

	free(MarginalEntropySNP);
	free(MarginalEntropySNP_Y);
	free(MarginalAssociation);
	free(GenoJointDistr);
	free(pMarginalDistrSNP);
	free(pMarginalDistrSNP_Y);

	free(AlleleJointDistr);//PLINK
	free(zval);//PLINK

	// Free CudaHostAlloc memory
	free(genocase);
	free(genoctrl);

	return 0;
}