LBM Code

main.c

#ifndef _MAIN_C_
#define _MAIN_C_

#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>

#include "lbm_model.h"
#include "initialization.h"
#include "initialization_gpu.h"
#include "streaming.h"
#include "collision.h"
#include "boundary.h"
#include "lbm_solver_gpu.h"
#include "visualization.h"
#include "utils.h"

int main(int argc, char *argv[]) {
	float *collide_field=NULL, *stream_field=NULL, *collide_field_d=NULL, *stream_field_d=NULL,
			*swap=NULL, tau, wall_velocity[D_LBM], num_cells, mlups_sum;
	int *flag_field=NULL, *flag_field_d=NULL, xlength, t, timesteps, timesteps_per_plotting,
			gpu_enabled;
	clock_t mlups_time;
	size_t field_size;

	/* process parameters */
	ReadParameters(&xlength, &tau, wall_velocity, &timesteps, &timesteps_per_plotting, argc, argv,
			&gpu_enabled);

	/* check if provided parameters are legitimate */
	ValidateModel(wall_velocity, xlength, tau);

	/* initializing fields */
	num_cells = pow(xlength + 2, D_LBM);
	field_size = Q_LBM*num_cells*sizeof(float);
	collide_field = (float*) malloc(field_size);
	stream_field = (float*) malloc(field_size);
	flag_field = (int*) malloc(num_cells*sizeof(int));
	InitialiseFields(collide_field, stream_field, flag_field, xlength, gpu_enabled);
	InitialiseDeviceFields(collide_field, stream_field, flag_field, xlength, &collide_field_d, &stream_field_d, &flag_field_d);

	for (t = 0; t < timesteps; t++) {
		printf("Time step: #%d\n", t);
		if (gpu_enabled){
			DoIteration(collide_field, stream_field, flag_field, tau, wall_velocity, xlength,
					&collide_field_d, &stream_field_d, &flag_field_d, &mlups_sum);
			/* Copy data from devcice in memory only when we need VTK output */
			if (!(t%timesteps_per_plotting))
				CopyFieldsFromDevice(collide_field, stream_field, xlength,
						&collide_field_d, &stream_field_d);
		} else {
			mlups_time = clock();
			/* Copy pdfs from neighbouring cells into collide field */
			DoStreaming(collide_field, stream_field, flag_field, xlength);
			/* Perform the swapping of collide and stream fields */
			swap = collide_field; collide_field = stream_field; stream_field = swap;
			/* Compute post collision distributions */
			DoCollision(collide_field, flag_field, tau, xlength);
			/* Treat boundaries */
			TreatBoundary(collide_field, flag_field, wall_velocity, xlength);
			mlups_time = clock()-mlups_time;
			/* Print out the MLUPS value */
			mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC);
			if(VERBOSE)
				printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC));
		}
		/* Print out vtk output if needed */
		if (!(t%timesteps_per_plotting))
			WriteVtkOutput(collide_field, flag_field, "img/lbm-img", t, xlength);
	}

	printf("Average MLUPS: %f\n", mlups_sum/(t+1));

	if (VERBOSE) {
		if(gpu_enabled)
	  	      CopyFieldsFromDevice(collide_field, stream_field, xlength, &collide_field_d, &stream_field_d);
		WriteField(collide_field, "img/collide-field", 0, xlength, gpu_enabled);
		writeFlagField(flag_field, "img/flag-field", xlength, gpu_enabled);
	}

	/* Free memory */
	free(collide_field);
	free(stream_field);
	free(flag_field);

	FreeDeviceFields(&collide_field_d, &stream_field_d, &flag_field_d);

	printf("Simulation complete.\n");
	return 0;
}

#endif

lbm_model.h

#ifndef _LBM_MODEL_H_
#define _LBM_MODEL_H_

#define VERBOSE 0
#define MLUPS_MIN_CELLS 300
#define MLUPS_EXPONENT 1000000
/* simulation parameters */
#define D_LBM 3
#define Q_LBM 19
/* arbitrary cell identifiers */
#define FLUID 0
#define NO_SLIP 1
#define MOVING_WALL 2
/* computation enhancing values */
#define EPS 0.05
#define C_S_POW2_INV 3.0
#define C_S_POW4_INV 9.0
/* reference values, not used in actual computation */
#define SQRT3 1.73205080756887729
static const float C_S = 1.0/SQRT3;
/* predefined simulation parameters */
static const int LATTICE_VELOCITIES[19][3] = {
    {0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0},
    {-1,0,0}, {0,0,0},  {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0},  {0,-1,1},{-1,0,1},
    {0,0,1},  {1,0,1},  {0,1,1}
};
static const float LATTICE_WEIGHTS[19] = {
    1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 
    2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 
    2.0/36.0, 1.0/36.0, 1.0/36.0
};

/* additional cell definitions */
#define LEFT_BOUNDARY 5100
#define RIGHT_BOUNDARY 5900
#define BOTTOM_BOUNDARY 5010
#define TOP_BOUNDARY 5090
#define BACK_BOUNDARY 5001
#define FRONT_BOUNDARY 5009

#define LEFT_BOTTOM_EDGE 5110
#define LEFT_UPPER_EDGE 5190
#define RIGHT_BOTTOM_EDGE 5910
#define RIGHT_UPPER_EDGE 5990

#define BACK_BOTTOM_EDGE 5011
#define BACK_UPPER_EDGE 5091
#define FRONT_BOTTOM_EDGE 5019
#define FRONT_UPPER_EDGE 5099

#define LEFT_BACK_EDGE 5101
#define LEFT_FRONT_EDGE 5109
#define RIGHT_BACK_EDGE 5901
#define RIGHT_FRONT_EDGE 5909

#define LEFT_BOTTOM_BACK_CORNER 5111
#define LEFT_BOTTOM_FRONT_CORNER 5119
#define LEFT_UPPER_BACK_CORNER 5191
#define LEFT_UPPER_FRONT_CORNER 5199
#define RIGHT_BOTTOM_BACK_CORNER 5911
#define RIGHT_BOTTOM_FRONT_CORNER 5919
#define RIGHT_UPPER_BACK_CORNER 5991
#define RIGHT_UPPER_FRONT_CORNER 5999

/**
 * Validates the configured physical model by calculating characteristic numbers
 */
void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau);

#endif

#include <math.h>
#include <stdio.h>

#include "lbm_model.h"
#include "utils.h"


void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau){
	float u_wall_length, mach_number, reynolds_number;
	/* Compute Mach number and Reynolds number */
	u_wall_length = sqrt(wall_velocity[0]*wall_velocity[0]+
			wall_velocity[1]*wall_velocity[1]+
			wall_velocity[2]*wall_velocity[2]);
	mach_number = u_wall_length*SQRT3;
	reynolds_number = u_wall_length*domain_size*C_S_POW2_INV/(tau-0.5);
	printf("Computed Mach number: %f\n", mach_number);
	printf("Computed Reynolds number: %f\n", reynolds_number);

	/* Check if characteristic numbers are correct */
	if (mach_number >= 1)
		ERROR("Computed Mach number is too large.");
	if (reynolds_number > 500)
		ERROR("Computed Reynolds number is too large for simulation to be run on a laptop/pc.");
}

GPU:

initialization_gpu.h

#ifndef _INITIALIZATION_GPU_H_
#define _INITIALIZATION_GPU_H_

/**
 * Copy everything from fields on DRAM in GPU
 */
void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
		float **collide_field_d, float **stream_field_d,int **flag_field_d);

/**
 * Free GPU memory
 */
void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d);

/**
 * Copy data from device in global memory.
 */
void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength,
		float **collide_field_dd, float **stream_field_dd);

#endif

initialization_gpu.cu

#include "initialization_gpu.h"
#include "lbm_model.h"
#include "utils_gpu.h"


void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
		float **collide_field_d, float **stream_field_d,int **flag_field_d){
	int num_cells = pow(xlength+2, D_LBM);
	size_t computational_field_size = Q_LBM*num_cells*sizeof(float);
	size_t flag_field_size = num_cells*sizeof(int);

	cudaErrorCheck(cudaMalloc(collide_field_d, computational_field_size));
	cudaErrorCheck(cudaMemcpy(*collide_field_d, collide_field, computational_field_size, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMalloc(stream_field_d, computational_field_size));
	cudaErrorCheck(cudaMemcpy(*stream_field_d, stream_field, computational_field_size, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMalloc(flag_field_d, flag_field_size));
	cudaErrorCheck(cudaMemcpy(*flag_field_d, flag_field, flag_field_size, cudaMemcpyHostToDevice));
}


void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d){
	cudaErrorCheck(cudaFree(*collide_field_d));
	cudaErrorCheck(cudaFree(*stream_field_d));
	cudaErrorCheck(cudaFree(*flag_field_d));
}


void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength,
		float **collide_field_dd, float **stream_field_dd) {
	int num_cells = pow(xlength+2, D_LBM);
	size_t computational_field_size = Q_LBM*num_cells*sizeof(float);
	cudaErrorCheck(cudaMemcpy(collide_field, *collide_field_dd, computational_field_size, cudaMemcpyDeviceToHost));
	cudaErrorCheck(cudaMemcpy(stream_field, *stream_field_dd, computational_field_size, cudaMemcpyDeviceToHost));
}

lbm_solver_gpu.h

#ifndef _COLLISION_GPU_H_
#define _COLLISION_GPU_H_

/**
 * Performs streaming, collision and boundary treatment.
 */
void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau,
		float *wall_velocity, int xlength, float **collide_field_d, float **stream_field_d,
		int **flag_field_d, float *mlups_sum);

#endif

lbm_solver_gpu.cu

#include <math.h>
#include <stdio.h>

#include "lbm_solver_gpu.h"
#include "lbm_model.h"
#include "lbm_model_gpu.cuh"
#include "utils.h"
#include "utils_gpu.h"
#include "cell_computation_gpu.cuh"


__constant__ float tau_d, wall_velocity_d[D_LBM];
__constant__ int xlength_d, num_cells_d;
__device__ float *stream_field_d, *collide_field_d;


/**
 * Computes the post-collision distribution functions according to the BGK update rule and
 * stores the results again at the same position.
 */
__device__ void ComputePostCollisionDistributionsGpu(float *current_cell, float *feq){
	current_cell[0]=current_cell[0]-(current_cell[0]-feq[0])/tau_d;
	current_cell[1]=current_cell[1]-(current_cell[1]-feq[1])/tau_d;
	current_cell[2]=current_cell[2]-(current_cell[2]-feq[2])/tau_d;
	current_cell[3]=current_cell[3]-(current_cell[3]-feq[3])/tau_d;
	current_cell[4]=current_cell[4]-(current_cell[4]-feq[4])/tau_d;
	current_cell[5]=current_cell[5]-(current_cell[5]-feq[5])/tau_d;
	current_cell[6]=current_cell[6]-(current_cell[6]-feq[6])/tau_d;
	current_cell[7]=current_cell[7]-(current_cell[7]-feq[7])/tau_d;
	current_cell[8]=current_cell[8]-(current_cell[8]-feq[8])/tau_d;
	current_cell[9]=current_cell[9]-(current_cell[9]-feq[9])/tau_d;
	current_cell[10]=current_cell[10]-(current_cell[10]-feq[10])/tau_d;
	current_cell[11]=current_cell[11]-(current_cell[11]-feq[11])/tau_d;
	current_cell[12]=current_cell[12]-(current_cell[12]-feq[12])/tau_d;
	current_cell[13]=current_cell[13]-(current_cell[13]-feq[13])/tau_d;
	current_cell[14]=current_cell[14]-(current_cell[14]-feq[14])/tau_d;
	current_cell[15]=current_cell[15]-(current_cell[15]-feq[15])/tau_d;
	current_cell[16]=current_cell[16]-(current_cell[16]-feq[16])/tau_d;
	current_cell[17]=current_cell[17]-(current_cell[17]-feq[17])/tau_d;
	current_cell[18]=current_cell[18]-(current_cell[18]-feq[18])/tau_d;
}

/*
 * Performs streaming on cells.
 */
__global__ void DoStreaming(){
	int x = threadIdx.x+blockIdx.x*blockDim.x;
	int y = threadIdx.y+blockIdx.y*blockDim.y;
	int z = threadIdx.z+blockIdx.z*blockDim.z;
	int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz;

	/* check that indices are within the bounds since there could be more threads than needed */
	if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){
		nx=x-LATTICE_VELOCITIES_D[0][0];
		ny=y-LATTICE_VELOCITIES_D[0][1];
		nz=z-LATTICE_VELOCITIES_D[0][2];
		stream_field_d[Q_LBM*idx]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)];
		nx=x-LATTICE_VELOCITIES_D[1][0];
		ny=y-LATTICE_VELOCITIES_D[1][1];
		nz=z-LATTICE_VELOCITIES_D[1][2];
		stream_field_d[Q_LBM*idx+1]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+1];
		nx=x-LATTICE_VELOCITIES_D[2][0];
		ny=y-LATTICE_VELOCITIES_D[2][1];
		nz=z-LATTICE_VELOCITIES_D[2][2];
		stream_field_d[Q_LBM*idx+2]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+2];
		nx=x-LATTICE_VELOCITIES_D[3][0];
		ny=y-LATTICE_VELOCITIES_D[3][1];
		nz=z-LATTICE_VELOCITIES_D[3][2];
		stream_field_d[Q_LBM*idx+3]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+3];
		nx=x-LATTICE_VELOCITIES_D[4][0];
		ny=y-LATTICE_VELOCITIES_D[4][1];
		nz=z-LATTICE_VELOCITIES_D[4][2];
		stream_field_d[Q_LBM*idx+4]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+4];
		nx=x-LATTICE_VELOCITIES_D[5][0];
		ny=y-LATTICE_VELOCITIES_D[5][1];
		nz=z-LATTICE_VELOCITIES_D[5][2];
		stream_field_d[Q_LBM*idx+5]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+5];
		nx=x-LATTICE_VELOCITIES_D[6][0];
		ny=y-LATTICE_VELOCITIES_D[6][1];
		nz=z-LATTICE_VELOCITIES_D[6][2];
		stream_field_d[Q_LBM*idx+6]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+6];
		nx=x-LATTICE_VELOCITIES_D[7][0];
		ny=y-LATTICE_VELOCITIES_D[7][1];
		nz=z-LATTICE_VELOCITIES_D[7][2];
		stream_field_d[Q_LBM*idx+7]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+7];
		nx=x-LATTICE_VELOCITIES_D[8][0];
		ny=y-LATTICE_VELOCITIES_D[8][1];
		nz=z-LATTICE_VELOCITIES_D[8][2];
		stream_field_d[Q_LBM*idx+8]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+8];
		nx=x-LATTICE_VELOCITIES_D[9][0];
		ny=y-LATTICE_VELOCITIES_D[9][1];
		nz=z-LATTICE_VELOCITIES_D[9][2];
		stream_field_d[Q_LBM*idx+9]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+9];
		nx=x-LATTICE_VELOCITIES_D[10][0];
		ny=y-LATTICE_VELOCITIES_D[10][1];
		nz=z-LATTICE_VELOCITIES_D[10][2];
		stream_field_d[Q_LBM*idx+10]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+10];
		nx=x-LATTICE_VELOCITIES_D[11][0];
		ny=y-LATTICE_VELOCITIES_D[11][1];
		nz=z-LATTICE_VELOCITIES_D[11][2];
		stream_field_d[Q_LBM*idx+11]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+11];
		nx=x-LATTICE_VELOCITIES_D[12][0];
		ny=y-LATTICE_VELOCITIES_D[12][1];
		nz=z-LATTICE_VELOCITIES_D[12][2];
		stream_field_d[Q_LBM*idx+12]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+12];
		nx=x-LATTICE_VELOCITIES_D[13][0];
		ny=y-LATTICE_VELOCITIES_D[13][1];
		nz=z-LATTICE_VELOCITIES_D[13][2];
		stream_field_d[Q_LBM*idx+13]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+13];
		nx=x-LATTICE_VELOCITIES_D[14][0];
		ny=y-LATTICE_VELOCITIES_D[14][1];
		nz=z-LATTICE_VELOCITIES_D[14][2];
		stream_field_d[Q_LBM*idx+14]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+14];
		nx=x-LATTICE_VELOCITIES_D[15][0];
		ny=y-LATTICE_VELOCITIES_D[15][1];
		nz=z-LATTICE_VELOCITIES_D[15][2];
		stream_field_d[Q_LBM*idx+15]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+15];
		nx=x-LATTICE_VELOCITIES_D[16][0];
		ny=y-LATTICE_VELOCITIES_D[16][1];
		nz=z-LATTICE_VELOCITIES_D[16][2];
		stream_field_d[Q_LBM*idx+16]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+16];
		nx=x-LATTICE_VELOCITIES_D[17][0];
		ny=y-LATTICE_VELOCITIES_D[17][1];
		nz=z-LATTICE_VELOCITIES_D[17][2];
		stream_field_d[Q_LBM*idx+17]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+17];
		nx=x-LATTICE_VELOCITIES_D[18][0];
		ny=y-LATTICE_VELOCITIES_D[18][1];
		nz=z-LATTICE_VELOCITIES_D[18][2];
		stream_field_d[Q_LBM*idx+18]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+18];
	}
}


/*
 * Performs collision computation.
 */
__global__ void DoCollision(){
	int x = threadIdx.x+blockIdx.x*blockDim.x;
	int y = threadIdx.y+blockIdx.y*blockDim.y;
	int z = threadIdx.z+blockIdx.z*blockDim.z;
	int step=xlength_d+2, idx=x+y*step+z*step*step;
	float density, velocity[D_LBM], feq[Q_LBM], *current_cell_s;

	/* check that indices are within the bounds since there could be more threads than needed */
	if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){
		current_cell_s=&collide_field_d[Q_LBM*idx];
		ComputeDensityGpu(current_cell_s,&density);
		ComputeVelocityGpu(current_cell_s,&density,velocity);
		ComputeFeqGpu(&density,velocity,feq);
		ComputePostCollisionDistributionsGpu(current_cell_s,feq);
	}
}


/*
 * Computes proper boundary values.
 */
__global__ void TreatBoundary(int *flag_field_d){
	int x = threadIdx.x+blockIdx.x*blockDim.x;
	int y = threadIdx.y+blockIdx.y*blockDim.y;
	int z = threadIdx.z+blockIdx.z*blockDim.z;

	int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz, i, boundary_side=0, boundary_idx=100500;
	float density, dot_prod;

	if(idx<num_cells_d) {
		if(flag_field_d[idx] == BOTTOM_BOUNDARY) {
			boundary_side = BOTTOM_BOUNDARY;
			boundary_idx = BOTTOM_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == LEFT_BOUNDARY) {
			boundary_side = LEFT_BOUNDARY;
			boundary_idx = LEFT_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == RIGHT_BOUNDARY) {
			boundary_side = RIGHT_BOUNDARY;
			boundary_idx = RIGHT_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == BACK_BOUNDARY) {
			boundary_side = BACK_BOUNDARY;
			boundary_idx = BACK_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == FRONT_BOUNDARY) {
			boundary_side = FRONT_BOUNDARY;
			boundary_idx = FRONT_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == LEFT_BOTTOM_EDGE) {
			boundary_side = LEFT_BOTTOM_EDGE;
			boundary_idx = 13;
		} else if (flag_field_d[idx] == RIGHT_BOTTOM_EDGE) {
			boundary_side = RIGHT_BOTTOM_EDGE;
			boundary_idx = 11;
		} else if (flag_field_d[idx] == BACK_BOTTOM_EDGE) {
			boundary_side = BACK_BOTTOM_EDGE;
			boundary_idx = 18;
		} else if (flag_field_d[idx] == FRONT_BOTTOM_EDGE) {
			boundary_side = FRONT_BOTTOM_EDGE;
			boundary_idx = 4;
		} else if (flag_field_d[idx] == LEFT_BACK_EDGE) {
			boundary_side = LEFT_BACK_EDGE;
			boundary_idx = 17;
		} else if (flag_field_d[idx] == LEFT_FRONT_EDGE) {
			boundary_side = LEFT_FRONT_EDGE;
			boundary_idx = 3;
		} else if (flag_field_d[idx] == RIGHT_BACK_EDGE) {
			boundary_side = RIGHT_BACK_EDGE;
			boundary_idx = 15;
		} else if (flag_field_d[idx] == RIGHT_FRONT_EDGE) {
			boundary_side = RIGHT_FRONT_EDGE;
			boundary_idx = 1;
		} else if (flag_field_d[idx] == LEFT_UPPER_EDGE) {
			boundary_side = LEFT_UPPER_EDGE;
			boundary_idx = 7;
		} else if (flag_field_d[idx] == RIGHT_UPPER_EDGE) {
			boundary_side = RIGHT_UPPER_EDGE;
			boundary_idx = 5;
		} else if (flag_field_d[idx] == BACK_UPPER_EDGE) {
			boundary_side = BACK_UPPER_EDGE;
			boundary_idx = 14;
		} else if (flag_field_d[idx] == FRONT_UPPER_EDGE) {
			boundary_side = FRONT_UPPER_EDGE;
			boundary_idx = 0;
		} else if (flag_field_d[idx] == TOP_BOUNDARY) {
			boundary_side = TOP_BOUNDARY;
			boundary_idx = TOP_BOUNDARY_IDX;
		}

		if( boundary_side==LEFT_BOUNDARY || boundary_side==RIGHT_BOUNDARY ||
				boundary_side==BOTTOM_BOUNDARY ||
				boundary_side==BACK_BOUNDARY || boundary_side==FRONT_BOUNDARY) {
			i = TREAT_BOUNDARY_INDECES[boundary_idx][0];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][1];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][2];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][3];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][4];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
		} else if (boundary_side == LEFT_BOTTOM_EDGE || boundary_side == RIGHT_BOTTOM_EDGE ||
				boundary_side == BACK_BOTTOM_EDGE || boundary_side == FRONT_BOTTOM_EDGE ||
				boundary_side == LEFT_BACK_EDGE || boundary_side == LEFT_FRONT_EDGE ||
				boundary_side == RIGHT_BACK_EDGE || boundary_side == RIGHT_FRONT_EDGE) {
			nx=x+LATTICE_VELOCITIES_D[boundary_idx][0];
			ny=y+LATTICE_VELOCITIES_D[boundary_idx][1];
			nz=z+LATTICE_VELOCITIES_D[boundary_idx][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+boundary_idx]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(boundary_idx)];
		} else if(boundary_side == LEFT_UPPER_EDGE || boundary_side == RIGHT_UPPER_EDGE ||
				boundary_side == BACK_UPPER_EDGE || boundary_side == FRONT_UPPER_EDGE) {
			i = boundary_idx;
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			/* Compute density in the neighbour cell */
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			/* Compute dot product */
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			/* Assign the boudary cell value */
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
		} else if(boundary_side == TOP_BOUNDARY) {
			i = TREAT_BOUNDARY_INDECES[boundary_idx][0];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][1];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][2];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][3];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][4];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
		}
	}
}


/*
 * Performs pointer swap on GPU.
 */
__global__ void DoSwap(){
	float *swap=collide_field_d; collide_field_d=stream_field_d; stream_field_d=swap;
}


void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau,
		float *wall_velocity, int xlength, float **collide_field_dd, float **stream_field_dd,
		int **flag_field_d, float *mlups_sum){
	int num_cells = pow(xlength+2, D_LBM);
	clock_t mlups_time;

	/* initialize constant data */
	cudaErrorCheck(cudaMemcpyToSymbol(xlength_d, &xlength, sizeof(int), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(num_cells_d, &num_cells, sizeof(int), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(tau_d, &tau, sizeof(float), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(wall_velocity_d, wall_velocity, D_LBM*sizeof(float), 0, cudaMemcpyHostToDevice));

	cudaErrorCheck(cudaMemcpyToSymbol(collide_field_d, collide_field_dd, sizeof(*collide_field_dd), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(stream_field_d, stream_field_dd, sizeof(*stream_field_dd), 0, cudaMemcpyHostToDevice));

	/* define grid structure */
	dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((xlength+2+block.x-1)/block.x, (xlength+2+block.y-1)/block.y, (xlength+2+block.z-1)/block.z);

	mlups_time = clock();

	/* perform streaming */
	DoStreaming<<<grid,block>>>();
	cudaErrorCheck(cudaThreadSynchronize());
	cudaErrorCheck(cudaPeekAtLastError());

	/* Perform the swapping of collide and stream fields */
	DoSwap<<<1,1>>>();
	cudaErrorCheck(cudaThreadSynchronize());
	cudaErrorCheck(cudaPeekAtLastError());

	/* perform collision */
	DoCollision<<<grid,block>>>();
	cudaErrorCheck(cudaThreadSynchronize());
	cudaErrorCheck(cudaPeekAtLastError());

	/* perform boundary treatment */
	TreatBoundary<<<grid,block>>>(*flag_field_d);
	cudaErrorCheck(cudaPeekAtLastError());

	mlups_time = clock()-mlups_time;

	*mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC);
	if(VERBOSE)
		printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC));

	/* copy data back to host */
	cudaErrorCheck(cudaMemcpyFromSymbol(collide_field_dd, collide_field_d, sizeof(*collide_field_dd), 0, cudaMemcpyDeviceToHost));
	cudaErrorCheck(cudaMemcpyFromSymbol(stream_field_dd, stream_field_d, sizeof(*stream_field_dd), 0, cudaMemcpyDeviceToHost));
}

cell_computation_gpu.h

#ifndef _CELL_COMPUTATION_GPU_CUH_
#define _CELL_COMPUTATION_GPU_CUH_


/**
 * Computes the density from the particle distribution functions stored at currentCell.
 * currentCell thus denotes the address of the first particle distribution function of the
 * respective cell. The result is stored in density.
 */
__device__ void ComputeDensityGpu(float *current_cell, float *density){
    *density=0;
    *density+=current_cell[0];
    *density+=current_cell[1];
    *density+=current_cell[2];
    *density+=current_cell[3];
    *density+=current_cell[4];
    *density+=current_cell[5];
    *density+=current_cell[6];
    *density+=current_cell[7];
    *density+=current_cell[8];
    *density+=current_cell[9];
    *density+=current_cell[10];
    *density+=current_cell[11];
    *density+=current_cell[12];
    *density+=current_cell[13];
    *density+=current_cell[14];
    *density+=current_cell[15];
    *density+=current_cell[16];
    *density+=current_cell[17];
    *density+=current_cell[18];
}


/**
 * Computes the velocity within currentCell and stores the result in velocity
 */
__device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){
    velocity[0]=0;
	velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0];
	velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0];
	velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0];
	velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0];
	velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0];
	velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0];
	velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0];
	velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0];
	velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0];
	velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0];
	velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0];
	velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0];
	velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0];
	velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0];
	velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0];
	velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0];
	velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0];
	velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0];
	velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0];
	velocity[0]/=*density;

	velocity[1]=0;
	velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1];
	velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1];
	velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1];
	velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1];
	velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1];
	velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1];
	velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1];
	velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1];
	velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1];
	velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1];
	velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1];
	velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1];
	velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1];
	velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1];
	velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1];
	velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1];
	velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1];
	velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1];
	velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1];
	velocity[1]/=*density;

	velocity[2]=0;
	velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2];
	velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2];
	velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2];
	velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2];
	velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2];
	velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2];
	velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2];
	velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2];
	velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2];
	velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2];
	velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2];
	velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2];
	velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2];
	velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2];
	velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2];
	velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2];
	velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2];
	velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2];
	velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2];
    velocity[2]/=*density;
}


/**
 * Computes the equilibrium distributions for all particle distribution functions of one
 * cell from density and velocity and stores the results in feq.
 */
__device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){
    float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
	s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2];
	s2 = s1*s1;
	feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2];
	s2 = s1*s1;
	feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2];
	s2 = s1*s1;
	feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2];
	s2 = s1*s1;
	feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2];
	s2 = s1*s1;
	feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2];
	s2 = s1*s1;
	feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2];
	s2 = s1*s1;
	feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2];
	s2 = s1*s1;
	feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2];
	s2 = s1*s1;
	feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2];
	s2 = s1*s1;
	feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2];
	s2 = s1*s1;
	feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2];
	s2 = s1*s1;
	feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2];
	s2 = s1*s1;
	feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2];
	s2 = s1*s1;
	feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2];
	s2 = s1*s1;
	feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2];
	s2 = s1*s1;
	feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2];
	s2 = s1*s1;
	feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2];
	s2 = s1*s1;
	feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2];
	s2 = s1*s1;
	feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
}


/**
 * Finds an inverse probability distribution for the provided lattice index.
 */
__device__ int InvGpu(int i){
    return (Q_LBM-1)-i;
}

#endif

cell_computation_gpu.cu

#ifndef _CELL_COMPUTATION_GPU_CUH_
#define _CELL_COMPUTATION_GPU_CUH_


/**
 * Computes the density from the particle distribution functions stored at currentCell.
 * currentCell thus denotes the address of the first particle distribution function of the
 * respective cell. The result is stored in density.
 */
__device__ void ComputeDensityGpu(float *current_cell, float *density){
    *density=0;
    *density+=current_cell[0];
    *density+=current_cell[1];
    *density+=current_cell[2];
    *density+=current_cell[3];
    *density+=current_cell[4];
    *density+=current_cell[5];
    *density+=current_cell[6];
    *density+=current_cell[7];
    *density+=current_cell[8];
    *density+=current_cell[9];
    *density+=current_cell[10];
    *density+=current_cell[11];
    *density+=current_cell[12];
    *density+=current_cell[13];
    *density+=current_cell[14];
    *density+=current_cell[15];
    *density+=current_cell[16];
    *density+=current_cell[17];
    *density+=current_cell[18];
}


/**
 * Computes the velocity within currentCell and stores the result in velocity
 */
__device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){
    velocity[0]=0;
	velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0];
	velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0];
	velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0];
	velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0];
	velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0];
	velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0];
	velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0];
	velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0];
	velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0];
	velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0];
	velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0];
	velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0];
	velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0];
	velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0];
	velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0];
	velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0];
	velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0];
	velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0];
	velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0];
	velocity[0]/=*density;

	velocity[1]=0;
	velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1];
	velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1];
	velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1];
	velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1];
	velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1];
	velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1];
	velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1];
	velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1];
	velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1];
	velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1];
	velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1];
	velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1];
	velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1];
	velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1];
	velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1];
	velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1];
	velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1];
	velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1];
	velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1];
	velocity[1]/=*density;

	velocity[2]=0;
	velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2];
	velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2];
	velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2];
	velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2];
	velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2];
	velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2];
	velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2];
	velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2];
	velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2];
	velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2];
	velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2];
	velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2];
	velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2];
	velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2];
	velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2];
	velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2];
	velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2];
	velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2];
	velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2];
    velocity[2]/=*density;
}


/**
 * Computes the equilibrium distributions for all particle distribution functions of one
 * cell from density and velocity and stores the results in feq.
 */
__device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){
    float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
	s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2];
	s2 = s1*s1;
	feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2];
	s2 = s1*s1;
	feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2];
	s2 = s1*s1;
	feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2];
	s2 = s1*s1;
	feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2];
	s2 = s1*s1;
	feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2];
	s2 = s1*s1;
	feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2];
	s2 = s1*s1;
	feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2];
	s2 = s1*s1;
	feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2];
	s2 = s1*s1;
	feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2];
	s2 = s1*s1;
	feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2];
	s2 = s1*s1;
	feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2];
	s2 = s1*s1;
	feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2];
	s2 = s1*s1;
	feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2];
	s2 = s1*s1;
	feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2];
	s2 = s1*s1;
	feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2];
	s2 = s1*s1;
	feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2];
	s2 = s1*s1;
	feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2];
	s2 = s1*s1;
	feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2];
	s2 = s1*s1;
	feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
}


/**
 * Finds an inverse probability distribution for the provided lattice index.
 */
__device__ int InvGpu(int i){
    return (Q_LBM-1)-i;
}

#endif


10 AM Meeting

lbm_model_gpu.cuh

#ifndef _LBM_MODEL_GPU_CUH_
#define _LBM_MODEL_GPU_CUH_

/* Restricts 3D blocks to have 512 threads (limits: 512 CC<2.x; 1024 CC>2.x) */
#define BLOCK_SIZE 8

/* Arbitrary boundary identifiers */
#define LEFT_BOUNDARY_IDX 0
#define RIGHT_BOUNDARY_IDX 1
#define BOTTOM_BOUNDARY_IDX 2
#define TOP_BOUNDARY_IDX 3
#define BACK_BOUNDARY_IDX 4
#define FRONT_BOUNDARY_IDX 5


__constant__ static const int LATTICE_VELOCITIES_D[19][3] = {
    {0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0},
    {-1,0,0}, {0,0,0},  {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0},  {0,-1,1},{-1,0,1},
    {0,0,1},  {1,0,1},  {0,1,1}
};

__constant__ static const float LATTICE_WEIGHTS_D[19] = {
    1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0,
    2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0,
    2.0/36.0, 1.0/36.0, 1.0/36.0
};

/**
 * Stores number of probability distributions which we need to copy in boundary treatment step.
 */
__constant__ static const float TREAT_BOUNDARY_INDECES[6][5] = {
	{3,7,10,13,17},
	{1,5,8,11,15},
	{4,11,12,13,18},
	{0,5,6,7,14},
	{14,15,16,17,18},
	{0,1,2,3,4}
};

#endif

CPU:

initialization.h

#ifndef _INITIALIZATION_H_
#define _INITIALIZATION_H_


/**
 * Reads the parameters for the lid driven cavity scenario from a configuration file
 */
void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps,
		int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled);


/**
 * Initializes the particle distribution functions and the flag field
 */
void InitialiseFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
		int gpu_enabled);

#endif

initialization.c

#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#include "initialization.h"
#include "lbm_model.h"
#include "utils.h"


/**
 * Prints help message that includes program usage instructions and control flags.
 */
void PrintHelpMessage(){
	printf("List of control flags:\n");
	printf("\t -gpu             all computations are to be performed on gpu\n");
	printf("\t -cpu             all computations are to be performed on cpu\n");
	printf("\t -help            prints this help message\n");
	printf("NOTE: Control flags are mutually exclusive and only one flag at a time is allowed\n");
	printf("Example program usage:\n");
	printf("\t./lbm-sim ./data/lbm.dat -gpu\n");
	exit(1);
}


void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps,
		int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled){
    float *velocity_wall_1, *velocity_wall_2, *velocity_wall_3;
    /* printing out help message */
    if(argc<3) PrintHelpMessage();
    if(!strcmp(argv[1], "-help") || !strcmp(argv[2], "-help")) PrintHelpMessage();

    /* checking parameters */
    if(access(argv[1], R_OK) != 0)
        ERROR("Provided configuration file path either doesn't exist or can not be read.");
    if(!strcmp(argv[2], "-gpu")) *gpu_enabled=1; else *gpu_enabled=0;

    /* reading parameters */
    READ_FLOAT(argv[1], *tau);

    velocity_wall_1=&velocity_wall[0];
    velocity_wall_2=&velocity_wall[1];
    velocity_wall_3=&velocity_wall[2];

    READ_FLOAT(argv[1], *velocity_wall_1);
    READ_FLOAT(argv[1], *velocity_wall_2);
    READ_FLOAT(argv[1], *velocity_wall_3);
    
    READ_INT(argv[1], *xlength);
    READ_INT(argv[1], *timesteps);
    READ_INT(argv[1], *timesteps_per_plotting);
}


void InitialiseFields(float *collide_field, float *stream_field, int *flag_field, int xlength,
		int gpu_enabled){
    int x,y,z,i,step=xlength+2;
    
    /* NOTE: We use z=xlength+1 as the moving wall */
    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                /* Initializing flags */
            	if(gpu_enabled) {
					if(x==0 && y==0 && z==0)
						flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_BACK_CORNER;
					else if(x==0 && y==0 && z==step-1)
						flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_FRONT_CORNER;
					else if(x==0 && y==step-1 && z==0)
						flag_field[x+y*step+z*step*step] = LEFT_UPPER_BACK_CORNER;
					else if(x==0 && y==step-1 && z==step-1)
						flag_field[x+y*step+z*step*step] = LEFT_UPPER_FRONT_CORNER;
					else if(x==step-1 && y==0 && z==0)
						flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_BACK_CORNER;
					else if(x==step-1 && y==0 && z==step-1)
						flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_FRONT_CORNER;
					else if(x==step-1 && y==step-1 && z==0)
						flag_field[x+y*step+z*step*step] = RIGHT_UPPER_BACK_CORNER;
					else if(x==step-1 && y==step-1 && z==step-1)
						flag_field[x+y*step+z*step*step] = RIGHT_UPPER_FRONT_CORNER;
					else if(x==0 && y==0)
						flag_field[x+y*step+z*step*step]=LEFT_BOTTOM_EDGE;
					else if(x==xlength+1 && y==0)
						flag_field[x+y*step+z*step*step]=RIGHT_BOTTOM_EDGE;
					else if(y==0 && z==0)
						flag_field[x+y*step+z*step*step]=BACK_BOTTOM_EDGE;
					else if(y==0 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=FRONT_BOTTOM_EDGE;

					else if(x==0 && z==0)
						flag_field[x+y*step+z*step*step]=LEFT_BACK_EDGE;
					else if(x==0 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=LEFT_FRONT_EDGE;
					else if(x==xlength+1 && z==0)
						flag_field[x+y*step+z*step*step]=RIGHT_BACK_EDGE;
					else if(x==xlength+1 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=RIGHT_FRONT_EDGE;
					else if(x==0 && y==xlength+1)
						flag_field[x+y*step+z*step*step]=LEFT_UPPER_EDGE;
					else if(x==xlength+1 && y==xlength+1)
						flag_field[x+y*step+z*step*step]=RIGHT_UPPER_EDGE;
					else if(y==xlength+1 && z==0)
						flag_field[x+y*step+z*step*step]=BACK_UPPER_EDGE;
					else if(y==xlength+1 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=FRONT_UPPER_EDGE;


					else if(y==step-1 && x!=0 && x!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = TOP_BOUNDARY;
					else if(y==0 && x!=0 && x!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = BOTTOM_BOUNDARY;
					else if (x==0 && y!=0 && y!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = LEFT_BOUNDARY;
					else if (x==step-1 && y!=0 && y!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = RIGHT_BOUNDARY;
					else if (z==0 && y!=0 && y!=step-1 && x!=0 && x!=step-1)
						flag_field[x+y*step+z*step*step] = BACK_BOUNDARY;
					else if (z==step-1 && y!=0 && y!=step-1 && x!=0 && x!=step-1)
						flag_field[x+y*step+z*step*step] = FRONT_BOUNDARY;
					else
						flag_field[x+y*step+z*step*step]=FLUID;
            	} else {
            		if(y == xlength+1)
						flag_field[x+y*step+z*step*step]=MOVING_WALL;
					else if(x == 0 || x == xlength+1 || y == 0 || z == xlength+1 || z == 0)
						flag_field[x+y*step+z*step*step]=NO_SLIP;
					else
						flag_field[x+y*step+z*step*step]=FLUID;
            	}
                
                /* Initializing distributions for stream and collide fields */
                for(i=0;i<Q_LBM;i++){
                    /*
                     * NOTE:Stream field is initilized to 0s because that helps to
                     * track down mistakes and has no impact whatsoever to on the 
                     * computation further on.
                     */
                    stream_field[Q_LBM*(x+y*step+z*step*step)+i]=0;
                    collide_field[Q_LBM*(x+y*step+z*step*step)+i]=LATTICE_WEIGHTS[i];
                }
            }
        }
    }
}

streaming.h

#ifndef _STREAMING_H_
#define _STREAMING_H_

/**
 * Carries out the streaming step and writes the respective distribution functions from
 * collideField to streamField.
 */
void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength);

#endif

streaming.c

#include "streaming.h"
#include "lbm_model.h"


void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength){
    int x,nx,y,ny,z,nz,i,step=xlength+2;

    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                if(flag_field[x+y*step+z*step*step]==FLUID){
                    for(i=0;i<Q_LBM;i++){
                        nx=x-LATTICE_VELOCITIES[i][0];
                        ny=y-LATTICE_VELOCITIES[i][1];
                        nz=z-LATTICE_VELOCITIES[i][2];
                        
                        stream_field[Q_LBM*(x+y*step+z*step*step)+i]=
                                collide_field[Q_LBM*(nx+ny*step+nz*step*step)+i];
                    }
                }
            }
        }
    }
}

collision.h

#ifndef _COLLISION_H_
#define _COLLISION_H_

/**
 * Carries out the whole local collision process. Computes density and velocity and equilibrium
 * distributions. Carries out BGK update.
 */
void DoCollision(float *collide_field, int *flag_field, float tau, int xlength);

#endif

collision.c

#include "collision.h"
#include "cell_computation.h"
#include "lbm_model.h"
#include "utils.h"


/**
 * Computes the post-collision distribution functions according to the BGK update rule and
 * stores the results again at the same position.
 */
void ComputePostCollisionDistributions(float *current_cell, float tau, const float *const feq){
    int i;
    for(i=0;i<Q_LBM;i++){
        current_cell[i]=current_cell[i]-(current_cell[i]-feq[i])/tau;
        
        /* Probability distribution function can not be less than 0 */
        if (current_cell[i] < 0)
            ERROR("Probability distribution function can not be negative.");
    }
}


void DoCollision(float *collide_field, int *flag_field, float tau, int xlength){
    float density, velocity[3], feq[Q_LBM], *currentCell;
    int x,y,z,step=xlength+2;
    
    for(x=1;x<step-1;x++){
        for(y=1;y<step-1;y++){
            for(z=1;z<step-1;z++){
                currentCell=&collide_field[Q_LBM*(x+y*step+z*step*step)];
                
                ComputeDensity(currentCell,&density);
                ComputeVelocity(currentCell,&density,velocity);
                ComputeFeq(&density,velocity,feq);
                ComputePostCollisionDistributions(currentCell,tau,feq);
            }
        }
    }
}

boundary.h

#ifndef _BOUNDARY_H_
#define _BOUNDARY_H_


/**
 * Handles the boundaries in our simulation setup
 */
void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength);

#endif

boundary.c

#include "boundary.h"
#include "lbm_model.h"
#include "cell_computation.h"

/**
 * Finds an inverse probability distribution for the provided lattice index.
 */
int inv(int i){
    return (Q_LBM-1)-i;
}


void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength){
    int x,nx,y,ny,z,nz,i,step=xlength+2;
    float density,dot_prod;
    
    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                if(flag_field[x+y*step+z*step*step]!=FLUID){
                    for(i=0;i<Q_LBM;i++){
                        nx=x+LATTICE_VELOCITIES[i][0];
                        ny=y+LATTICE_VELOCITIES[i][1];
                        nz=z+LATTICE_VELOCITIES[i][2];

                        /* We don't need the values outside of our extended domain */
                        if(0<nx && nx<step-1 && 0<ny && ny<step-1 && 0<nz && nz<step-1){
                            if (flag_field[x+y*step+z*step*step]==MOVING_WALL){
                                /* Compute density in the neighbour cell */
                                ComputeDensity(&collide_field[Q_LBM*(nx+ny*step+nz*step*step)],&density);
                                /* Compute dot product */
                                dot_prod=LATTICE_VELOCITIES[i][0]*wall_velocity[0]+
                                        LATTICE_VELOCITIES[i][1]*wall_velocity[1]+
                                        LATTICE_VELOCITIES[i][2]*wall_velocity[2];
                                /* Assign the boudary cell value */
                                collide_field[Q_LBM*(x+y*step+z*step*step)+i]=
                                        collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)]+
                                        2*LATTICE_WEIGHTS[i]*density*C_S_POW2_INV*dot_prod;
                            }else if(flag_field[x+y*step+z*step*step]==NO_SLIP){
                                collide_field[Q_LBM*(x+y*step+z*step*step)+i]=
                                        collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)];
                            }
                        }
                    }
                }
            }
        }
    }
}

cell_computation.h

#ifndef _CELL_COMPUTATIONS_H_
#define _CELL_COMPUTATIONS_H_

/**
 * Computes the density from the particle distribution functions stored at currentCell.
 * currentCell thus denotes the address of the first particle distribution function of the
 * respective cell. The result is stored in density.
 */
void ComputeDensity(const float *const current_cell, float *density);

/**
 * Computes the velocity within currentCell and stores the result in velocity
 */
void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity);

/**
 * Computes the equilibrium distributions for all particle distribution functions of one
 * cell from density and velocity and stores the results in feq.
 */
void ComputeFeq(const float * const density, const float * const velocity, float *feq);

#endif

cell_computation.c

#include "cell_computation.h"
#include "lbm_model.h"
#include "utils.h"

void ComputeDensity(const float *const current_cell, float *density){
    int i; *density=0;
    for(i=0;i<Q_LBM;i++)
        *density+=current_cell[i];

    /* Density should be close to a unit (ρ~1) */
    if((*density-1.0)>EPS)
        ERROR("Density dropped below error tolerance.");
}


void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity){
    int i;
    /* NOTE:Indexes are hard coded to improve program performance */
    velocity[0]=0;
    velocity[1]=0;
    velocity[2]=0;
    
    for(i=0;i<Q_LBM;i++){
        velocity[0]+=current_cell[i]*LATTICE_VELOCITIES[i][0];
        velocity[1]+=current_cell[i]*LATTICE_VELOCITIES[i][1];
        velocity[2]+=current_cell[i]*LATTICE_VELOCITIES[i][2];
    }
    
    velocity[0]/=*density;
    velocity[1]/=*density;
    velocity[2]/=*density;
}


void ComputeFeq(const float * const density, const float * const velocity, float *feq){
    int i;
    float s1, s2, s3; /* summands */
    /* NOTE:Indexes are hard coded to improve program performance */
    for(i=0;i<Q_LBM;i++){
        s1 = LATTICE_VELOCITIES[i][0]*velocity[0]+LATTICE_VELOCITIES[i][1]*velocity[1]+
                LATTICE_VELOCITIES[i][2]*velocity[2];
        s2 = s1*s1;
        s3 = velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
        
        feq[i]=LATTICE_WEIGHTS[i]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
        
        /* Probability distribution function can not be less than 0 */
        if (feq[i] < 0)
            ERROR("Probability distribution function can not be negative.");
    }
}

visualization.h

#ifndef _VISUALIZATION_H_
#define _VISUALIZATION_H_


/**
 * Writes the density and velocity field (derived from the distributions in collideField)
 * to a file determined by 'filename' and timestep 't'. You can re-use parts of the code
 * from visual.c (VTK output for Navier-Stokes solver) and modify it for 3D datasets.
 */
void WriteVtkOutput(const float * const collide_field, const int * const flag_field,
        const char * filename, unsigned int t, int xlength);


/**
 * Writes the provided stream or collision field to a file with specified identifiers.
 */
void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength,
		const int rank);


/**
 * Writes the provided flag field to a file with specified identifiers.
 */
void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank);


/**
 * Prints out the point by point values of the provided field (4D) to the stdout.
 * @param field
 *          linerized 4D array, with (x,y,z,i)=Q*(x+y*(ncell+2)+z*(ncell+2)*(ncell+2))+i
 * @param ncell
 *          number of inner cells, the ones which are there before adding a boundary layer
 */
void PrintField(float *field, int ncell);

#endif

visualization.c

#include <cstdio>

#include "lbm_model.h"
#include "visualization.h"
#include "cell_computation.h"
#include "utils.h"


void write_vtkHeader( FILE *fp, int xlength) {
    if( fp == NULL ){
        char szBuff[80];
        sprintf( szBuff, "Null pointer in write_vtkHeader" );
        ERROR( szBuff );
        return;
    }

    fprintf(fp,"# vtk DataFile Version 2.0\n");
    fprintf(fp,"generated by CFD-lab course output (written by Denys Korzh, Denys Sobchyshak, Ozel Christo Annamanthadoo  \n");
    fprintf(fp,"ASCII\n");
    fprintf(fp,"\n");
    fprintf(fp,"DATASET STRUCTURED_GRID\n");
    fprintf(fp,"DIMENSIONS  %i %i %i \n", xlength, xlength, xlength);
    fprintf(fp,"POINTS %i float\n", xlength*xlength*xlength );
    fprintf(fp,"\n");
}


void write_vtkPointCoordinates( FILE *fp, int xlength) {
    float originX = 0.0, originY = 0.0;
    int i = 0, j = 0, k = 0;

    for(k = 0; k < xlength; k++)
        for(j = 0; j < xlength; j++)
            for(i = 0; i < xlength; i++)
                fprintf(fp, "%f %f %f\n", originX+(i*1.0/xlength), originY+(j*1.0/xlength), 
                        originY+(k*1.0/xlength) );
}


void WriteVtkOutput(const float * const collideField, const int * const flagField, const char * filename, unsigned int t, int xlength) {
    int i, j, k, len = xlength+2; /* lexicographic order "[ Q * ( z*len*len + y*len + x) + i ]" */
    float velocity[3], density;

    char szFileName[80];
    FILE *fp=NULL;
    sprintf( szFileName, "%s.%i.vtk", filename, t );
    fp = fopen( szFileName, "w");
    if( fp == NULL ){
        char szBuff[80];
        sprintf( szBuff, "Failed to open %s", szFileName );
        ERROR( szBuff );
        return;
    }

    write_vtkHeader( fp, xlength);
    write_vtkPointCoordinates( fp, xlength);

    fprintf(fp,"POINT_DATA %i \n", xlength*xlength*xlength );
    fprintf(fp,"\n");
    fprintf(fp, "VECTORS velocity float\n");
    for(k = 1; k < xlength+1; k++) {
        for(j = 1; j < xlength+1; j++) {
            for(i = 1; i < xlength+1; i++) {
                ComputeDensity (&collideField[ 19 * ( k*len*len + j*len + i) ], &density);
                ComputeVelocity(&collideField[ 19 * ( k*len*len + j*len + i) ], &density, velocity);
                fprintf(fp, "%f %f %f\n",  velocity[0], velocity[1], velocity[2]);
            }
        }
    }

    fprintf(fp,"\n");
    fprintf(fp, "SCALARS density float 1 \n");
    fprintf(fp, "LOOKUP_TABLE default \n");
    for(k = 1; k < xlength+1; k++) {
        for(j = 1; j < xlength+1; j++) {
            for(i = 1; i < xlength+1; i++) {
                ComputeDensity (&collideField[ 19 * ( k*len*len + j*len + i) ], &density);
                fprintf(fp, "%f\n",  density);
            }
        }
    }

    if(fclose(fp)){
        char szBuff[80];
        sprintf( szBuff, "Failed to close %s", szFileName );
        ERROR( szBuff );
    }
}


void PrintField(float *field, int ncell){
    int x,y,z,i,step=ncell+2;

    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                printf("(%d,%d,%d): ",x,y,z);
                for(i=0;i<Q_LBM;i++)
                    printf("%f ",field[Q_LBM*(x+y*step+z*step*step)+i]);
                printf("\n");
            }
        }
    }
}


void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength, const int rank) {
	int x,y,z,i, stepX=xlength+2,stepY=xlength+2,stepZ=xlength+2;

	char szFileName[80];
	FILE *fp=NULL;
	sprintf( szFileName, "%s-rank%i.%i.out", filename, rank, t );
	fp = fopen( szFileName, "w");
	if( fp == NULL ){
		char szBuff[80];
		sprintf( szBuff, "Failed to open %s", szFileName );
		ERROR( szBuff );
		return;
	}

	for(z=0;z<stepZ;z++){
		for(x=0;x<stepX;x++){
			for(y=0;y<stepY;y++){
				fprintf(fp, "(%d,%d,%d): ",x,y,z);
					for(i=0;i<Q_LBM;i++)
					  fprintf(fp, "%f ",field[Q_LBM*(x+y*stepX+z*stepX*stepY)+i]);
				fprintf(fp, "\n");
			}
		}
	}

	if(fclose(fp)){
		char szBuff[80];
		sprintf( szBuff, "Failed to close %s", szFileName );
		ERROR( szBuff );
	}
}


void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank) {
  int x,y,z, stepX=xlength+2,stepY=xlength+2,stepZ=xlength+2;

  char szFileName[80];
  FILE *fp=NULL;
  sprintf( szFileName, "%s-rank%i.out", filename, rank );
  fp = fopen( szFileName, "w");
  if( fp == NULL ){
    char szBuff[80];
    sprintf( szBuff, "Failed to open %s", szFileName );
    ERROR( szBuff );
    return;
  }

  fprintf(fp, "(y,z)\\x  ");
  for(x=0; x<stepX; x++) {
      fprintf(fp, "%2i ",x);
  }
  fprintf(fp, "\n-------");
  for(x=0; x<stepX; x++) {
      fprintf(fp, "---");
  }
  fprintf(fp, "\n");

  for(z=0;z<stepZ;z++){
      for(y=stepY-1;y>=0;y--){
          fprintf(fp, "(%2d,%2d): ",y,z);
          for(x=0;x<stepX;x++){
              fprintf(fp, "%2i ",flagField[x+y*stepX+z*stepX*stepY]);
          }
          fprintf(fp, "\n");
      }
      fprintf(fp, "\n");
  }

  if(fclose(fp)){
    char szBuff[80];
    sprintf( szBuff, "Failed to close %s", szFileName );
    ERROR( szBuff );
  }
}