LBM Code

main.c

#ifndef _MAIN_C_
#define _MAIN_C_

#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>

#include "lbm_model.h"
#include "initialization.h"
#include "initialization_gpu.h"
#include "streaming.h"
#include "collision.h"
#include "boundary.h"
#include "lbm_solver_gpu.h"
#include "visualization.h"
#include "utils.h"

int main(int argc, char *argv[]) {
	float *collide_field=NULL, *stream_field=NULL, *collide_field_d=NULL, *stream_field_d=NULL,
			*swap=NULL, tau, wall_velocity[D_LBM], num_cells, mlups_sum;
	int *flag_field=NULL, *flag_field_d=NULL, xlength, t, timesteps, timesteps_per_plotting,
			gpu_enabled;
	clock_t mlups_time;
	size_t field_size;

	/* process parameters */
	ReadParameters(&xlength, &tau, wall_velocity, &timesteps, &timesteps_per_plotting, argc, argv,
			&gpu_enabled);

	/* check if provided parameters are legitimate */
	ValidateModel(wall_velocity, xlength, tau);

	/* initializing fields */
	num_cells = pow(xlength + 2, D_LBM);
	field_size = Q_LBM*num_cells*sizeof(float);
	collide_field = (float*) malloc(field_size);
	stream_field = (float*) malloc(field_size);
	flag_field = (int*) malloc(num_cells*sizeof(int));
	InitialiseFields(collide_field, stream_field, flag_field, xlength, gpu_enabled);
	InitialiseDeviceFields(collide_field, stream_field, flag_field, xlength, &collide_field_d, &stream_field_d, &flag_field_d);

	for (t = 0; t < timesteps; t++) {
		printf("Time step: #%d\n", t);
		if (gpu_enabled){
			DoIteration(collide_field, stream_field, flag_field, tau, wall_velocity, xlength,
					&collide_field_d, &stream_field_d, &flag_field_d, &mlups_sum);
			/* Copy data from devcice in memory only when we need VTK output */
			if (!(t%timesteps_per_plotting))
				CopyFieldsFromDevice(collide_field, stream_field, xlength,
						&collide_field_d, &stream_field_d);
		} else {
			mlups_time = clock();
			/* Copy pdfs from neighbouring cells into collide field */
			DoStreaming(collide_field, stream_field, flag_field, xlength);
			/* Perform the swapping of collide and stream fields */
			swap = collide_field; collide_field = stream_field; stream_field = swap;
			/* Compute post collision distributions */
			DoCollision(collide_field, flag_field, tau, xlength);
			/* Treat boundaries */
			TreatBoundary(collide_field, flag_field, wall_velocity, xlength);
			mlups_time = clock()-mlups_time;
			/* Print out the MLUPS value */
			mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC);
			if(VERBOSE)
				printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC));
		}
		/* Print out vtk output if needed */
		if (!(t%timesteps_per_plotting))
			WriteVtkOutput(collide_field, flag_field, "img/lbm-img", t, xlength);
	}

	printf("Average MLUPS: %f\n", mlups_sum/(t+1));

	if (VERBOSE) {
		if(gpu_enabled)
	  	      CopyFieldsFromDevice(collide_field, stream_field, xlength, &collide_field_d, &stream_field_d);
		WriteField(collide_field, "img/collide-field", 0, xlength, gpu_enabled);
		writeFlagField(flag_field, "img/flag-field", xlength, gpu_enabled);
	}

	/* Free memory */
	free(collide_field);
	free(stream_field);
	free(flag_field);

	FreeDeviceFields(&collide_field_d, &stream_field_d, &flag_field_d);

	printf("Simulation complete.\n");
	return 0;
}

#endif

lbm_model.h

#ifndef _LBM_MODEL_H_
#define _LBM_MODEL_H_

#define VERBOSE 0
#define MLUPS_MIN_CELLS 300
#define MLUPS_EXPONENT 1000000
/* simulation parameters */
#define D_LBM 3
#define Q_LBM 19
/* arbitrary cell identifiers */
#define FLUID 0
#define NO_SLIP 1
#define MOVING_WALL 2
/* computation enhancing values */
#define EPS 0.05
#define C_S_POW2_INV 3.0
#define C_S_POW4_INV 9.0
/* reference values, not used in actual computation */
#define SQRT3 1.73205080756887729
static const float C_S = 1.0/SQRT3;
/* predefined simulation parameters */
static const int LATTICE_VELOCITIES[19][3] = {
    {0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0},
    {-1,0,0}, {0,0,0},  {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0},  {0,-1,1},{-1,0,1},
    {0,0,1},  {1,0,1},  {0,1,1}
};
static const float LATTICE_WEIGHTS[19] = {
    1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 
    2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 
    2.0/36.0, 1.0/36.0, 1.0/36.0
};

/* additional cell definitions */
#define LEFT_BOUNDARY 5100
#define RIGHT_BOUNDARY 5900
#define BOTTOM_BOUNDARY 5010
#define TOP_BOUNDARY 5090
#define BACK_BOUNDARY 5001
#define FRONT_BOUNDARY 5009

#define LEFT_BOTTOM_EDGE 5110
#define LEFT_UPPER_EDGE 5190
#define RIGHT_BOTTOM_EDGE 5910
#define RIGHT_UPPER_EDGE 5990

#define BACK_BOTTOM_EDGE 5011
#define BACK_UPPER_EDGE 5091
#define FRONT_BOTTOM_EDGE 5019
#define FRONT_UPPER_EDGE 5099

#define LEFT_BACK_EDGE 5101
#define LEFT_FRONT_EDGE 5109
#define RIGHT_BACK_EDGE 5901
#define RIGHT_FRONT_EDGE 5909

#define LEFT_BOTTOM_BACK_CORNER 5111
#define LEFT_BOTTOM_FRONT_CORNER 5119
#define LEFT_UPPER_BACK_CORNER 5191
#define LEFT_UPPER_FRONT_CORNER 5199
#define RIGHT_BOTTOM_BACK_CORNER 5911
#define RIGHT_BOTTOM_FRONT_CORNER 5919
#define RIGHT_UPPER_BACK_CORNER 5991
#define RIGHT_UPPER_FRONT_CORNER 5999

/**
 * Validates the configured physical model by calculating characteristic numbers
 */
void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau);

#endif


lbm_model.c

#include <math.h>
#include <stdio.h>

#include "lbm_model.h"
#include "utils.h"


void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau){
	float u_wall_length, mach_number, reynolds_number;
	/* Compute Mach number and Reynolds number */
	u_wall_length = sqrt(wall_velocity[0]*wall_velocity[0]+
			wall_velocity[1]*wall_velocity[1]+
			wall_velocity[2]*wall_velocity[2]);
	mach_number = u_wall_length*SQRT3;
	reynolds_number = u_wall_length*domain_size*C_S_POW2_INV/(tau-0.5);
	printf("Computed Mach number: %f\n", mach_number);
	printf("Computed Reynolds number: %f\n", reynolds_number);

	/* Check if characteristic numbers are correct */
	if (mach_number >= 1)
		ERROR("Computed Mach number is too large.");
	if (reynolds_number > 500)
		ERROR("Computed Reynolds number is too large for simulation to be run on a laptop/pc.");
}

 

GPU:

initialization_gpu.h

#ifndef _INITIALIZATION_GPU_H_
#define _INITIALIZATION_GPU_H_

/**
 * Copy everything from fields on DRAM in GPU
 */
void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
		float **collide_field_d, float **stream_field_d,int **flag_field_d);

/**
 * Free GPU memory
 */
void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d);

/**
 * Copy data from device in global memory.
 */
void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength,
		float **collide_field_dd, float **stream_field_dd);

#endif

initialization_gpu.cu

#include "initialization_gpu.h"
#include "lbm_model.h"
#include "utils_gpu.h"


void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
		float **collide_field_d, float **stream_field_d,int **flag_field_d){
	int num_cells = pow(xlength+2, D_LBM);
	size_t computational_field_size = Q_LBM*num_cells*sizeof(float);
	size_t flag_field_size = num_cells*sizeof(int);

	cudaErrorCheck(cudaMalloc(collide_field_d, computational_field_size));
	cudaErrorCheck(cudaMemcpy(*collide_field_d, collide_field, computational_field_size, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMalloc(stream_field_d, computational_field_size));
	cudaErrorCheck(cudaMemcpy(*stream_field_d, stream_field, computational_field_size, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMalloc(flag_field_d, flag_field_size));
	cudaErrorCheck(cudaMemcpy(*flag_field_d, flag_field, flag_field_size, cudaMemcpyHostToDevice));
}


void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d){
	cudaErrorCheck(cudaFree(*collide_field_d));
	cudaErrorCheck(cudaFree(*stream_field_d));
	cudaErrorCheck(cudaFree(*flag_field_d));
}


void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength,
		float **collide_field_dd, float **stream_field_dd) {
	int num_cells = pow(xlength+2, D_LBM);
	size_t computational_field_size = Q_LBM*num_cells*sizeof(float);
	cudaErrorCheck(cudaMemcpy(collide_field, *collide_field_dd, computational_field_size, cudaMemcpyDeviceToHost));
	cudaErrorCheck(cudaMemcpy(stream_field, *stream_field_dd, computational_field_size, cudaMemcpyDeviceToHost));
}

lbm_solver_gpu.h

#ifndef _COLLISION_GPU_H_
#define _COLLISION_GPU_H_

/**
 * Performs streaming, collision and boundary treatment.
 */
void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau,
		float *wall_velocity, int xlength, float **collide_field_d, float **stream_field_d,
		int **flag_field_d, float *mlups_sum);

#endif

lbm_solver_gpu.cu

#include <math.h>
#include <stdio.h>

#include "lbm_solver_gpu.h"
#include "lbm_model.h"
#include "lbm_model_gpu.cuh"
#include "utils.h"
#include "utils_gpu.h"
#include "cell_computation_gpu.cuh"


__constant__ float tau_d, wall_velocity_d[D_LBM];
__constant__ int xlength_d, num_cells_d;
__device__ float *stream_field_d, *collide_field_d;


/**
 * Computes the post-collision distribution functions according to the BGK update rule and
 * stores the results again at the same position.
 */
__device__ void ComputePostCollisionDistributionsGpu(float *current_cell, float *feq){
	current_cell[0]=current_cell[0]-(current_cell[0]-feq[0])/tau_d;
	current_cell[1]=current_cell[1]-(current_cell[1]-feq[1])/tau_d;
	current_cell[2]=current_cell[2]-(current_cell[2]-feq[2])/tau_d;
	current_cell[3]=current_cell[3]-(current_cell[3]-feq[3])/tau_d;
	current_cell[4]=current_cell[4]-(current_cell[4]-feq[4])/tau_d;
	current_cell[5]=current_cell[5]-(current_cell[5]-feq[5])/tau_d;
	current_cell[6]=current_cell[6]-(current_cell[6]-feq[6])/tau_d;
	current_cell[7]=current_cell[7]-(current_cell[7]-feq[7])/tau_d;
	current_cell[8]=current_cell[8]-(current_cell[8]-feq[8])/tau_d;
	current_cell[9]=current_cell[9]-(current_cell[9]-feq[9])/tau_d;
	current_cell[10]=current_cell[10]-(current_cell[10]-feq[10])/tau_d;
	current_cell[11]=current_cell[11]-(current_cell[11]-feq[11])/tau_d;
	current_cell[12]=current_cell[12]-(current_cell[12]-feq[12])/tau_d;
	current_cell[13]=current_cell[13]-(current_cell[13]-feq[13])/tau_d;
	current_cell[14]=current_cell[14]-(current_cell[14]-feq[14])/tau_d;
	current_cell[15]=current_cell[15]-(current_cell[15]-feq[15])/tau_d;
	current_cell[16]=current_cell[16]-(current_cell[16]-feq[16])/tau_d;
	current_cell[17]=current_cell[17]-(current_cell[17]-feq[17])/tau_d;
	current_cell[18]=current_cell[18]-(current_cell[18]-feq[18])/tau_d;
}

/*
 * Performs streaming on cells.
 */
__global__ void DoStreaming(){
	int x = threadIdx.x+blockIdx.x*blockDim.x;
	int y = threadIdx.y+blockIdx.y*blockDim.y;
	int z = threadIdx.z+blockIdx.z*blockDim.z;
	int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz;

	/* check that indices are within the bounds since there could be more threads than needed */
	if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){
		nx=x-LATTICE_VELOCITIES_D[0][0];
		ny=y-LATTICE_VELOCITIES_D[0][1];
		nz=z-LATTICE_VELOCITIES_D[0][2];
		stream_field_d[Q_LBM*idx]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)];
		nx=x-LATTICE_VELOCITIES_D[1][0];
		ny=y-LATTICE_VELOCITIES_D[1][1];
		nz=z-LATTICE_VELOCITIES_D[1][2];
		stream_field_d[Q_LBM*idx+1]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+1];
		nx=x-LATTICE_VELOCITIES_D[2][0];
		ny=y-LATTICE_VELOCITIES_D[2][1];
		nz=z-LATTICE_VELOCITIES_D[2][2];
		stream_field_d[Q_LBM*idx+2]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+2];
		nx=x-LATTICE_VELOCITIES_D[3][0];
		ny=y-LATTICE_VELOCITIES_D[3][1];
		nz=z-LATTICE_VELOCITIES_D[3][2];
		stream_field_d[Q_LBM*idx+3]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+3];
		nx=x-LATTICE_VELOCITIES_D[4][0];
		ny=y-LATTICE_VELOCITIES_D[4][1];
		nz=z-LATTICE_VELOCITIES_D[4][2];
		stream_field_d[Q_LBM*idx+4]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+4];
		nx=x-LATTICE_VELOCITIES_D[5][0];
		ny=y-LATTICE_VELOCITIES_D[5][1];
		nz=z-LATTICE_VELOCITIES_D[5][2];
		stream_field_d[Q_LBM*idx+5]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+5];
		nx=x-LATTICE_VELOCITIES_D[6][0];
		ny=y-LATTICE_VELOCITIES_D[6][1];
		nz=z-LATTICE_VELOCITIES_D[6][2];
		stream_field_d[Q_LBM*idx+6]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+6];
		nx=x-LATTICE_VELOCITIES_D[7][0];
		ny=y-LATTICE_VELOCITIES_D[7][1];
		nz=z-LATTICE_VELOCITIES_D[7][2];
		stream_field_d[Q_LBM*idx+7]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+7];
		nx=x-LATTICE_VELOCITIES_D[8][0];
		ny=y-LATTICE_VELOCITIES_D[8][1];
		nz=z-LATTICE_VELOCITIES_D[8][2];
		stream_field_d[Q_LBM*idx+8]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+8];
		nx=x-LATTICE_VELOCITIES_D[9][0];
		ny=y-LATTICE_VELOCITIES_D[9][1];
		nz=z-LATTICE_VELOCITIES_D[9][2];
		stream_field_d[Q_LBM*idx+9]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+9];
		nx=x-LATTICE_VELOCITIES_D[10][0];
		ny=y-LATTICE_VELOCITIES_D[10][1];
		nz=z-LATTICE_VELOCITIES_D[10][2];
		stream_field_d[Q_LBM*idx+10]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+10];
		nx=x-LATTICE_VELOCITIES_D[11][0];
		ny=y-LATTICE_VELOCITIES_D[11][1];
		nz=z-LATTICE_VELOCITIES_D[11][2];
		stream_field_d[Q_LBM*idx+11]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+11];
		nx=x-LATTICE_VELOCITIES_D[12][0];
		ny=y-LATTICE_VELOCITIES_D[12][1];
		nz=z-LATTICE_VELOCITIES_D[12][2];
		stream_field_d[Q_LBM*idx+12]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+12];
		nx=x-LATTICE_VELOCITIES_D[13][0];
		ny=y-LATTICE_VELOCITIES_D[13][1];
		nz=z-LATTICE_VELOCITIES_D[13][2];
		stream_field_d[Q_LBM*idx+13]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+13];
		nx=x-LATTICE_VELOCITIES_D[14][0];
		ny=y-LATTICE_VELOCITIES_D[14][1];
		nz=z-LATTICE_VELOCITIES_D[14][2];
		stream_field_d[Q_LBM*idx+14]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+14];
		nx=x-LATTICE_VELOCITIES_D[15][0];
		ny=y-LATTICE_VELOCITIES_D[15][1];
		nz=z-LATTICE_VELOCITIES_D[15][2];
		stream_field_d[Q_LBM*idx+15]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+15];
		nx=x-LATTICE_VELOCITIES_D[16][0];
		ny=y-LATTICE_VELOCITIES_D[16][1];
		nz=z-LATTICE_VELOCITIES_D[16][2];
		stream_field_d[Q_LBM*idx+16]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+16];
		nx=x-LATTICE_VELOCITIES_D[17][0];
		ny=y-LATTICE_VELOCITIES_D[17][1];
		nz=z-LATTICE_VELOCITIES_D[17][2];
		stream_field_d[Q_LBM*idx+17]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+17];
		nx=x-LATTICE_VELOCITIES_D[18][0];
		ny=y-LATTICE_VELOCITIES_D[18][1];
		nz=z-LATTICE_VELOCITIES_D[18][2];
		stream_field_d[Q_LBM*idx+18]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+18];
	}
}


/*
 * Performs collision computation.
 */
__global__ void DoCollision(){
	int x = threadIdx.x+blockIdx.x*blockDim.x;
	int y = threadIdx.y+blockIdx.y*blockDim.y;
	int z = threadIdx.z+blockIdx.z*blockDim.z;
	int step=xlength_d+2, idx=x+y*step+z*step*step;
	float density, velocity[D_LBM], feq[Q_LBM], *current_cell_s;

	/* check that indices are within the bounds since there could be more threads than needed */
	if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){
		current_cell_s=&collide_field_d[Q_LBM*idx];
		ComputeDensityGpu(current_cell_s,&density);
		ComputeVelocityGpu(current_cell_s,&density,velocity);
		ComputeFeqGpu(&density,velocity,feq);
		ComputePostCollisionDistributionsGpu(current_cell_s,feq);
	}
}


/*
 * Computes proper boundary values.
 */
__global__ void TreatBoundary(int *flag_field_d){
	int x = threadIdx.x+blockIdx.x*blockDim.x;
	int y = threadIdx.y+blockIdx.y*blockDim.y;
	int z = threadIdx.z+blockIdx.z*blockDim.z;

	int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz, i, boundary_side=0, boundary_idx=100500;
	float density, dot_prod;

	if(idx<num_cells_d) {
		if(flag_field_d[idx] == BOTTOM_BOUNDARY) {
			boundary_side = BOTTOM_BOUNDARY;
			boundary_idx = BOTTOM_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == LEFT_BOUNDARY) {
			boundary_side = LEFT_BOUNDARY;
			boundary_idx = LEFT_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == RIGHT_BOUNDARY) {
			boundary_side = RIGHT_BOUNDARY;
			boundary_idx = RIGHT_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == BACK_BOUNDARY) {
			boundary_side = BACK_BOUNDARY;
			boundary_idx = BACK_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == FRONT_BOUNDARY) {
			boundary_side = FRONT_BOUNDARY;
			boundary_idx = FRONT_BOUNDARY_IDX;
		} else if (flag_field_d[idx] == LEFT_BOTTOM_EDGE) {
			boundary_side = LEFT_BOTTOM_EDGE;
			boundary_idx = 13;
		} else if (flag_field_d[idx] == RIGHT_BOTTOM_EDGE) {
			boundary_side = RIGHT_BOTTOM_EDGE;
			boundary_idx = 11;
		} else if (flag_field_d[idx] == BACK_BOTTOM_EDGE) {
			boundary_side = BACK_BOTTOM_EDGE;
			boundary_idx = 18;
		} else if (flag_field_d[idx] == FRONT_BOTTOM_EDGE) {
			boundary_side = FRONT_BOTTOM_EDGE;
			boundary_idx = 4;
		} else if (flag_field_d[idx] == LEFT_BACK_EDGE) {
			boundary_side = LEFT_BACK_EDGE;
			boundary_idx = 17;
		} else if (flag_field_d[idx] == LEFT_FRONT_EDGE) {
			boundary_side = LEFT_FRONT_EDGE;
			boundary_idx = 3;
		} else if (flag_field_d[idx] == RIGHT_BACK_EDGE) {
			boundary_side = RIGHT_BACK_EDGE;
			boundary_idx = 15;
		} else if (flag_field_d[idx] == RIGHT_FRONT_EDGE) {
			boundary_side = RIGHT_FRONT_EDGE;
			boundary_idx = 1;
		} else if (flag_field_d[idx] == LEFT_UPPER_EDGE) {
			boundary_side = LEFT_UPPER_EDGE;
			boundary_idx = 7;
		} else if (flag_field_d[idx] == RIGHT_UPPER_EDGE) {
			boundary_side = RIGHT_UPPER_EDGE;
			boundary_idx = 5;
		} else if (flag_field_d[idx] == BACK_UPPER_EDGE) {
			boundary_side = BACK_UPPER_EDGE;
			boundary_idx = 14;
		} else if (flag_field_d[idx] == FRONT_UPPER_EDGE) {
			boundary_side = FRONT_UPPER_EDGE;
			boundary_idx = 0;
		} else if (flag_field_d[idx] == TOP_BOUNDARY) {
			boundary_side = TOP_BOUNDARY;
			boundary_idx = TOP_BOUNDARY_IDX;
		}

		if( boundary_side==LEFT_BOUNDARY || boundary_side==RIGHT_BOUNDARY ||
				boundary_side==BOTTOM_BOUNDARY ||
				boundary_side==BACK_BOUNDARY || boundary_side==FRONT_BOUNDARY) {
			i = TREAT_BOUNDARY_INDECES[boundary_idx][0];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][1];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][2];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][3];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
			i = TREAT_BOUNDARY_INDECES[boundary_idx][4];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
		} else if (boundary_side == LEFT_BOTTOM_EDGE || boundary_side == RIGHT_BOTTOM_EDGE ||
				boundary_side == BACK_BOTTOM_EDGE || boundary_side == FRONT_BOTTOM_EDGE ||
				boundary_side == LEFT_BACK_EDGE || boundary_side == LEFT_FRONT_EDGE ||
				boundary_side == RIGHT_BACK_EDGE || boundary_side == RIGHT_FRONT_EDGE) {
			nx=x+LATTICE_VELOCITIES_D[boundary_idx][0];
			ny=y+LATTICE_VELOCITIES_D[boundary_idx][1];
			nz=z+LATTICE_VELOCITIES_D[boundary_idx][2];
			collide_field_d[Q_LBM*(x+y*step+z*step*step)+boundary_idx]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(boundary_idx)];
		} else if(boundary_side == LEFT_UPPER_EDGE || boundary_side == RIGHT_UPPER_EDGE ||
				boundary_side == BACK_UPPER_EDGE || boundary_side == FRONT_UPPER_EDGE) {
			i = boundary_idx;
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			/* Compute density in the neighbour cell */
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			/* Compute dot product */
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			/* Assign the boudary cell value */
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
		} else if(boundary_side == TOP_BOUNDARY) {
			i = TREAT_BOUNDARY_INDECES[boundary_idx][0];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][1];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][2];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][3];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
			i = TREAT_BOUNDARY_INDECES[boundary_idx][4];
			nx=x+LATTICE_VELOCITIES_D[i][0];
			ny=y+LATTICE_VELOCITIES_D[i][1];
			nz=z+LATTICE_VELOCITIES_D[i][2];
			ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
			dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
					LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
					LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
			collide_field_d[Q_LBM*(idx)+i]=
					collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
					2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
		}
	}
}


/*
 * Performs pointer swap on GPU.
 */
__global__ void DoSwap(){
	float *swap=collide_field_d; collide_field_d=stream_field_d; stream_field_d=swap;
}


void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau,
		float *wall_velocity, int xlength, float **collide_field_dd, float **stream_field_dd,
		int **flag_field_d, float *mlups_sum){
	int num_cells = pow(xlength+2, D_LBM);
	clock_t mlups_time;

	/* initialize constant data */
	cudaErrorCheck(cudaMemcpyToSymbol(xlength_d, &xlength, sizeof(int), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(num_cells_d, &num_cells, sizeof(int), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(tau_d, &tau, sizeof(float), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(wall_velocity_d, wall_velocity, D_LBM*sizeof(float), 0, cudaMemcpyHostToDevice));

	cudaErrorCheck(cudaMemcpyToSymbol(collide_field_d, collide_field_dd, sizeof(*collide_field_dd), 0, cudaMemcpyHostToDevice));
	cudaErrorCheck(cudaMemcpyToSymbol(stream_field_d, stream_field_dd, sizeof(*stream_field_dd), 0, cudaMemcpyHostToDevice));

	/* define grid structure */
	dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((xlength+2+block.x-1)/block.x, (xlength+2+block.y-1)/block.y, (xlength+2+block.z-1)/block.z);

	mlups_time = clock();

	/* perform streaming */
	DoStreaming<<<grid,block>>>();
	cudaErrorCheck(cudaThreadSynchronize());
	cudaErrorCheck(cudaPeekAtLastError());

	/* Perform the swapping of collide and stream fields */
	DoSwap<<<1,1>>>();
	cudaErrorCheck(cudaThreadSynchronize());
	cudaErrorCheck(cudaPeekAtLastError());

	/* perform collision */
	DoCollision<<<grid,block>>>();
	cudaErrorCheck(cudaThreadSynchronize());
	cudaErrorCheck(cudaPeekAtLastError());

	/* perform boundary treatment */
	TreatBoundary<<<grid,block>>>(*flag_field_d);
	cudaErrorCheck(cudaPeekAtLastError());

	mlups_time = clock()-mlups_time;

	*mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC);
	if(VERBOSE)
		printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC));

	/* copy data back to host */
	cudaErrorCheck(cudaMemcpyFromSymbol(collide_field_dd, collide_field_d, sizeof(*collide_field_dd), 0, cudaMemcpyDeviceToHost));
	cudaErrorCheck(cudaMemcpyFromSymbol(stream_field_dd, stream_field_d, sizeof(*stream_field_dd), 0, cudaMemcpyDeviceToHost));
}

 

cell_computation_gpu.h

#ifndef _CELL_COMPUTATION_GPU_CUH_
#define _CELL_COMPUTATION_GPU_CUH_


/**
 * Computes the density from the particle distribution functions stored at currentCell.
 * currentCell thus denotes the address of the first particle distribution function of the
 * respective cell. The result is stored in density.
 */
__device__ void ComputeDensityGpu(float *current_cell, float *density){
    *density=0;
    *density+=current_cell[0];
    *density+=current_cell[1];
    *density+=current_cell[2];
    *density+=current_cell[3];
    *density+=current_cell[4];
    *density+=current_cell[5];
    *density+=current_cell[6];
    *density+=current_cell[7];
    *density+=current_cell[8];
    *density+=current_cell[9];
    *density+=current_cell[10];
    *density+=current_cell[11];
    *density+=current_cell[12];
    *density+=current_cell[13];
    *density+=current_cell[14];
    *density+=current_cell[15];
    *density+=current_cell[16];
    *density+=current_cell[17];
    *density+=current_cell[18];
}


/**
 * Computes the velocity within currentCell and stores the result in velocity
 */
__device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){
    velocity[0]=0;
	velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0];
	velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0];
	velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0];
	velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0];
	velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0];
	velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0];
	velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0];
	velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0];
	velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0];
	velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0];
	velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0];
	velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0];
	velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0];
	velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0];
	velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0];
	velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0];
	velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0];
	velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0];
	velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0];
	velocity[0]/=*density;

	velocity[1]=0;
	velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1];
	velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1];
	velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1];
	velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1];
	velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1];
	velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1];
	velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1];
	velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1];
	velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1];
	velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1];
	velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1];
	velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1];
	velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1];
	velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1];
	velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1];
	velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1];
	velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1];
	velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1];
	velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1];
	velocity[1]/=*density;

	velocity[2]=0;
	velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2];
	velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2];
	velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2];
	velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2];
	velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2];
	velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2];
	velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2];
	velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2];
	velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2];
	velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2];
	velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2];
	velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2];
	velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2];
	velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2];
	velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2];
	velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2];
	velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2];
	velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2];
	velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2];
    velocity[2]/=*density;
}


/**
 * Computes the equilibrium distributions for all particle distribution functions of one
 * cell from density and velocity and stores the results in feq.
 */
__device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){
    float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
	s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2];
	s2 = s1*s1;
	feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2];
	s2 = s1*s1;
	feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2];
	s2 = s1*s1;
	feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2];
	s2 = s1*s1;
	feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2];
	s2 = s1*s1;
	feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2];
	s2 = s1*s1;
	feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2];
	s2 = s1*s1;
	feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2];
	s2 = s1*s1;
	feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2];
	s2 = s1*s1;
	feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2];
	s2 = s1*s1;
	feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2];
	s2 = s1*s1;
	feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2];
	s2 = s1*s1;
	feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2];
	s2 = s1*s1;
	feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2];
	s2 = s1*s1;
	feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2];
	s2 = s1*s1;
	feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2];
	s2 = s1*s1;
	feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2];
	s2 = s1*s1;
	feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2];
	s2 = s1*s1;
	feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2];
	s2 = s1*s1;
	feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
}


/**
 * Finds an inverse probability distribution for the provided lattice index.
 */
__device__ int InvGpu(int i){
    return (Q_LBM-1)-i;
}

#endif

cell_computation_gpu.cu

#ifndef _CELL_COMPUTATION_GPU_CUH_
#define _CELL_COMPUTATION_GPU_CUH_


/**
 * Computes the density from the particle distribution functions stored at currentCell.
 * currentCell thus denotes the address of the first particle distribution function of the
 * respective cell. The result is stored in density.
 */
__device__ void ComputeDensityGpu(float *current_cell, float *density){
    *density=0;
    *density+=current_cell[0];
    *density+=current_cell[1];
    *density+=current_cell[2];
    *density+=current_cell[3];
    *density+=current_cell[4];
    *density+=current_cell[5];
    *density+=current_cell[6];
    *density+=current_cell[7];
    *density+=current_cell[8];
    *density+=current_cell[9];
    *density+=current_cell[10];
    *density+=current_cell[11];
    *density+=current_cell[12];
    *density+=current_cell[13];
    *density+=current_cell[14];
    *density+=current_cell[15];
    *density+=current_cell[16];
    *density+=current_cell[17];
    *density+=current_cell[18];
}


/**
 * Computes the velocity within currentCell and stores the result in velocity
 */
__device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){
    velocity[0]=0;
	velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0];
	velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0];
	velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0];
	velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0];
	velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0];
	velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0];
	velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0];
	velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0];
	velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0];
	velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0];
	velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0];
	velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0];
	velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0];
	velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0];
	velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0];
	velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0];
	velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0];
	velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0];
	velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0];
	velocity[0]/=*density;

	velocity[1]=0;
	velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1];
	velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1];
	velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1];
	velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1];
	velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1];
	velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1];
	velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1];
	velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1];
	velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1];
	velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1];
	velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1];
	velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1];
	velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1];
	velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1];
	velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1];
	velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1];
	velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1];
	velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1];
	velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1];
	velocity[1]/=*density;

	velocity[2]=0;
	velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2];
	velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2];
	velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2];
	velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2];
	velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2];
	velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2];
	velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2];
	velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2];
	velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2];
	velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2];
	velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2];
	velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2];
	velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2];
	velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2];
	velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2];
	velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2];
	velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2];
	velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2];
	velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2];
    velocity[2]/=*density;
}


/**
 * Computes the equilibrium distributions for all particle distribution functions of one
 * cell from density and velocity and stores the results in feq.
 */
__device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){
    float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
	s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2];
	s2 = s1*s1;
	feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2];
	s2 = s1*s1;
	feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2];
	s2 = s1*s1;
	feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2];
	s2 = s1*s1;
	feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2];
	s2 = s1*s1;
	feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2];
	s2 = s1*s1;
	feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2];
	s2 = s1*s1;
	feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2];
	s2 = s1*s1;
	feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2];
	s2 = s1*s1;
	feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2];
	s2 = s1*s1;
	feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2];
	s2 = s1*s1;
	feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2];
	s2 = s1*s1;
	feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2];
	s2 = s1*s1;
	feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2];
	s2 = s1*s1;
	feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2];
	s2 = s1*s1;
	feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2];
	s2 = s1*s1;
	feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2];
	s2 = s1*s1;
	feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2];
	s2 = s1*s1;
	feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);

	s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2];
	s2 = s1*s1;
	feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
}


/**
 * Finds an inverse probability distribution for the provided lattice index.
 */
__device__ int InvGpu(int i){
    return (Q_LBM-1)-i;
}

#endif


10 AM Meeting

lbm_model_gpu.cuh

#ifndef _LBM_MODEL_GPU_CUH_
#define _LBM_MODEL_GPU_CUH_

/* Restricts 3D blocks to have 512 threads (limits: 512 CC<2.x; 1024 CC>2.x) */
#define BLOCK_SIZE 8

/* Arbitrary boundary identifiers */
#define LEFT_BOUNDARY_IDX 0
#define RIGHT_BOUNDARY_IDX 1
#define BOTTOM_BOUNDARY_IDX 2
#define TOP_BOUNDARY_IDX 3
#define BACK_BOUNDARY_IDX 4
#define FRONT_BOUNDARY_IDX 5


__constant__ static const int LATTICE_VELOCITIES_D[19][3] = {
    {0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0},
    {-1,0,0}, {0,0,0},  {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0},  {0,-1,1},{-1,0,1},
    {0,0,1},  {1,0,1},  {0,1,1}
};

__constant__ static const float LATTICE_WEIGHTS_D[19] = {
    1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0,
    2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0,
    2.0/36.0, 1.0/36.0, 1.0/36.0
};

/**
 * Stores number of probability distributions which we need to copy in boundary treatment step.
 */
__constant__ static const float TREAT_BOUNDARY_INDECES[6][5] = {
	{3,7,10,13,17},
	{1,5,8,11,15},
	{4,11,12,13,18},
	{0,5,6,7,14},
	{14,15,16,17,18},
	{0,1,2,3,4}
};

#endif

CPU:

initialization.h

#ifndef _INITIALIZATION_H_
#define _INITIALIZATION_H_


/**
 * Reads the parameters for the lid driven cavity scenario from a configuration file
 */
void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps,
		int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled);


/**
 * Initializes the particle distribution functions and the flag field
 */
void InitialiseFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
		int gpu_enabled);

#endif

 

initialization.c

#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#include "initialization.h"
#include "lbm_model.h"
#include "utils.h"


/**
 * Prints help message that includes program usage instructions and control flags.
 */
void PrintHelpMessage(){
	printf("List of control flags:\n");
	printf("\t -gpu             all computations are to be performed on gpu\n");
	printf("\t -cpu             all computations are to be performed on cpu\n");
	printf("\t -help            prints this help message\n");
	printf("NOTE: Control flags are mutually exclusive and only one flag at a time is allowed\n");
	printf("Example program usage:\n");
	printf("\t./lbm-sim ./data/lbm.dat -gpu\n");
	exit(1);
}


void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps,
		int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled){
    float *velocity_wall_1, *velocity_wall_2, *velocity_wall_3;
    /* printing out help message */
    if(argc<3) PrintHelpMessage();
    if(!strcmp(argv[1], "-help") || !strcmp(argv[2], "-help")) PrintHelpMessage();

    /* checking parameters */
    if(access(argv[1], R_OK) != 0)
        ERROR("Provided configuration file path either doesn't exist or can not be read.");
    if(!strcmp(argv[2], "-gpu")) *gpu_enabled=1; else *gpu_enabled=0;

    /* reading parameters */
    READ_FLOAT(argv[1], *tau);

    velocity_wall_1=&velocity_wall[0];
    velocity_wall_2=&velocity_wall[1];
    velocity_wall_3=&velocity_wall[2];

    READ_FLOAT(argv[1], *velocity_wall_1);
    READ_FLOAT(argv[1], *velocity_wall_2);
    READ_FLOAT(argv[1], *velocity_wall_3);
    
    READ_INT(argv[1], *xlength);
    READ_INT(argv[1], *timesteps);
    READ_INT(argv[1], *timesteps_per_plotting);
}


void InitialiseFields(float *collide_field, float *stream_field, int *flag_field, int xlength,
		int gpu_enabled){
    int x,y,z,i,step=xlength+2;
    
    /* NOTE: We use z=xlength+1 as the moving wall */
    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                /* Initializing flags */
            	if(gpu_enabled) {
					if(x==0 && y==0 && z==0)
						flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_BACK_CORNER;
					else if(x==0 && y==0 && z==step-1)
						flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_FRONT_CORNER;
					else if(x==0 && y==step-1 && z==0)
						flag_field[x+y*step+z*step*step] = LEFT_UPPER_BACK_CORNER;
					else if(x==0 && y==step-1 && z==step-1)
						flag_field[x+y*step+z*step*step] = LEFT_UPPER_FRONT_CORNER;
					else if(x==step-1 && y==0 && z==0)
						flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_BACK_CORNER;
					else if(x==step-1 && y==0 && z==step-1)
						flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_FRONT_CORNER;
					else if(x==step-1 && y==step-1 && z==0)
						flag_field[x+y*step+z*step*step] = RIGHT_UPPER_BACK_CORNER;
					else if(x==step-1 && y==step-1 && z==step-1)
						flag_field[x+y*step+z*step*step] = RIGHT_UPPER_FRONT_CORNER;
					else if(x==0 && y==0)
						flag_field[x+y*step+z*step*step]=LEFT_BOTTOM_EDGE;
					else if(x==xlength+1 && y==0)
						flag_field[x+y*step+z*step*step]=RIGHT_BOTTOM_EDGE;
					else if(y==0 && z==0)
						flag_field[x+y*step+z*step*step]=BACK_BOTTOM_EDGE;
					else if(y==0 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=FRONT_BOTTOM_EDGE;

					else if(x==0 && z==0)
						flag_field[x+y*step+z*step*step]=LEFT_BACK_EDGE;
					else if(x==0 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=LEFT_FRONT_EDGE;
					else if(x==xlength+1 && z==0)
						flag_field[x+y*step+z*step*step]=RIGHT_BACK_EDGE;
					else if(x==xlength+1 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=RIGHT_FRONT_EDGE;
					else if(x==0 && y==xlength+1)
						flag_field[x+y*step+z*step*step]=LEFT_UPPER_EDGE;
					else if(x==xlength+1 && y==xlength+1)
						flag_field[x+y*step+z*step*step]=RIGHT_UPPER_EDGE;
					else if(y==xlength+1 && z==0)
						flag_field[x+y*step+z*step*step]=BACK_UPPER_EDGE;
					else if(y==xlength+1 && z==xlength+1)
						flag_field[x+y*step+z*step*step]=FRONT_UPPER_EDGE;


					else if(y==step-1 && x!=0 && x!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = TOP_BOUNDARY;
					else if(y==0 && x!=0 && x!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = BOTTOM_BOUNDARY;
					else if (x==0 && y!=0 && y!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = LEFT_BOUNDARY;
					else if (x==step-1 && y!=0 && y!=step-1 && z!=0 && z!=step-1)
						flag_field[x+y*step+z*step*step] = RIGHT_BOUNDARY;
					else if (z==0 && y!=0 && y!=step-1 && x!=0 && x!=step-1)
						flag_field[x+y*step+z*step*step] = BACK_BOUNDARY;
					else if (z==step-1 && y!=0 && y!=step-1 && x!=0 && x!=step-1)
						flag_field[x+y*step+z*step*step] = FRONT_BOUNDARY;
					else
						flag_field[x+y*step+z*step*step]=FLUID;
            	} else {
            		if(y == xlength+1)
						flag_field[x+y*step+z*step*step]=MOVING_WALL;
					else if(x == 0 || x == xlength+1 || y == 0 || z == xlength+1 || z == 0)
						flag_field[x+y*step+z*step*step]=NO_SLIP;
					else
						flag_field[x+y*step+z*step*step]=FLUID;
            	}
                
                /* Initializing distributions for stream and collide fields */
                for(i=0;i<Q_LBM;i++){
                    /*
                     * NOTE:Stream field is initilized to 0s because that helps to
                     * track down mistakes and has no impact whatsoever to on the 
                     * computation further on.
                     */
                    stream_field[Q_LBM*(x+y*step+z*step*step)+i]=0;
                    collide_field[Q_LBM*(x+y*step+z*step*step)+i]=LATTICE_WEIGHTS[i];
                }
            }
        }
    }
}

 

streaming.h

#ifndef _STREAMING_H_
#define _STREAMING_H_

/**
 * Carries out the streaming step and writes the respective distribution functions from
 * collideField to streamField.
 */
void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength);

#endif

streaming.c

#include "streaming.h"
#include "lbm_model.h"


void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength){
    int x,nx,y,ny,z,nz,i,step=xlength+2;

    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                if(flag_field[x+y*step+z*step*step]==FLUID){
                    for(i=0;i<Q_LBM;i++){
                        nx=x-LATTICE_VELOCITIES[i][0];
                        ny=y-LATTICE_VELOCITIES[i][1];
                        nz=z-LATTICE_VELOCITIES[i][2];
                        
                        stream_field[Q_LBM*(x+y*step+z*step*step)+i]=
                                collide_field[Q_LBM*(nx+ny*step+nz*step*step)+i];
                    }
                }
            }
        }
    }
}

collision.h

#ifndef _COLLISION_H_
#define _COLLISION_H_

/**
 * Carries out the whole local collision process. Computes density and velocity and equilibrium
 * distributions. Carries out BGK update.
 */
void DoCollision(float *collide_field, int *flag_field, float tau, int xlength);

#endif

collision.c

#include "collision.h"
#include "cell_computation.h"
#include "lbm_model.h"
#include "utils.h"


/**
 * Computes the post-collision distribution functions according to the BGK update rule and
 * stores the results again at the same position.
 */
void ComputePostCollisionDistributions(float *current_cell, float tau, const float *const feq){
    int i;
    for(i=0;i<Q_LBM;i++){
        current_cell[i]=current_cell[i]-(current_cell[i]-feq[i])/tau;
        
        /* Probability distribution function can not be less than 0 */
        if (current_cell[i] < 0)
            ERROR("Probability distribution function can not be negative.");
    }
}


void DoCollision(float *collide_field, int *flag_field, float tau, int xlength){
    float density, velocity[3], feq[Q_LBM], *currentCell;
    int x,y,z,step=xlength+2;
    
    for(x=1;x<step-1;x++){
        for(y=1;y<step-1;y++){
            for(z=1;z<step-1;z++){
                currentCell=&collide_field[Q_LBM*(x+y*step+z*step*step)];
                
                ComputeDensity(currentCell,&density);
                ComputeVelocity(currentCell,&density,velocity);
                ComputeFeq(&density,velocity,feq);
                ComputePostCollisionDistributions(currentCell,tau,feq);
            }
        }
    }
}

 

boundary.h

#ifndef _BOUNDARY_H_
#define _BOUNDARY_H_


/**
 * Handles the boundaries in our simulation setup
 */
void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength);

#endif

boundary.c

#include "boundary.h"
#include "lbm_model.h"
#include "cell_computation.h"

/**
 * Finds an inverse probability distribution for the provided lattice index.
 */
int inv(int i){
    return (Q_LBM-1)-i;
}


void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength){
    int x,nx,y,ny,z,nz,i,step=xlength+2;
    float density,dot_prod;
    
    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                if(flag_field[x+y*step+z*step*step]!=FLUID){
                    for(i=0;i<Q_LBM;i++){
                        nx=x+LATTICE_VELOCITIES[i][0];
                        ny=y+LATTICE_VELOCITIES[i][1];
                        nz=z+LATTICE_VELOCITIES[i][2];

                        /* We don't need the values outside of our extended domain */
                        if(0<nx && nx<step-1 && 0<ny && ny<step-1 && 0<nz && nz<step-1){
                            if (flag_field[x+y*step+z*step*step]==MOVING_WALL){
                                /* Compute density in the neighbour cell */
                                ComputeDensity(&collide_field[Q_LBM*(nx+ny*step+nz*step*step)],&density);
                                /* Compute dot product */
                                dot_prod=LATTICE_VELOCITIES[i][0]*wall_velocity[0]+
                                        LATTICE_VELOCITIES[i][1]*wall_velocity[1]+
                                        LATTICE_VELOCITIES[i][2]*wall_velocity[2];
                                /* Assign the boudary cell value */
                                collide_field[Q_LBM*(x+y*step+z*step*step)+i]=
                                        collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)]+
                                        2*LATTICE_WEIGHTS[i]*density*C_S_POW2_INV*dot_prod;
                            }else if(flag_field[x+y*step+z*step*step]==NO_SLIP){
                                collide_field[Q_LBM*(x+y*step+z*step*step)+i]=
                                        collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)];
                            }
                        }
                    }
                }
            }
        }
    }
}

 

cell_computation.h

#ifndef _CELL_COMPUTATIONS_H_
#define _CELL_COMPUTATIONS_H_

/**
 * Computes the density from the particle distribution functions stored at currentCell.
 * currentCell thus denotes the address of the first particle distribution function of the
 * respective cell. The result is stored in density.
 */
void ComputeDensity(const float *const current_cell, float *density);

/**
 * Computes the velocity within currentCell and stores the result in velocity
 */
void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity);

/**
 * Computes the equilibrium distributions for all particle distribution functions of one
 * cell from density and velocity and stores the results in feq.
 */
void ComputeFeq(const float * const density, const float * const velocity, float *feq);

#endif

cell_computation.c

#include "cell_computation.h"
#include "lbm_model.h"
#include "utils.h"

void ComputeDensity(const float *const current_cell, float *density){
    int i; *density=0;
    for(i=0;i<Q_LBM;i++)
        *density+=current_cell[i];

    /* Density should be close to a unit (ρ~1) */
    if((*density-1.0)>EPS)
        ERROR("Density dropped below error tolerance.");
}


void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity){
    int i;
    /* NOTE:Indexes are hard coded to improve program performance */
    velocity[0]=0;
    velocity[1]=0;
    velocity[2]=0;
    
    for(i=0;i<Q_LBM;i++){
        velocity[0]+=current_cell[i]*LATTICE_VELOCITIES[i][0];
        velocity[1]+=current_cell[i]*LATTICE_VELOCITIES[i][1];
        velocity[2]+=current_cell[i]*LATTICE_VELOCITIES[i][2];
    }
    
    velocity[0]/=*density;
    velocity[1]/=*density;
    velocity[2]/=*density;
}


void ComputeFeq(const float * const density, const float * const velocity, float *feq){
    int i;
    float s1, s2, s3; /* summands */
    /* NOTE:Indexes are hard coded to improve program performance */
    for(i=0;i<Q_LBM;i++){
        s1 = LATTICE_VELOCITIES[i][0]*velocity[0]+LATTICE_VELOCITIES[i][1]*velocity[1]+
                LATTICE_VELOCITIES[i][2]*velocity[2];
        s2 = s1*s1;
        s3 = velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
        
        feq[i]=LATTICE_WEIGHTS[i]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
        
        /* Probability distribution function can not be less than 0 */
        if (feq[i] < 0)
            ERROR("Probability distribution function can not be negative.");
    }
}

visualization.h

#ifndef _VISUALIZATION_H_
#define _VISUALIZATION_H_


/**
 * Writes the density and velocity field (derived from the distributions in collideField)
 * to a file determined by 'filename' and timestep 't'. You can re-use parts of the code
 * from visual.c (VTK output for Navier-Stokes solver) and modify it for 3D datasets.
 */
void WriteVtkOutput(const float * const collide_field, const int * const flag_field,
        const char * filename, unsigned int t, int xlength);


/**
 * Writes the provided stream or collision field to a file with specified identifiers.
 */
void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength,
		const int rank);


/**
 * Writes the provided flag field to a file with specified identifiers.
 */
void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank);


/**
 * Prints out the point by point values of the provided field (4D) to the stdout.
 * @param field
 *          linerized 4D array, with (x,y,z,i)=Q*(x+y*(ncell+2)+z*(ncell+2)*(ncell+2))+i
 * @param ncell
 *          number of inner cells, the ones which are there before adding a boundary layer
 */
void PrintField(float *field, int ncell);

#endif

visualization.c

#include <cstdio>

#include "lbm_model.h"
#include "visualization.h"
#include "cell_computation.h"
#include "utils.h"


void write_vtkHeader( FILE *fp, int xlength) {
    if( fp == NULL ){
        char szBuff[80];
        sprintf( szBuff, "Null pointer in write_vtkHeader" );
        ERROR( szBuff );
        return;
    }

    fprintf(fp,"# vtk DataFile Version 2.0\n");
    fprintf(fp,"generated by CFD-lab course output (written by Denys Korzh, Denys Sobchyshak, Ozel Christo Annamanthadoo  \n");
    fprintf(fp,"ASCII\n");
    fprintf(fp,"\n");
    fprintf(fp,"DATASET STRUCTURED_GRID\n");
    fprintf(fp,"DIMENSIONS  %i %i %i \n", xlength, xlength, xlength);
    fprintf(fp,"POINTS %i float\n", xlength*xlength*xlength );
    fprintf(fp,"\n");
}


void write_vtkPointCoordinates( FILE *fp, int xlength) {
    float originX = 0.0, originY = 0.0;
    int i = 0, j = 0, k = 0;

    for(k = 0; k < xlength; k++)
        for(j = 0; j < xlength; j++)
            for(i = 0; i < xlength; i++)
                fprintf(fp, "%f %f %f\n", originX+(i*1.0/xlength), originY+(j*1.0/xlength), 
                        originY+(k*1.0/xlength) );
}


void WriteVtkOutput(const float * const collideField, const int * const flagField, const char * filename, unsigned int t, int xlength) {
    int i, j, k, len = xlength+2; /* lexicographic order "[ Q * ( z*len*len + y*len + x) + i ]" */
    float velocity[3], density;

    char szFileName[80];
    FILE *fp=NULL;
    sprintf( szFileName, "%s.%i.vtk", filename, t );
    fp = fopen( szFileName, "w");
    if( fp == NULL ){
        char szBuff[80];
        sprintf( szBuff, "Failed to open %s", szFileName );
        ERROR( szBuff );
        return;
    }

    write_vtkHeader( fp, xlength);
    write_vtkPointCoordinates( fp, xlength);

    fprintf(fp,"POINT_DATA %i \n", xlength*xlength*xlength );
    fprintf(fp,"\n");
    fprintf(fp, "VECTORS velocity float\n");
    for(k = 1; k < xlength+1; k++) {
        for(j = 1; j < xlength+1; j++) {
            for(i = 1; i < xlength+1; i++) {
                ComputeDensity (&collideField[ 19 * ( k*len*len + j*len + i) ], &density);
                ComputeVelocity(&collideField[ 19 * ( k*len*len + j*len + i) ], &density, velocity);
                fprintf(fp, "%f %f %f\n",  velocity[0], velocity[1], velocity[2]);
            }
        }
    }

    fprintf(fp,"\n");
    fprintf(fp, "SCALARS density float 1 \n");
    fprintf(fp, "LOOKUP_TABLE default \n");
    for(k = 1; k < xlength+1; k++) {
        for(j = 1; j < xlength+1; j++) {
            for(i = 1; i < xlength+1; i++) {
                ComputeDensity (&collideField[ 19 * ( k*len*len + j*len + i) ], &density);
                fprintf(fp, "%f\n",  density);
            }
        }
    }

    if(fclose(fp)){
        char szBuff[80];
        sprintf( szBuff, "Failed to close %s", szFileName );
        ERROR( szBuff );
    }
}


void PrintField(float *field, int ncell){
    int x,y,z,i,step=ncell+2;

    for(x=0;x<step;x++){
        for(y=0;y<step;y++){
            for(z=0;z<step;z++){
                printf("(%d,%d,%d): ",x,y,z);
                for(i=0;i<Q_LBM;i++)
                    printf("%f ",field[Q_LBM*(x+y*step+z*step*step)+i]);
                printf("\n");
            }
        }
    }
}


void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength, const int rank) {
	int x,y,z,i, stepX=xlength+2,stepY=xlength+2,stepZ=xlength+2;

	char szFileName[80];
	FILE *fp=NULL;
	sprintf( szFileName, "%s-rank%i.%i.out", filename, rank, t );
	fp = fopen( szFileName, "w");
	if( fp == NULL ){
		char szBuff[80];
		sprintf( szBuff, "Failed to open %s", szFileName );
		ERROR( szBuff );
		return;
	}

	for(z=0;z<stepZ;z++){
		for(x=0;x<stepX;x++){
			for(y=0;y<stepY;y++){
				fprintf(fp, "(%d,%d,%d): ",x,y,z);
					for(i=0;i<Q_LBM;i++)
					  fprintf(fp, "%f ",field[Q_LBM*(x+y*stepX+z*stepX*stepY)+i]);
				fprintf(fp, "\n");
			}
		}
	}

	if(fclose(fp)){
		char szBuff[80];
		sprintf( szBuff, "Failed to close %s", szFileName );
		ERROR( szBuff );
	}
}


void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank) {
  int x,y,z, stepX=xlength+2,stepY=xlength+2,stepZ=xlength+2;

  char szFileName[80];
  FILE *fp=NULL;
  sprintf( szFileName, "%s-rank%i.out", filename, rank );
  fp = fopen( szFileName, "w");
  if( fp == NULL ){
    char szBuff[80];
    sprintf( szBuff, "Failed to open %s", szFileName );
    ERROR( szBuff );
    return;
  }

  fprintf(fp, "(y,z)\\x  ");
  for(x=0; x<stepX; x++) {
      fprintf(fp, "%2i ",x);
  }
  fprintf(fp, "\n-------");
  for(x=0; x<stepX; x++) {
      fprintf(fp, "---");
  }
  fprintf(fp, "\n");

  for(z=0;z<stepZ;z++){
      for(y=stepY-1;y>=0;y--){
          fprintf(fp, "(%2d,%2d): ",y,z);
          for(x=0;x<stepX;x++){
              fprintf(fp, "%2i ",flagField[x+y*stepX+z*stepX*stepY]);
          }
          fprintf(fp, "\n");
      }
      fprintf(fp, "\n");
  }

  if(fclose(fp)){
    char szBuff[80];
    sprintf( szBuff, "Failed to close %s", szFileName );
    ERROR( szBuff );
  }
}

 

LBM GPU GEMs 2005 – Flow Simulation with Complex Boundaries

Chapter 47. Flow Simulation with Complex Boundaries

Wei Li
Siemens Corporate Research

Zhe Fan
Stony Brook University

Xiaoming Wei
Stony Brook University

Arie Kaufman
Stony Brook University

47.1 Introduction

Physically based fluid flow simulation is commonplace in many prediction and scientific computation problems. It also greatly improves the visual fidelity in computer graphics applications. However, an accurate simulation is computationally expensive. In this chapter, we present a physically plausible yet fast fluid flow simulation approach based on the Lattice Boltzmann Method (LBM) (Chen and Doolean 1998). Figure 47-1 shows the results of fluid simulation around different obstacles. In Figure 47-1a, the obstacle is a static vase, while in Figure 47-1b, it is a sphere that moves toward the top right. Figure 47-1c and Figure 47-1d are two snapshots showing a jellyfish swimming from right to left. Note that when the jellyfish deforms its body, the liquid-solid boundary also deforms. To visualize the flow field, we inject a slice of colored particles and advect them according to the velocity field of the flow. All the computations, the simulation, the generation of the boundaries, the advection, and the rendering of the particles are executed on the GPU in real time.

47_flow_01.jpgFigure 47-1 Flow Fields Simulated on the GPU Using Different Obstacles

The LBM model is composed of many nodes arranged on Cartesian grids, usually called a lattice. Each node is associated with several attributes. At discrete time steps, the attributes are updated to determine the dynamics of a flow field. The basic computation of the LBM naturally fits the stream-processing framework of the GPU, hence it is not very difficult to express the LBM equations using GPU programs. However, a straightforward translation results in a very slow implementation. In this chapter, we present various algorithm-level and machine-level optimizations. With the proper optimizations, a GPU-based flow simulation can be an order of magnitude faster than its CPU counterpart.

One of the advantages of the LBM is that it is relatively easy to handle complex, moving, and deformable boundaries. However, the boundaries must be voxelized to discrete boundary nodes that are aligned with the LBM lattice. Voxelization of moving and deforming boundaries primarily involves finding intersections of the boundary polygons with the LBM lattice. Voxelization must be repeated whenever the boundaries change. If CPU-based voxelization is used with GPU-based LBM, the transfer of boundary nodes from main memory to graphics memory becomes a bottleneck. To address these problems, we propose a fast voxelization algorithm on the GPU and use it to model the interaction of complex obstacles and even living objects with the flow.

47.2 The Lattice Boltzmann Method

Figure 47-2 shows a 2D model with the nodes represented as green dots. The figure shows only 12 nodes, but in practice, we need thousands or even millions of nodes to generate a delicate flow field. Each node is connected to its direct neighbors with vectors, denoted as e qi and shown as black arrows in Figure 47-2. There is also an e qi pointing to the node itself (not shown in the figure). The model in Figure 47-2 is called D2Q9, because each node has 9 e qi vectors. Each node is associated with various attributes, such as packet distribution fqi , density , and velocity u. We don’t need to worry about their physical meanings. All we need to know is that densities and velocities are derived from packet distributions, and velocities are the output that we need for our application. Interested readers may find more details about LBM in the literature, such as Chen and Doolean 1998 and Wei et al. 2004. For each node, there is one packet distribution for every e qi vector. Therefore in this 2D model, each node has 9 packet distributions.

47_flow_02.jpgFigure 47-2 A 2D LBM Model

We are most interested in 3D flow. The 3D model that we use in this chapter is called D3Q19 because it has 19 packet distributions for each node. These packet distributions account for most of the memory requirements of the LBM model. All the attributes are updated at every time step, using the values of the previous step. Every packet distribution moves along its e qi vector to the neighbor node and replaces the packet distribution of the neighbor node, except the one pointing to the node itself, as indicated by the blue arrows in Figure 47-2. This is called propagation or streaming.

47.3 GPU-Based LBM

47.3.1 Algorithm Overview

To compute the LBM equations on graphics hardware, we divide the LBM lattice and group the packet distributions fqi into arrays according to their velocity vectors e qi . All packet distributions of the same velocity vector are grouped into an array, maintaining the layout of the original lattice. Figure 47-3 shows the division of a 2D model into a collection of arrays (one per velocity vector e qi ). The division of a 3D model is similar. We store the arrays as 2D textures. For a 2D model, all such arrays are naturally 2D; for a 3D model, each array forms a volume. The volume is treated as a set of slices and is stored as a stack of 2D textures or a large tiled 2D texture. All the other variables are stored similarly in 2D textures. To update these values at each time step, we render quadrilaterals mapped with those textures, so that a fragment program running at each texel computes the LBM equations.

47_flow_03.jpgFigure 47-3 Division of a D2Q9 Model According to Its Velocity Directions

Figure 47-4 is a diagram of the data flow of the LBM computation on the GPU; green boxes represent the textures storing lattice properties, and blue boxes represent the operations. An LBM update iteration starts with the packet distribution textures as inputs. Density and velocity textures are then dynamically generated from the distribution textures. Next, intermediate distributions, called equilibrium distributions, are obtained from the densities and velocities. New distributions are then computed from the input distributions and the equilibrium distributions according to the collision and the streaming equations. Finally, the boundary and outflow conditions are used to update the distribution textures. The updated distribution textures are then used as inputs for the next simulation step. Readers can find the details of these computations in Wei et al. 2004.

47_flow_04.jpgFigure 47-4 Flow Chart of LBM Computation on the GPU

47.3.2 Packing

During the simulation, textures are updated dynamically at every step by copying from or binding to the frame buffer (or an off-screen buffer). In current graphics hardware, it is most efficient to use 8-bit-per-component RGBA (or BGRA) textures. Each RGBA texel has four channels, so it can store up to four scalars or a vector with up to four components.

We pack multiple variables into each texel. For example, four packet distributions, fqi , from different directions are stored in a single RGBA texel. It is important to pack those variables involved in the same LBM equations into the same texture or even the same texel, if possible, in order to reduce the number of texture fetches. This also improves data locality and texture-cache coherence.

Table 47-1 lists the contents of the textures packed with the variables of the D3Q19 model, including densities, velocities, and packet distributions. In texture u , vx , vy , and vz are the three components of the velocity stored in the RGB channels, while the density is stored in the alpha channel. The rows in Table 47-1 for textures f 0 through f 4 show the packing patterns of the packet distributions fqi . The distribution in the direction of (xyz) is f (x,y,z). Note that we pack pairs of distributions of opposite directions into the same texture. There are two reasons for this. First, when we handle complex or moving boundaries, neighboring distributions at opposite directions are needed to evaluate the effects on a boundary link. Second, when programming the fragment processor, we typically need to pass the corresponding velocity vector e qi as an argument of the fragment program. When opposite distributions are packed together, just two e qi ‘s are needed, instead of four, and the other two are easily inferred.

Table 47-1. Packed LBM Variables of the D3Q19 Model

Texture R G B A
u vx vy vz
f 0 f (1, 0, 0) f (–1, 0, 0) f (0, 1, 0) f (0, –1, 0)
f 1 f (1, 1, 0) f (–1, –1, 0) f (1, –1, 0) f (–1, 1, 0)
f 2 f (1, 0, 1) f (–1, 0, –1) f (1, 0, –1) f (–1, 0, 1)
f 3 f (0, 1, 1) f (0, –1, –1) f (0, 1, –1) f (0, –1, 1)
f 4 f (0, 0, 1) f (0, 0, –1) f (0, 0, 0) unused

Instead of using a stack of 2D textures to represent a volume, we tile the slices into a large 2D texture that can be considered a “flat” volume. Similar approaches have been reported in the literature, for example, in Harris et al. 2003. One advantage of the flat volume is the reduced amount of texture switching. Using the flat volume is critical in two stages of our algorithms: particle advection and boundary node generation. The conversion from the volume coordinates (xyz) to the coordinates (uv) of the flattened texture is as follows:

Equation 1

ch47_eqn001.jpg

 

where W and H are dimensions of each slice in the volume, and every set of d slices is tiled in a row along the x direction in the flat volume. The computation of Equation 1 can be done either in the fragment program or via a 1D texture lookup.

47.3.3 Streaming

Recall that each packet distribution with nonzero velocity propagates to a neighbor node at every time step. On modern GPUs, the texture unit can fetch texels at arbitrary positions indicated by texture coordinates computed by the fragment program. If a distribution fqi is propagated along vector e qi , we simply fetch from the distribution texture at the position of current node minus e qi . Because the four channels are packed with four distributions with different velocity vectors, four fetches are needed for each fragment.

Figure 47-5 shows an example of the propagation of distribution texture f 1, which is packed with f (1, 1, 0)f(–1, –1, 0)f (1, –1, 0), and f (–1, 1, 0). The fragment program fetches texels by adding the negative values of the velocity directions to the texture coordinates of the current fragment and extracting the proper color component.

47_flow_05.jpgFigure 47-5 Streaming of Packet Distribution

To propagate using the flat volume, for each channel we can add the 3D position of the fragment to the negative of the corresponding e qi , then convert it to the texture coordinate of the flat volume according to Equation 1 before fetching. However, this requires execution of Equation 1 four times per fragment. We can push the coordinates’ conversion to the vertex level, because for each channel inside a slice, the velocity vectors are the same. We can either assign each vertex of the proxy quad four texture coordinates containing the converted values or generate the texture coordinates with a vertex program.

47.4 GPU-Based Boundary Handling

The previous computations apply only to internal nodes, away from any obstacle boundaries. To make the simulation work properly, and to make the flow interact with obstacles, we need to consider boundary conditions. To handle an obstacle boundary, we need to compute the intersections of the boundary surface with all the LBM lattice links. For static obstacles, the intersections can be precomputed, whereas for moving or deformable boundaries, the intersection positions change dynamically. The boundary description can be either continuous, such as a polygonal mesh or a higher-order surface, or discrete, such as a voxel volume. Regardless, the handling of the boundary conditions requires discrete boundary nodes to be aligned with the LBM lattice. Even if the boundary representation is already volumetric, it has to be revoxelized whenever it moves or deforms. One solution is to compute the intersection and voxelization on the CPU and then transfer the computed volumetric boundary information from the main memory to the graphics memory. Unfortunately, both the computation and the data transfer are too time-consuming for interactive applications.

In the following sections, we first propose a general-purpose GPU-based voxelization algorithm that converts an arbitrary model into a Cartesian grid volume. Then we discuss the handling of three different boundary conditions, while focusing on arbitrary complex boundaries that can move and deform. The generation of the boundary nodes of arbitrary boundaries is performed by extending our general-purpose GPU-based voxelization.

47.4.1 GPU-Based Voxelization

An intuitive voxelization approach is the slicing method, which sets the near and far clip planes so that for each rendering pass, only the geometry falling into the slab between the two clip planes is rendered (Fang and Chen 2000). This creates one slice of the resulting volume. The clip planes are shifted to generate subsequent slices. Obviously, for this slicing method, the number of rendering passes is the same as the number of slices in the volume. In most cases, the boundaries are sparse in a volume. In other words, only a small percentage of voxels are intersected by the boundary surfaces. There is no need to voxelize the “empty” space that corresponds to nonboundary voxels.

Our GPU-based voxelization avoids this slicing. Instead, we adapt the idea of depth peeling (Everitt 2001) used for order-independent transparency, in which the depth layers in the scene are stripped away with successive rendering passes. In the first pass, the scene is rendered normally, and we obtain the layer of nearest fragments—or equivalently, voxels. From the second rendering pass, each fragment is compared with a depth texture obtained from the depth buffer of the previous pass. A fragment reaches the frame buffer only if its depth value is greater than that of the corresponding pixel in the depth texture, while the ordinary depth test is still enabled. Therefore, the second pass generates the second-nearest layer, and so on. The process continues until no fragment is farther away than the corresponding pixel in the depth texture. This condition is best determined by using a hardware occlusion query, which returns the number of pixels written to the frame buffer. The number of rendering passes of the depth-peeling algorithm is the number of layers plus one, which typically is significantly smaller than the number of slices.

When rendering order-independent transparent objects, all the layer images are blended in depth order. In contrast, for voxelization, we want the layers to be separated, which is similar to a layered depth image (Shade et al. 1998). The pixels of the layered images are the attributes of the corresponding voxels. The first attribute is the 3D position, and the other attributes depend on the application. Assume that the maximum size along any of the major axes of the object being voxelized is D. We allocate an off-screen buffer with width and height equal to the maximum number of layers times D and the number of attributes times D, respectively. Then, between rendering passes, we translate the viewport so that the layer images do not overlap but are tiled as tightly as possible. We apply the peeling process three times; each time, the image plane is orthogonal to one of the major axes. That is, we perform the peeling from three orthogonal views to avoid missing voxels. As a result, some of the voxels may be rendered more than once. However, the replication does not affect the accuracy. The images containing the voxel attributes are then copied to a vertex array allocated in video memory (using OpenGL extensions such as ARB_pixel_buffer_object and ARB_vertex_buffer_object [NVIDIA 2004]). Note that different types of voxel attributes are copied to different locations inside the vertex array. We may avoid the copying if either render-to-vertex-array or vertex texture fetch is available. Figure 47-6 illustrates the process in 2D. The line segments are the boundary lines (surfaces in 3D). The small red, green, and blue boxes represent the voxels generated when the boundary lines are projected onto the layered off-screen buffers. Note that the red boundary will only result in two voxels if it is only rendered into layer X.

47_flow_06.jpgFigure 47-6 Voxelization Based on Depth Peeling

The vertex array is essentially an array of voxel positions and other attributes, which can generate all the boundary voxels for further processing. We use the vertex array because current GPU fragment programs cannot scatter (that is, the output location of a fragment is fixed). Therefore, we must rely on the transformation capability of vertex programs. We convert the vertex array into a flat volume by rendering each vertex as a point of size 1, so that each voxel has a footprint of 1 pixel. All the vertices of the array pass through a vertex program that translates each voxel properly according to its z coordinate, using equations similar to Equation 1. The frame buffers for the depth peeling are initialized with large numbers. If a pixel is not covered by any boundary voxels, then the corresponding vertex created from the pixel falls far away from the view frustum and is clipped.

47.4.2 Periodic Boundaries

For periodic boundaries, outgoing packet distributions wrap around and reenter the lattice from the opposite side. In practice, periodic boundaries are actually computed during the propagation of the packet distributions. If a periodic boundary face is orthogonal to the x or y axis, we call it an in-slice periodic boundary face, because a distribution on the face is copied to the opposite side of the lattice but stays inside the same xy slice. For in-slice periodic boundaries, we simply apply a modulo operation to the texture coordinates by the width or the height of each slice. If a periodic boundary face is perpendicular to the z axis, we call it an out-slice periodic boundary face, for which we need to copy distribution textures of one slice to another.

A naive implementation of the in-slice periodic boundary condition is to apply the modulo operation to the texture coordinates of all the distribution texels. However, this can be very slow. Therefore, one optimization we use first propagates without the periodic boundary condition. Then we draw strips of single-texel width that only cover the boundary and apply the modulo operation. (See Chapter 34 of this book, “GPU Flow-Control Idioms,” for more information on this technique.) In this way, the cost for computing is negligible, because the periodic boundary nodes account for only a small percentage of the lattice.

47.4.3 Outflow Boundaries

For outflow conditions, some packet distributions of the boundary nodes propagate outside of the lattice and can simply be discarded. However, these nodes also expect some packet distributions to propagate inward from fictitious nodes just outside the boundary. One solution is to copy the distribution of the internal nodes to those boundary nodes, according to Equations 2, 3, and 4, in which f(B)(ijk) is the distribution of a boundary node in the velocity direction of (ijk); f(I)(ijk) is the distribution of the corresponding internal node; and ijk U220A.GIF {-1, 0, 1}. For example, boundary nodes on slice 0 are copied from internal nodes two slices away, which are on slice 2.

For an outflow face orthogonal to the x axis:

Equation 2

ch47_eqn002.jpg

 

For an outflow face orthogonal to the y axis:

Equation 3

ch47_eqn003.jpg

 

For an outflow face orthogonal to the z axis:

Equation 4

ch47_eqn004.jpg

 

Similar to periodic boundaries, an outflow boundary condition is applied by drawing single-texel-wide stripes. Note that when packing the distributions, we guarantee that f (ijk) and f (–ijk) of the same node are in the same texture, as well as the pairs f (ijk) and f (i, –jk)f (ijk) and f (ij, –k). You can easily verify these with Table 47-1. Therefore, each boundary distribution texture copies from only one distribution texture. To flip the distributions around the major axes, we use the swizzling operator to rearrange the order of the color elements.

47.4.4 Obstacle Boundaries

For obstacle boundaries, we adopt the improved bounce-back rule of Mei et al. 2000, which can handle deformable and moving boundaries. As indicated in Figure 47-7, all packet distributions with corresponding links that are intersected by a boundary are updated using the curved boundary equations rather than the standard LBM equations. For example, the link between the two nodes in Figure 47-7 is intersected by a boundary, hence the boundary rules are applied to the packet distributions marked in blue, f 1 and f 2. Again, we don’t need to worry about the details of these equations; interested readers will find these equations in Mei et al. 2000. We just need to know that the boundary equations require two values in addition to the values of f 1 and f 2 at the previous time step. The first value is the exact intersection location, which is described by the ratio , where = t 1/(t 1 + t 2) for f 1, and = t 2/(t 1 + t 2) for f 2. The second value is the moving speed at the intersection, u w , which is useful for moving and deformable boundaries. Note that for each boundary link, the boundary condition affects two distributions, one on each side of the boundary. The two distributions are in opposite directions, but they are colinear with the link. We refer to them as boundary distributions.

47_flow_07.jpgFigure 47-7 Curved-Wall Boundary in LBM Lattice

To generate the boundary information, we first create a voxelization of the boundaries using the method described in the previous section. Besides the position of the voxels, we also need the wall velocity u w , as well as the coefficients of polygon plane equations, which will be used to compute . That is, we need three attributes in total. To preserve accuracy, we treat these attributes as texture coordinates when rendering the vertex array in the next step.

In practice, we don’t explicitly generate the flat volume of the boundary voxels. Rather, we combine the generation with the computation of the boundary conditions, by rendering the boundary vertex array directly into the off-screen buffer containing the propagated new distributions and then applying the fragment program for complex boundary conditions. Note that in most cases only one node of each boundary link is covered by a voxel from the generated voxel array. However, we need each boundary node to receive a fragment so that the boundary distributions are updated. Therefore, for each packed distribution texture, we render the voxel array three times. In the first pass, they are rendered normally, covering those voxels in the generated voxel array. In the second pass, we first set the color mask so that only the R and G channels can be modified. Then we apply a translation to all the voxels using a vertex program. The translated offset is computed according to the following rule:

ch47_eqn005.jpg

where flag 1 = (posx , posy , posz , 1) · (A, B, C, D), flag 2 = (A, B, C) · e qi , and · represents a dot product. The coordinates (posx , posy , posz ) represent the 3D position of the voxel without translation. The boundary surface inside the voxel is defined by Ax + By + Cz + D = 0, where (A, B, C) is a normalized plane normal pointing from the solid region to the liquid region. The value e qi is the velocity vector associated with the distribution in the red channel. The third pass is similar to the second pass, except that this time the blue and alpha channels are modified, and e qi is the velocity vector corresponding to the blue channel distribution.

In this way, all the boundary nodes are covered by the voxels. We then compute at the beginning of the boundary condition fragment program with the following equations:

ch47_eqn006.jpg

The meanings of flag 1 and flag 2 are the same as before. Note that each channel computes its flag 1flag 2, and independently, with its own e qi . A distribution is a boundary distribution only if, for the corresponding color channel, 1 U2265.GIF U2264.GIF 0. If it is not a boundary distribution, the fragment program prevents modifying the corresponding color channel by assigning it the old value.

47.5 Visualization

To visualize the simulation, we inject particles into the flow field. The positions of the particles are stored in a texture and are updated every time step by a fragment program using the current velocity field. Similar to the generation of boundary voxels, the updated particle positions are then copied to a vertex array residing in the graphics memory for rendering. The whole simulation and rendering cycle is inside the GPU, hence there is no need to transfer large chunks of data between the main memory and the graphics memory. To better display the flow field, we arrange particles in a regular grid before injection and color them according to the position where they enter the flow field, as shown in Figure 47-1. Due to the requirements of the vertex array, the total number of particles is constant during the simulation. We use a fragment program to recycle particles that flow out of the border of the field or stay in a zero-velocity region and place them back at the inlet. If two particles coincide at exactly the same location at the same time, they will never separate during the advection. Visually, we will see fewer and fewer particles. To avoid this, we add a random offset to each particle when placing it at the inlet.

In 2D LBM, there is only one velocity slice, while in 3D LBM, the velocities form a volume. The advection fragment program fetches the velocity indicated by the current position of the particles. Therefore, the fragment program needs to access either a 3D texture or a 2D texture storing the flat volume. We chose the flat volume storage, which is much faster. Please note that if we stored the velocity as a stack of separate 2D textures, the advection would be very difficult.

47.6 Experimental Results

We have experimented with our GPU-based LBM implemented using OpenGL and Cg on an NVIDIA GeForce FX 5900 Ultra. This GPU has 256 MB of 425 MHz DDR SDRAM, and its core speed is 400 MHz. The host computer has a Pentium 4 2.53 GHz CPU with 1 GB PC800 RDRAM. All the results related to the GPU are based on 32-bit single-precision computation of the fragment pipeline. For comparison, we have also implemented a software version of the LBM simulation using single-precision floating point, and we measured its performance on the same machine.

Figure 47-8 shows a 2D flow field simulation based on the D2Q9 model with a lattice size of 2562. We insert two circles (shown in red) into the field as the obstacles’ boundary conditions. Vortices are generated because of the obstacles. The figure shows only a snapshot of the flow; the flow is relatively stable, but not constant. Figure 47-1 shows 3D flow simulations using the D3Q19 model with a lattice size of 503. Note that our simulation can handle arbitrarily complex and dynamic boundaries, which usually results in a complex flow field. We show only particles injected from a slit, to reduce clutter. The motion of the particles and the ability to change the viewing angle help in understanding the shape of the flow.

47_flow_08.jpgFigure 47-8 Particles Advected in a 2D Flow Field Based on the D2Q9 LBM Model

Figure 47-9 shows the time in milliseconds per step of the 3D LBM model as a function of the lattice size, running on both the CPU and the GPU. Note that both the x and y axes are in logarithmic scale. Figure 47-10 compares the two from a different perspective by showing the speedup factor. The time includes both simulation and visualization. Note that there is no need to transfer the velocity field or the particle positions between the main memory and the graphics memory. The time spent on advecting and rendering the particles is negligible with respect to the simulation. The speedup factor varies between 8 and 9 for most of the lattice sizes. When the lattice size approaches 1283, the speedup is as high as 15. We are sure to get higher speedups with more recent GPUs, such as a GeForce 6800-class GPU. The step in the GPU timing curve—as well as in the GPU versus CPU speedup—is good evidence showing their different cache behavior. When the grid size gets bigger and surpasses a certain threshold, cache misses significantly slow down the CPU version. Due to the mostly sequential access patterns of the LBM algorithm and the GPU’s optimization for sequential access, the cache-miss rate on the GPU is relatively independent of the grid size, and we don’t see such a performance jump. The performance of the CPU version actually drops dramatically for lattice size of exactly power of two, possibly caused by 64K aliasing (Intel 2004). Therefore, we actually use lattices of size power-of-two-plus-one for the measurement. The CPU-based LBM implementation does not utilize SSE.

47_flow_09.jpgFigure 47-9 Time per Step of a D3Q19 LBM Simulation

47_flow_10.jpgFigure 47-10 Speedup of the LBM on the GPU

47.7 Conclusion

We have presented a fluid flow simulation using the physically based Lattice Boltzmann Method, accelerated by programmable graphics hardware. The LBM simulation and its boundary conditions are of second-order accuracy in both time and space. Our GPU-based simulation using floating-point computation achieves the same accuracy as the CPU-based LBM simulation, yet it is much faster. Our experimental results have shown that the GPU version is significantly faster (for example, 8 to 15 times faster, for the D3Q19 model) for most lattice sizes in both 2D and 3D simulations. To attain these speeds, we have incorporated several optimization techniques into the GPU method. We have also proposed a GPU-based general-purpose voxelization algorithm and its extension to handle arbitrarily complex and dynamic boundary conditions in real time.

47.8 References

Other graphics researchers have explored the approach of solving the Navier-Stokes (NS) equations to simulate amorphous phenomena, such as gases and fluid: Foster and Metaxas 1997, Stam 1999, and Fedkiw et al. 2001. Readers may find a comparison of direct NS solvers with the LBM in Wei et al. 2004. Researchers have also developed GPU-accelerated conjugate gradient solvers that can be used for NS equations: Bolz et al. 2003, Krüger and Westermann 2003. There are also other works on using GPUs to compute fluid dynamics: Harris et al. 2003, Goodnight et al. 2003, and Harris 2004. More details of the GPU-accelerated LBM can be found in Li 2004.

Bolz, J., I. Farmer, E. Grinspun, and P. Schröder. 2003. “Sparse Matrix Solvers on the GPU: Conjugate Gradients and Multigrid.” ACM Transactions on Graphics (Proceedings of SIGGRAPH 2003) 22(3), pp. 917–924.

Chen, S., and G. D. Doolean. 1998. “Lattice Boltzmann Method for Fluid Flows.” Annual Review of Fluid Mechanics 30, pp. 329–364.

Everitt, C. 2001. “Interactive Order-Independent Transparency.” NVIDIA technical report. Available online at http://developer.nvidia.com/object/Interactive_Order_Transparency.html

Fang, S., and H. Chen. 2000. “Hardware Accelerated Voxelization.” Computers and Graphics 24(3), pp. 433–442.

Fedkiw, R., J. Stam, and H. W. Jensen. 2001. “Visual Simulation of Smoke.” In Proceedings of SIGGRAPH 2001, pp. 15–22.

Foster, N., and D. Metaxas. 1997. “Modeling the Motion of a Hot, Turbulent Gas.” In Proceedings of SIGGRAPH 97, pp. 181–188.

Goodnight, N., C. Woolley, G. Lewin, D. Luebke, and G. Humphreys. 2003. “A Multigrid Solver for Boundary Value Problems Using Programmable Graphics Hardware.” In Proceedings of the SIGGRAPH /Eurographics Workshop on Graphics Hardware 2003, pp. 102–111.

Harris, M. 2004. “Fast Fluid Dynamics Simulation on the GPU.” In GPU Gems, edited by Randima Fernando, pp. 637–665. Addison-Wesley.

Harris, M., W. V. Baxter, T. Scheuermann, and A. Lastra. 2003. “Simulation of Cloud Dynamics on Graphics Hardware.” In Proceedings of the SIGGRAPH/Eurographics Workshop on Graphics Hardware 2003, pp. 92–101.

Intel. 2004. “Capacity Limits and Aliasing in Caches.” In Chapter 2 of IA-32 Intel Architecture Optimization Reference Manual, pp. 41–43. Available online at http://www.intel.com/design/pentium4/manuals/index_new.htm

Krüger, J., and R. Westermann. 2003. “Linear Algebra Operators for GPU Implementation of Numerical Algorithms.” ACM Transactions on Graphics (Proceedings of SIGGRAPH 2003) 22(3), pp. 908–916.

Li, W. 2004. “Accelerating Simulation and Visualization on Graphics Hardware.” Ph.D. dissertation, Computer Science Department, Stony Brook University.

Mei, R., W. Shyy, D. Yu, and L.-S. Luo. 2000. “Lattice Boltzmann Method for 3-D Flows with Curved Boundary.” Journal of Computational Physics 161, pp. 680–699.

NVIDIA Corporation. 2004. OpenGL Extension Specifications. Available online at http://developer.nvidia.com/object/nvidia_opengl_specs.html

Shade, J., S. J. Gortler, L. He, and R. Szelisk. 1998. “Layered Depth Images.” In Proceedings of SIGGRAPH 98, pp. 231–242.

Stam, J. 1999. “Stable Fluids.” In Proceedings of SIGGRAPH 99, pp. 121–128.

Wei, X., W. Li, K. Mueller, and A. Kaufman. 2004. “The Lattice-Boltzmann Method for Gaseous Phenomena.” IEEE Transactions on Visualization and Computer Graphics 10(2), pp. 164–176.


Copyright

Many of the designations used by manufacturers and sellers to distinguish their products are claimed as trademarks. Where those designations appear in this book, and Addison-Wesley was aware of a trademark claim, the designations have been printed with initial capital letters or in all capitals.

The authors and publisher have taken care in the preparation of this book, but make no expressed or implied warranty of any kind and assume no responsibility for errors or omissions. No liability is assumed for incidental or consequential damages in connection with or arising out of the use of the information or programs contained herein.

NVIDIA makes no warranty or representation that the techniques described herein are free from any Intellectual Property claims. The reader assumes all risk of any such claims based on his or her use of these techniques.

The publisher offers excellent discounts on this book when ordered in quantity for bulk purchases or special sales, which may include electronic versions and/or custom covers and content particular to your business, training goals, marketing focus, and branding interests. For more information, please contact:

U.S. Corporate and Government Sales
(800) 382-3419
corpsales@pearsontechgroup.com

For sales outside of the U.S., please contact:

International Sales
international@pearsoned.com

Visit Addison-Wesley on the Web: www.awprofessional.com

Library of Congress Cataloging-in-Publication Data

GPU gems 2 : programming techniques for high-performance graphics and general-purpose
computation / edited by Matt Pharr ; Randima Fernando, series editor.
p. cm.
Includes bibliographical references and index.
ISBN 0-321-33559-7 (hardcover : alk. paper)
1. Computer graphics. 2. Real-time programming. I. Pharr, Matt. II. Fernando, Randima.

T385.G688 2005
006.66—dc22
2004030181

GeForce™ and NVIDIA Quadro® are trademarks or registered trademarks of NVIDIA Corporation.

Nalu, Timbury, and Clear Sailing images © 2004 NVIDIA Corporation.

mental images and mental ray are trademarks or registered trademarks of mental images, GmbH.

Copyright © 2005 by NVIDIA Corporation.

All rights reserved. No part of this publication may be reproduced, stored in a retrieval system, or transmitted, in any form, or by any means, electronic, mechanical, photocopying, recording, or otherwise, without the prior consent of the publisher. Printed in the United States of America. Published simultaneously in Canada.

For information on obtaining permission for use of material from this work, please submit a written request to:

Pearson Education, Inc.
Rights and Contracts Department
One Lake Street
Upper Saddle River, NJ 07458

Text printed in the United States on recycled paper at Quebecor World Taunton in Taunton, Massachusetts.

Second printing, April 2005

021 EU Meetup February 6, 2018

 

 EU Meetup February 6, 2018

 
 

Electrophoresis (from the Greek "Ηλεκτροφόρηση" meaning "to bear electrons") is the motion of dispersed particles relative to a fluid under the influence of a spatially uniform electric field.[1][2][3][4][5][6] This electrokinetic phenomenon was observed for the first time in 1807 by Russian professors Peter Ivanovich Strakhov and Ferdinand Frederic Reuss (Moscow State University),[7] who noticed that the application of a constant electric field caused clay particles dispersed in water to migrate. It is ultimately caused by the presence of a charged interface between the particle surface and the surrounding fluid. It is the basis for a number of analytical techniques used in chemistry for separating molecules by size, charge, or binding affinity.

Electrophoresis of positively charged particles (cations) is called cataphoresis, while electrophoresis of negatively charged particles (anions) is called anaphoresis. Electrophoresis is a technique used in laboratories in order to separate macromolecules based on size. The technique applies a negative charge so proteins move towards a positive charge. This is used for both DNA and RNA analysis. Polyacrylamide gel electrophoresis (PAGE) has a clearer resolution than agarose and is more suitable for quantitative analysis. In this technique DNA foot-printing can identify how proteins bind to DNA. It can be used to separate proteins by size, density and purity. It can also be used for plasmid analysis, which develops our understanding of bacteria becoming resistant to antibiotics.

 

Read More

Robert Lanza – BioCentrism – Insanely Improbable Universe




Wal Thornhill – Electric Comets & Asteriods