main.c
#ifndef _MAIN_C_
#define _MAIN_C_
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "lbm_model.h"
#include "initialization.h"
#include "initialization_gpu.h"
#include "streaming.h"
#include "collision.h"
#include "boundary.h"
#include "lbm_solver_gpu.h"
#include "visualization.h"
#include "utils.h"
int main(int argc, char *argv[]) {
float *collide_field=NULL, *stream_field=NULL, *collide_field_d=NULL, *stream_field_d=NULL,
*swap=NULL, tau, wall_velocity[D_LBM], num_cells, mlups_sum;
int *flag_field=NULL, *flag_field_d=NULL, xlength, t, timesteps, timesteps_per_plotting,
gpu_enabled;
clock_t mlups_time;
size_t field_size;
/* process parameters */
ReadParameters(&xlength, &tau, wall_velocity, ×teps, ×teps_per_plotting, argc, argv,
&gpu_enabled);
/* check if provided parameters are legitimate */
ValidateModel(wall_velocity, xlength, tau);
/* initializing fields */
num_cells = pow(xlength + 2, D_LBM);
field_size = Q_LBM*num_cells*sizeof(float);
collide_field = (float*) malloc(field_size);
stream_field = (float*) malloc(field_size);
flag_field = (int*) malloc(num_cells*sizeof(int));
InitialiseFields(collide_field, stream_field, flag_field, xlength, gpu_enabled);
InitialiseDeviceFields(collide_field, stream_field, flag_field, xlength, &collide_field_d, &stream_field_d, &flag_field_d);
for (t = 0; t < timesteps; t++) {
printf("Time step: #%d\n", t);
if (gpu_enabled){
DoIteration(collide_field, stream_field, flag_field, tau, wall_velocity, xlength,
&collide_field_d, &stream_field_d, &flag_field_d, &mlups_sum);
/* Copy data from devcice in memory only when we need VTK output */
if (!(t%timesteps_per_plotting))
CopyFieldsFromDevice(collide_field, stream_field, xlength,
&collide_field_d, &stream_field_d);
} else {
mlups_time = clock();
/* Copy pdfs from neighbouring cells into collide field */
DoStreaming(collide_field, stream_field, flag_field, xlength);
/* Perform the swapping of collide and stream fields */
swap = collide_field; collide_field = stream_field; stream_field = swap;
/* Compute post collision distributions */
DoCollision(collide_field, flag_field, tau, xlength);
/* Treat boundaries */
TreatBoundary(collide_field, flag_field, wall_velocity, xlength);
mlups_time = clock()-mlups_time;
/* Print out the MLUPS value */
mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC);
if(VERBOSE)
printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC));
}
/* Print out vtk output if needed */
if (!(t%timesteps_per_plotting))
WriteVtkOutput(collide_field, flag_field, "img/lbm-img", t, xlength);
}
printf("Average MLUPS: %f\n", mlups_sum/(t+1));
if (VERBOSE) {
if(gpu_enabled)
CopyFieldsFromDevice(collide_field, stream_field, xlength, &collide_field_d, &stream_field_d);
WriteField(collide_field, "img/collide-field", 0, xlength, gpu_enabled);
writeFlagField(flag_field, "img/flag-field", xlength, gpu_enabled);
}
/* Free memory */
free(collide_field);
free(stream_field);
free(flag_field);
FreeDeviceFields(&collide_field_d, &stream_field_d, &flag_field_d);
printf("Simulation complete.\n");
return 0;
}
#endif
lbm_model.h
#ifndef _LBM_MODEL_H_
#define _LBM_MODEL_H_
#define VERBOSE 0
#define MLUPS_MIN_CELLS 300
#define MLUPS_EXPONENT 1000000
/* simulation parameters */
#define D_LBM 3
#define Q_LBM 19
/* arbitrary cell identifiers */
#define FLUID 0
#define NO_SLIP 1
#define MOVING_WALL 2
/* computation enhancing values */
#define EPS 0.05
#define C_S_POW2_INV 3.0
#define C_S_POW4_INV 9.0
/* reference values, not used in actual computation */
#define SQRT3 1.73205080756887729
static const float C_S = 1.0/SQRT3;
/* predefined simulation parameters */
static const int LATTICE_VELOCITIES[19][3] = {
{0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0},
{-1,0,0}, {0,0,0}, {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0}, {0,-1,1},{-1,0,1},
{0,0,1}, {1,0,1}, {0,1,1}
};
static const float LATTICE_WEIGHTS[19] = {
1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0,
2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0,
2.0/36.0, 1.0/36.0, 1.0/36.0
};
/* additional cell definitions */
#define LEFT_BOUNDARY 5100
#define RIGHT_BOUNDARY 5900
#define BOTTOM_BOUNDARY 5010
#define TOP_BOUNDARY 5090
#define BACK_BOUNDARY 5001
#define FRONT_BOUNDARY 5009
#define LEFT_BOTTOM_EDGE 5110
#define LEFT_UPPER_EDGE 5190
#define RIGHT_BOTTOM_EDGE 5910
#define RIGHT_UPPER_EDGE 5990
#define BACK_BOTTOM_EDGE 5011
#define BACK_UPPER_EDGE 5091
#define FRONT_BOTTOM_EDGE 5019
#define FRONT_UPPER_EDGE 5099
#define LEFT_BACK_EDGE 5101
#define LEFT_FRONT_EDGE 5109
#define RIGHT_BACK_EDGE 5901
#define RIGHT_FRONT_EDGE 5909
#define LEFT_BOTTOM_BACK_CORNER 5111
#define LEFT_BOTTOM_FRONT_CORNER 5119
#define LEFT_UPPER_BACK_CORNER 5191
#define LEFT_UPPER_FRONT_CORNER 5199
#define RIGHT_BOTTOM_BACK_CORNER 5911
#define RIGHT_BOTTOM_FRONT_CORNER 5919
#define RIGHT_UPPER_BACK_CORNER 5991
#define RIGHT_UPPER_FRONT_CORNER 5999
/**
* Validates the configured physical model by calculating characteristic numbers
*/
void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau);
#endif
lbm_model.c
#include <math.h>
#include <stdio.h>
#include "lbm_model.h"
#include "utils.h"
void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau){
float u_wall_length, mach_number, reynolds_number;
/* Compute Mach number and Reynolds number */
u_wall_length = sqrt(wall_velocity[0]*wall_velocity[0]+
wall_velocity[1]*wall_velocity[1]+
wall_velocity[2]*wall_velocity[2]);
mach_number = u_wall_length*SQRT3;
reynolds_number = u_wall_length*domain_size*C_S_POW2_INV/(tau-0.5);
printf("Computed Mach number: %f\n", mach_number);
printf("Computed Reynolds number: %f\n", reynolds_number);
/* Check if characteristic numbers are correct */
if (mach_number >= 1)
ERROR("Computed Mach number is too large.");
if (reynolds_number > 500)
ERROR("Computed Reynolds number is too large for simulation to be run on a laptop/pc.");
}
GPU:
initialization_gpu.h
#ifndef _INITIALIZATION_GPU_H_
#define _INITIALIZATION_GPU_H_
/**
* Copy everything from fields on DRAM in GPU
*/
void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
float **collide_field_d, float **stream_field_d,int **flag_field_d);
/**
* Free GPU memory
*/
void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d);
/**
* Copy data from device in global memory.
*/
void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength,
float **collide_field_dd, float **stream_field_dd);
#endif
initialization_gpu.cu
#include "initialization_gpu.h"
#include "lbm_model.h"
#include "utils_gpu.h"
void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
float **collide_field_d, float **stream_field_d,int **flag_field_d){
int num_cells = pow(xlength+2, D_LBM);
size_t computational_field_size = Q_LBM*num_cells*sizeof(float);
size_t flag_field_size = num_cells*sizeof(int);
cudaErrorCheck(cudaMalloc(collide_field_d, computational_field_size));
cudaErrorCheck(cudaMemcpy(*collide_field_d, collide_field, computational_field_size, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMalloc(stream_field_d, computational_field_size));
cudaErrorCheck(cudaMemcpy(*stream_field_d, stream_field, computational_field_size, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMalloc(flag_field_d, flag_field_size));
cudaErrorCheck(cudaMemcpy(*flag_field_d, flag_field, flag_field_size, cudaMemcpyHostToDevice));
}
void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d){
cudaErrorCheck(cudaFree(*collide_field_d));
cudaErrorCheck(cudaFree(*stream_field_d));
cudaErrorCheck(cudaFree(*flag_field_d));
}
void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength,
float **collide_field_dd, float **stream_field_dd) {
int num_cells = pow(xlength+2, D_LBM);
size_t computational_field_size = Q_LBM*num_cells*sizeof(float);
cudaErrorCheck(cudaMemcpy(collide_field, *collide_field_dd, computational_field_size, cudaMemcpyDeviceToHost));
cudaErrorCheck(cudaMemcpy(stream_field, *stream_field_dd, computational_field_size, cudaMemcpyDeviceToHost));
}
lbm_solver_gpu.h
#ifndef _COLLISION_GPU_H_
#define _COLLISION_GPU_H_
/**
* Performs streaming, collision and boundary treatment.
*/
void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau,
float *wall_velocity, int xlength, float **collide_field_d, float **stream_field_d,
int **flag_field_d, float *mlups_sum);
#endif
lbm_solver_gpu.cu
#include <math.h>
#include <stdio.h>
#include "lbm_solver_gpu.h"
#include "lbm_model.h"
#include "lbm_model_gpu.cuh"
#include "utils.h"
#include "utils_gpu.h"
#include "cell_computation_gpu.cuh"
__constant__ float tau_d, wall_velocity_d[D_LBM];
__constant__ int xlength_d, num_cells_d;
__device__ float *stream_field_d, *collide_field_d;
/**
* Computes the post-collision distribution functions according to the BGK update rule and
* stores the results again at the same position.
*/
__device__ void ComputePostCollisionDistributionsGpu(float *current_cell, float *feq){
current_cell[0]=current_cell[0]-(current_cell[0]-feq[0])/tau_d;
current_cell[1]=current_cell[1]-(current_cell[1]-feq[1])/tau_d;
current_cell[2]=current_cell[2]-(current_cell[2]-feq[2])/tau_d;
current_cell[3]=current_cell[3]-(current_cell[3]-feq[3])/tau_d;
current_cell[4]=current_cell[4]-(current_cell[4]-feq[4])/tau_d;
current_cell[5]=current_cell[5]-(current_cell[5]-feq[5])/tau_d;
current_cell[6]=current_cell[6]-(current_cell[6]-feq[6])/tau_d;
current_cell[7]=current_cell[7]-(current_cell[7]-feq[7])/tau_d;
current_cell[8]=current_cell[8]-(current_cell[8]-feq[8])/tau_d;
current_cell[9]=current_cell[9]-(current_cell[9]-feq[9])/tau_d;
current_cell[10]=current_cell[10]-(current_cell[10]-feq[10])/tau_d;
current_cell[11]=current_cell[11]-(current_cell[11]-feq[11])/tau_d;
current_cell[12]=current_cell[12]-(current_cell[12]-feq[12])/tau_d;
current_cell[13]=current_cell[13]-(current_cell[13]-feq[13])/tau_d;
current_cell[14]=current_cell[14]-(current_cell[14]-feq[14])/tau_d;
current_cell[15]=current_cell[15]-(current_cell[15]-feq[15])/tau_d;
current_cell[16]=current_cell[16]-(current_cell[16]-feq[16])/tau_d;
current_cell[17]=current_cell[17]-(current_cell[17]-feq[17])/tau_d;
current_cell[18]=current_cell[18]-(current_cell[18]-feq[18])/tau_d;
}
/*
* Performs streaming on cells.
*/
__global__ void DoStreaming(){
int x = threadIdx.x+blockIdx.x*blockDim.x;
int y = threadIdx.y+blockIdx.y*blockDim.y;
int z = threadIdx.z+blockIdx.z*blockDim.z;
int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz;
/* check that indices are within the bounds since there could be more threads than needed */
if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){
nx=x-LATTICE_VELOCITIES_D[0][0];
ny=y-LATTICE_VELOCITIES_D[0][1];
nz=z-LATTICE_VELOCITIES_D[0][2];
stream_field_d[Q_LBM*idx]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)];
nx=x-LATTICE_VELOCITIES_D[1][0];
ny=y-LATTICE_VELOCITIES_D[1][1];
nz=z-LATTICE_VELOCITIES_D[1][2];
stream_field_d[Q_LBM*idx+1]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+1];
nx=x-LATTICE_VELOCITIES_D[2][0];
ny=y-LATTICE_VELOCITIES_D[2][1];
nz=z-LATTICE_VELOCITIES_D[2][2];
stream_field_d[Q_LBM*idx+2]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+2];
nx=x-LATTICE_VELOCITIES_D[3][0];
ny=y-LATTICE_VELOCITIES_D[3][1];
nz=z-LATTICE_VELOCITIES_D[3][2];
stream_field_d[Q_LBM*idx+3]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+3];
nx=x-LATTICE_VELOCITIES_D[4][0];
ny=y-LATTICE_VELOCITIES_D[4][1];
nz=z-LATTICE_VELOCITIES_D[4][2];
stream_field_d[Q_LBM*idx+4]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+4];
nx=x-LATTICE_VELOCITIES_D[5][0];
ny=y-LATTICE_VELOCITIES_D[5][1];
nz=z-LATTICE_VELOCITIES_D[5][2];
stream_field_d[Q_LBM*idx+5]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+5];
nx=x-LATTICE_VELOCITIES_D[6][0];
ny=y-LATTICE_VELOCITIES_D[6][1];
nz=z-LATTICE_VELOCITIES_D[6][2];
stream_field_d[Q_LBM*idx+6]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+6];
nx=x-LATTICE_VELOCITIES_D[7][0];
ny=y-LATTICE_VELOCITIES_D[7][1];
nz=z-LATTICE_VELOCITIES_D[7][2];
stream_field_d[Q_LBM*idx+7]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+7];
nx=x-LATTICE_VELOCITIES_D[8][0];
ny=y-LATTICE_VELOCITIES_D[8][1];
nz=z-LATTICE_VELOCITIES_D[8][2];
stream_field_d[Q_LBM*idx+8]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+8];
nx=x-LATTICE_VELOCITIES_D[9][0];
ny=y-LATTICE_VELOCITIES_D[9][1];
nz=z-LATTICE_VELOCITIES_D[9][2];
stream_field_d[Q_LBM*idx+9]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+9];
nx=x-LATTICE_VELOCITIES_D[10][0];
ny=y-LATTICE_VELOCITIES_D[10][1];
nz=z-LATTICE_VELOCITIES_D[10][2];
stream_field_d[Q_LBM*idx+10]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+10];
nx=x-LATTICE_VELOCITIES_D[11][0];
ny=y-LATTICE_VELOCITIES_D[11][1];
nz=z-LATTICE_VELOCITIES_D[11][2];
stream_field_d[Q_LBM*idx+11]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+11];
nx=x-LATTICE_VELOCITIES_D[12][0];
ny=y-LATTICE_VELOCITIES_D[12][1];
nz=z-LATTICE_VELOCITIES_D[12][2];
stream_field_d[Q_LBM*idx+12]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+12];
nx=x-LATTICE_VELOCITIES_D[13][0];
ny=y-LATTICE_VELOCITIES_D[13][1];
nz=z-LATTICE_VELOCITIES_D[13][2];
stream_field_d[Q_LBM*idx+13]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+13];
nx=x-LATTICE_VELOCITIES_D[14][0];
ny=y-LATTICE_VELOCITIES_D[14][1];
nz=z-LATTICE_VELOCITIES_D[14][2];
stream_field_d[Q_LBM*idx+14]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+14];
nx=x-LATTICE_VELOCITIES_D[15][0];
ny=y-LATTICE_VELOCITIES_D[15][1];
nz=z-LATTICE_VELOCITIES_D[15][2];
stream_field_d[Q_LBM*idx+15]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+15];
nx=x-LATTICE_VELOCITIES_D[16][0];
ny=y-LATTICE_VELOCITIES_D[16][1];
nz=z-LATTICE_VELOCITIES_D[16][2];
stream_field_d[Q_LBM*idx+16]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+16];
nx=x-LATTICE_VELOCITIES_D[17][0];
ny=y-LATTICE_VELOCITIES_D[17][1];
nz=z-LATTICE_VELOCITIES_D[17][2];
stream_field_d[Q_LBM*idx+17]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+17];
nx=x-LATTICE_VELOCITIES_D[18][0];
ny=y-LATTICE_VELOCITIES_D[18][1];
nz=z-LATTICE_VELOCITIES_D[18][2];
stream_field_d[Q_LBM*idx+18]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+18];
}
}
/*
* Performs collision computation.
*/
__global__ void DoCollision(){
int x = threadIdx.x+blockIdx.x*blockDim.x;
int y = threadIdx.y+blockIdx.y*blockDim.y;
int z = threadIdx.z+blockIdx.z*blockDim.z;
int step=xlength_d+2, idx=x+y*step+z*step*step;
float density, velocity[D_LBM], feq[Q_LBM], *current_cell_s;
/* check that indices are within the bounds since there could be more threads than needed */
if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){
current_cell_s=&collide_field_d[Q_LBM*idx];
ComputeDensityGpu(current_cell_s,&density);
ComputeVelocityGpu(current_cell_s,&density,velocity);
ComputeFeqGpu(&density,velocity,feq);
ComputePostCollisionDistributionsGpu(current_cell_s,feq);
}
}
/*
* Computes proper boundary values.
*/
__global__ void TreatBoundary(int *flag_field_d){
int x = threadIdx.x+blockIdx.x*blockDim.x;
int y = threadIdx.y+blockIdx.y*blockDim.y;
int z = threadIdx.z+blockIdx.z*blockDim.z;
int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz, i, boundary_side=0, boundary_idx=100500;
float density, dot_prod;
if(idx<num_cells_d) {
if(flag_field_d[idx] == BOTTOM_BOUNDARY) {
boundary_side = BOTTOM_BOUNDARY;
boundary_idx = BOTTOM_BOUNDARY_IDX;
} else if (flag_field_d[idx] == LEFT_BOUNDARY) {
boundary_side = LEFT_BOUNDARY;
boundary_idx = LEFT_BOUNDARY_IDX;
} else if (flag_field_d[idx] == RIGHT_BOUNDARY) {
boundary_side = RIGHT_BOUNDARY;
boundary_idx = RIGHT_BOUNDARY_IDX;
} else if (flag_field_d[idx] == BACK_BOUNDARY) {
boundary_side = BACK_BOUNDARY;
boundary_idx = BACK_BOUNDARY_IDX;
} else if (flag_field_d[idx] == FRONT_BOUNDARY) {
boundary_side = FRONT_BOUNDARY;
boundary_idx = FRONT_BOUNDARY_IDX;
} else if (flag_field_d[idx] == LEFT_BOTTOM_EDGE) {
boundary_side = LEFT_BOTTOM_EDGE;
boundary_idx = 13;
} else if (flag_field_d[idx] == RIGHT_BOTTOM_EDGE) {
boundary_side = RIGHT_BOTTOM_EDGE;
boundary_idx = 11;
} else if (flag_field_d[idx] == BACK_BOTTOM_EDGE) {
boundary_side = BACK_BOTTOM_EDGE;
boundary_idx = 18;
} else if (flag_field_d[idx] == FRONT_BOTTOM_EDGE) {
boundary_side = FRONT_BOTTOM_EDGE;
boundary_idx = 4;
} else if (flag_field_d[idx] == LEFT_BACK_EDGE) {
boundary_side = LEFT_BACK_EDGE;
boundary_idx = 17;
} else if (flag_field_d[idx] == LEFT_FRONT_EDGE) {
boundary_side = LEFT_FRONT_EDGE;
boundary_idx = 3;
} else if (flag_field_d[idx] == RIGHT_BACK_EDGE) {
boundary_side = RIGHT_BACK_EDGE;
boundary_idx = 15;
} else if (flag_field_d[idx] == RIGHT_FRONT_EDGE) {
boundary_side = RIGHT_FRONT_EDGE;
boundary_idx = 1;
} else if (flag_field_d[idx] == LEFT_UPPER_EDGE) {
boundary_side = LEFT_UPPER_EDGE;
boundary_idx = 7;
} else if (flag_field_d[idx] == RIGHT_UPPER_EDGE) {
boundary_side = RIGHT_UPPER_EDGE;
boundary_idx = 5;
} else if (flag_field_d[idx] == BACK_UPPER_EDGE) {
boundary_side = BACK_UPPER_EDGE;
boundary_idx = 14;
} else if (flag_field_d[idx] == FRONT_UPPER_EDGE) {
boundary_side = FRONT_UPPER_EDGE;
boundary_idx = 0;
} else if (flag_field_d[idx] == TOP_BOUNDARY) {
boundary_side = TOP_BOUNDARY;
boundary_idx = TOP_BOUNDARY_IDX;
}
if( boundary_side==LEFT_BOUNDARY || boundary_side==RIGHT_BOUNDARY ||
boundary_side==BOTTOM_BOUNDARY ||
boundary_side==BACK_BOUNDARY || boundary_side==FRONT_BOUNDARY) {
i = TREAT_BOUNDARY_INDECES[boundary_idx][0];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
i = TREAT_BOUNDARY_INDECES[boundary_idx][1];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
i = TREAT_BOUNDARY_INDECES[boundary_idx][2];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
i = TREAT_BOUNDARY_INDECES[boundary_idx][3];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
i = TREAT_BOUNDARY_INDECES[boundary_idx][4];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)];
} else if (boundary_side == LEFT_BOTTOM_EDGE || boundary_side == RIGHT_BOTTOM_EDGE ||
boundary_side == BACK_BOTTOM_EDGE || boundary_side == FRONT_BOTTOM_EDGE ||
boundary_side == LEFT_BACK_EDGE || boundary_side == LEFT_FRONT_EDGE ||
boundary_side == RIGHT_BACK_EDGE || boundary_side == RIGHT_FRONT_EDGE) {
nx=x+LATTICE_VELOCITIES_D[boundary_idx][0];
ny=y+LATTICE_VELOCITIES_D[boundary_idx][1];
nz=z+LATTICE_VELOCITIES_D[boundary_idx][2];
collide_field_d[Q_LBM*(x+y*step+z*step*step)+boundary_idx]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(boundary_idx)];
} else if(boundary_side == LEFT_UPPER_EDGE || boundary_side == RIGHT_UPPER_EDGE ||
boundary_side == BACK_UPPER_EDGE || boundary_side == FRONT_UPPER_EDGE) {
i = boundary_idx;
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
/* Compute density in the neighbour cell */
ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
/* Compute dot product */
dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
/* Assign the boudary cell value */
collide_field_d[Q_LBM*(idx)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
} else if(boundary_side == TOP_BOUNDARY) {
i = TREAT_BOUNDARY_INDECES[boundary_idx][0];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
collide_field_d[Q_LBM*(idx)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
i = TREAT_BOUNDARY_INDECES[boundary_idx][1];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
collide_field_d[Q_LBM*(idx)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
i = TREAT_BOUNDARY_INDECES[boundary_idx][2];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
collide_field_d[Q_LBM*(idx)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
i = TREAT_BOUNDARY_INDECES[boundary_idx][3];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
collide_field_d[Q_LBM*(idx)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
i = TREAT_BOUNDARY_INDECES[boundary_idx][4];
nx=x+LATTICE_VELOCITIES_D[i][0];
ny=y+LATTICE_VELOCITIES_D[i][1];
nz=z+LATTICE_VELOCITIES_D[i][2];
ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density);
dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+
LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+
LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2];
collide_field_d[Q_LBM*(idx)+i]=
collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+
2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod;
}
}
}
/*
* Performs pointer swap on GPU.
*/
__global__ void DoSwap(){
float *swap=collide_field_d; collide_field_d=stream_field_d; stream_field_d=swap;
}
void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau,
float *wall_velocity, int xlength, float **collide_field_dd, float **stream_field_dd,
int **flag_field_d, float *mlups_sum){
int num_cells = pow(xlength+2, D_LBM);
clock_t mlups_time;
/* initialize constant data */
cudaErrorCheck(cudaMemcpyToSymbol(xlength_d, &xlength, sizeof(int), 0, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMemcpyToSymbol(num_cells_d, &num_cells, sizeof(int), 0, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMemcpyToSymbol(tau_d, &tau, sizeof(float), 0, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMemcpyToSymbol(wall_velocity_d, wall_velocity, D_LBM*sizeof(float), 0, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMemcpyToSymbol(collide_field_d, collide_field_dd, sizeof(*collide_field_dd), 0, cudaMemcpyHostToDevice));
cudaErrorCheck(cudaMemcpyToSymbol(stream_field_d, stream_field_dd, sizeof(*stream_field_dd), 0, cudaMemcpyHostToDevice));
/* define grid structure */
dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE);
dim3 grid((xlength+2+block.x-1)/block.x, (xlength+2+block.y-1)/block.y, (xlength+2+block.z-1)/block.z);
mlups_time = clock();
/* perform streaming */
DoStreaming<<<grid,block>>>();
cudaErrorCheck(cudaThreadSynchronize());
cudaErrorCheck(cudaPeekAtLastError());
/* Perform the swapping of collide and stream fields */
DoSwap<<<1,1>>>();
cudaErrorCheck(cudaThreadSynchronize());
cudaErrorCheck(cudaPeekAtLastError());
/* perform collision */
DoCollision<<<grid,block>>>();
cudaErrorCheck(cudaThreadSynchronize());
cudaErrorCheck(cudaPeekAtLastError());
/* perform boundary treatment */
TreatBoundary<<<grid,block>>>(*flag_field_d);
cudaErrorCheck(cudaPeekAtLastError());
mlups_time = clock()-mlups_time;
*mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC);
if(VERBOSE)
printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC));
/* copy data back to host */
cudaErrorCheck(cudaMemcpyFromSymbol(collide_field_dd, collide_field_d, sizeof(*collide_field_dd), 0, cudaMemcpyDeviceToHost));
cudaErrorCheck(cudaMemcpyFromSymbol(stream_field_dd, stream_field_d, sizeof(*stream_field_dd), 0, cudaMemcpyDeviceToHost));
}
cell_computation_gpu.h
#ifndef _CELL_COMPUTATION_GPU_CUH_
#define _CELL_COMPUTATION_GPU_CUH_
/**
* Computes the density from the particle distribution functions stored at currentCell.
* currentCell thus denotes the address of the first particle distribution function of the
* respective cell. The result is stored in density.
*/
__device__ void ComputeDensityGpu(float *current_cell, float *density){
*density=0;
*density+=current_cell[0];
*density+=current_cell[1];
*density+=current_cell[2];
*density+=current_cell[3];
*density+=current_cell[4];
*density+=current_cell[5];
*density+=current_cell[6];
*density+=current_cell[7];
*density+=current_cell[8];
*density+=current_cell[9];
*density+=current_cell[10];
*density+=current_cell[11];
*density+=current_cell[12];
*density+=current_cell[13];
*density+=current_cell[14];
*density+=current_cell[15];
*density+=current_cell[16];
*density+=current_cell[17];
*density+=current_cell[18];
}
/**
* Computes the velocity within currentCell and stores the result in velocity
*/
__device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){
velocity[0]=0;
velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0];
velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0];
velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0];
velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0];
velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0];
velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0];
velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0];
velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0];
velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0];
velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0];
velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0];
velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0];
velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0];
velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0];
velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0];
velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0];
velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0];
velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0];
velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0];
velocity[0]/=*density;
velocity[1]=0;
velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1];
velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1];
velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1];
velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1];
velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1];
velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1];
velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1];
velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1];
velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1];
velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1];
velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1];
velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1];
velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1];
velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1];
velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1];
velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1];
velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1];
velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1];
velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1];
velocity[1]/=*density;
velocity[2]=0;
velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2];
velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2];
velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2];
velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2];
velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2];
velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2];
velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2];
velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2];
velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2];
velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2];
velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2];
velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2];
velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2];
velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2];
velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2];
velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2];
velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2];
velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2];
velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2];
velocity[2]/=*density;
}
/**
* Computes the equilibrium distributions for all particle distribution functions of one
* cell from density and velocity and stores the results in feq.
*/
__device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){
float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2];
s2 = s1*s1;
feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2];
s2 = s1*s1;
feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2];
s2 = s1*s1;
feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2];
s2 = s1*s1;
feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2];
s2 = s1*s1;
feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2];
s2 = s1*s1;
feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2];
s2 = s1*s1;
feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2];
s2 = s1*s1;
feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2];
s2 = s1*s1;
feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2];
s2 = s1*s1;
feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2];
s2 = s1*s1;
feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2];
s2 = s1*s1;
feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2];
s2 = s1*s1;
feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2];
s2 = s1*s1;
feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2];
s2 = s1*s1;
feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2];
s2 = s1*s1;
feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2];
s2 = s1*s1;
feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2];
s2 = s1*s1;
feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2];
s2 = s1*s1;
feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
}
/**
* Finds an inverse probability distribution for the provided lattice index.
*/
__device__ int InvGpu(int i){
return (Q_LBM-1)-i;
}
#endif
cell_computation_gpu.cu
#ifndef _CELL_COMPUTATION_GPU_CUH_
#define _CELL_COMPUTATION_GPU_CUH_
/**
* Computes the density from the particle distribution functions stored at currentCell.
* currentCell thus denotes the address of the first particle distribution function of the
* respective cell. The result is stored in density.
*/
__device__ void ComputeDensityGpu(float *current_cell, float *density){
*density=0;
*density+=current_cell[0];
*density+=current_cell[1];
*density+=current_cell[2];
*density+=current_cell[3];
*density+=current_cell[4];
*density+=current_cell[5];
*density+=current_cell[6];
*density+=current_cell[7];
*density+=current_cell[8];
*density+=current_cell[9];
*density+=current_cell[10];
*density+=current_cell[11];
*density+=current_cell[12];
*density+=current_cell[13];
*density+=current_cell[14];
*density+=current_cell[15];
*density+=current_cell[16];
*density+=current_cell[17];
*density+=current_cell[18];
}
/**
* Computes the velocity within currentCell and stores the result in velocity
*/
__device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){
velocity[0]=0;
velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0];
velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0];
velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0];
velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0];
velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0];
velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0];
velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0];
velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0];
velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0];
velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0];
velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0];
velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0];
velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0];
velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0];
velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0];
velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0];
velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0];
velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0];
velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0];
velocity[0]/=*density;
velocity[1]=0;
velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1];
velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1];
velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1];
velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1];
velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1];
velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1];
velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1];
velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1];
velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1];
velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1];
velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1];
velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1];
velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1];
velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1];
velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1];
velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1];
velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1];
velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1];
velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1];
velocity[1]/=*density;
velocity[2]=0;
velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2];
velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2];
velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2];
velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2];
velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2];
velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2];
velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2];
velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2];
velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2];
velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2];
velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2];
velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2];
velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2];
velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2];
velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2];
velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2];
velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2];
velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2];
velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2];
velocity[2]/=*density;
}
/**
* Computes the equilibrium distributions for all particle distribution functions of one
* cell from density and velocity and stores the results in feq.
*/
__device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){
float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2];
s2 = s1*s1;
feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2];
s2 = s1*s1;
feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2];
s2 = s1*s1;
feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2];
s2 = s1*s1;
feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2];
s2 = s1*s1;
feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2];
s2 = s1*s1;
feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2];
s2 = s1*s1;
feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2];
s2 = s1*s1;
feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2];
s2 = s1*s1;
feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2];
s2 = s1*s1;
feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2];
s2 = s1*s1;
feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2];
s2 = s1*s1;
feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2];
s2 = s1*s1;
feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2];
s2 = s1*s1;
feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2];
s2 = s1*s1;
feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2];
s2 = s1*s1;
feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2];
s2 = s1*s1;
feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2];
s2 = s1*s1;
feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2];
s2 = s1*s1;
feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
}
/**
* Finds an inverse probability distribution for the provided lattice index.
*/
__device__ int InvGpu(int i){
return (Q_LBM-1)-i;
}
#endif
10 AM Meeting
lbm_model_gpu.cuh
#ifndef _LBM_MODEL_GPU_CUH_
#define _LBM_MODEL_GPU_CUH_
/* Restricts 3D blocks to have 512 threads (limits: 512 CC<2.x; 1024 CC>2.x) */
#define BLOCK_SIZE 8
/* Arbitrary boundary identifiers */
#define LEFT_BOUNDARY_IDX 0
#define RIGHT_BOUNDARY_IDX 1
#define BOTTOM_BOUNDARY_IDX 2
#define TOP_BOUNDARY_IDX 3
#define BACK_BOUNDARY_IDX 4
#define FRONT_BOUNDARY_IDX 5
__constant__ static const int LATTICE_VELOCITIES_D[19][3] = {
{0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0},
{-1,0,0}, {0,0,0}, {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0}, {0,-1,1},{-1,0,1},
{0,0,1}, {1,0,1}, {0,1,1}
};
__constant__ static const float LATTICE_WEIGHTS_D[19] = {
1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0,
2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0,
2.0/36.0, 1.0/36.0, 1.0/36.0
};
/**
* Stores number of probability distributions which we need to copy in boundary treatment step.
*/
__constant__ static const float TREAT_BOUNDARY_INDECES[6][5] = {
{3,7,10,13,17},
{1,5,8,11,15},
{4,11,12,13,18},
{0,5,6,7,14},
{14,15,16,17,18},
{0,1,2,3,4}
};
#endif
CPU:
initialization.h
#ifndef _INITIALIZATION_H_
#define _INITIALIZATION_H_
/**
* Reads the parameters for the lid driven cavity scenario from a configuration file
*/
void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps,
int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled);
/**
* Initializes the particle distribution functions and the flag field
*/
void InitialiseFields(float *collide_field, float *stream_field,int *flag_field, int xlength,
int gpu_enabled);
#endif
initialization.c
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "initialization.h"
#include "lbm_model.h"
#include "utils.h"
/**
* Prints help message that includes program usage instructions and control flags.
*/
void PrintHelpMessage(){
printf("List of control flags:\n");
printf("\t -gpu all computations are to be performed on gpu\n");
printf("\t -cpu all computations are to be performed on cpu\n");
printf("\t -help prints this help message\n");
printf("NOTE: Control flags are mutually exclusive and only one flag at a time is allowed\n");
printf("Example program usage:\n");
printf("\t./lbm-sim ./data/lbm.dat -gpu\n");
exit(1);
}
void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps,
int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled){
float *velocity_wall_1, *velocity_wall_2, *velocity_wall_3;
/* printing out help message */
if(argc<3) PrintHelpMessage();
if(!strcmp(argv[1], "-help") || !strcmp(argv[2], "-help")) PrintHelpMessage();
/* checking parameters */
if(access(argv[1], R_OK) != 0)
ERROR("Provided configuration file path either doesn't exist or can not be read.");
if(!strcmp(argv[2], "-gpu")) *gpu_enabled=1; else *gpu_enabled=0;
/* reading parameters */
READ_FLOAT(argv[1], *tau);
velocity_wall_1=&velocity_wall[0];
velocity_wall_2=&velocity_wall[1];
velocity_wall_3=&velocity_wall[2];
READ_FLOAT(argv[1], *velocity_wall_1);
READ_FLOAT(argv[1], *velocity_wall_2);
READ_FLOAT(argv[1], *velocity_wall_3);
READ_INT(argv[1], *xlength);
READ_INT(argv[1], *timesteps);
READ_INT(argv[1], *timesteps_per_plotting);
}
void InitialiseFields(float *collide_field, float *stream_field, int *flag_field, int xlength,
int gpu_enabled){
int x,y,z,i,step=xlength+2;
/* NOTE: We use z=xlength+1 as the moving wall */
for(x=0;x<step;x++){
for(y=0;y<step;y++){
for(z=0;z<step;z++){
/* Initializing flags */
if(gpu_enabled) {
if(x==0 && y==0 && z==0)
flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_BACK_CORNER;
else if(x==0 && y==0 && z==step-1)
flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_FRONT_CORNER;
else if(x==0 && y==step-1 && z==0)
flag_field[x+y*step+z*step*step] = LEFT_UPPER_BACK_CORNER;
else if(x==0 && y==step-1 && z==step-1)
flag_field[x+y*step+z*step*step] = LEFT_UPPER_FRONT_CORNER;
else if(x==step-1 && y==0 && z==0)
flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_BACK_CORNER;
else if(x==step-1 && y==0 && z==step-1)
flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_FRONT_CORNER;
else if(x==step-1 && y==step-1 && z==0)
flag_field[x+y*step+z*step*step] = RIGHT_UPPER_BACK_CORNER;
else if(x==step-1 && y==step-1 && z==step-1)
flag_field[x+y*step+z*step*step] = RIGHT_UPPER_FRONT_CORNER;
else if(x==0 && y==0)
flag_field[x+y*step+z*step*step]=LEFT_BOTTOM_EDGE;
else if(x==xlength+1 && y==0)
flag_field[x+y*step+z*step*step]=RIGHT_BOTTOM_EDGE;
else if(y==0 && z==0)
flag_field[x+y*step+z*step*step]=BACK_BOTTOM_EDGE;
else if(y==0 && z==xlength+1)
flag_field[x+y*step+z*step*step]=FRONT_BOTTOM_EDGE;
else if(x==0 && z==0)
flag_field[x+y*step+z*step*step]=LEFT_BACK_EDGE;
else if(x==0 && z==xlength+1)
flag_field[x+y*step+z*step*step]=LEFT_FRONT_EDGE;
else if(x==xlength+1 && z==0)
flag_field[x+y*step+z*step*step]=RIGHT_BACK_EDGE;
else if(x==xlength+1 && z==xlength+1)
flag_field[x+y*step+z*step*step]=RIGHT_FRONT_EDGE;
else if(x==0 && y==xlength+1)
flag_field[x+y*step+z*step*step]=LEFT_UPPER_EDGE;
else if(x==xlength+1 && y==xlength+1)
flag_field[x+y*step+z*step*step]=RIGHT_UPPER_EDGE;
else if(y==xlength+1 && z==0)
flag_field[x+y*step+z*step*step]=BACK_UPPER_EDGE;
else if(y==xlength+1 && z==xlength+1)
flag_field[x+y*step+z*step*step]=FRONT_UPPER_EDGE;
else if(y==step-1 && x!=0 && x!=step-1 && z!=0 && z!=step-1)
flag_field[x+y*step+z*step*step] = TOP_BOUNDARY;
else if(y==0 && x!=0 && x!=step-1 && z!=0 && z!=step-1)
flag_field[x+y*step+z*step*step] = BOTTOM_BOUNDARY;
else if (x==0 && y!=0 && y!=step-1 && z!=0 && z!=step-1)
flag_field[x+y*step+z*step*step] = LEFT_BOUNDARY;
else if (x==step-1 && y!=0 && y!=step-1 && z!=0 && z!=step-1)
flag_field[x+y*step+z*step*step] = RIGHT_BOUNDARY;
else if (z==0 && y!=0 && y!=step-1 && x!=0 && x!=step-1)
flag_field[x+y*step+z*step*step] = BACK_BOUNDARY;
else if (z==step-1 && y!=0 && y!=step-1 && x!=0 && x!=step-1)
flag_field[x+y*step+z*step*step] = FRONT_BOUNDARY;
else
flag_field[x+y*step+z*step*step]=FLUID;
} else {
if(y == xlength+1)
flag_field[x+y*step+z*step*step]=MOVING_WALL;
else if(x == 0 || x == xlength+1 || y == 0 || z == xlength+1 || z == 0)
flag_field[x+y*step+z*step*step]=NO_SLIP;
else
flag_field[x+y*step+z*step*step]=FLUID;
}
/* Initializing distributions for stream and collide fields */
for(i=0;i<Q_LBM;i++){
/*
* NOTE:Stream field is initilized to 0s because that helps to
* track down mistakes and has no impact whatsoever to on the
* computation further on.
*/
stream_field[Q_LBM*(x+y*step+z*step*step)+i]=0;
collide_field[Q_LBM*(x+y*step+z*step*step)+i]=LATTICE_WEIGHTS[i];
}
}
}
}
}
streaming.h
#ifndef _STREAMING_H_
#define _STREAMING_H_
/**
* Carries out the streaming step and writes the respective distribution functions from
* collideField to streamField.
*/
void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength);
#endif
streaming.c
#include "streaming.h"
#include "lbm_model.h"
void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength){
int x,nx,y,ny,z,nz,i,step=xlength+2;
for(x=0;x<step;x++){
for(y=0;y<step;y++){
for(z=0;z<step;z++){
if(flag_field[x+y*step+z*step*step]==FLUID){
for(i=0;i<Q_LBM;i++){
nx=x-LATTICE_VELOCITIES[i][0];
ny=y-LATTICE_VELOCITIES[i][1];
nz=z-LATTICE_VELOCITIES[i][2];
stream_field[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field[Q_LBM*(nx+ny*step+nz*step*step)+i];
}
}
}
}
}
}
collision.h
#ifndef _COLLISION_H_
#define _COLLISION_H_
/**
* Carries out the whole local collision process. Computes density and velocity and equilibrium
* distributions. Carries out BGK update.
*/
void DoCollision(float *collide_field, int *flag_field, float tau, int xlength);
#endif
collision.c
#include "collision.h"
#include "cell_computation.h"
#include "lbm_model.h"
#include "utils.h"
/**
* Computes the post-collision distribution functions according to the BGK update rule and
* stores the results again at the same position.
*/
void ComputePostCollisionDistributions(float *current_cell, float tau, const float *const feq){
int i;
for(i=0;i<Q_LBM;i++){
current_cell[i]=current_cell[i]-(current_cell[i]-feq[i])/tau;
/* Probability distribution function can not be less than 0 */
if (current_cell[i] < 0)
ERROR("Probability distribution function can not be negative.");
}
}
void DoCollision(float *collide_field, int *flag_field, float tau, int xlength){
float density, velocity[3], feq[Q_LBM], *currentCell;
int x,y,z,step=xlength+2;
for(x=1;x<step-1;x++){
for(y=1;y<step-1;y++){
for(z=1;z<step-1;z++){
currentCell=&collide_field[Q_LBM*(x+y*step+z*step*step)];
ComputeDensity(currentCell,&density);
ComputeVelocity(currentCell,&density,velocity);
ComputeFeq(&density,velocity,feq);
ComputePostCollisionDistributions(currentCell,tau,feq);
}
}
}
}
boundary.h
#ifndef _BOUNDARY_H_
#define _BOUNDARY_H_
/**
* Handles the boundaries in our simulation setup
*/
void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength);
#endif
boundary.c
#include "boundary.h"
#include "lbm_model.h"
#include "cell_computation.h"
/**
* Finds an inverse probability distribution for the provided lattice index.
*/
int inv(int i){
return (Q_LBM-1)-i;
}
void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength){
int x,nx,y,ny,z,nz,i,step=xlength+2;
float density,dot_prod;
for(x=0;x<step;x++){
for(y=0;y<step;y++){
for(z=0;z<step;z++){
if(flag_field[x+y*step+z*step*step]!=FLUID){
for(i=0;i<Q_LBM;i++){
nx=x+LATTICE_VELOCITIES[i][0];
ny=y+LATTICE_VELOCITIES[i][1];
nz=z+LATTICE_VELOCITIES[i][2];
/* We don't need the values outside of our extended domain */
if(0<nx && nx<step-1 && 0<ny && ny<step-1 && 0<nz && nz<step-1){
if (flag_field[x+y*step+z*step*step]==MOVING_WALL){
/* Compute density in the neighbour cell */
ComputeDensity(&collide_field[Q_LBM*(nx+ny*step+nz*step*step)],&density);
/* Compute dot product */
dot_prod=LATTICE_VELOCITIES[i][0]*wall_velocity[0]+
LATTICE_VELOCITIES[i][1]*wall_velocity[1]+
LATTICE_VELOCITIES[i][2]*wall_velocity[2];
/* Assign the boudary cell value */
collide_field[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)]+
2*LATTICE_WEIGHTS[i]*density*C_S_POW2_INV*dot_prod;
}else if(flag_field[x+y*step+z*step*step]==NO_SLIP){
collide_field[Q_LBM*(x+y*step+z*step*step)+i]=
collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)];
}
}
}
}
}
}
}
}
cell_computation.h
#ifndef _CELL_COMPUTATIONS_H_
#define _CELL_COMPUTATIONS_H_
/**
* Computes the density from the particle distribution functions stored at currentCell.
* currentCell thus denotes the address of the first particle distribution function of the
* respective cell. The result is stored in density.
*/
void ComputeDensity(const float *const current_cell, float *density);
/**
* Computes the velocity within currentCell and stores the result in velocity
*/
void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity);
/**
* Computes the equilibrium distributions for all particle distribution functions of one
* cell from density and velocity and stores the results in feq.
*/
void ComputeFeq(const float * const density, const float * const velocity, float *feq);
#endif
cell_computation.c
#include "cell_computation.h"
#include "lbm_model.h"
#include "utils.h"
void ComputeDensity(const float *const current_cell, float *density){
int i; *density=0;
for(i=0;i<Q_LBM;i++)
*density+=current_cell[i];
/* Density should be close to a unit (ρ~1) */
if((*density-1.0)>EPS)
ERROR("Density dropped below error tolerance.");
}
void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity){
int i;
/* NOTE:Indexes are hard coded to improve program performance */
velocity[0]=0;
velocity[1]=0;
velocity[2]=0;
for(i=0;i<Q_LBM;i++){
velocity[0]+=current_cell[i]*LATTICE_VELOCITIES[i][0];
velocity[1]+=current_cell[i]*LATTICE_VELOCITIES[i][1];
velocity[2]+=current_cell[i]*LATTICE_VELOCITIES[i][2];
}
velocity[0]/=*density;
velocity[1]/=*density;
velocity[2]/=*density;
}
void ComputeFeq(const float * const density, const float * const velocity, float *feq){
int i;
float s1, s2, s3; /* summands */
/* NOTE:Indexes are hard coded to improve program performance */
for(i=0;i<Q_LBM;i++){
s1 = LATTICE_VELOCITIES[i][0]*velocity[0]+LATTICE_VELOCITIES[i][1]*velocity[1]+
LATTICE_VELOCITIES[i][2]*velocity[2];
s2 = s1*s1;
s3 = velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2];
feq[i]=LATTICE_WEIGHTS[i]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0);
/* Probability distribution function can not be less than 0 */
if (feq[i] < 0)
ERROR("Probability distribution function can not be negative.");
}
}
visualization.h
#ifndef _VISUALIZATION_H_
#define _VISUALIZATION_H_
/**
* Writes the density and velocity field (derived from the distributions in collideField)
* to a file determined by 'filename' and timestep 't'. You can re-use parts of the code
* from visual.c (VTK output for Navier-Stokes solver) and modify it for 3D datasets.
*/
void WriteVtkOutput(const float * const collide_field, const int * const flag_field,
const char * filename, unsigned int t, int xlength);
/**
* Writes the provided stream or collision field to a file with specified identifiers.
*/
void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength,
const int rank);
/**
* Writes the provided flag field to a file with specified identifiers.
*/
void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank);
/**
* Prints out the point by point values of the provided field (4D) to the stdout.
* @param field
* linerized 4D array, with (x,y,z,i)=Q*(x+y*(ncell+2)+z*(ncell+2)*(ncell+2))+i
* @param ncell
* number of inner cells, the ones which are there before adding a boundary layer
*/
void PrintField(float *field, int ncell);
#endif