Tom Lombardo – What is The Good Future?
Eric P. Dollard – “True” Electromagnetism of Nikola Tesla
Dan Winter – Fractals, DNA and the Golden Ratio
https://youtu.be/4Rx35q-zJRk
Michael Talbot – Synchronicity and the Holographic Universe
GPU GEMS – Fluid Simulation
LBM Code
main.c
#ifndef _MAIN_C_ #define _MAIN_C_ #include <time.h> #include <math.h> #include <stdlib.h> #include <stdio.h> #include "lbm_model.h" #include "initialization.h" #include "initialization_gpu.h" #include "streaming.h" #include "collision.h" #include "boundary.h" #include "lbm_solver_gpu.h" #include "visualization.h" #include "utils.h" int main(int argc, char *argv[]) { float *collide_field=NULL, *stream_field=NULL, *collide_field_d=NULL, *stream_field_d=NULL, *swap=NULL, tau, wall_velocity[D_LBM], num_cells, mlups_sum; int *flag_field=NULL, *flag_field_d=NULL, xlength, t, timesteps, timesteps_per_plotting, gpu_enabled; clock_t mlups_time; size_t field_size; /* process parameters */ ReadParameters(&xlength, &tau, wall_velocity, ×teps, ×teps_per_plotting, argc, argv, &gpu_enabled); /* check if provided parameters are legitimate */ ValidateModel(wall_velocity, xlength, tau); /* initializing fields */ num_cells = pow(xlength + 2, D_LBM); field_size = Q_LBM*num_cells*sizeof(float); collide_field = (float*) malloc(field_size); stream_field = (float*) malloc(field_size); flag_field = (int*) malloc(num_cells*sizeof(int)); InitialiseFields(collide_field, stream_field, flag_field, xlength, gpu_enabled); InitialiseDeviceFields(collide_field, stream_field, flag_field, xlength, &collide_field_d, &stream_field_d, &flag_field_d); for (t = 0; t < timesteps; t++) { printf("Time step: #%d\n", t); if (gpu_enabled){ DoIteration(collide_field, stream_field, flag_field, tau, wall_velocity, xlength, &collide_field_d, &stream_field_d, &flag_field_d, &mlups_sum); /* Copy data from devcice in memory only when we need VTK output */ if (!(t%timesteps_per_plotting)) CopyFieldsFromDevice(collide_field, stream_field, xlength, &collide_field_d, &stream_field_d); } else { mlups_time = clock(); /* Copy pdfs from neighbouring cells into collide field */ DoStreaming(collide_field, stream_field, flag_field, xlength); /* Perform the swapping of collide and stream fields */ swap = collide_field; collide_field = stream_field; stream_field = swap; /* Compute post collision distributions */ DoCollision(collide_field, flag_field, tau, xlength); /* Treat boundaries */ TreatBoundary(collide_field, flag_field, wall_velocity, xlength); mlups_time = clock()-mlups_time; /* Print out the MLUPS value */ mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC); if(VERBOSE) printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC)); } /* Print out vtk output if needed */ if (!(t%timesteps_per_plotting)) WriteVtkOutput(collide_field, flag_field, "img/lbm-img", t, xlength); } printf("Average MLUPS: %f\n", mlups_sum/(t+1)); if (VERBOSE) { if(gpu_enabled) CopyFieldsFromDevice(collide_field, stream_field, xlength, &collide_field_d, &stream_field_d); WriteField(collide_field, "img/collide-field", 0, xlength, gpu_enabled); writeFlagField(flag_field, "img/flag-field", xlength, gpu_enabled); } /* Free memory */ free(collide_field); free(stream_field); free(flag_field); FreeDeviceFields(&collide_field_d, &stream_field_d, &flag_field_d); printf("Simulation complete.\n"); return 0; } #endif
lbm_model.h
#ifndef _LBM_MODEL_H_ #define _LBM_MODEL_H_ #define VERBOSE 0 #define MLUPS_MIN_CELLS 300 #define MLUPS_EXPONENT 1000000 /* simulation parameters */ #define D_LBM 3 #define Q_LBM 19 /* arbitrary cell identifiers */ #define FLUID 0 #define NO_SLIP 1 #define MOVING_WALL 2 /* computation enhancing values */ #define EPS 0.05 #define C_S_POW2_INV 3.0 #define C_S_POW4_INV 9.0 /* reference values, not used in actual computation */ #define SQRT3 1.73205080756887729 static const float C_S = 1.0/SQRT3; /* predefined simulation parameters */ static const int LATTICE_VELOCITIES[19][3] = { {0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0}, {-1,0,0}, {0,0,0}, {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0}, {0,-1,1},{-1,0,1}, {0,0,1}, {1,0,1}, {0,1,1} }; static const float LATTICE_WEIGHTS[19] = { 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0 }; /* additional cell definitions */ #define LEFT_BOUNDARY 5100 #define RIGHT_BOUNDARY 5900 #define BOTTOM_BOUNDARY 5010 #define TOP_BOUNDARY 5090 #define BACK_BOUNDARY 5001 #define FRONT_BOUNDARY 5009 #define LEFT_BOTTOM_EDGE 5110 #define LEFT_UPPER_EDGE 5190 #define RIGHT_BOTTOM_EDGE 5910 #define RIGHT_UPPER_EDGE 5990 #define BACK_BOTTOM_EDGE 5011 #define BACK_UPPER_EDGE 5091 #define FRONT_BOTTOM_EDGE 5019 #define FRONT_UPPER_EDGE 5099 #define LEFT_BACK_EDGE 5101 #define LEFT_FRONT_EDGE 5109 #define RIGHT_BACK_EDGE 5901 #define RIGHT_FRONT_EDGE 5909 #define LEFT_BOTTOM_BACK_CORNER 5111 #define LEFT_BOTTOM_FRONT_CORNER 5119 #define LEFT_UPPER_BACK_CORNER 5191 #define LEFT_UPPER_FRONT_CORNER 5199 #define RIGHT_BOTTOM_BACK_CORNER 5911 #define RIGHT_BOTTOM_FRONT_CORNER 5919 #define RIGHT_UPPER_BACK_CORNER 5991 #define RIGHT_UPPER_FRONT_CORNER 5999 /** * Validates the configured physical model by calculating characteristic numbers */ void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau); #endif
lbm_model.c
#include <math.h> #include <stdio.h> #include "lbm_model.h" #include "utils.h" void ValidateModel(float wall_velocity[D_LBM], int domain_size, float tau){ float u_wall_length, mach_number, reynolds_number; /* Compute Mach number and Reynolds number */ u_wall_length = sqrt(wall_velocity[0]*wall_velocity[0]+ wall_velocity[1]*wall_velocity[1]+ wall_velocity[2]*wall_velocity[2]); mach_number = u_wall_length*SQRT3; reynolds_number = u_wall_length*domain_size*C_S_POW2_INV/(tau-0.5); printf("Computed Mach number: %f\n", mach_number); printf("Computed Reynolds number: %f\n", reynolds_number); /* Check if characteristic numbers are correct */ if (mach_number >= 1) ERROR("Computed Mach number is too large."); if (reynolds_number > 500) ERROR("Computed Reynolds number is too large for simulation to be run on a laptop/pc."); }
GPU:
initialization_gpu.h
#ifndef _INITIALIZATION_GPU_H_ #define _INITIALIZATION_GPU_H_ /** * Copy everything from fields on DRAM in GPU */ void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength, float **collide_field_d, float **stream_field_d,int **flag_field_d); /** * Free GPU memory */ void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d); /** * Copy data from device in global memory. */ void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength, float **collide_field_dd, float **stream_field_dd); #endif
initialization_gpu.cu
#include "initialization_gpu.h" #include "lbm_model.h" #include "utils_gpu.h" void InitialiseDeviceFields(float *collide_field, float *stream_field,int *flag_field, int xlength, float **collide_field_d, float **stream_field_d,int **flag_field_d){ int num_cells = pow(xlength+2, D_LBM); size_t computational_field_size = Q_LBM*num_cells*sizeof(float); size_t flag_field_size = num_cells*sizeof(int); cudaErrorCheck(cudaMalloc(collide_field_d, computational_field_size)); cudaErrorCheck(cudaMemcpy(*collide_field_d, collide_field, computational_field_size, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMalloc(stream_field_d, computational_field_size)); cudaErrorCheck(cudaMemcpy(*stream_field_d, stream_field, computational_field_size, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMalloc(flag_field_d, flag_field_size)); cudaErrorCheck(cudaMemcpy(*flag_field_d, flag_field, flag_field_size, cudaMemcpyHostToDevice)); } void FreeDeviceFields(float **collide_field_d, float **stream_field_d,int **flag_field_d){ cudaErrorCheck(cudaFree(*collide_field_d)); cudaErrorCheck(cudaFree(*stream_field_d)); cudaErrorCheck(cudaFree(*flag_field_d)); } void CopyFieldsFromDevice(float *collide_field, float *stream_field, int xlength, float **collide_field_dd, float **stream_field_dd) { int num_cells = pow(xlength+2, D_LBM); size_t computational_field_size = Q_LBM*num_cells*sizeof(float); cudaErrorCheck(cudaMemcpy(collide_field, *collide_field_dd, computational_field_size, cudaMemcpyDeviceToHost)); cudaErrorCheck(cudaMemcpy(stream_field, *stream_field_dd, computational_field_size, cudaMemcpyDeviceToHost)); }
lbm_solver_gpu.h
#ifndef _COLLISION_GPU_H_ #define _COLLISION_GPU_H_ /** * Performs streaming, collision and boundary treatment. */ void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau, float *wall_velocity, int xlength, float **collide_field_d, float **stream_field_d, int **flag_field_d, float *mlups_sum); #endif
lbm_solver_gpu.cu
#include <math.h> #include <stdio.h> #include "lbm_solver_gpu.h" #include "lbm_model.h" #include "lbm_model_gpu.cuh" #include "utils.h" #include "utils_gpu.h" #include "cell_computation_gpu.cuh" __constant__ float tau_d, wall_velocity_d[D_LBM]; __constant__ int xlength_d, num_cells_d; __device__ float *stream_field_d, *collide_field_d; /** * Computes the post-collision distribution functions according to the BGK update rule and * stores the results again at the same position. */ __device__ void ComputePostCollisionDistributionsGpu(float *current_cell, float *feq){ current_cell[0]=current_cell[0]-(current_cell[0]-feq[0])/tau_d; current_cell[1]=current_cell[1]-(current_cell[1]-feq[1])/tau_d; current_cell[2]=current_cell[2]-(current_cell[2]-feq[2])/tau_d; current_cell[3]=current_cell[3]-(current_cell[3]-feq[3])/tau_d; current_cell[4]=current_cell[4]-(current_cell[4]-feq[4])/tau_d; current_cell[5]=current_cell[5]-(current_cell[5]-feq[5])/tau_d; current_cell[6]=current_cell[6]-(current_cell[6]-feq[6])/tau_d; current_cell[7]=current_cell[7]-(current_cell[7]-feq[7])/tau_d; current_cell[8]=current_cell[8]-(current_cell[8]-feq[8])/tau_d; current_cell[9]=current_cell[9]-(current_cell[9]-feq[9])/tau_d; current_cell[10]=current_cell[10]-(current_cell[10]-feq[10])/tau_d; current_cell[11]=current_cell[11]-(current_cell[11]-feq[11])/tau_d; current_cell[12]=current_cell[12]-(current_cell[12]-feq[12])/tau_d; current_cell[13]=current_cell[13]-(current_cell[13]-feq[13])/tau_d; current_cell[14]=current_cell[14]-(current_cell[14]-feq[14])/tau_d; current_cell[15]=current_cell[15]-(current_cell[15]-feq[15])/tau_d; current_cell[16]=current_cell[16]-(current_cell[16]-feq[16])/tau_d; current_cell[17]=current_cell[17]-(current_cell[17]-feq[17])/tau_d; current_cell[18]=current_cell[18]-(current_cell[18]-feq[18])/tau_d; } /* * Performs streaming on cells. */ __global__ void DoStreaming(){ int x = threadIdx.x+blockIdx.x*blockDim.x; int y = threadIdx.y+blockIdx.y*blockDim.y; int z = threadIdx.z+blockIdx.z*blockDim.z; int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz; /* check that indices are within the bounds since there could be more threads than needed */ if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){ nx=x-LATTICE_VELOCITIES_D[0][0]; ny=y-LATTICE_VELOCITIES_D[0][1]; nz=z-LATTICE_VELOCITIES_D[0][2]; stream_field_d[Q_LBM*idx]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)]; nx=x-LATTICE_VELOCITIES_D[1][0]; ny=y-LATTICE_VELOCITIES_D[1][1]; nz=z-LATTICE_VELOCITIES_D[1][2]; stream_field_d[Q_LBM*idx+1]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+1]; nx=x-LATTICE_VELOCITIES_D[2][0]; ny=y-LATTICE_VELOCITIES_D[2][1]; nz=z-LATTICE_VELOCITIES_D[2][2]; stream_field_d[Q_LBM*idx+2]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+2]; nx=x-LATTICE_VELOCITIES_D[3][0]; ny=y-LATTICE_VELOCITIES_D[3][1]; nz=z-LATTICE_VELOCITIES_D[3][2]; stream_field_d[Q_LBM*idx+3]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+3]; nx=x-LATTICE_VELOCITIES_D[4][0]; ny=y-LATTICE_VELOCITIES_D[4][1]; nz=z-LATTICE_VELOCITIES_D[4][2]; stream_field_d[Q_LBM*idx+4]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+4]; nx=x-LATTICE_VELOCITIES_D[5][0]; ny=y-LATTICE_VELOCITIES_D[5][1]; nz=z-LATTICE_VELOCITIES_D[5][2]; stream_field_d[Q_LBM*idx+5]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+5]; nx=x-LATTICE_VELOCITIES_D[6][0]; ny=y-LATTICE_VELOCITIES_D[6][1]; nz=z-LATTICE_VELOCITIES_D[6][2]; stream_field_d[Q_LBM*idx+6]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+6]; nx=x-LATTICE_VELOCITIES_D[7][0]; ny=y-LATTICE_VELOCITIES_D[7][1]; nz=z-LATTICE_VELOCITIES_D[7][2]; stream_field_d[Q_LBM*idx+7]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+7]; nx=x-LATTICE_VELOCITIES_D[8][0]; ny=y-LATTICE_VELOCITIES_D[8][1]; nz=z-LATTICE_VELOCITIES_D[8][2]; stream_field_d[Q_LBM*idx+8]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+8]; nx=x-LATTICE_VELOCITIES_D[9][0]; ny=y-LATTICE_VELOCITIES_D[9][1]; nz=z-LATTICE_VELOCITIES_D[9][2]; stream_field_d[Q_LBM*idx+9]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+9]; nx=x-LATTICE_VELOCITIES_D[10][0]; ny=y-LATTICE_VELOCITIES_D[10][1]; nz=z-LATTICE_VELOCITIES_D[10][2]; stream_field_d[Q_LBM*idx+10]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+10]; nx=x-LATTICE_VELOCITIES_D[11][0]; ny=y-LATTICE_VELOCITIES_D[11][1]; nz=z-LATTICE_VELOCITIES_D[11][2]; stream_field_d[Q_LBM*idx+11]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+11]; nx=x-LATTICE_VELOCITIES_D[12][0]; ny=y-LATTICE_VELOCITIES_D[12][1]; nz=z-LATTICE_VELOCITIES_D[12][2]; stream_field_d[Q_LBM*idx+12]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+12]; nx=x-LATTICE_VELOCITIES_D[13][0]; ny=y-LATTICE_VELOCITIES_D[13][1]; nz=z-LATTICE_VELOCITIES_D[13][2]; stream_field_d[Q_LBM*idx+13]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+13]; nx=x-LATTICE_VELOCITIES_D[14][0]; ny=y-LATTICE_VELOCITIES_D[14][1]; nz=z-LATTICE_VELOCITIES_D[14][2]; stream_field_d[Q_LBM*idx+14]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+14]; nx=x-LATTICE_VELOCITIES_D[15][0]; ny=y-LATTICE_VELOCITIES_D[15][1]; nz=z-LATTICE_VELOCITIES_D[15][2]; stream_field_d[Q_LBM*idx+15]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+15]; nx=x-LATTICE_VELOCITIES_D[16][0]; ny=y-LATTICE_VELOCITIES_D[16][1]; nz=z-LATTICE_VELOCITIES_D[16][2]; stream_field_d[Q_LBM*idx+16]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+16]; nx=x-LATTICE_VELOCITIES_D[17][0]; ny=y-LATTICE_VELOCITIES_D[17][1]; nz=z-LATTICE_VELOCITIES_D[17][2]; stream_field_d[Q_LBM*idx+17]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+17]; nx=x-LATTICE_VELOCITIES_D[18][0]; ny=y-LATTICE_VELOCITIES_D[18][1]; nz=z-LATTICE_VELOCITIES_D[18][2]; stream_field_d[Q_LBM*idx+18]=collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+18]; } } /* * Performs collision computation. */ __global__ void DoCollision(){ int x = threadIdx.x+blockIdx.x*blockDim.x; int y = threadIdx.y+blockIdx.y*blockDim.y; int z = threadIdx.z+blockIdx.z*blockDim.z; int step=xlength_d+2, idx=x+y*step+z*step*step; float density, velocity[D_LBM], feq[Q_LBM], *current_cell_s; /* check that indices are within the bounds since there could be more threads than needed */ if (0<x && x<(step-1) && 0<y && y<(step-1) && 0<z && z<(step-1)){ current_cell_s=&collide_field_d[Q_LBM*idx]; ComputeDensityGpu(current_cell_s,&density); ComputeVelocityGpu(current_cell_s,&density,velocity); ComputeFeqGpu(&density,velocity,feq); ComputePostCollisionDistributionsGpu(current_cell_s,feq); } } /* * Computes proper boundary values. */ __global__ void TreatBoundary(int *flag_field_d){ int x = threadIdx.x+blockIdx.x*blockDim.x; int y = threadIdx.y+blockIdx.y*blockDim.y; int z = threadIdx.z+blockIdx.z*blockDim.z; int step=xlength_d+2, idx=x+y*step+z*step*step, nx, ny, nz, i, boundary_side=0, boundary_idx=100500; float density, dot_prod; if(idx<num_cells_d) { if(flag_field_d[idx] == BOTTOM_BOUNDARY) { boundary_side = BOTTOM_BOUNDARY; boundary_idx = BOTTOM_BOUNDARY_IDX; } else if (flag_field_d[idx] == LEFT_BOUNDARY) { boundary_side = LEFT_BOUNDARY; boundary_idx = LEFT_BOUNDARY_IDX; } else if (flag_field_d[idx] == RIGHT_BOUNDARY) { boundary_side = RIGHT_BOUNDARY; boundary_idx = RIGHT_BOUNDARY_IDX; } else if (flag_field_d[idx] == BACK_BOUNDARY) { boundary_side = BACK_BOUNDARY; boundary_idx = BACK_BOUNDARY_IDX; } else if (flag_field_d[idx] == FRONT_BOUNDARY) { boundary_side = FRONT_BOUNDARY; boundary_idx = FRONT_BOUNDARY_IDX; } else if (flag_field_d[idx] == LEFT_BOTTOM_EDGE) { boundary_side = LEFT_BOTTOM_EDGE; boundary_idx = 13; } else if (flag_field_d[idx] == RIGHT_BOTTOM_EDGE) { boundary_side = RIGHT_BOTTOM_EDGE; boundary_idx = 11; } else if (flag_field_d[idx] == BACK_BOTTOM_EDGE) { boundary_side = BACK_BOTTOM_EDGE; boundary_idx = 18; } else if (flag_field_d[idx] == FRONT_BOTTOM_EDGE) { boundary_side = FRONT_BOTTOM_EDGE; boundary_idx = 4; } else if (flag_field_d[idx] == LEFT_BACK_EDGE) { boundary_side = LEFT_BACK_EDGE; boundary_idx = 17; } else if (flag_field_d[idx] == LEFT_FRONT_EDGE) { boundary_side = LEFT_FRONT_EDGE; boundary_idx = 3; } else if (flag_field_d[idx] == RIGHT_BACK_EDGE) { boundary_side = RIGHT_BACK_EDGE; boundary_idx = 15; } else if (flag_field_d[idx] == RIGHT_FRONT_EDGE) { boundary_side = RIGHT_FRONT_EDGE; boundary_idx = 1; } else if (flag_field_d[idx] == LEFT_UPPER_EDGE) { boundary_side = LEFT_UPPER_EDGE; boundary_idx = 7; } else if (flag_field_d[idx] == RIGHT_UPPER_EDGE) { boundary_side = RIGHT_UPPER_EDGE; boundary_idx = 5; } else if (flag_field_d[idx] == BACK_UPPER_EDGE) { boundary_side = BACK_UPPER_EDGE; boundary_idx = 14; } else if (flag_field_d[idx] == FRONT_UPPER_EDGE) { boundary_side = FRONT_UPPER_EDGE; boundary_idx = 0; } else if (flag_field_d[idx] == TOP_BOUNDARY) { boundary_side = TOP_BOUNDARY; boundary_idx = TOP_BOUNDARY_IDX; } if( boundary_side==LEFT_BOUNDARY || boundary_side==RIGHT_BOUNDARY || boundary_side==BOTTOM_BOUNDARY || boundary_side==BACK_BOUNDARY || boundary_side==FRONT_BOUNDARY) { i = TREAT_BOUNDARY_INDECES[boundary_idx][0]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]; i = TREAT_BOUNDARY_INDECES[boundary_idx][1]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]; i = TREAT_BOUNDARY_INDECES[boundary_idx][2]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]; i = TREAT_BOUNDARY_INDECES[boundary_idx][3]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]; i = TREAT_BOUNDARY_INDECES[boundary_idx][4]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; collide_field_d[Q_LBM*(x+y*step+z*step*step)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]; } else if (boundary_side == LEFT_BOTTOM_EDGE || boundary_side == RIGHT_BOTTOM_EDGE || boundary_side == BACK_BOTTOM_EDGE || boundary_side == FRONT_BOTTOM_EDGE || boundary_side == LEFT_BACK_EDGE || boundary_side == LEFT_FRONT_EDGE || boundary_side == RIGHT_BACK_EDGE || boundary_side == RIGHT_FRONT_EDGE) { nx=x+LATTICE_VELOCITIES_D[boundary_idx][0]; ny=y+LATTICE_VELOCITIES_D[boundary_idx][1]; nz=z+LATTICE_VELOCITIES_D[boundary_idx][2]; collide_field_d[Q_LBM*(x+y*step+z*step*step)+boundary_idx]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(boundary_idx)]; } else if(boundary_side == LEFT_UPPER_EDGE || boundary_side == RIGHT_UPPER_EDGE || boundary_side == BACK_UPPER_EDGE || boundary_side == FRONT_UPPER_EDGE) { i = boundary_idx; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; /* Compute density in the neighbour cell */ ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density); /* Compute dot product */ dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+ LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+ LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2]; /* Assign the boudary cell value */ collide_field_d[Q_LBM*(idx)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+ 2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod; } else if(boundary_side == TOP_BOUNDARY) { i = TREAT_BOUNDARY_INDECES[boundary_idx][0]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density); dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+ LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+ LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2]; collide_field_d[Q_LBM*(idx)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+ 2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod; i = TREAT_BOUNDARY_INDECES[boundary_idx][1]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density); dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+ LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+ LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2]; collide_field_d[Q_LBM*(idx)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+ 2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod; i = TREAT_BOUNDARY_INDECES[boundary_idx][2]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density); dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+ LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+ LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2]; collide_field_d[Q_LBM*(idx)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+ 2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod; i = TREAT_BOUNDARY_INDECES[boundary_idx][3]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density); dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+ LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+ LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2]; collide_field_d[Q_LBM*(idx)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+ 2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod; i = TREAT_BOUNDARY_INDECES[boundary_idx][4]; nx=x+LATTICE_VELOCITIES_D[i][0]; ny=y+LATTICE_VELOCITIES_D[i][1]; nz=z+LATTICE_VELOCITIES_D[i][2]; ComputeDensityGpu(&collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)],&density); dot_prod=LATTICE_VELOCITIES_D[i][0]*wall_velocity_d[0]+ LATTICE_VELOCITIES_D[i][1]*wall_velocity_d[1]+ LATTICE_VELOCITIES_D[i][2]*wall_velocity_d[2]; collide_field_d[Q_LBM*(idx)+i]= collide_field_d[Q_LBM*(nx+ny*step+nz*step*step)+InvGpu(i)]+ 2*LATTICE_WEIGHTS_D[i]*density*C_S_POW2_INV*dot_prod; } } } /* * Performs pointer swap on GPU. */ __global__ void DoSwap(){ float *swap=collide_field_d; collide_field_d=stream_field_d; stream_field_d=swap; } void DoIteration(float *collide_field, float *stream_field, int *flag_field, float tau, float *wall_velocity, int xlength, float **collide_field_dd, float **stream_field_dd, int **flag_field_d, float *mlups_sum){ int num_cells = pow(xlength+2, D_LBM); clock_t mlups_time; /* initialize constant data */ cudaErrorCheck(cudaMemcpyToSymbol(xlength_d, &xlength, sizeof(int), 0, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMemcpyToSymbol(num_cells_d, &num_cells, sizeof(int), 0, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMemcpyToSymbol(tau_d, &tau, sizeof(float), 0, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMemcpyToSymbol(wall_velocity_d, wall_velocity, D_LBM*sizeof(float), 0, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMemcpyToSymbol(collide_field_d, collide_field_dd, sizeof(*collide_field_dd), 0, cudaMemcpyHostToDevice)); cudaErrorCheck(cudaMemcpyToSymbol(stream_field_d, stream_field_dd, sizeof(*stream_field_dd), 0, cudaMemcpyHostToDevice)); /* define grid structure */ dim3 block(BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE); dim3 grid((xlength+2+block.x-1)/block.x, (xlength+2+block.y-1)/block.y, (xlength+2+block.z-1)/block.z); mlups_time = clock(); /* perform streaming */ DoStreaming<<<grid,block>>>(); cudaErrorCheck(cudaThreadSynchronize()); cudaErrorCheck(cudaPeekAtLastError()); /* Perform the swapping of collide and stream fields */ DoSwap<<<1,1>>>(); cudaErrorCheck(cudaThreadSynchronize()); cudaErrorCheck(cudaPeekAtLastError()); /* perform collision */ DoCollision<<<grid,block>>>(); cudaErrorCheck(cudaThreadSynchronize()); cudaErrorCheck(cudaPeekAtLastError()); /* perform boundary treatment */ TreatBoundary<<<grid,block>>>(*flag_field_d); cudaErrorCheck(cudaPeekAtLastError()); mlups_time = clock()-mlups_time; *mlups_sum += num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC); if(VERBOSE) printf("MLUPS: %f\n", num_cells/(MLUPS_EXPONENT*(float)mlups_time/CLOCKS_PER_SEC)); /* copy data back to host */ cudaErrorCheck(cudaMemcpyFromSymbol(collide_field_dd, collide_field_d, sizeof(*collide_field_dd), 0, cudaMemcpyDeviceToHost)); cudaErrorCheck(cudaMemcpyFromSymbol(stream_field_dd, stream_field_d, sizeof(*stream_field_dd), 0, cudaMemcpyDeviceToHost)); }
cell_computation_gpu.h
#ifndef _CELL_COMPUTATION_GPU_CUH_ #define _CELL_COMPUTATION_GPU_CUH_ /** * Computes the density from the particle distribution functions stored at currentCell. * currentCell thus denotes the address of the first particle distribution function of the * respective cell. The result is stored in density. */ __device__ void ComputeDensityGpu(float *current_cell, float *density){ *density=0; *density+=current_cell[0]; *density+=current_cell[1]; *density+=current_cell[2]; *density+=current_cell[3]; *density+=current_cell[4]; *density+=current_cell[5]; *density+=current_cell[6]; *density+=current_cell[7]; *density+=current_cell[8]; *density+=current_cell[9]; *density+=current_cell[10]; *density+=current_cell[11]; *density+=current_cell[12]; *density+=current_cell[13]; *density+=current_cell[14]; *density+=current_cell[15]; *density+=current_cell[16]; *density+=current_cell[17]; *density+=current_cell[18]; } /** * Computes the velocity within currentCell and stores the result in velocity */ __device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){ velocity[0]=0; velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0]; velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0]; velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0]; velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0]; velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0]; velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0]; velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0]; velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0]; velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0]; velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0]; velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0]; velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0]; velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0]; velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0]; velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0]; velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0]; velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0]; velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0]; velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0]; velocity[0]/=*density; velocity[1]=0; velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1]; velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1]; velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1]; velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1]; velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1]; velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1]; velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1]; velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1]; velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1]; velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1]; velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1]; velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1]; velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1]; velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1]; velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1]; velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1]; velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1]; velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1]; velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1]; velocity[1]/=*density; velocity[2]=0; velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2]; velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2]; velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2]; velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2]; velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2]; velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2]; velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2]; velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2]; velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2]; velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2]; velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2]; velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2]; velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2]; velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2]; velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2]; velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2]; velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2]; velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2]; velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2]; velocity[2]/=*density; } /** * Computes the equilibrium distributions for all particle distribution functions of one * cell from density and velocity and stores the results in feq. */ __device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){ float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2]; s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2]; s2 = s1*s1; feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2]; s2 = s1*s1; feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2]; s2 = s1*s1; feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2]; s2 = s1*s1; feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2]; s2 = s1*s1; feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2]; s2 = s1*s1; feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2]; s2 = s1*s1; feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2]; s2 = s1*s1; feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2]; s2 = s1*s1; feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2]; s2 = s1*s1; feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2]; s2 = s1*s1; feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2]; s2 = s1*s1; feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2]; s2 = s1*s1; feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2]; s2 = s1*s1; feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2]; s2 = s1*s1; feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2]; s2 = s1*s1; feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2]; s2 = s1*s1; feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2]; s2 = s1*s1; feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2]; s2 = s1*s1; feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); } /** * Finds an inverse probability distribution for the provided lattice index. */ __device__ int InvGpu(int i){ return (Q_LBM-1)-i; } #endif
cell_computation_gpu.cu
#ifndef _CELL_COMPUTATION_GPU_CUH_ #define _CELL_COMPUTATION_GPU_CUH_ /** * Computes the density from the particle distribution functions stored at currentCell. * currentCell thus denotes the address of the first particle distribution function of the * respective cell. The result is stored in density. */ __device__ void ComputeDensityGpu(float *current_cell, float *density){ *density=0; *density+=current_cell[0]; *density+=current_cell[1]; *density+=current_cell[2]; *density+=current_cell[3]; *density+=current_cell[4]; *density+=current_cell[5]; *density+=current_cell[6]; *density+=current_cell[7]; *density+=current_cell[8]; *density+=current_cell[9]; *density+=current_cell[10]; *density+=current_cell[11]; *density+=current_cell[12]; *density+=current_cell[13]; *density+=current_cell[14]; *density+=current_cell[15]; *density+=current_cell[16]; *density+=current_cell[17]; *density+=current_cell[18]; } /** * Computes the velocity within currentCell and stores the result in velocity */ __device__ void ComputeVelocityGpu(float *current_cell, float *density, float *velocity){ velocity[0]=0; velocity[0]+=current_cell[0]*LATTICE_VELOCITIES_D[0][0]; velocity[0]+=current_cell[1]*LATTICE_VELOCITIES_D[1][0]; velocity[0]+=current_cell[2]*LATTICE_VELOCITIES_D[2][0]; velocity[0]+=current_cell[3]*LATTICE_VELOCITIES_D[3][0]; velocity[0]+=current_cell[4]*LATTICE_VELOCITIES_D[4][0]; velocity[0]+=current_cell[5]*LATTICE_VELOCITIES_D[5][0]; velocity[0]+=current_cell[6]*LATTICE_VELOCITIES_D[6][0]; velocity[0]+=current_cell[7]*LATTICE_VELOCITIES_D[7][0]; velocity[0]+=current_cell[8]*LATTICE_VELOCITIES_D[8][0]; velocity[0]+=current_cell[9]*LATTICE_VELOCITIES_D[9][0]; velocity[0]+=current_cell[10]*LATTICE_VELOCITIES_D[10][0]; velocity[0]+=current_cell[11]*LATTICE_VELOCITIES_D[11][0]; velocity[0]+=current_cell[12]*LATTICE_VELOCITIES_D[12][0]; velocity[0]+=current_cell[13]*LATTICE_VELOCITIES_D[13][0]; velocity[0]+=current_cell[14]*LATTICE_VELOCITIES_D[14][0]; velocity[0]+=current_cell[15]*LATTICE_VELOCITIES_D[15][0]; velocity[0]+=current_cell[16]*LATTICE_VELOCITIES_D[16][0]; velocity[0]+=current_cell[17]*LATTICE_VELOCITIES_D[17][0]; velocity[0]+=current_cell[18]*LATTICE_VELOCITIES_D[18][0]; velocity[0]/=*density; velocity[1]=0; velocity[1]+=current_cell[0]*LATTICE_VELOCITIES_D[0][1]; velocity[1]+=current_cell[1]*LATTICE_VELOCITIES_D[1][1]; velocity[1]+=current_cell[2]*LATTICE_VELOCITIES_D[2][1]; velocity[1]+=current_cell[3]*LATTICE_VELOCITIES_D[3][1]; velocity[1]+=current_cell[4]*LATTICE_VELOCITIES_D[4][1]; velocity[1]+=current_cell[5]*LATTICE_VELOCITIES_D[5][1]; velocity[1]+=current_cell[6]*LATTICE_VELOCITIES_D[6][1]; velocity[1]+=current_cell[7]*LATTICE_VELOCITIES_D[7][1]; velocity[1]+=current_cell[8]*LATTICE_VELOCITIES_D[8][1]; velocity[1]+=current_cell[9]*LATTICE_VELOCITIES_D[9][1]; velocity[1]+=current_cell[10]*LATTICE_VELOCITIES_D[10][1]; velocity[1]+=current_cell[11]*LATTICE_VELOCITIES_D[11][1]; velocity[1]+=current_cell[12]*LATTICE_VELOCITIES_D[12][1]; velocity[1]+=current_cell[13]*LATTICE_VELOCITIES_D[13][1]; velocity[1]+=current_cell[14]*LATTICE_VELOCITIES_D[14][1]; velocity[1]+=current_cell[15]*LATTICE_VELOCITIES_D[15][1]; velocity[1]+=current_cell[16]*LATTICE_VELOCITIES_D[16][1]; velocity[1]+=current_cell[17]*LATTICE_VELOCITIES_D[17][1]; velocity[1]+=current_cell[18]*LATTICE_VELOCITIES_D[18][1]; velocity[1]/=*density; velocity[2]=0; velocity[2]+=current_cell[0]*LATTICE_VELOCITIES_D[0][2]; velocity[2]+=current_cell[1]*LATTICE_VELOCITIES_D[1][2]; velocity[2]+=current_cell[2]*LATTICE_VELOCITIES_D[2][2]; velocity[2]+=current_cell[3]*LATTICE_VELOCITIES_D[3][2]; velocity[2]+=current_cell[4]*LATTICE_VELOCITIES_D[4][2]; velocity[2]+=current_cell[5]*LATTICE_VELOCITIES_D[5][2]; velocity[2]+=current_cell[6]*LATTICE_VELOCITIES_D[6][2]; velocity[2]+=current_cell[7]*LATTICE_VELOCITIES_D[7][2]; velocity[2]+=current_cell[8]*LATTICE_VELOCITIES_D[8][2]; velocity[2]+=current_cell[9]*LATTICE_VELOCITIES_D[9][2]; velocity[2]+=current_cell[10]*LATTICE_VELOCITIES_D[10][2]; velocity[2]+=current_cell[11]*LATTICE_VELOCITIES_D[11][2]; velocity[2]+=current_cell[12]*LATTICE_VELOCITIES_D[12][2]; velocity[2]+=current_cell[13]*LATTICE_VELOCITIES_D[13][2]; velocity[2]+=current_cell[14]*LATTICE_VELOCITIES_D[14][2]; velocity[2]+=current_cell[15]*LATTICE_VELOCITIES_D[15][2]; velocity[2]+=current_cell[16]*LATTICE_VELOCITIES_D[16][2]; velocity[2]+=current_cell[17]*LATTICE_VELOCITIES_D[17][2]; velocity[2]+=current_cell[18]*LATTICE_VELOCITIES_D[18][2]; velocity[2]/=*density; } /** * Computes the equilibrium distributions for all particle distribution functions of one * cell from density and velocity and stores the results in feq. */ __device__ void ComputeFeqGpu(float *density, float *velocity, float *feq){ float s1, s2, s3=velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2]; s1 = LATTICE_VELOCITIES_D[0][0]*velocity[0]+LATTICE_VELOCITIES_D[0][1]*velocity[1]+LATTICE_VELOCITIES_D[0][2]*velocity[2]; s2 = s1*s1; feq[0]=LATTICE_WEIGHTS_D[0]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[1][0]*velocity[0]+LATTICE_VELOCITIES_D[1][1]*velocity[1]+LATTICE_VELOCITIES_D[1][2]*velocity[2]; s2 = s1*s1; feq[1]=LATTICE_WEIGHTS_D[1]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[2][0]*velocity[0]+LATTICE_VELOCITIES_D[2][1]*velocity[1]+LATTICE_VELOCITIES_D[2][2]*velocity[2]; s2 = s1*s1; feq[2]=LATTICE_WEIGHTS_D[2]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[3][0]*velocity[0]+LATTICE_VELOCITIES_D[3][1]*velocity[1]+LATTICE_VELOCITIES_D[3][2]*velocity[2]; s2 = s1*s1; feq[3]=LATTICE_WEIGHTS_D[3]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[4][0]*velocity[0]+LATTICE_VELOCITIES_D[4][1]*velocity[1]+LATTICE_VELOCITIES_D[4][2]*velocity[2]; s2 = s1*s1; feq[4]=LATTICE_WEIGHTS_D[4]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[5][0]*velocity[0]+LATTICE_VELOCITIES_D[5][1]*velocity[1]+LATTICE_VELOCITIES_D[5][2]*velocity[2]; s2 = s1*s1; feq[5]=LATTICE_WEIGHTS_D[5]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[6][0]*velocity[0]+LATTICE_VELOCITIES_D[6][1]*velocity[1]+LATTICE_VELOCITIES_D[6][2]*velocity[2]; s2 = s1*s1; feq[6]=LATTICE_WEIGHTS_D[6]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[7][0]*velocity[0]+LATTICE_VELOCITIES_D[7][1]*velocity[1]+LATTICE_VELOCITIES_D[7][2]*velocity[2]; s2 = s1*s1; feq[7]=LATTICE_WEIGHTS_D[7]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[8][0]*velocity[0]+LATTICE_VELOCITIES_D[8][1]*velocity[1]+LATTICE_VELOCITIES_D[8][2]*velocity[2]; s2 = s1*s1; feq[8]=LATTICE_WEIGHTS_D[8]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[9][0]*velocity[0]+LATTICE_VELOCITIES_D[9][1]*velocity[1]+LATTICE_VELOCITIES_D[9][2]*velocity[2]; s2 = s1*s1; feq[9]=LATTICE_WEIGHTS_D[9]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[10][0]*velocity[0]+LATTICE_VELOCITIES_D[10][1]*velocity[1]+LATTICE_VELOCITIES_D[10][2]*velocity[2]; s2 = s1*s1; feq[10]=LATTICE_WEIGHTS_D[10]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[11][0]*velocity[0]+LATTICE_VELOCITIES_D[11][1]*velocity[1]+LATTICE_VELOCITIES_D[11][2]*velocity[2]; s2 = s1*s1; feq[11]=LATTICE_WEIGHTS_D[11]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[12][0]*velocity[0]+LATTICE_VELOCITIES_D[12][1]*velocity[1]+LATTICE_VELOCITIES_D[12][2]*velocity[2]; s2 = s1*s1; feq[12]=LATTICE_WEIGHTS_D[12]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[13][0]*velocity[0]+LATTICE_VELOCITIES_D[13][1]*velocity[1]+LATTICE_VELOCITIES_D[13][2]*velocity[2]; s2 = s1*s1; feq[13]=LATTICE_WEIGHTS_D[13]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[14][0]*velocity[0]+LATTICE_VELOCITIES_D[14][1]*velocity[1]+LATTICE_VELOCITIES_D[14][2]*velocity[2]; s2 = s1*s1; feq[14]=LATTICE_WEIGHTS_D[14]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[15][0]*velocity[0]+LATTICE_VELOCITIES_D[15][1]*velocity[1]+LATTICE_VELOCITIES_D[15][2]*velocity[2]; s2 = s1*s1; feq[15]=LATTICE_WEIGHTS_D[15]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[16][0]*velocity[0]+LATTICE_VELOCITIES_D[16][1]*velocity[1]+LATTICE_VELOCITIES_D[16][2]*velocity[2]; s2 = s1*s1; feq[16]=LATTICE_WEIGHTS_D[16]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[17][0]*velocity[0]+LATTICE_VELOCITIES_D[17][1]*velocity[1]+LATTICE_VELOCITIES_D[17][2]*velocity[2]; s2 = s1*s1; feq[17]=LATTICE_WEIGHTS_D[17]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); s1 = LATTICE_VELOCITIES_D[18][0]*velocity[0]+LATTICE_VELOCITIES_D[18][1]*velocity[1]+LATTICE_VELOCITIES_D[18][2]*velocity[2]; s2 = s1*s1; feq[18]=LATTICE_WEIGHTS_D[18]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); } /** * Finds an inverse probability distribution for the provided lattice index. */ __device__ int InvGpu(int i){ return (Q_LBM-1)-i; } #endif 10 AM Meeting
lbm_model_gpu.cuh
#ifndef _LBM_MODEL_GPU_CUH_ #define _LBM_MODEL_GPU_CUH_ /* Restricts 3D blocks to have 512 threads (limits: 512 CC<2.x; 1024 CC>2.x) */ #define BLOCK_SIZE 8 /* Arbitrary boundary identifiers */ #define LEFT_BOUNDARY_IDX 0 #define RIGHT_BOUNDARY_IDX 1 #define BOTTOM_BOUNDARY_IDX 2 #define TOP_BOUNDARY_IDX 3 #define BACK_BOUNDARY_IDX 4 #define FRONT_BOUNDARY_IDX 5 __constant__ static const int LATTICE_VELOCITIES_D[19][3] = { {0,-1,-1},{-1,0,-1},{0,0,-1},{1,0,-1},{0,1,-1},{-1,-1,0},{0,-1,0},{1,-1,0}, {-1,0,0}, {0,0,0}, {1,0,0}, {-1,1,0},{0,1,0}, {1,1,0}, {0,-1,1},{-1,0,1}, {0,0,1}, {1,0,1}, {0,1,1} }; __constant__ static const float LATTICE_WEIGHTS_D[19] = { 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 2.0/36.0, 12.0/36.0,2.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0, 1.0/36.0, 2.0/36.0, 1.0/36.0, 1.0/36.0 }; /** * Stores number of probability distributions which we need to copy in boundary treatment step. */ __constant__ static const float TREAT_BOUNDARY_INDECES[6][5] = { {3,7,10,13,17}, {1,5,8,11,15}, {4,11,12,13,18}, {0,5,6,7,14}, {14,15,16,17,18}, {0,1,2,3,4} }; #endif
CPU:
initialization.h
#ifndef _INITIALIZATION_H_ #define _INITIALIZATION_H_ /** * Reads the parameters for the lid driven cavity scenario from a configuration file */ void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps, int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled); /** * Initializes the particle distribution functions and the flag field */ void InitialiseFields(float *collide_field, float *stream_field,int *flag_field, int xlength, int gpu_enabled); #endif
initialization.c
#include <unistd.h> #include <string.h> #include <stdio.h> #include <stdlib.h> #include "initialization.h" #include "lbm_model.h" #include "utils.h" /** * Prints help message that includes program usage instructions and control flags. */ void PrintHelpMessage(){ printf("List of control flags:\n"); printf("\t -gpu all computations are to be performed on gpu\n"); printf("\t -cpu all computations are to be performed on cpu\n"); printf("\t -help prints this help message\n"); printf("NOTE: Control flags are mutually exclusive and only one flag at a time is allowed\n"); printf("Example program usage:\n"); printf("\t./lbm-sim ./data/lbm.dat -gpu\n"); exit(1); } void ReadParameters(int *xlength, float *tau, float *velocity_wall, int *timesteps, int *timesteps_per_plotting, int argc, char *argv[], int *gpu_enabled){ float *velocity_wall_1, *velocity_wall_2, *velocity_wall_3; /* printing out help message */ if(argc<3) PrintHelpMessage(); if(!strcmp(argv[1], "-help") || !strcmp(argv[2], "-help")) PrintHelpMessage(); /* checking parameters */ if(access(argv[1], R_OK) != 0) ERROR("Provided configuration file path either doesn't exist or can not be read."); if(!strcmp(argv[2], "-gpu")) *gpu_enabled=1; else *gpu_enabled=0; /* reading parameters */ READ_FLOAT(argv[1], *tau); velocity_wall_1=&velocity_wall[0]; velocity_wall_2=&velocity_wall[1]; velocity_wall_3=&velocity_wall[2]; READ_FLOAT(argv[1], *velocity_wall_1); READ_FLOAT(argv[1], *velocity_wall_2); READ_FLOAT(argv[1], *velocity_wall_3); READ_INT(argv[1], *xlength); READ_INT(argv[1], *timesteps); READ_INT(argv[1], *timesteps_per_plotting); } void InitialiseFields(float *collide_field, float *stream_field, int *flag_field, int xlength, int gpu_enabled){ int x,y,z,i,step=xlength+2; /* NOTE: We use z=xlength+1 as the moving wall */ for(x=0;x<step;x++){ for(y=0;y<step;y++){ for(z=0;z<step;z++){ /* Initializing flags */ if(gpu_enabled) { if(x==0 && y==0 && z==0) flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_BACK_CORNER; else if(x==0 && y==0 && z==step-1) flag_field[x+y*step+z*step*step] = LEFT_BOTTOM_FRONT_CORNER; else if(x==0 && y==step-1 && z==0) flag_field[x+y*step+z*step*step] = LEFT_UPPER_BACK_CORNER; else if(x==0 && y==step-1 && z==step-1) flag_field[x+y*step+z*step*step] = LEFT_UPPER_FRONT_CORNER; else if(x==step-1 && y==0 && z==0) flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_BACK_CORNER; else if(x==step-1 && y==0 && z==step-1) flag_field[x+y*step+z*step*step] = RIGHT_BOTTOM_FRONT_CORNER; else if(x==step-1 && y==step-1 && z==0) flag_field[x+y*step+z*step*step] = RIGHT_UPPER_BACK_CORNER; else if(x==step-1 && y==step-1 && z==step-1) flag_field[x+y*step+z*step*step] = RIGHT_UPPER_FRONT_CORNER; else if(x==0 && y==0) flag_field[x+y*step+z*step*step]=LEFT_BOTTOM_EDGE; else if(x==xlength+1 && y==0) flag_field[x+y*step+z*step*step]=RIGHT_BOTTOM_EDGE; else if(y==0 && z==0) flag_field[x+y*step+z*step*step]=BACK_BOTTOM_EDGE; else if(y==0 && z==xlength+1) flag_field[x+y*step+z*step*step]=FRONT_BOTTOM_EDGE; else if(x==0 && z==0) flag_field[x+y*step+z*step*step]=LEFT_BACK_EDGE; else if(x==0 && z==xlength+1) flag_field[x+y*step+z*step*step]=LEFT_FRONT_EDGE; else if(x==xlength+1 && z==0) flag_field[x+y*step+z*step*step]=RIGHT_BACK_EDGE; else if(x==xlength+1 && z==xlength+1) flag_field[x+y*step+z*step*step]=RIGHT_FRONT_EDGE; else if(x==0 && y==xlength+1) flag_field[x+y*step+z*step*step]=LEFT_UPPER_EDGE; else if(x==xlength+1 && y==xlength+1) flag_field[x+y*step+z*step*step]=RIGHT_UPPER_EDGE; else if(y==xlength+1 && z==0) flag_field[x+y*step+z*step*step]=BACK_UPPER_EDGE; else if(y==xlength+1 && z==xlength+1) flag_field[x+y*step+z*step*step]=FRONT_UPPER_EDGE; else if(y==step-1 && x!=0 && x!=step-1 && z!=0 && z!=step-1) flag_field[x+y*step+z*step*step] = TOP_BOUNDARY; else if(y==0 && x!=0 && x!=step-1 && z!=0 && z!=step-1) flag_field[x+y*step+z*step*step] = BOTTOM_BOUNDARY; else if (x==0 && y!=0 && y!=step-1 && z!=0 && z!=step-1) flag_field[x+y*step+z*step*step] = LEFT_BOUNDARY; else if (x==step-1 && y!=0 && y!=step-1 && z!=0 && z!=step-1) flag_field[x+y*step+z*step*step] = RIGHT_BOUNDARY; else if (z==0 && y!=0 && y!=step-1 && x!=0 && x!=step-1) flag_field[x+y*step+z*step*step] = BACK_BOUNDARY; else if (z==step-1 && y!=0 && y!=step-1 && x!=0 && x!=step-1) flag_field[x+y*step+z*step*step] = FRONT_BOUNDARY; else flag_field[x+y*step+z*step*step]=FLUID; } else { if(y == xlength+1) flag_field[x+y*step+z*step*step]=MOVING_WALL; else if(x == 0 || x == xlength+1 || y == 0 || z == xlength+1 || z == 0) flag_field[x+y*step+z*step*step]=NO_SLIP; else flag_field[x+y*step+z*step*step]=FLUID; } /* Initializing distributions for stream and collide fields */ for(i=0;i<Q_LBM;i++){ /* * NOTE:Stream field is initilized to 0s because that helps to * track down mistakes and has no impact whatsoever to on the * computation further on. */ stream_field[Q_LBM*(x+y*step+z*step*step)+i]=0; collide_field[Q_LBM*(x+y*step+z*step*step)+i]=LATTICE_WEIGHTS[i]; } } } } }
streaming.h
#ifndef _STREAMING_H_ #define _STREAMING_H_ /** * Carries out the streaming step and writes the respective distribution functions from * collideField to streamField. */ void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength); #endif
streaming.c
#include "streaming.h" #include "lbm_model.h" void DoStreaming(float *collide_field, float *stream_field, int *flag_field, int xlength){ int x,nx,y,ny,z,nz,i,step=xlength+2; for(x=0;x<step;x++){ for(y=0;y<step;y++){ for(z=0;z<step;z++){ if(flag_field[x+y*step+z*step*step]==FLUID){ for(i=0;i<Q_LBM;i++){ nx=x-LATTICE_VELOCITIES[i][0]; ny=y-LATTICE_VELOCITIES[i][1]; nz=z-LATTICE_VELOCITIES[i][2]; stream_field[Q_LBM*(x+y*step+z*step*step)+i]= collide_field[Q_LBM*(nx+ny*step+nz*step*step)+i]; } } } } } }
collision.h
#ifndef _COLLISION_H_ #define _COLLISION_H_ /** * Carries out the whole local collision process. Computes density and velocity and equilibrium * distributions. Carries out BGK update. */ void DoCollision(float *collide_field, int *flag_field, float tau, int xlength); #endif
collision.c
#include "collision.h" #include "cell_computation.h" #include "lbm_model.h" #include "utils.h" /** * Computes the post-collision distribution functions according to the BGK update rule and * stores the results again at the same position. */ void ComputePostCollisionDistributions(float *current_cell, float tau, const float *const feq){ int i; for(i=0;i<Q_LBM;i++){ current_cell[i]=current_cell[i]-(current_cell[i]-feq[i])/tau; /* Probability distribution function can not be less than 0 */ if (current_cell[i] < 0) ERROR("Probability distribution function can not be negative."); } } void DoCollision(float *collide_field, int *flag_field, float tau, int xlength){ float density, velocity[3], feq[Q_LBM], *currentCell; int x,y,z,step=xlength+2; for(x=1;x<step-1;x++){ for(y=1;y<step-1;y++){ for(z=1;z<step-1;z++){ currentCell=&collide_field[Q_LBM*(x+y*step+z*step*step)]; ComputeDensity(currentCell,&density); ComputeVelocity(currentCell,&density,velocity); ComputeFeq(&density,velocity,feq); ComputePostCollisionDistributions(currentCell,tau,feq); } } } }
boundary.h
#ifndef _BOUNDARY_H_ #define _BOUNDARY_H_ /** * Handles the boundaries in our simulation setup */ void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength); #endif
boundary.c
#include "boundary.h" #include "lbm_model.h" #include "cell_computation.h" /** * Finds an inverse probability distribution for the provided lattice index. */ int inv(int i){ return (Q_LBM-1)-i; } void TreatBoundary(float *collide_field, int* flag_field, float *wall_velocity, int xlength){ int x,nx,y,ny,z,nz,i,step=xlength+2; float density,dot_prod; for(x=0;x<step;x++){ for(y=0;y<step;y++){ for(z=0;z<step;z++){ if(flag_field[x+y*step+z*step*step]!=FLUID){ for(i=0;i<Q_LBM;i++){ nx=x+LATTICE_VELOCITIES[i][0]; ny=y+LATTICE_VELOCITIES[i][1]; nz=z+LATTICE_VELOCITIES[i][2]; /* We don't need the values outside of our extended domain */ if(0<nx && nx<step-1 && 0<ny && ny<step-1 && 0<nz && nz<step-1){ if (flag_field[x+y*step+z*step*step]==MOVING_WALL){ /* Compute density in the neighbour cell */ ComputeDensity(&collide_field[Q_LBM*(nx+ny*step+nz*step*step)],&density); /* Compute dot product */ dot_prod=LATTICE_VELOCITIES[i][0]*wall_velocity[0]+ LATTICE_VELOCITIES[i][1]*wall_velocity[1]+ LATTICE_VELOCITIES[i][2]*wall_velocity[2]; /* Assign the boudary cell value */ collide_field[Q_LBM*(x+y*step+z*step*step)+i]= collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)]+ 2*LATTICE_WEIGHTS[i]*density*C_S_POW2_INV*dot_prod; }else if(flag_field[x+y*step+z*step*step]==NO_SLIP){ collide_field[Q_LBM*(x+y*step+z*step*step)+i]= collide_field[Q_LBM*(nx+ny*step+nz*step*step)+inv(i)]; } } } } } } } }
cell_computation.h
#ifndef _CELL_COMPUTATIONS_H_ #define _CELL_COMPUTATIONS_H_ /** * Computes the density from the particle distribution functions stored at currentCell. * currentCell thus denotes the address of the first particle distribution function of the * respective cell. The result is stored in density. */ void ComputeDensity(const float *const current_cell, float *density); /** * Computes the velocity within currentCell and stores the result in velocity */ void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity); /** * Computes the equilibrium distributions for all particle distribution functions of one * cell from density and velocity and stores the results in feq. */ void ComputeFeq(const float * const density, const float * const velocity, float *feq); #endif
cell_computation.c
#include "cell_computation.h" #include "lbm_model.h" #include "utils.h" void ComputeDensity(const float *const current_cell, float *density){ int i; *density=0; for(i=0;i<Q_LBM;i++) *density+=current_cell[i]; /* Density should be close to a unit (ρ~1) */ if((*density-1.0)>EPS) ERROR("Density dropped below error tolerance."); } void ComputeVelocity(const float * const current_cell, const float * const density, float *velocity){ int i; /* NOTE:Indexes are hard coded to improve program performance */ velocity[0]=0; velocity[1]=0; velocity[2]=0; for(i=0;i<Q_LBM;i++){ velocity[0]+=current_cell[i]*LATTICE_VELOCITIES[i][0]; velocity[1]+=current_cell[i]*LATTICE_VELOCITIES[i][1]; velocity[2]+=current_cell[i]*LATTICE_VELOCITIES[i][2]; } velocity[0]/=*density; velocity[1]/=*density; velocity[2]/=*density; } void ComputeFeq(const float * const density, const float * const velocity, float *feq){ int i; float s1, s2, s3; /* summands */ /* NOTE:Indexes are hard coded to improve program performance */ for(i=0;i<Q_LBM;i++){ s1 = LATTICE_VELOCITIES[i][0]*velocity[0]+LATTICE_VELOCITIES[i][1]*velocity[1]+ LATTICE_VELOCITIES[i][2]*velocity[2]; s2 = s1*s1; s3 = velocity[0]*velocity[0]+velocity[1]*velocity[1]+velocity[2]*velocity[2]; feq[i]=LATTICE_WEIGHTS[i]*(*density)*(1+s1*C_S_POW2_INV+s2*C_S_POW4_INV/2.0-s3*C_S_POW2_INV/2.0); /* Probability distribution function can not be less than 0 */ if (feq[i] < 0) ERROR("Probability distribution function can not be negative."); } }
visualization.h
#ifndef _VISUALIZATION_H_ #define _VISUALIZATION_H_ /** * Writes the density and velocity field (derived from the distributions in collideField) * to a file determined by 'filename' and timestep 't'. You can re-use parts of the code * from visual.c (VTK output for Navier-Stokes solver) and modify it for 3D datasets. */ void WriteVtkOutput(const float * const collide_field, const int * const flag_field, const char * filename, unsigned int t, int xlength); /** * Writes the provided stream or collision field to a file with specified identifiers. */ void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength, const int rank); /** * Writes the provided flag field to a file with specified identifiers. */ void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank); /** * Prints out the point by point values of the provided field (4D) to the stdout. * @param field * linerized 4D array, with (x,y,z,i)=Q*(x+y*(ncell+2)+z*(ncell+2)*(ncell+2))+i * @param ncell * number of inner cells, the ones which are there before adding a boundary layer */ void PrintField(float *field, int ncell); #endif
visualization.c
#include <cstdio> #include "lbm_model.h" #include "visualization.h" #include "cell_computation.h" #include "utils.h" void write_vtkHeader( FILE *fp, int xlength) { if( fp == NULL ){ char szBuff[80]; sprintf( szBuff, "Null pointer in write_vtkHeader" ); ERROR( szBuff ); return; } fprintf(fp,"# vtk DataFile Version 2.0\n"); fprintf(fp,"generated by CFD-lab course output (written by Denys Korzh, Denys Sobchyshak, Ozel Christo Annamanthadoo \n"); fprintf(fp,"ASCII\n"); fprintf(fp,"\n"); fprintf(fp,"DATASET STRUCTURED_GRID\n"); fprintf(fp,"DIMENSIONS %i %i %i \n", xlength, xlength, xlength); fprintf(fp,"POINTS %i float\n", xlength*xlength*xlength ); fprintf(fp,"\n"); } void write_vtkPointCoordinates( FILE *fp, int xlength) { float originX = 0.0, originY = 0.0; int i = 0, j = 0, k = 0; for(k = 0; k < xlength; k++) for(j = 0; j < xlength; j++) for(i = 0; i < xlength; i++) fprintf(fp, "%f %f %f\n", originX+(i*1.0/xlength), originY+(j*1.0/xlength), originY+(k*1.0/xlength) ); } void WriteVtkOutput(const float * const collideField, const int * const flagField, const char * filename, unsigned int t, int xlength) { int i, j, k, len = xlength+2; /* lexicographic order "[ Q * ( z*len*len + y*len + x) + i ]" */ float velocity[3], density; char szFileName[80]; FILE *fp=NULL; sprintf( szFileName, "%s.%i.vtk", filename, t ); fp = fopen( szFileName, "w"); if( fp == NULL ){ char szBuff[80]; sprintf( szBuff, "Failed to open %s", szFileName ); ERROR( szBuff ); return; } write_vtkHeader( fp, xlength); write_vtkPointCoordinates( fp, xlength); fprintf(fp,"POINT_DATA %i \n", xlength*xlength*xlength ); fprintf(fp,"\n"); fprintf(fp, "VECTORS velocity float\n"); for(k = 1; k < xlength+1; k++) { for(j = 1; j < xlength+1; j++) { for(i = 1; i < xlength+1; i++) { ComputeDensity (&collideField[ 19 * ( k*len*len + j*len + i) ], &density); ComputeVelocity(&collideField[ 19 * ( k*len*len + j*len + i) ], &density, velocity); fprintf(fp, "%f %f %f\n", velocity[0], velocity[1], velocity[2]); } } } fprintf(fp,"\n"); fprintf(fp, "SCALARS density float 1 \n"); fprintf(fp, "LOOKUP_TABLE default \n"); for(k = 1; k < xlength+1; k++) { for(j = 1; j < xlength+1; j++) { for(i = 1; i < xlength+1; i++) { ComputeDensity (&collideField[ 19 * ( k*len*len + j*len + i) ], &density); fprintf(fp, "%f\n", density); } } } if(fclose(fp)){ char szBuff[80]; sprintf( szBuff, "Failed to close %s", szFileName ); ERROR( szBuff ); } } void PrintField(float *field, int ncell){ int x,y,z,i,step=ncell+2; for(x=0;x<step;x++){ for(y=0;y<step;y++){ for(z=0;z<step;z++){ printf("(%d,%d,%d): ",x,y,z); for(i=0;i<Q_LBM;i++) printf("%f ",field[Q_LBM*(x+y*step+z*step*step)+i]); printf("\n"); } } } } void WriteField(const float * const field, const char * filename, unsigned int t, const int xlength, const int rank) { int x,y,z,i, stepX=xlength+2,stepY=xlength+2,stepZ=xlength+2; char szFileName[80]; FILE *fp=NULL; sprintf( szFileName, "%s-rank%i.%i.out", filename, rank, t ); fp = fopen( szFileName, "w"); if( fp == NULL ){ char szBuff[80]; sprintf( szBuff, "Failed to open %s", szFileName ); ERROR( szBuff ); return; } for(z=0;z<stepZ;z++){ for(x=0;x<stepX;x++){ for(y=0;y<stepY;y++){ fprintf(fp, "(%d,%d,%d): ",x,y,z); for(i=0;i<Q_LBM;i++) fprintf(fp, "%f ",field[Q_LBM*(x+y*stepX+z*stepX*stepY)+i]); fprintf(fp, "\n"); } } } if(fclose(fp)){ char szBuff[80]; sprintf( szBuff, "Failed to close %s", szFileName ); ERROR( szBuff ); } } void writeFlagField(const int * const flagField, const char * filename, const int xlength, const int rank) { int x,y,z, stepX=xlength+2,stepY=xlength+2,stepZ=xlength+2; char szFileName[80]; FILE *fp=NULL; sprintf( szFileName, "%s-rank%i.out", filename, rank ); fp = fopen( szFileName, "w"); if( fp == NULL ){ char szBuff[80]; sprintf( szBuff, "Failed to open %s", szFileName ); ERROR( szBuff ); return; } fprintf(fp, "(y,z)\\x "); for(x=0; x<stepX; x++) { fprintf(fp, "%2i ",x); } fprintf(fp, "\n-------"); for(x=0; x<stepX; x++) { fprintf(fp, "---"); } fprintf(fp, "\n"); for(z=0;z<stepZ;z++){ for(y=stepY-1;y>=0;y--){ fprintf(fp, "(%2d,%2d): ",y,z); for(x=0;x<stepX;x++){ fprintf(fp, "%2i ",flagField[x+y*stepX+z*stepX*stepY]); } fprintf(fp, "\n"); } fprintf(fp, "\n"); } if(fclose(fp)){ char szBuff[80]; sprintf( szBuff, "Failed to close %s", szFileName ); ERROR( szBuff ); } }