1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
| #include <math.h> #include <stdio.h> #include <stdlib.h> #include "timer.h" #include "check.h" #include <cuda_runtime.h>
#define SOFTENING 1e-9f #define BLOCK_SIZE 128 #define BLOCK_STEP 32
/* * Each body contains x, y, and z coordinate positions, * as well as velocities in the x, y, and z directions. */
typedef struct { float x, y, z, vx, vy, vz; } Body;
/* * Do not modify this function. A constraint of this exercise is * that it remain a host function. */
void randomizeBodies(float *data, int n) { for (int i = 0; i < n; i++) { data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f; } }
/* * This function calculates the gravitational impact of all bodies in the system * on all others, but does not update their positions. */
__global__ void bodyForce(Body *p, float dt, int n) { // 从全局内存获取本线程负责的物体。 int i = (threadIdx.x + blockIdx.x * blockDim.x) % n; Body body = p[i];
// 块级共享内存,用于缓存一个批次的施力物体。 __shared__ float3 tile[BLOCK_SIZE];
float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
int nBlocks = n / BLOCK_SIZE; int k = blockIdx.x + blockIdx.x / nBlocks;
#pragma unroll 32 for (int swap = 0; swap < n / (BLOCK_STEP * BLOCK_SIZE); swap++) { k %= nBlocks;
// 从全局内存获取新一批物体,装入共享内存。 Body temp = p[k * BLOCK_SIZE + threadIdx.x]; tile[threadIdx.x] = make_float3(temp.x, temp.y, temp.z);
// 确保新一批物体已经全部装入。 __syncthreads();
#pragma unroll 32 // 叠加新一批物体施加在负责物体上的引力。 for (int j = 0; j < BLOCK_SIZE; j++) { float dx = tile[j].x - body.x; float dy = tile[j].y - body.y; float dz = tile[j].z - body.z; float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING; float invDist = rsqrtf(distSqr); float invDist3 = invDist * invDist * invDist;
Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3; }
// 确保新一批物体已经全部消耗。 __syncthreads();
k += BLOCK_STEP; }
// 使用原子加法更新速度,避免竞态问题。 atomicAdd(&p[i].vx, dt * Fx); atomicAdd(&p[i].vy, dt * Fy); atomicAdd(&p[i].vz, dt * Fz); }
__global__ void integratePosition(Body *p, float dt, int n) { // 计算本线程负责的物体的下标。 int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n) { return; }
// 更新坐标。 p[i].x += p[i].vx * dt; p[i].y += p[i].vy * dt; p[i].z += p[i].vz * dt; }
int main(const int argc, const char **argv) {
/* * Do not change the value for `nBodies` here. If you would like to modify it, * pass values into the command line. */
int nBodies = 2 << 11; int salt = 0; if (argc > 1) nBodies = 2 << atoi(argv[1]);
/* * This salt is for assessment reasons. Tampering with it will result in automatic failure. */
if (argc > 2) salt = atoi(argv[2]);
const float dt = 0.01f; // time step const int nIters = 10; // simulation iterations
int bytes = nBodies * sizeof(Body); float *buf; cudaMallocHost(&buf, bytes);
/* * As a constraint of this exercise, `randomizeBodies` must remain a host function. */
randomizeBodies(buf, 6 * nBodies); // Init pos / vel data
float *d_buf; cudaMalloc(&d_buf, bytes); Body *d_p = (Body *)d_buf; cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);
int nBlocks = nBodies / BLOCK_SIZE;
double totalTime = 0.0;
/* * This simulation will run for 10 cycles of time, calculating gravitational * interaction amongst bodies, and adjusting their positions to reflect. */
/*******************************************************************/ // Do not modify these 2 lines of code. for (int iter = 0; iter < nIters; iter++) { StartTimer(); /*******************************************************************/
/* * You will likely wish to refactor the work being done in `bodyForce`, * as well as the work to integrate the positions. */
bodyForce<<<nBlocks * BLOCK_STEP, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces
/* * This position integration cannot occur until this round of `bodyForce` has completed. * Also, the next round of `bodyForce` cannot begin until the integration is complete. */
integratePosition<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies);
if (iter == nIters - 1) { cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost); }
/*******************************************************************/ // Do not modify the code in this section. const double tElapsed = GetTimer() / 1000.0; totalTime += tElapsed; }
double avgTime = totalTime / (double)(nIters); float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
#ifdef ASSESS checkPerformance(buf, billionsOfOpsPerSecond, salt); #else checkAccuracy(buf, nBodies); printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond); salt += 1; #endif /*******************************************************************/
/* * Feel free to modify code below. */
cudaFree(d_buf); cudaFreeHost(buf); }
|