0%

在模拟由N个对象组成的引力交互系统中,每个对象的位置与速度随时间演进而持续变化,这一挑战即为N体问题,目标在于精确预测任一时间点上每个对象的确切位置。
在三维空间背景下,每个对象通过坐标(x, y, z)及其沿x, y, z轴的速度分量(vx, vy, vz)来描述,且假设所有对象质量一致,简化为单位质量m=1。
引力作用遵循如下法则计算两物体间的相互吸引:
nbody
这里,Fx,Fy,Fz代表引力在三维坐标轴上的分量,dx,dy,dz为两物体在相应坐标轴上的距离差,d是两物体间总距离的欧氏距离,并加入了一个小的软化因子以避免除零异常。
速度更新依据总引力的积分:
nbody
其中,△vx,△vy,△vz为速度增量,dt代表极小的时间间隔。
随后,基于新的速度更新位置:
nbody
在传统的串行处理中,通过反复执行“累加引力→速度更新→位置更新”的流程来逐步推进模拟。
转至CUDA C并行环境,每个CUDA线程分别处理一个天体,首先通过cudaMemcpy将主存数据传输到GPU。各线程独立执行上述“引力累加、速度与位置更新”的循环,充分利用GPU的并行处理能力。计算完毕后,再次运用cudaMemcpy将结果数据从GPU回传至主存,完成并行模拟过程。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"
#include <cuda_runtime.h>

#define SOFTENING 1e-9f
#define BLOCK_SIZE 128
#define BLOCK_STEP 32

/*
* Each body contains x, y, and z coordinate positions,
* as well as velocities in the x, y, and z directions.
*/

typedef struct
{
float x, y, z, vx, vy, vz;
} Body;

/*
* Do not modify this function. A constraint of this exercise is
* that it remain a host function.
*/

void randomizeBodies(float *data, int n)
{
for (int i = 0; i < n; i++)
{
data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
}
}

/*
* This function calculates the gravitational impact of all bodies in the system
* on all others, but does not update their positions.
*/

__global__ void bodyForce(Body *p, float dt, int n)
{
// 从全局内存获取本线程负责的物体。
int i = (threadIdx.x + blockIdx.x * blockDim.x) % n;
Body body = p[i];

// 块级共享内存,用于缓存一个批次的施力物体。
__shared__ float3 tile[BLOCK_SIZE];

float Fx = 0.0f;
float Fy = 0.0f;
float Fz = 0.0f;

int nBlocks = n / BLOCK_SIZE;
int k = blockIdx.x + blockIdx.x / nBlocks;

#pragma unroll 32
for (int swap = 0; swap < n / (BLOCK_STEP * BLOCK_SIZE); swap++)
{
k %= nBlocks;

// 从全局内存获取新一批物体,装入共享内存。
Body temp = p[k * BLOCK_SIZE + threadIdx.x];
tile[threadIdx.x] = make_float3(temp.x, temp.y, temp.z);

// 确保新一批物体已经全部装入。
__syncthreads();

#pragma unroll 32
// 叠加新一批物体施加在负责物体上的引力。
for (int j = 0; j < BLOCK_SIZE; j++)
{
float dx = tile[j].x - body.x;
float dy = tile[j].y - body.y;
float dz = tile[j].z - body.z;
float distSqr = dx * dx + dy * dy + dz * dz + SOFTENING;
float invDist = rsqrtf(distSqr);
float invDist3 = invDist * invDist * invDist;

Fx += dx * invDist3;
Fy += dy * invDist3;
Fz += dz * invDist3;
}

// 确保新一批物体已经全部消耗。
__syncthreads();

k += BLOCK_STEP;
}

// 使用原子加法更新速度,避免竞态问题。
atomicAdd(&p[i].vx, dt * Fx);
atomicAdd(&p[i].vy, dt * Fy);
atomicAdd(&p[i].vz, dt * Fz);
}

__global__ void integratePosition(Body *p, float dt, int n)
{
// 计算本线程负责的物体的下标。
int i = threadIdx.x + blockIdx.x * blockDim.x;

if (i >= n)
{
return;
}

// 更新坐标。
p[i].x += p[i].vx * dt;
p[i].y += p[i].vy * dt;
p[i].z += p[i].vz * dt;
}

int main(const int argc, const char **argv)
{

/*
* Do not change the value for `nBodies` here. If you would like to modify it,
* pass values into the command line.
*/

int nBodies = 2 << 11;
int salt = 0;
if (argc > 1)
nBodies = 2 << atoi(argv[1]);

/*
* This salt is for assessment reasons. Tampering with it will result in automatic failure.
*/

if (argc > 2)
salt = atoi(argv[2]);

const float dt = 0.01f; // time step
const int nIters = 10; // simulation iterations

int bytes = nBodies * sizeof(Body);
float *buf;
cudaMallocHost(&buf, bytes);

/*
* As a constraint of this exercise, `randomizeBodies` must remain a host function.
*/

randomizeBodies(buf, 6 * nBodies); // Init pos / vel data

float *d_buf;
cudaMalloc(&d_buf, bytes);
Body *d_p = (Body *)d_buf;
cudaMemcpy(d_buf, buf, bytes, cudaMemcpyHostToDevice);

int nBlocks = nBodies / BLOCK_SIZE;

double totalTime = 0.0;

/*
* This simulation will run for 10 cycles of time, calculating gravitational
* interaction amongst bodies, and adjusting their positions to reflect.
*/

/*******************************************************************/
// Do not modify these 2 lines of code.
for (int iter = 0; iter < nIters; iter++)
{
StartTimer();
/*******************************************************************/

/*
* You will likely wish to refactor the work being done in `bodyForce`,
* as well as the work to integrate the positions.
*/

bodyForce<<<nBlocks * BLOCK_STEP, BLOCK_SIZE>>>(d_p, dt, nBodies); // compute interbody forces

/*
* This position integration cannot occur until this round of `bodyForce` has completed.
* Also, the next round of `bodyForce` cannot begin until the integration is complete.
*/

integratePosition<<<nBlocks, BLOCK_SIZE>>>(d_p, dt, nBodies);

if (iter == nIters - 1)
{
cudaMemcpy(buf, d_buf, bytes, cudaMemcpyDeviceToHost);
}

/*******************************************************************/
// Do not modify the code in this section.
const double tElapsed = GetTimer() / 1000.0;
totalTime += tElapsed;
}

double avgTime = totalTime / (double)(nIters);
float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;

#ifdef ASSESS
checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
checkAccuracy(buf, nBodies);
printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
salt += 1;
#endif
/*******************************************************************/

/*
* Feel free to modify code below.
*/

cudaFree(d_buf);
cudaFreeHost(buf);
}

“检测到 #include 错误。请更新 includePath” 解决办法

用vscode时遇到了这样的报错
bug

通过修改编译器路径等方法都没用

win+R打开终端
输入

1
g++ -v -E -x c++ -

复制如下内容

bug

在vscode中摁ctrl+shift+p搜索json文件

bug

将刚才复制内容粘贴至includePath中,注意格式

bug

完成

bug

测试

printf("hello world");

蓝色
红色
绿色

猫是液体