The tests were run with the 2000 particles that chili has for default in his original version
NBody without SSE (chili's code)
Average is about 30 ms per frame
NBody with SSE SoA (chili's code)
Average is about 3 ms per frame
NBody with SSE AoS (my code)
Average is about 8 ms per frame
NBody with SSE SoAoS (my code)
Average is about 4 ms per frame
*SoAoS, a structure of Vec2 arrays, which are structures themselves. The difference here is each element such as position, velocity and acceleration vectors were arrays themselves and therefor could be loaded into the registers two at a time. The AoS version the elements of the register had to be set, and I was able to still load them two at a time, since it's a vec2 and two fit in a register.
I'm just going to copy paste the functions instead of uploading the project just to show what I did. You can download chili's nBody here.
NBodyAoS
Code: Select all
void Step()
{
__m128 grav = _mm_set1_ps(constGrav * mass);
__m128 mAccel = _mm_set1_ps(maxAccel);
__m128 temp0;
Vec2 temp[2];
for (int i = 0; i < nBodies; i += 2)
{
UINT ik = i + 1;
// Load position vectors for outer loop
__m128 iPos = _mm_setr_ps(body[i].pos.x, body[i].pos.y, body[ik].pos.x, body[ik].pos.y);
// Load velocity vectors for outer loop
__m128 iVel = _mm_setr_ps(body[i].vel.x, body[i].vel.y, body[ik].vel.x, body[ik].vel.y);
// Load acceleration vectors for the outer loop
__m128 iAccel = _mm_setr_ps(body[i].accel.x, body[i].accel.y, body[ik].accel.x, body[ik].accel.y);
for (UINT j = 1; j < nBodies; j += 2)
{
UINT jk = j + 1;
// Load position vectors for inner loop
__m128 jPos = _mm_setr_ps(body[j].pos.x, body[j].pos.y, body[jk].pos.x, body[jk].pos.y);
// Load acceleration vectors for the inner loop
__m128 jAccel = _mm_setr_ps(body[j].accel.x, body[j].accel.y, body[jk].accel.x, body[jk].accel.y);
// Calculate the mahattan distance between the two points
__m128 delta = _mm_sub_ps(jPos, iPos);
// Calculate the actual square distance using dot product
temp0 = _mm_mul_ps(delta, delta);
// X0, Y0, X1, Y1 -> Y0, X0, Y1, X1
__m128 temp1 = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 3, 0, 1));
__m128 distSqr = _mm_add_ps(temp0, temp1);
// Calculate normal by multiplying delat by the recripacol square root
// of the actual square distance
__m128 normal = _mm_mul_ps(delta, _mm_rsqrt_ps(distSqr));
// Limit the force to maxAccel
__m128 force = _mm_min_ps(_mm_mul_ps(grav, _mm_rcp_ps(distSqr)), mAccel);
// Calculate the new acceleration vector
temp0 = _mm_mul_ps(normal, force);
// Subtract new acceleration to inner loop acceleration
jAccel = _mm_sub_ps(jAccel, temp0);
// Store updated acceleration
_mm_storeu_ps((float*)&temp, jAccel);
body[j].accel = temp[0];
body[jk].accel = temp[1];
}
// Add new acceleration to outer loop
iAccel = _mm_add_ps(iAccel, temp0);
// Add acceleration to velocity
iVel = _mm_add_ps(iVel, iAccel);
// Add velocity to position
iPos = _mm_add_ps(iPos, iVel);
// Store updated velocity and position and zero acceleration
_mm_store_ps((float*)&temp, iVel);
body[i].vel = temp[0];
body[ik].vel = temp[1];
_mm_store_ps((float*)&temp, iPos);
body[i].pos = temp[0];
body[ik].pos = temp[1];
_mm_store_ps((float*)&temp, _mm_setzero_ps());
body[i].accel = temp[0];
body[ik].accel = temp[1];
}
}
Code: Select all
void Step()
{
__m128 grav = _mm_set1_ps(constGrav * mass);
__m128 mAccel = _mm_set1_ps(maxAccel);
__m128 temp0;
for (int i = 0; i < nBodies; i += 2)
{
__m128 iPos = _mm_load_ps((float*)&body.pos[i]);
for (int j = 1; j < nBodies; j += 2)
{
// Load position vectors for inner loop
__m128 jPos = _mm_loadu_ps((float*)&body.pos[j]);
// Load acceleration vectors for the inner loop
__m128 jAccel = _mm_loadu_ps((float*)&body.accel[j]);
// Calculate the mahattan distance between the two points
__m128 delta = _mm_sub_ps(jPos, iPos);
// Calculate the actual square distance using dot product
temp0 = _mm_mul_ps(delta, delta);
// X0, Y0, X1, Y1 -> Y0, X0, Y1, X1
__m128 temp1 = _mm_shuffle_ps(temp0, temp0, _MM_SHUFFLE(2, 3, 0, 1));
__m128 distSqr = _mm_add_ps(temp0, temp1);
// Calculate normal by multiplying delat by the recripacol square root
// of the actual square distance
__m128 normal = _mm_mul_ps(delta, _mm_rsqrt_ps(distSqr));
// Limit the force to maxAccel
__m128 force = _mm_min_ps(_mm_mul_ps(grav, _mm_rcp_ps(distSqr)), mAccel);
// Calculate the new acceleration vector
temp0 = _mm_mul_ps(normal, force);
// Subtract new acceleration to inner loop acceleration
jAccel = _mm_sub_ps(jAccel, temp0);
// Store updated acceleration
_mm_storeu_ps((float*)&body.accel[j], jAccel);
}
// Load acceleration vectors for the outer loop
__m128 iAccel = _mm_load_ps((float*)&body.accel[i]);
// Add new acceleration to outer loop
iAccel = _mm_add_ps(iAccel, temp0);
// Load velocity vectors for outer loop
__m128 iVel = _mm_load_ps((float*)&body.vel[i]);
// Add acceleration to velocity
iVel = _mm_add_ps(iVel, iAccel);
// Add velocity to position
iPos = _mm_add_ps(iPos, iVel);
// Store updated velocity and position and zero acceleration
_mm_store_ps((float*)&body.vel[i], iVel);
_mm_store_ps((float*)&body.pos[i], iPos);
_mm_store_ps((float*)&body.accel[i], _mm_setzero_ps());
}
}