For the past couple of weeks I decided to go on a small journey. While I’m familiar with assembly, I’m no expert. I wanted to have first hand exposure to the common statement that compilers “will generally output faster assembly than handwritten assembly”.
To sum it up: yes.
Let me quickly set the scene for when I started: I had zero exposure to x86_64 before this. I had very small exposure to x86. I had zero exposure to GNU assembler. I did not know much about ELFs and had to learn calling conventions. I had exposure to assemblers. I had a basic understanding that binaries have a code and data section. I had exposure to writing assembly on much weaker systems.
It was a fun two weeks of internalizing that information. As the days went by I felt useful patterns arising again and again, and I’d write macros to reduce written code, which reduced possibilities for error and eased typing. With this confidence growing, I thought I had a good chance to defeat gcc.
My primary goal though was not speed, but asimple mental model. I believe I didn’t achieve either in these experiments but I feel the simple mental model goal is not yet totally destroyed.
First I want to say: wow, for at least x86_64, assembly is much easier to write than for something like a 6502. The saying that “processors are optimizing for C” is totally correct. What blew my mind the most was learning that most processors are equipped with special string processing instructions. Yes, strings. That’s with SSE4.2, which most processors will have today.
The last thing I want to mention is my perspective on my original belief has changed. Ok so maybe using assembly with macros is pointless in a reality with C, but hear me out. In a realm where C is not available, assembly and macros would certainly be just fine. In fact I still believe that there is an assembler to be built, where assembly, macros, types and smart memory tracing could become dominant. I think there is some interesting research to be done. To those who have already and will peak down this corridor, yes, I know the research exists in many forms. What I want to see is a single paper focusing on these 4 aspects of an assembly language.
Ok, onto the numbers and code! It’s a tad long but I’ve done code first, numbers second, so feel free to skip ahead.
The gcc options for the C code used was -O3 -lm, because my processor doesn’t support AVX (for those unfamiliar, instructions that operate even faster on many floating point numbers).
The code I was up against
#include #include #include #define pi 3.141592653589793 #define solar_mass (4 * pi * pi) #define year 365.24 #define for_k for(k = 0; k < 3; k++) typedef struct planet { double x[3], v[3], mass; } planet; void advance(int nbodies, struct planet *bodies, double dt, int steps) { int i, j; register planet *a, *b; double d[3], d2, mag; while (steps--) { for (a = bodies, i = 0; i < nbodies; a++, i++) { for (b = a + 1, j = i + 1; j < nbodies; b++, j++) { d[0] = a->x[0] - b->x[0]; d[1] = a->x[1] - b->x[1]; d[2] = a->x[2] - b->x[2]; d2 = d[0] * d[0] + d[1] * d[1] + d[2] * d[2]; mag = dt / (d2 * sqrt(d2)); a->v[0] -= d[0] * b->mass * mag; a->v[1] -= d[1] * b->mass * mag; a->v[2] -= d[2] * b->mass * mag; b->v[0] += d[0] * a->mass * mag; b->v[1] += d[1] * a->mass * mag; b->v[2] += d[2] * a->mass * mag; } } for (a = bodies, i = 0; i < nbodies; i++, a++) { a->x[0] += dt * a->v[0]; a->x[1] += dt * a->v[1]; a->x[2] += dt * a->v[2]; } } } double energy(int nbodies, planet *bodies) { double e, d[3]; int i, j, k; planet *a, *b; e = 0.0; for (i = 0, a = bodies; i < nbodies; a++, i++) { for_k { e += a->mass * a->v[k] * a->v[k] / 2; } for (j = i + 1, b = a + 1; j < nbodies; b++, j++) { for_k { d[k] = a->x[k] - b->x[k]; } e -= (a->mass * b->mass) / sqrt(d[0] * d[0] + d[1] * d[1] + d[2] * d[2]); } } return e; } void offset_momentum(int nbodies, planet *bodies) { int i, k; register planet *a, *b; for (i = 0; i < nbodies; i++) { for_k { bodies[0].v[k] -= bodies[i].v[k] * bodies[i].mass / solar_mass; } } } struct planet bodies[] = { { /* sun */ {0, 0, 0}, {0, 0, 0}, solar_mass }, { /* jupiter */ { 4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01 }, { 1.66007664274403694e-03 * year, 7.69901118419740425e-03 * year, -6.90460016972063023e-05 * year }, 9.54791938424326609e-04 * solar_mass }, { /* saturn */ { 8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01 }, { -2.76742510726862411e-03 * year, 4.99852801234917238e-03 * year, 2.30417297573763929e-05 * year }, 2.85885980666130812e-04 * solar_mass }, { /* uranus */ { 1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01 }, { 2.96460137564761618e-03 * year, 2.37847173959480950e-03 * year, -2.96589568540237556e-05 * year }, 4.36624404335156298e-05 * solar_mass }, { /* neptune */ { 1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01 }, { 2.68067772490389322e-03 * year, 1.62824170038242295e-03 * year, -9.51592254519715870e-05 * year }, 5.15138902046611451e-05 * solar_mass } }; #define N sizeof(bodies)/sizeof(planet) int main(int argc, char **argv) { int n = atoi(argv[1]); offset_momentum(N, bodies); printf("%.9fn", energy(N, bodies)); advance(N, bodies, 0.01, n); printf("%.9fn", energy(N, bodies)); return 0; }
My handwritten amateur assembly
.intel_syntax noprefix .include "list.macro" .section .data format_energy: .asciz "%.9fn" format_3f: .asciz "%f %f %fn" .align 16 initial_advance_value: .double 0.01, 0.01 .align 16 one: .double 1.0, 1.0 two: .double 2.0, 2.0 body_size = (8 * 10) bodies_length = 5 .align 16 bodies_data: /* x y, z, padding, vx, vy, vz, padding, mass, mass2 */ sun: .double 2.0, 3.0, 4.0, 0.0 .double 2.0, 3.0, 4.0, 0.0 .double 39.478418, 39.478418 jupiter: .double 4.841431, -1.160320, -0.103622, 0.0 .double 0.606326, 2.811987, -0.025218, 0.0 .double 0.037694, 0.037694 saturn: .double 8.343367, 4.124799, -0.403523, 0.0 .double -1.010774, 1.825662, 0.008416, 0.0 .double 0.011286, 0.011286 uranus: .double 12.894370, -15.111151, -0.223308, 0.0 .double 1.082791, 0.868713, -0.010833, 0.0 .double 0.001724, 0.001724 neptune: .double 15.379697, -25.919315, 0.179259, 0.0 .double 0.979091, 0.594699, -0.034756, 0.0 .double 0.002034, 0.002034 pairs_data: permutations sun, jupiter, saturn, uranus, neptune .section .text .global main main: sub rsp, 8 call offset_momentum call report_energy movapd xmm5, [initial_advance_value] mov r8, 5000000 call advance call report_energy mov rdi, 0; call exit add rsp, 8; ret report_neptune: mov rdi, offset format_3f movlpd xmm0, [neptune] movlpd xmm1, [neptune + (8 * 1)] movlpd xmm2, [neptune + (8 * 2)] mov al, 3 sub rsp, 8 call printf add rsp, 8 ret offset_momentum: /* () -> () */ i = rcx bodies = rdi mov bodies, offset bodies_data mov i, bodies_length 1: movapd xmm8, [bodies + (8 * 4)] movapd xmm11, [bodies + (8 * 6)] movapd xmm9, [bodies + (8 * 8)] mulpd xmm8, xmm9 mulpd xmm11, xmm9 divpd xmm8, [sun + (8 * 8)] divpd xmm11, [sun + (8 * 8)] movapd xmm12, [sun + (8 * 4)] movapd xmm13, [sun + (8 * 6)] subpd xmm12, xmm8 subpd xmm13, xmm11 movapd [sun + (8 * 4)], xmm12 movapd [sun + (8 * 6)], xmm13 add bodies, body_size dec rcx jne 1b ret report_energy: /* () -> () */ pairs = rdi; pairs_index = rcx _1 = rsi; _2 = rdx x1y1 = xmm15; z101 = xmm14 x2y2 = xmm13; z202 = xmm12 m1m1 = xmm11; m2m2 = xmm10 z0ee = xmm9; mov pairs, offset pairs_data mov pairs_index, 10 1: mov _1, [pairs] mov _2, [pairs + (8 * 1)] movapd x1y1, [_1] movapd z101, [_1 + (8 * 2)] movapd x2y2, [_2] movapd z202, [_2 + (8 * 2)] subpd x1y1, x2y2 subpd z101, z202 movapd m1m1, [_1 + (8 * 8)] movapd m2m2, [_2 + (8 * 8)] mulpd m1m1, m2m2 mulpd x1y1, x1y1 mulpd z101, z101 haddpd x1y1, z101 haddpd x1y1, x1y1 sqrtpd x1y1, x1y1 divpd m1m1, x1y1 subpd z0ee, m1m1 add pairs, (8 * 2) loop 1b bodies = rdi; bodies_length = rcx vxvy = xmm15; vzv0 = xmm14 mmmm = xmm13; mov bodies, offset bodies_data mov bodies_length, 5 1: movapd vxvy, [bodies + (8 * 4)] movapd vzv0, [bodies + (8 * 6)] movapd mmmm, [bodies + (8 * 8)] mulpd vxvy, vxvy mulpd vzv0, vzv0 haddpd vxvy, vzv0 haddpd vxvy, vxvy divpd vxvy, [two] mulpd vxvy, mmmm addpd z0ee, vxvy add bodies, (8 * 10) loop 1b mov rdi, offset format_energy movapd xmm0, z0ee mov al, 1 sub rsp, 8 call printf add rsp, 8 ret advance: /* dt: double, n: integer -> () */ dt = xmm5; n = r8 dto = xmm6 pairs = rdi; pairs_index = rcx _1 = rsi; _2 = rdx x1y1 = xmm15; z101 = xmm14 x2y2 = xmm13; z202 = xmm12 m1m1 = xmm11; m2m2 = xmm10 d0d1 = xmm8; d200 = xmm7 movapd dto, dt 1: mov pairs, offset pairs_data mov pairs_index, 10 2: movapd dt, dto mov _1, [pairs] mov _2, [pairs + (8 * 1)] movapd x1y1, [_1] movapd z101, [_1 + (8 * 2)] movapd x2y2, [_2] movapd z202, [_2 + (8 * 2)] subpd x1y1, x2y2 subpd z101, z202 movapd d0d1, x1y1 movapd d200, z101 mulpd x1y1, x1y1 mulpd z101, z101 haddpd x1y1, z101 haddpd x1y1, x1y1 sqrtpd z101, x1y1 mulpd x1y1, z101 divpd dt, x1y1 mag = dt; mag2 = xmm1 movapd mag2, mag movapd m2m2, [_2 + (8 * 8)] movapd x1y1, d0d1 movapd z101, d200 v1v2 = xmm9; v300 = xmm4 movapd v1v2, [_1 + (8 * 4)] movapd v300, [_1 + (8 * 6)] movapd m1m1, [_1 + (8 * 8)] movapd x2y2, d0d1 movapd z202, d200 v4v5 = xmm3; v600 = xmm2 movapd v4v5, [_2 + (8 * 4)] movapd v600, [_2 + (8 * 6)] mulpd m2m2, mag mulpd x1y1, m2m2 mulpd z101, m2m2 subpd v1v2, x1y1 subpd v300, z101 movapd [_1 + (8 * 4)], v1v2 movapd [_1 + (8 * 6)], v300 mulpd m1m1, mag2 mulpd x2y2, m1m1 mulpd z202, m1m1 addpd v4v5, x2y2 addpd v600, z202 movapd [_2 + (8 * 4)], v4v5 movapd [_2 + (8 * 6)], v600 add pairs, (8 * 2) dec pairs_index cmp pairs_index, 0 jne 2b bodies = rdi; bodies_length = rcx vxvy = xmm15; vzv0 = xmm14 mmmm = xmm13; mov bodies, offset bodies_data mov bodies_length, 5 movapd dt, dto 2: movapd vxvy, [bodies + (8 * 4)] movapd vzv0, [bodies + (8 * 6)] mulpd vxvy, dt mulpd vzv0, dt addpd vxvy, [bodies] addpd vzv0, [bodies + (8 * 2)] movapd [bodies], vxvy movapd [bodies + (8 * 2)], vzv0 add bodies, body_size loop 2b dec n cmp n, 0 jne 1b ret
The numbers
Theirs took on average 2.85s.
Mine took on average 4.64s.
The only hand optimization I tried was to “decouple” registers from some instructions so they can execute in parallel/speculatively I guess. The instruction “haddpd” apparently is slow too but changing it made about a 0.01s difference.
If you run objdump on the C, it comes out to using similar instructions to what I used. I haven’t looked particularly close at it but I’m sure someone could spot why mine is almost 2x as slow.
One thing I’d like to see is if someone could improve this in a simple way.
And well that’s that. I have more plans to explore this space. My next experiment is creating a lib of macros to deal with lists in an easy way… Not that they aren’t already pretty easily.