Sorted summary for file /box/solution-bin ---------------------------------------------- 38.70 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:51 23.28 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:53 12.77 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:45 12.74 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:46 5.90 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:54 5.01 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:52 1.13 /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:44 Samples | Source code & Disassembly of /box/solution-bin for cpu_core/cycles/P (441 samples, percent: local period) ------------------------------------------------------------------------------------------------------------------------- : : : : 3 Disassembly of section .text: : : 5 0000000000001180
: : 6 __m128i s = _mm_add_epi64(lo, hi); : 7 return (uint64_t)_mm_cvtsi128_si64(s) + : 8 (uint64_t)_mm_cvtsi128_si64(_mm_unpackhi_epi64(s, s)); : 9 } : : 11 int main() { 0 : 1180: push rbp 0 : 1181: mov rbp,rsp 0 : 1184: push r14 0 : 1186: push rbx 0 : 1187: sub rsp,0xb0 0 : 118e: lea rsi,[rbp-0xc0] : 18 constexpr uint8_t TARGET = 127; : 19 struct stat st; : 20 if (fstat(STDIN_FILENO, &st) != 0 || !S_ISREG(st.st_mode) || st.st_size <= 0) 0 : 1195: xor edi,edi 0 : 1197: call 1060 0 : 119c: mov ecx,eax 0 : 119e: mov eax,0x1 0 : 11a3: test ecx,ecx 0 : 11a5: je 11b3 : : 28 char outbuf[32]; : 29 int len = std::snprintf(outbuf, sizeof(outbuf), "%llu\n", (unsigned long long)cnt); : 30 (void)write(STDOUT_FILENO, outbuf, len); : 31 return 0; : 32 } 0 : 11a7: add rsp,0xb0 0 : 11ae: pop rbx 0 : 11af: pop r14 0 : 11b1: pop rbp 0 : 11b2: ret 0 : 11b3: mov ecx,0xf000 : 39 if (fstat(STDIN_FILENO, &st) != 0 || !S_ISREG(st.st_mode) || st.st_size <= 0) 0 : 11b8: and ecx,DWORD PTR [rbp-0xa8] 0 : 11be: cmp ecx,0x8000 0 : 11c4: jne 11a7 0 : 11c6: mov rbx,QWORD PTR [rbp-0x90] 0 : 11cd: test rbx,rbx 0 : 11d0: jle 11a7 : 46 void* p = mmap(nullptr, n, PROT_READ, MAP_PRIVATE, STDIN_FILENO, 0); 0 : 11d2: xor edi,edi 0 : 11d4: mov rsi,rbx 0 : 11d7: mov edx,0x1 0 : 11dc: mov ecx,0x2 0 : 11e1: xor r8d,r8d 0 : 11e4: xor r9d,r9d 0 : 11e7: call 1070 0 : 11ec: mov r14,rax 0 : 11ef: mov eax,0x1 : 56 if (p == MAP_FAILED) return 1; 0 : 11f4: cmp r14,0xffffffffffffffff 0 : 11f8: je 11a7 : 59 (void)madvise(p, n, MADV_SEQUENTIAL); 0 : 11fa: mov rdi,r14 0 : 11fd: mov rsi,rbx 0 : 1200: mov edx,0x2 0 : 1205: call 1040 0 : 120a: vpxor xmm0,xmm0,xmm0 : 65 while (i + BLOCK <= n) { 0 : 120e: cmp rbx,0x7f80 0 : 1215: jae 1222 0 : 1217: xor edx,edx 0 : 1219: vpxor xmm1,xmm1,xmm1 0 : 121d: jmp 137b 0 : 1222: vpxor xmm2,xmm2,xmm2 0 : 1226: mov ecx,0x7f80 0 : 122b: vpbroadcastb ymm3,BYTE PTR [rip+0xe24] # 2058 <_IO_stdin_used+0x58> 0 : 1234: mov rax,r14 0 : 1237: vpxor xmm1,xmm1,xmm1 0 : 123b: nop DWORD PTR [rax+rax*1+0x0] 0 : 1240: xor edx,edx 0 : 1242: vpxor xmm7,xmm7,xmm7 0 : 1246: vpxor xmm5,xmm5,xmm5 0 : 124a: vpxor xmm6,xmm6,xmm6 0 : 124e: vpxor xmm4,xmm4,xmm4 0 : 1252: data16 data16 data16 data16 cs nop WORD PTR [rax+rax*1+0x0] : 83 _mm_prefetch((const char*)(p2 + PREFETCH_DIST), _MM_HINT_T0); 20 : 1260: prefetcht0 BYTE PTR [rax+rdx*1+0x200] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:45 : 85 _mm_prefetch((const char*)(p2 + PREFETCH_DIST + 64), _MM_HINT_T0); 20 : 1268: prefetcht0 BYTE PTR [rax+rdx*1+0x240] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:46 : 87 a0 = _mm256_sub_epi8(a0, _mm256_cmpeq_epi8(v0, t)); 57 : 1270: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:51 0 : 1275: vpsubb ymm7,ymm7,ymm8 : 90 a1 = _mm256_sub_epi8(a1, _mm256_cmpeq_epi8(v1, t)); 3 : 127a: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x20] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:52 4 : 1280: vpsubb ymm5,ymm5,ymm8 : 93 a2 = _mm256_sub_epi8(a2, _mm256_cmpeq_epi8(v2, t)); 32 : 1285: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x40] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:53 0 : 128b: vpsubb ymm6,ymm6,ymm8 : 96 a3 = _mm256_sub_epi8(a3, _mm256_cmpeq_epi8(v3, t)); 10 : 1290: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x60] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:54 0 : 1296: vpsubb ymm4,ymm4,ymm8 : 99 _mm_prefetch((const char*)(p2 + PREFETCH_DIST), _MM_HINT_T0); 20 : 129b: prefetcht0 BYTE PTR [rax+rdx*1+0x280] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:45 : 101 _mm_prefetch((const char*)(p2 + PREFETCH_DIST + 64), _MM_HINT_T0); 16 : 12a3: prefetcht0 BYTE PTR [rax+rdx*1+0x2c0] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:46 : 103 a0 = _mm256_sub_epi8(a0, _mm256_cmpeq_epi8(v0, t)); 58 : 12ab: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x80] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:51 : 105 a1 = _mm256_sub_epi8(a1, _mm256_cmpeq_epi8(v1, t)); 7 : 12b4: vpcmpeqb ymm9,ymm3,YMMWORD PTR [rax+rdx*1+0xa0] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:52 : 107 a0 = _mm256_sub_epi8(a0, _mm256_cmpeq_epi8(v0, t)); 0 : 12bd: vpsubb ymm7,ymm7,ymm8 : 109 a1 = _mm256_sub_epi8(a1, _mm256_cmpeq_epi8(v1, t)); 1 : 12c2: vpsubb ymm5,ymm5,ymm9 : 111 a2 = _mm256_sub_epi8(a2, _mm256_cmpeq_epi8(v2, t)); 38 : 12c7: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0xc0] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:53 : 113 a3 = _mm256_sub_epi8(a3, _mm256_cmpeq_epi8(v3, t)); 4 : 12d0: vpcmpeqb ymm9,ymm3,YMMWORD PTR [rax+rdx*1+0xe0] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:54 : 115 a2 = _mm256_sub_epi8(a2, _mm256_cmpeq_epi8(v2, t)); 0 : 12d9: vpsubb ymm6,ymm6,ymm8 : 117 a3 = _mm256_sub_epi8(a3, _mm256_cmpeq_epi8(v3, t)); 3 : 12de: vpsubb ymm4,ymm4,ymm9 : 119 _mm_prefetch((const char*)(p2 + PREFETCH_DIST), _MM_HINT_T0); 16 : 12e3: prefetcht0 BYTE PTR [rax+rdx*1+0x300] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:45 : 121 _mm_prefetch((const char*)(p2 + PREFETCH_DIST + 64), _MM_HINT_T0); 20 : 12eb: prefetcht0 BYTE PTR [rax+rdx*1+0x340] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:46 : 123 a0 = _mm256_sub_epi8(a0, _mm256_cmpeq_epi8(v0, t)); 53 : 12f3: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x100] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:51 3 : 12fc: vpsubb ymm7,ymm7,ymm8 : 126 a1 = _mm256_sub_epi8(a1, _mm256_cmpeq_epi8(v1, t)); 5 : 1301: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x120] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:52 3 : 130a: vpsubb ymm5,ymm5,ymm8 : 129 a2 = _mm256_sub_epi8(a2, _mm256_cmpeq_epi8(v2, t)); 30 : 130f: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x140] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:53 3 : 1318: vpsubb ymm6,ymm6,ymm8 : 132 a3 = _mm256_sub_epi8(a3, _mm256_cmpeq_epi8(v3, t)); 6 : 131d: vpcmpeqb ymm8,ymm3,YMMWORD PTR [rax+rdx*1+0x160] // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:54 3 : 1326: vpsubb ymm4,ymm4,ymm8 : 135 for (size_t k = 0; k < ITERS; k++) { 1 : 132b: add rdx,0x180 0 : 1332: cmp rdx,0x7f80 5 : 1339: jne 1260 // /tmp/cpu-mode-161014-1777968459542848895-22/solution/solution.cpp:44 : 139 total = _mm256_add_epi64(total, _mm256_sad_epu8(a0, zero)); 0 : 133f: vpsadbw ymm7,ymm7,ymm2 0 : 1343: vpaddq ymm1,ymm7,ymm1 : 142 total = _mm256_add_epi64(total, _mm256_sad_epu8(a1, zero)); 0 : 1347: vpsadbw ymm5,ymm5,ymm2 : 144 total = _mm256_add_epi64(total, _mm256_sad_epu8(a2, zero)); 0 : 134b: vpsadbw ymm6,ymm6,ymm2 : 146 total = _mm256_add_epi64(total, _mm256_sad_epu8(a1, zero)); 0 : 134f: vpaddq ymm5,ymm5,ymm6 : 148 total = _mm256_add_epi64(total, _mm256_sad_epu8(a2, zero)); 0 : 1353: vpaddq ymm1,ymm1,ymm5 : 150 total = _mm256_add_epi64(total, _mm256_sad_epu8(a3, zero)); 0 : 1357: vpsadbw ymm4,ymm4,ymm2 0 : 135b: vpaddq ymm1,ymm1,ymm4 0 : 135f: mov rdx,rcx : 154 while (i + BLOCK <= n) { 0 : 1362: lea rsi,[rcx+0x7f80] 0 : 1369: add rax,0x7f80 0 : 136f: mov rcx,rsi 0 : 1372: cmp rsi,rbx 0 : 1375: jbe 1240 : 160 while (i + 32 <= n) { 0 : 137b: mov rax,rdx 0 : 137e: or rax,0x20 0 : 1382: cmp rax,rbx 0 : 1385: jbe 14d9 0 : 138b: mov rax,rdx : 166 total = _mm256_add_epi64(total, _mm256_sad_epu8(acc, zero)); 0 : 138e: vpxor xmm2,xmm2,xmm2 0 : 1392: vpsadbw ymm0,ymm0,ymm2 0 : 1396: vpaddq ymm0,ymm0,ymm1 : 170 __m128i hi = _mm256_extracti128_si256(v, 1); 0 : 139a: vextracti128 xmm1,ymm0,0x1 : 172 __m128i s = _mm_add_epi64(lo, hi); 0 : 13a0: vpaddq xmm0,xmm0,xmm1 : 174 return (uint64_t)_mm_cvtsi128_si64(s) + 0 : 13a4: vpshufd xmm1,xmm0,0xee 0 : 13a9: vpaddq xmm0,xmm0,xmm1 0 : 13ad: vmovq rcx,xmm0 : 178 for (; i < n; i++) cnt += (data[i] == TARGET); 0 : 13b2: mov rdx,rbx 0 : 13b5: sub rdx,rax 0 : 13b8: jbe 14a5 0 : 13be: cmp rdx,0x10 0 : 13c2: jb 1490 0 : 13c8: mov rsi,rdx 0 : 13cb: and rsi,0xfffffffffffffff0 0 : 13cf: vmovq xmm0,rcx 0 : 13d4: lea rcx,[rax+r14*1] 0 : 13d8: add rcx,0xc 0 : 13dc: add rax,rsi 0 : 13df: vpxor xmm1,xmm1,xmm1 0 : 13e3: xor edi,edi 0 : 13e5: vpbroadcastb xmm2,BYTE PTR [rip+0xc6a] # 2058 <_IO_stdin_used+0x58> 0 : 13ee: vpbroadcastq ymm3,QWORD PTR [rip+0xc59] # 2050 <_IO_stdin_used+0x50> 0 : 13f7: vpxor xmm4,xmm4,xmm4 0 : 13fb: vpxor xmm5,xmm5,xmm5 0 : 13ff: nop 0 : 1400: vmovd xmm6,DWORD PTR [rcx+rdi*1-0xc] 0 : 1406: vmovd xmm7,DWORD PTR [rcx+rdi*1-0x8] 0 : 140c: vmovd xmm8,DWORD PTR [rcx+rdi*1-0x4] 0 : 1412: vmovd xmm9,DWORD PTR [rcx+rdi*1] 0 : 1417: vpcmpeqb xmm6,xmm6,xmm2 0 : 141b: vpmovzxbq ymm6,xmm6 0 : 1420: vpand ymm6,ymm6,ymm3 0 : 1424: vpaddq ymm0,ymm0,ymm6 0 : 1428: vpcmpeqb xmm6,xmm7,xmm2 0 : 142c: vpmovzxbq ymm6,xmm6 0 : 1431: vpand ymm6,ymm6,ymm3 0 : 1435: vpaddq ymm1,ymm1,ymm6 0 : 1439: vpcmpeqb xmm6,xmm8,xmm2 0 : 143d: vpmovzxbq ymm6,xmm6 0 : 1442: vpand ymm6,ymm6,ymm3 0 : 1446: vpaddq ymm4,ymm4,ymm6 0 : 144a: vpcmpeqb xmm6,xmm9,xmm2 0 : 144e: vpmovzxbq ymm6,xmm6 0 : 1453: vpand ymm6,ymm6,ymm3 0 : 1457: vpaddq ymm5,ymm5,ymm6 0 : 145b: add rdi,0x10 0 : 145f: cmp rsi,rdi 0 : 1462: jne 1400 0 : 1464: vpaddq ymm0,ymm1,ymm0 0 : 1468: vpaddq ymm0,ymm4,ymm0 0 : 146c: vpaddq ymm0,ymm5,ymm0 0 : 1470: vextracti128 xmm1,ymm0,0x1 0 : 1476: vpaddq xmm0,xmm0,xmm1 0 : 147a: vpshufd xmm1,xmm0,0xee 0 : 147f: vpaddq xmm0,xmm0,xmm1 0 : 1483: vmovq rcx,xmm0 0 : 1488: cmp rdx,rsi 0 : 148b: je 14a5 0 : 148d: nop DWORD PTR [rax] 0 : 1490: xor edx,edx 0 : 1492: cmp BYTE PTR [r14+rax*1],0x7f 0 : 1497: sete dl 0 : 149a: add rcx,rdx 0 : 149d: inc rax 0 : 14a0: cmp rbx,rax 0 : 14a3: jne 1490 : 238 int len = std::snprintf(outbuf, sizeof(outbuf), "%llu\n", (unsigned long long)cnt); 0 : 14a5: lea rdx,[rip+0xbad] # 2059 <_IO_stdin_used+0x59> 0 : 14ac: lea rbx,[rbp-0x30] 0 : 14b0: mov esi,0x20 0 : 14b5: mov rdi,rbx 0 : 14b8: xor eax,eax 0 : 14ba: vzeroupper 0 : 14bd: call 1050 : 246 (void)write(STDOUT_FILENO, outbuf, len); 0 : 14c2: movsxd rdx,eax 0 : 14c5: mov edi,0x1 0 : 14ca: mov rsi,rbx 0 : 14cd: call 1030 0 : 14d2: xor eax,eax 0 : 14d4: jmp 11a7 0 : 14d9: vpxor xmm0,xmm0,xmm0 0 : 14dd: xor ecx,ecx 0 : 14df: vpbroadcastb ymm2,BYTE PTR [rip+0xb70] # 2058 <_IO_stdin_used+0x58> 0 : 14e8: jmp 1504 0 : 14ea: nop WORD PTR [rax+rax*1+0x0] : 258 while (i + 32 <= n) { 0 : 14f0: lea rax,[rdx+0x20] 0 : 14f4: add rdx,0x40 0 : 14f8: cmp rdx,rbx 0 : 14fb: mov rdx,rax 0 : 14fe: ja 138e : 264 __m256i m = _mm256_cmpeq_epi8(v, t); 0 : 1504: vpcmpeqb ymm3,ymm2,YMMWORD PTR [r14+rdx*1] : 266 acc = _mm256_sub_epi8(acc, m); 0 : 150a: vpsubb ymm0,ymm0,ymm3 : 268 if (++inner_iters == 255) { 0 : 150e: inc rcx 0 : 1511: cmp rcx,0xff 0 : 1518: jne 14f0 : 272 total = _mm256_add_epi64(total, _mm256_sad_epu8(acc, zero)); 0 : 151a: vpxor xmm3,xmm3,xmm3 0 : 151e: vpsadbw ymm0,ymm0,ymm3 0 : 1522: vpaddq ymm1,ymm0,ymm1 0 : 1526: vpxor xmm0,xmm0,xmm0 0 : 152a: xor ecx,ecx 0 : 152c: jmp 14f0