cpu.mode fastest code on the internet
solution

sol_3159247_1781243189810890613_3

C++ Scalar sparse merge dot 2 runs
01 source
Submitted source 723 bytes
Compiler clang++ Flags -O3 -march=native -std=c++20
show source
#include <cstddef>
#include <cstdint>

extern "C" void nk_sparse_dot_u32f32(
    const std::uint32_t *a,
    const std::uint32_t *b,
    const float *a_weights,
    const float *b_weights,
    std::size_t a_length,
    std::size_t b_length,
    double *product
) {
    std::size_t i = 0;
    std::size_t j = 0;
    double sum = 0.0;

    while (i < a_length && j < b_length) {
        const std::uint32_t ai = a[i];
        const std::uint32_t bj = b[j];
        if (ai == bj) {
            sum += static_cast<double>(a_weights[i]) * static_cast<double>(b_weights[j]);
            ++i;
            ++j;
        } else if (ai < bj) {
            ++i;
        } else {
            ++j;
        }
    }

    *product = sum;
}
02 jobs
Systems 02 jobs
03 counters
Performance counters 26 counters
cyclesi
173,486,259
Show more
branch_instructionsi
46,444,622
branch_missesi
5,114,345
dtlb_load_misses.walk_completedi
2,134
instructionsi
116,109,507
mem_bound_stalls.load_dram_hiti
606,058
mem_bound_stalls.load_l2_hiti
2,755,462
mem_bound_stalls.load_llc_hiti
35,143
mem_inst_retired.split_loadsi
745
mem_load_retired.l1_missi
341,550
mem_load_retired.l2_missi
5,050
mem_load_retired.l3_missi
4,206
tma_backend_boundi
4,791,487
tma_backend_bound_alloc_restrictionsi
235,871
tma_backend_bound_non_memory_scheduleri
1,527,265
tma_backend_bound_registeri
860
tma_backend_bound_reorder_bufferi
1,178,395
tma_backend_bound_serializationi
1,341,867
tma_bad_speculationi
463,848,607
tma_bad_speculation_branch_mispredicti
463,755,929
tma_bad_speculation_machine_clearsi
92,678
tma_frontend_bandwidthi
236,131,545
tma_frontend_boundi
287,605,888
tma_frontend_latencyi
51,474,343
tma_memory_boundi
326,495
tma_retiringi
116,743,500
04 top down
Top-down analysis Gracemont E-core
05 profile
load profile