cpu.mode fastest code on the internet
01 specification

Llama.cpp's ggml_vec_dot_q6_K_q8_K

Implement llama.cpp's ggml_vec_dot_q6_K_q8_K function. The judge builds your source as libsolution.so and calls the exported function from a verifier executable.

This kernel computes one dot product between a Q6_K quantized weight row and a Q8_K activation vector. Q6_K stores 256 weights per block using 4 low bits, 2 high bits, per-group signed scales, and a half-precision super-block scale; Q8_K stores the runtime activations and group sums. The hot work is unpacking the 6-bit weights, applying the scales and the -32 zero point, multiplying against the signed Q8 values, and accumulating the result for quantized llama.cpp matmul.

The required symbol is extern "C" void ggml_vec_dot_q6_K_q8_K(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc).

The verifier supplies block_q6_K and block_q8_K buffers in llama.cpp layout with nrc = 1, then compares the printed aggregate against the scalar reference with a small floating-point tolerance.

minimal Rust solution
use std::ffi::c_void;

const QK_K: usize = 256;

#[repr(C)]
struct BlockQ6K {
    ql: [u8; QK_K / 2],
    qh: [u8; QK_K / 4],
    scales: [i8; QK_K / 16],
    d: u16,
}

#[repr(C)]
struct BlockQ8K {
    d: f32,
    qs: [i8; QK_K],
    bsums: [i16; QK_K / 16],
}

#[unsafe(no_mangle)]
pub unsafe extern "C" fn ggml_vec_dot_q6_K_q8_K(
    n: i32,
    s: *mut f32,
    _bs: usize,
    vx: *const c_void,
    _bx: usize,
    vy: *const c_void,
    _by: usize,
    _nrc: i32,
) {
    let nb = n as usize / QK_K;
    let q6 = unsafe { std::slice::from_raw_parts(vx.cast::<BlockQ6K>(), nb) };
    let q8 = unsafe { std::slice::from_raw_parts(vy.cast::<BlockQ8K>(), nb) };
    let out = unsafe { &mut *s };
    *out = reference_dot(q6, q8);
}

fn reference_dot(q6: &[BlockQ6K], q8: &[BlockQ8K]) -> f32 {
    let mut sums = [0.0_f32; 8];

    for (x, y) in q6.iter().zip(q8) {
        let mut aux8 = [0_i8; QK_K];
        let mut out = 0;
        let mut ql_offset = 0;
        let mut qh_offset = 0;
        for _ in (0..QK_K).step_by(128) {
            for lane in 0..32 {
                aux8[out + lane] = (((x.ql[ql_offset + lane] & 0x0f)
                    | (((x.qh[qh_offset + lane] >> 0) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 32 + lane] = (((x.ql[ql_offset + 32 + lane] & 0x0f)
                    | (((x.qh[qh_offset + lane] >> 2) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 64 + lane] = (((x.ql[ql_offset + lane] >> 4)
                    | (((x.qh[qh_offset + lane] >> 4) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 96 + lane] = (((x.ql[ql_offset + 32 + lane] >> 4)
                    | (((x.qh[qh_offset + lane] >> 6) & 3) << 4))
                    as i8)
                    - 32;
            }
            out += 128;
            ql_offset += 64;
            qh_offset += 32;
        }

        let mut aux32 = [0_i32; 8];
        let mut q8_offset = 0;
        let mut aux_offset = 0;
        for group in 0..QK_K / 16 {
            let scale = i32::from(x.scales[group]);
            for _ in 0..2 {
                for lane in 0..8 {
                    aux32[lane] += scale
                        * i32::from(y.qs[q8_offset + lane])
                        * i32::from(aux8[aux_offset + lane]);
                }
                q8_offset += 8;
                aux_offset += 8;
            }
        }

        let d = f16_to_f32(x.d) * y.d;
        for lane in 0..8 {
            sums[lane] += d * aux32[lane] as f32;
        }
    }

    sums.into_iter().sum()
}

fn f16_to_f32(bits: u16) -> f32 {
    let sign = ((bits & 0x8000) as u32) << 16;
    let exp = ((bits >> 10) & 0x1f) as i32;
    let frac = (bits & 0x03ff) as u32;
    let out = if exp == 0 {
        if frac == 0 {
            sign
        } else {
            let mut mant = frac;
            let mut exponent = -14_i32;
            while (mant & 0x0400) == 0 {
                mant <<= 1;
                exponent -= 1;
            }
            mant &= 0x03ff;
            sign | (((exponent + 127) as u32) << 23) | (mant << 13)
        }
    } else if exp == 31 {
        sign | 0x7f80_0000 | (frac << 13)
    } else {
        sign | (((exp - 15 + 127) as u32) << 23) | (frac << 13)
    };
    f32::from_bits(out)
}
02 scope / runtime over time
Lang
System
double-click zooms out
03 leaderboard
Leaderboard · top 8 click any row to expand · open multiple to compare
Rank User Lang Best Position in CDF Analysis When
04 submit
Your Solution
Single File
Sign in to submit.