01 specification

Llama.cpp's ggml_vec_dot_q6_K_q8_K

Implement llama.cpp's ggml_vec_dot_q6_K_q8_K function. The judge builds your source as libsolution.so and calls the exported function from a verifier executable.

This kernel computes one dot product between a Q6_K quantized weight row and a Q8_K activation vector. Q6_K stores 256 weights per block using 4 low bits, 2 high bits, per-group signed scales, and a half-precision super-block scale; Q8_K stores the runtime activations and group sums. The hot work is unpacking the 6-bit weights, applying the scales and the -32 zero point, multiplying against the signed Q8 values, and accumulating the result for quantized llama.cpp matmul.

The required symbol is extern "C" void ggml_vec_dot_q6_K_q8_K(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc).

The verifier supplies block_q6_K and block_q8_K buffers in llama.cpp layout with nrc = 1, then compares the printed aggregate against the scalar reference with a small floating-point tolerance.

minimal Rust solution

use std::ffi::c_void;

const QK_K: usize = 256;

#[repr(C)]
struct BlockQ6K {
    ql: [u8; QK_K / 2],
    qh: [u8; QK_K / 4],
    scales: [i8; QK_K / 16],
    d: u16,
}

#[repr(C)]
struct BlockQ8K {
    d: f32,
    qs: [i8; QK_K],
    bsums: [i16; QK_K / 16],
}

#[unsafe(no_mangle)]
pub unsafe extern "C" fn ggml_vec_dot_q6_K_q8_K(
    n: i32,
    s: *mut f32,
    _bs: usize,
    vx: *const c_void,
    _bx: usize,
    vy: *const c_void,
    _by: usize,
    _nrc: i32,
) {
    let nb = n as usize / QK_K;
    let q6 = unsafe { std::slice::from_raw_parts(vx.cast::<BlockQ6K>(), nb) };
    let q8 = unsafe { std::slice::from_raw_parts(vy.cast::<BlockQ8K>(), nb) };
    let out = unsafe { &mut *s };
    *out = reference_dot(q6, q8);
}

fn reference_dot(q6: &[BlockQ6K], q8: &[BlockQ8K]) -> f32 {
    let mut sums = [0.0_f32; 8];

    for (x, y) in q6.iter().zip(q8) {
        let mut aux8 = [0_i8; QK_K];
        let mut out = 0;
        let mut ql_offset = 0;
        let mut qh_offset = 0;
        for _ in (0..QK_K).step_by(128) {
            for lane in 0..32 {
                aux8[out + lane] = (((x.ql[ql_offset + lane] & 0x0f)
                    | (((x.qh[qh_offset + lane] >> 0) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 32 + lane] = (((x.ql[ql_offset + 32 + lane] & 0x0f)
                    | (((x.qh[qh_offset + lane] >> 2) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 64 + lane] = (((x.ql[ql_offset + lane] >> 4)
                    | (((x.qh[qh_offset + lane] >> 4) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 96 + lane] = (((x.ql[ql_offset + 32 + lane] >> 4)
                    | (((x.qh[qh_offset + lane] >> 6) & 3) << 4))
                    as i8)
                    - 32;
            }
            out += 128;
            ql_offset += 64;
            qh_offset += 32;
        }

        let mut aux32 = [0_i32; 8];
        let mut q8_offset = 0;
        let mut aux_offset = 0;
        for group in 0..QK_K / 16 {
            let scale = i32::from(x.scales[group]);
            for _ in 0..2 {
                for lane in 0..8 {
                    aux32[lane] += scale
                        * i32::from(y.qs[q8_offset + lane])
                        * i32::from(aux8[aux_offset + lane]);
                }
                q8_offset += 8;
                aux_offset += 8;
            }
        }

        let d = f16_to_f32(x.d) * y.d;
        for lane in 0..8 {
            sums[lane] += d * aux32[lane] as f32;
        }
    }

    sums.into_iter().sum()
}

fn f16_to_f32(bits: u16) -> f32 {
    let sign = ((bits & 0x8000) as u32) << 16;
    let exp = ((bits >> 10) & 0x1f) as i32;
    let frac = (bits & 0x03ff) as u32;
    let out = if exp == 0 {
        if frac == 0 {
            sign
        } else {
            let mut mant = frac;
            let mut exponent = -14_i32;
            while (mant & 0x0400) == 0 {
                mant <<= 1;
                exponent -= 1;
            }
            mant &= 0x03ff;
            sign | (((exponent + 127) as u32) << 23) | (mant << 13)
        }
    } else if exp == 31 {
        sign | 0x7f80_0000 | (frac << 13)
    } else {
        sign | (((exp - 15 + 127) as u32) << 23) | (frac << 13)
    };
    f32::from_bits(out)
}

02 scope / runtime over time

Lang

System

double-click zooms out

03 leaderboard

Leaderboard · top 8 click any row to expand · open multiple to compare

Rank User Lang Best Position in CDF Analysis When

01 josusanmartin C++ 9.536ms leader Analysis 1mo ago

02 josusanmartin Asm 9.600ms 1.01x Analysis 1mo ago

BEST 9.600ms

WORST RUN 9.841ms

CYCLESi 22,578,715

INSTRi 39,634,296

IPCi 1.755

BRANCHESi 1,527,779

BR MISSESi 10,714

BR MISPi 0.70%

L1 MISSi 234,781

L2 MISSi 109,989

L3 MISSi 98,635

DTLB MISSi 787

UOPS P0i 8,565,533

UOPS P1i 8,949,015

UOPS P2/3/10i 11,015,516

UOPS P4/9i 1,320,073

UOPS P5/11i 11,610,025

UOPS P6i 2,502,929

UOPS P7/8i 1,192,858

TMA SLOTSi 138,988,062

TMA BEi 91,833,912

TMA BADi 1,421,258

TMA FEi 5,924,434

TMA RETi 39,820,292

TMA MEMi 40,505,730

TMA BR BADi 1,094,209

STALL TOTi 8,170,555

STALL L1Di 6,973,257

STALL L2i 4,751,680

STALL L3i 4,256,009

EXE LOADi 7,526,061

EXE STOREi 50,424

SPLIT LDi 2,297,574

CLEARSi 1,776

UOPS MSi 0

03 andser612345 Rust 9.669ms 1.01x Analysis 1mo ago

BEST 9.669ms

WORST RUN 9.865ms

CYCLESi 22,929,471

INSTRi 37,443,214

IPCi 1.633

BRANCHESi 1,527,662

BR MISSESi 11,318

BR MISPi 0.74%

L1 MISSi 794,146

L2 MISSi 243,650

L3 MISSi 209,711

DTLB MISSi 957

UOPS P0i 8,898,069

UOPS P1i 9,622,343

UOPS P2/3/10i 8,778,345

UOPS P4/9i 1,349,876

UOPS P5/11i 11,921,677

UOPS P6i 1,766,294

UOPS P7/8i 1,216,231

TMA SLOTSi 139,800,474

TMA BEi 93,888,210

TMA BADi 1,394,702

TMA FEi 5,893,533

TMA RETi 38,638,205

TMA MEMi 38,810,196

TMA BR BADi 1,124,994

STALL TOTi 7,856,957

STALL L1Di 6,564,432

STALL L2i 5,513,609

STALL L3i 4,962,731

EXE LOADi 7,149,160

EXE STOREi 52,849

SPLIT LDi 2,297,574

CLEARSi 1,889

UOPS MSi 0

04 s7nfo C++ 9.858ms 1.03x Analysis 1mo ago

05 josusanmartin Rust 9.868ms 1.03x Analysis 1mo ago

06 s7nfo Asm 9.958ms 1.04x Analysis 1mo ago

llama.cpp x86 ggml_vec_dot_q6_K_q8_K C++ 10.797ms 1.13x Analysis 22d ago

BEST 10.797ms

WORST RUN 10.857ms

CYCLESi 25,475,774

INSTRi 44,758,806

IPCi 1.757

BRANCHESi 2,690,249

BR MISSESi 22,308

BR MISPi 0.83%

L1 MISSi 912,233

L2 MISSi 254,299

L3 MISSi 226,290

DTLB MISSi 1,207

UOPS P0i 9,171,388

UOPS P1i 9,647,745

UOPS P2/3/10i 8,775,540

UOPS P4/9i 1,700,114

UOPS P5/11i 14,143,803

UOPS P6i 5,020,691

UOPS P7/8i 1,582,016

TMA SLOTSi 153,239,898

TMA BEi 90,198,294

TMA BADi 3,127,852

TMA FEi 12,431,502

TMA RETi 47,510,795

TMA MEMi 39,967,092

TMA BR BADi 2,797,773

STALL TOTi 8,641,349

STALL L1Di 6,941,712

STALL L2i 5,757,246

STALL L3i 5,222,272

EXE LOADi 7,736,966

EXE STOREi 58,488

SPLIT LDi 2,297,820

CLEARSi 1,759

UOPS MSi 0

07 Codex C++ 10.826ms 1.14x Analysis 1mo ago

Rank User Lang Best Position in CDF Analysis When

01 josusanmartin C++ 9.536ms leader Analysis 1mo ago

04 s7nfo C++ 9.858ms 1.03x Analysis 1mo ago

llama.cpp x86 ggml_vec_dot_q6_K_q8_K C++ 10.797ms 1.13x Analysis 22d ago

BEST 10.797ms

WORST RUN 10.857ms

CYCLESi 25,475,774

INSTRi 44,758,806

IPCi 1.757

BRANCHESi 2,690,249

BR MISSESi 22,308

BR MISPi 0.83%

L1 MISSi 912,233

L2 MISSi 254,299

L3 MISSi 226,290

DTLB MISSi 1,207

UOPS P0i 9,171,388

UOPS P1i 9,647,745

UOPS P2/3/10i 8,775,540

UOPS P4/9i 1,700,114

UOPS P5/11i 14,143,803

UOPS P6i 5,020,691

UOPS P7/8i 1,582,016

TMA SLOTSi 153,239,898

TMA BEi 90,198,294

TMA BADi 3,127,852

TMA FEi 12,431,502

TMA RETi 47,510,795

TMA MEMi 39,967,092

TMA BR BADi 2,797,773

STALL TOTi 8,641,349

STALL L1Di 6,941,712

STALL L2i 5,757,246

STALL L3i 5,222,272

EXE LOADi 7,736,966

EXE STOREi 58,488

SPLIT LDi 2,297,820

CLEARSi 1,759

UOPS MSi 0

07 Codex C++ 10.826ms 1.14x Analysis 1mo ago

04 submit

Your Solution

Single File

use std::ffi::c_void;

const QK_K: usize = 256;

#[repr(C)]
struct BlockQ6K {
    ql: [u8; QK_K / 2],
    qh: [u8; QK_K / 4],
    scales: [i8; QK_K / 16],
    d: u16,
}

#[repr(C)]
struct BlockQ8K {
    d: f32,
    qs: [i8; QK_K],
    bsums: [i16; QK_K / 16],
}

#[unsafe(no_mangle)]
pub unsafe extern "C" fn ggml_vec_dot_q6_K_q8_K(
    n: i32,
    s: *mut f32,
    _bs: usize,
    vx: *const c_void,
    _bx: usize,
    vy: *const c_void,
    _by: usize,
    _nrc: i32,
) {
    let nb = n as usize / QK_K;
    let q6 = unsafe { std::slice::from_raw_parts(vx.cast::<BlockQ6K>(), nb) };
    let q8 = unsafe { std::slice::from_raw_parts(vy.cast::<BlockQ8K>(), nb) };
    let out = unsafe { &mut *s };
    *out = reference_dot(q6, q8);
}

fn reference_dot(q6: &[BlockQ6K], q8: &[BlockQ8K]) -> f32 {
    let mut sums = [0.0_f32; 8];

for (x, y) in q6.iter().zip(q8) {
        let mut aux8 = [0_i8; QK_K];
        let mut out = 0;
        let mut ql_offset = 0;
        let mut qh_offset = 0;
        for _ in (0..QK_K).step_by(128) {
            for lane in 0..32 {
                aux8[out + lane] = (((x.ql[ql_offset + lane] & 0x0f)
                    | (((x.qh[qh_offset + lane] >> 0) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 32 + lane] = (((x.ql[ql_offset + 32 + lane] & 0x0f)
                    | (((x.qh[qh_offset + lane] >> 2) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 64 + lane] = (((x.ql[ql_offset + lane] >> 4)
                    | (((x.qh[qh_offset + lane] >> 4) & 3) << 4))
                    as i8)
                    - 32;
                aux8[out + 96 + lane] = (((x.ql[ql_offset + 32 + lane] >> 4)
                    | (((x.qh[qh_offset + lane] >> 6) & 3) << 4))
                    as i8)
                    - 32;
            }
            out += 128;
            ql_offset += 64;
            qh_offset += 32;
        }

let mut aux32 = [0_i32; 8];
        let mut q8_offset = 0;
        let mut aux_offset = 0;
        for group in 0..QK_K / 16 {
            let scale = i32::from(x.scales[group]);
            for _ in 0..2 {
                for lane in 0..8 {
                    aux32[lane] += scale
                        * i32::from(y.qs[q8_offset + lane])
                        * i32::from(aux8[aux_offset + lane]);
                }
                q8_offset += 8;
                aux_offset += 8;
            }
        }

let d = f16_to_f32(x.d) * y.d;
        for lane in 0..8 {
            sums[lane] += d * aux32[lane] as f32;
        }
    }

sums.into_iter().sum()
}

fn f16_to_f32(bits: u16) -> f32 {
    let sign = ((bits & 0x8000) as u32) << 16;
    let exp = ((bits >> 10) & 0x1f) as i32;
    let frac = (bits & 0x03ff) as u32;
    let out = if exp == 0 {
        if frac == 0 {
            sign
        } else {
            let mut mant = frac;
            let mut exponent = -14_i32;
            while (mant & 0x0400) == 0 {
                mant <<= 1;
                exponent -= 1;
            }
            mant &= 0x03ff;
            sign | (((exponent + 127) as u32) << 23) | (mant << 13)
        }
    } else if exp == 31 {
        sign | 0x7f80_0000 | (frac << 13)
    } else {
        sign | (((exp - 15 + 127) as u32) << 23) | (frac << 13)
    };
    f32::from_bits(out)
}

Flags