Llama.cpp's ggml_vec_dot_q6_K_q8_K
Implement llama.cpp's ggml_vec_dot_q6_K_q8_K function. The judge builds your source as libsolution.so and calls the exported function from a verifier executable.
This kernel computes one dot product between a Q6_K quantized weight row and a Q8_K activation vector. Q6_K stores 256 weights per block using 4 low bits, 2 high bits, per-group signed scales, and a half-precision super-block scale; Q8_K stores the runtime activations and group sums. The hot work is unpacking the 6-bit weights, applying the scales and the -32 zero point, multiplying against the signed Q8 values, and accumulating the result for quantized llama.cpp matmul.
The required symbol is extern "C" void ggml_vec_dot_q6_K_q8_K(int n, float * s, size_t bs, const void * vx, size_t bx, const void * vy, size_t by, int nrc).
The verifier supplies block_q6_K and block_q8_K buffers in llama.cpp layout with nrc = 1, then compares the printed aggregate against the scalar reference with a small floating-point tolerance.
use std::ffi::c_void;
const QK_K: usize = 256;
#[repr(C)]
struct BlockQ6K {
ql: [u8; QK_K / 2],
qh: [u8; QK_K / 4],
scales: [i8; QK_K / 16],
d: u16,
}
#[repr(C)]
struct BlockQ8K {
d: f32,
qs: [i8; QK_K],
bsums: [i16; QK_K / 16],
}
#[unsafe(no_mangle)]
pub unsafe extern "C" fn ggml_vec_dot_q6_K_q8_K(
n: i32,
s: *mut f32,
_bs: usize,
vx: *const c_void,
_bx: usize,
vy: *const c_void,
_by: usize,
_nrc: i32,
) {
let nb = n as usize / QK_K;
let q6 = unsafe { std::slice::from_raw_parts(vx.cast::<BlockQ6K>(), nb) };
let q8 = unsafe { std::slice::from_raw_parts(vy.cast::<BlockQ8K>(), nb) };
let out = unsafe { &mut *s };
*out = reference_dot(q6, q8);
}
fn reference_dot(q6: &[BlockQ6K], q8: &[BlockQ8K]) -> f32 {
let mut sums = [0.0_f32; 8];
for (x, y) in q6.iter().zip(q8) {
let mut aux8 = [0_i8; QK_K];
let mut out = 0;
let mut ql_offset = 0;
let mut qh_offset = 0;
for _ in (0..QK_K).step_by(128) {
for lane in 0..32 {
aux8[out + lane] = (((x.ql[ql_offset + lane] & 0x0f)
| (((x.qh[qh_offset + lane] >> 0) & 3) << 4))
as i8)
- 32;
aux8[out + 32 + lane] = (((x.ql[ql_offset + 32 + lane] & 0x0f)
| (((x.qh[qh_offset + lane] >> 2) & 3) << 4))
as i8)
- 32;
aux8[out + 64 + lane] = (((x.ql[ql_offset + lane] >> 4)
| (((x.qh[qh_offset + lane] >> 4) & 3) << 4))
as i8)
- 32;
aux8[out + 96 + lane] = (((x.ql[ql_offset + 32 + lane] >> 4)
| (((x.qh[qh_offset + lane] >> 6) & 3) << 4))
as i8)
- 32;
}
out += 128;
ql_offset += 64;
qh_offset += 32;
}
let mut aux32 = [0_i32; 8];
let mut q8_offset = 0;
let mut aux_offset = 0;
for group in 0..QK_K / 16 {
let scale = i32::from(x.scales[group]);
for _ in 0..2 {
for lane in 0..8 {
aux32[lane] += scale
* i32::from(y.qs[q8_offset + lane])
* i32::from(aux8[aux_offset + lane]);
}
q8_offset += 8;
aux_offset += 8;
}
}
let d = f16_to_f32(x.d) * y.d;
for lane in 0..8 {
sums[lane] += d * aux32[lane] as f32;
}
}
sums.into_iter().sum()
}
fn f16_to_f32(bits: u16) -> f32 {
let sign = ((bits & 0x8000) as u32) << 16;
let exp = ((bits >> 10) & 0x1f) as i32;
let frac = (bits & 0x03ff) as u32;
let out = if exp == 0 {
if frac == 0 {
sign
} else {
let mut mant = frac;
let mut exponent = -14_i32;
while (mant & 0x0400) == 0 {
mant <<= 1;
exponent -= 1;
}
mant &= 0x03ff;
sign | (((exponent + 127) as u32) << 23) | (mant << 13)
}
} else if exp == 31 {
sign | 0x7f80_0000 | (frac << 13)
} else {
sign | (((exp - 15 + 127) as u32) << 23) | (frac << 13)
};
f32::from_bits(out)
}