solution
sol_3159247_1781243190131426109_12
01
source
Submitted source
8884 bytes
show source
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#if defined(__x86_64__) || defined(__i386__)
#include <immintrin.h>
#endif
static constexpr int QK_K = 256;
using ggml_half = std::uint16_t;
struct block_q6_K {
std::uint8_t ql[QK_K / 2];
std::uint8_t qh[QK_K / 4];
std::int8_t scales[QK_K / 16];
ggml_half d;
};
struct block_q8_K {
float d;
std::int8_t qs[QK_K];
std::int16_t bsums[QK_K / 16];
};
static_assert(sizeof(block_q6_K) == 210);
static_assert(sizeof(block_q8_K) == 292);
static float fp16_to_fp32(ggml_half h) {
const std::uint32_t sign = (std::uint32_t(h) & 0x8000u) << 16;
std::uint32_t exp = (std::uint32_t(h) >> 10) & 0x1fu;
std::uint32_t mant = std::uint32_t(h) & 0x03ffu;
if (exp == 0) {
if (mant == 0) {
float out;
std::memcpy(&out, &sign, sizeof(out));
return out;
}
while ((mant & 0x0400u) == 0) {
mant <<= 1;
--exp;
}
++exp;
mant &= 0x03ffu;
} else if (exp == 31) {
const std::uint32_t bits = sign | 0x7f800000u | (mant << 13);
float out;
std::memcpy(&out, &bits, sizeof(out));
return out;
}
exp = exp + (127 - 15);
const std::uint32_t bits = sign | (exp << 23) | (mant << 13);
float out;
std::memcpy(&out, &bits, sizeof(out));
return out;
}
#if defined(__AVX2__)
// Some compilers do not provide _mm256_set_m128i.
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
static inline float hsum_float_8(const __m256 x) {
__m128 res = _mm256_extractf128_ps(x, 1);
res = _mm_add_ps(res, _mm256_castps256_ps128(x));
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
res = _mm_add_ss(res, _mm_movehdup_ps(res));
return _mm_cvtss_f32(res);
}
static inline __m128i get_scale_shuffle(int i) {
static const std::uint8_t k_shuffle[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,
12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,
14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,
};
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(k_shuffle) + i);
}
#endif
static float scalar_dot(int n, const block_q6_K * x, const block_q8_K * y) {
const int nb = n / QK_K;
float sums[8] = {};
for (int i = 0; i < nb; ++i) {
std::int8_t aux8[QK_K];
int out = 0;
int ql_offset = 0;
int qh_offset = 0;
for (int j = 0; j < QK_K; j += 128) {
(void) j;
for (int lane = 0; lane < 32; ++lane) {
aux8[out + lane] = std::int8_t(
(x[i].ql[ql_offset + lane] & 0x0f)
| (((x[i].qh[qh_offset + lane] >> 0) & 3) << 4)
) - 32;
aux8[out + 32 + lane] = std::int8_t(
(x[i].ql[ql_offset + 32 + lane] & 0x0f)
| (((x[i].qh[qh_offset + lane] >> 2) & 3) << 4)
) - 32;
aux8[out + 64 + lane] = std::int8_t(
(x[i].ql[ql_offset + lane] >> 4)
| (((x[i].qh[qh_offset + lane] >> 4) & 3) << 4)
) - 32;
aux8[out + 96 + lane] = std::int8_t(
(x[i].ql[ql_offset + 32 + lane] >> 4)
| (((x[i].qh[qh_offset + lane] >> 6) & 3) << 4)
) - 32;
}
out += 128;
ql_offset += 64;
qh_offset += 32;
}
std::int32_t aux32[8] = {};
int q8_offset = 0;
int aux_offset = 0;
for (int group = 0; group < QK_K / 16; ++group) {
const int scale = x[i].scales[group];
for (int half = 0; half < 2; ++half) {
(void) half;
for (int lane = 0; lane < 8; ++lane) {
aux32[lane] += scale
* int(y[i].qs[q8_offset + lane])
* int(aux8[aux_offset + lane]);
}
q8_offset += 8;
aux_offset += 8;
}
}
const float d = fp16_to_fp32(x[i].d) * y[i].d;
for (int lane = 0; lane < 8; ++lane) {
sums[lane] += d * float(aux32[lane]);
}
}
float sum = 0.0f;
for (float value : sums) {
sum += value;
}
return sum;
}
extern "C" void ggml_vec_dot_q6_K_q8_K(
int n,
float * s,
std::size_t bs,
const void * vx,
std::size_t bx,
const void * vy,
std::size_t by,
int nrc
) {
assert(n % QK_K == 0);
assert(nrc == 1);
(void) bs;
(void) bx;
(void) by;
(void) nrc;
const auto * x = static_cast<const block_q6_K *>(vx);
const auto * y = static_cast<const block_q8_K *>(vy);
const int nb = n / QK_K;
#if defined(__AVX2__)
const __m256i m3 = _mm256_set1_epi8(3);
const __m256i m15 = _mm256_set1_epi8(15);
__m256 acc = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * fp16_to_fp32(x[i].d);
const std::uint8_t * q4 = x[i].ql;
const std::uint8_t * qh = x[i].qh;
const std::int8_t * q8 = y[i].qs;
const __m256i q8sums = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(y[i].bsums));
const __m128i scales = _mm_loadu_si128(reinterpret_cast<const __m128i *>(x[i].scales));
const __m256i scales_16 = _mm256_cvtepi8_epi16(scales);
const __m256i q8sclsub = _mm256_slli_epi32(_mm256_madd_epi16(q8sums, scales_16), 5);
__m256i sumi = _mm256_setzero_si256();
int is = 0;
for (int j = 0; j < QK_K / 128; ++j) {
(void) j;
const __m256i q4bits1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q4));
q4 += 32;
const __m256i q4bits2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q4));
q4 += 32;
const __m256i q4bitsH = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(qh));
qh += 32;
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m3), 4);
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(12)), 2);
const __m256i q4h_2 = _mm256_and_si256(q4bitsH, _mm256_set1_epi8(48));
const __m256i q4h_3 = _mm256_srli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(-64)), 2);
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m15), q4h_0);
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m15), q4h_1);
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m15), q4h_2);
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m15), q4h_3);
const __m256i q8_0 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q8));
q8 += 32;
const __m256i q8_1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q8));
q8 += 32;
const __m256i q8_2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q8));
q8 += 32;
const __m256i q8_3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(q8));
q8 += 32;
__m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
__m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
__m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
__m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
is += 4;
p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
}
sumi = _mm256_sub_epi32(sumi, q8sclsub);
acc = _mm256_add_ps(acc, _mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)));
}
*s = hsum_float_8(acc);
#else
*s = scalar_dot(n, x, y);
#endif
}
02
jobs
Systems
02 jobs
03
counters
Performance counters
31 counters
cyclesi
25,475,774Show more
branch_instructionsi
2,690,249branch_missesi
22,308cycle_activity.stalls_l1d_missi
6,941,712cycle_activity.stalls_l2_missi
5,757,246cycle_activity.stalls_l3_missi
5,222,272cycle_activity.stalls_totali
8,641,349dtlb_load_misses.walk_completedi
1,207exe_activity.bound_on_loadsi
7,736,966exe_activity.bound_on_storesi
58,488instructionsi
44,758,806machine_clearsi
1,759mem_inst_retired.split_loadsi
2,297,820mem_load_retired.l1_missi
912,233mem_load_retired.l2_missi
254,299mem_load_retired.l3_missi
226,290tma_backend_boundi
90,198,294tma_bad_speculationi
3,127,852tma_branch_mispredict_slotsi
2,797,773tma_frontend_boundi
12,431,502tma_memory_boundi
39,967,092tma_retiringi
47,510,795tma_slotsi
153,239,898uops_dispatched.port_0i
9,171,388uops_dispatched.port_1i
9,647,745uops_dispatched.port_2_3_10i
8,775,540uops_dispatched.port_4_9i
1,700,114uops_dispatched.port_5_11i
14,143,803uops_dispatched.port_6i
5,020,691uops_dispatched.port_7_8i
1,582,016uops_retired.msi
0
04
top down
Top-down analysis
Raptor Cove P-core
05
profile
load profile
03
counters
Performance counters
26 counters
cyclesi
31,579,231Show more
branch_instructionsi
2,689,912branch_missesi
22,408dtlb_load_misses.walk_completedi
2,948instructionsi
44,763,603mem_bound_stalls.load_dram_hiti
7,043,204mem_bound_stalls.load_l2_hiti
4,962,684mem_bound_stalls.load_llc_hiti
168,680mem_inst_retired.split_loadsi
2,049,826mem_load_retired.l1_missi
4,776,324mem_load_retired.l2_missi
175,024mem_load_retired.l3_missi
159,568tma_backend_boundi
76,409,297tma_backend_bound_alloc_restrictionsi
383,044tma_backend_bound_non_memory_scheduleri
57,983,220tma_backend_bound_registeri
7,037,395tma_backend_bound_reorder_bufferi
8,077,840tma_backend_bound_serializationi
1,863,883tma_bad_speculationi
5,145,236tma_bad_speculation_branch_mispredicti
5,028,973tma_bad_speculation_machine_clearsi
116,263tma_frontend_bandwidthi
4,211,558tma_frontend_boundi
6,774,238tma_frontend_latencyi
2,562,680tma_memory_boundi
524,360tma_retiringi
69,814,753
04
top down
Top-down analysis
Gracemont E-core
05
profile
load profile