OFFELLIA_Quantis / quants.py
Brunobkr's picture
Upload 7 files
d3bb43b verified
# BRUNO BECKER / OFFELLIA 2026
# brunoconta1980@gmail.com
# brunoconta1980@hotmail.com
# X @Brunoxuser
# Fnord
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Callable, Sequence
from math import log2, ceil
from dataclasses import dataclass, field
from numpy.typing import DTypeLike
from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
from .lazy import LazyNumpyTensor
import numpy as np
from functools import lru_cache
# ====================== FUNÇÕES AUXILIARES ======================
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % block_size != 0:
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
return (*shape[:-1], shape[-1] // block_size * type_size)
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % type_size != 0:
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
return (*shape[:-1], shape[-1] // type_size * block_size)
def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
rows = arr.reshape((-1, arr.shape[-1]))
osize = 1
for dim in oshape:
osize *= dim
out = np.empty(shape=osize, dtype=otype)
n_groups = (rows.shape[0] // 16) or 1
np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
return out.reshape(oshape)
def np_roundf(n: np.ndarray) -> np.ndarray:
a = abs(n)
floored = np.floor(a)
b = floored + np.floor(2 * (a - floored))
return np.sign(n) * b
class QuantError(Exception): ...
_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
if qtype == GGMLQuantizationType.F32:
return data.astype(np.float32, copy=False)
elif qtype == GGMLQuantizationType.F16:
return data.astype(np.float16, copy=False)
elif (q := _type_traits.get(qtype)) is not None:
return q.quantize(data)
else:
raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
if qtype == GGMLQuantizationType.F32:
return data.view(np.float32)
elif qtype == GGMLQuantizationType.F16:
return data.view(np.float16).astype(np.float32)
elif (q := _type_traits.get(qtype)) is not None:
return q.dequantize(data)
else:
raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
# ====================== CLASSE BASE ======================
class __Quant(ABC):
qtype: GGMLQuantizationType
block_size: int
type_size: int
grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
grid_shape: tuple[int, int] = (0, 0)
grid_map: tuple[int | float, ...] = ()
grid_hex: bytes | None = None
def __init__(self):
raise TypeError("Quant conversion classes can't have instances")
def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
cls.qtype = qtype
cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
cls.__quantize_array,
meta_noop=(np.uint8, cls.__shape_to_bytes)
)
cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
cls.__dequantize_array,
meta_noop=(np.float32, cls.__shape_from_bytes)
)
assert qtype not in _type_traits
_type_traits[qtype] = cls
@classmethod
def init_grid(cls):
if cls.grid is not None or cls.grid_hex is None:
return
bits_per_elem = ceil(log2(len(cls.grid_map)))
assert bits_per_elem != 0, cls.qtype.name
elems_per_byte = 8 // bits_per_elem
grid = np.frombuffer(cls.grid_hex, dtype=np.uint8)
grid = grid.reshape((-1, 2))
grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array([4, 0], dtype=np.uint8).reshape((1, 2))
grid = grid[..., 0] | grid[..., 1]
grid = grid.reshape((-1, 1)) >> np.array([i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8).reshape((1, elems_per_byte))
grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1))
grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1))
grid = np.take_along_axis(grid_map, grid, axis=-1)
cls.grid = grid.reshape((1, 1, *cls.grid_shape))
@classmethod
@abstractmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
raise NotImplementedError
@classmethod
def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
rows = rows.astype(np.float32, copy=False)
shape = rows.shape
n_blocks = rows.size // cls.block_size
blocks = rows.reshape((n_blocks, cls.block_size))
zeta_core = HelicoidalZetaCore()
for i in range(n_blocks):
if i == 0:
print(f"\n[AUDITORIA] OFFELLIA ATIVA - Bloco 0 ANTES: {blocks[i][0]:.15f}", flush=True)
blocks[i] = zeta_core.transform(blocks[i], n_val=i+1)
if i == 0:
print(f"[AUDITORIA] SUCESSO - Bloco 0 DEPOIS: {blocks[i][0]:.15f}\n", flush=True)
if i % 42 == 0:
print(f" > Offellia processando tensores: {i}/{n_blocks}...", end='\r', flush=True)
blocks = cls.quantize_blocks(blocks)
assert blocks.dtype == np.uint8
assert blocks.shape[-1] == cls.type_size
return blocks.reshape(cls.__shape_to_bytes(shape))
@classmethod
def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
shape = rows.shape
n_blocks = rows.size // cls.type_size
blocks = rows.reshape((n_blocks, cls.type_size))
# 1. Dequantização padrão do GGML
dequant_blocks = cls.dequantize_blocks(blocks)
# 2. Aplica inverse_transform OFFELLIA (essencial!)
zeta_core = HelicoidalZetaCore()
for i in range(n_blocks):
dequant_blocks[i] = zeta_core.inverse_transform(dequant_blocks[i], n_val=i+1)
return dequant_blocks.reshape(cls.__shape_from_bytes(shape))
@classmethod
def __shape_to_bytes(cls, shape: Sequence[int]):
return quant_shape_to_byte_shape(shape, cls.qtype)
@classmethod
def __shape_from_bytes(cls, shape: Sequence[int]):
return quant_shape_from_byte_shape(shape, cls.qtype)
@classmethod
def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
@classmethod
def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
cls.init_grid()
return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
@classmethod
def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
return tensor.shape[-1] % cls.block_size == 0
@classmethod
def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
if not cls.can_quantize(tensor):
raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
return cls.__quantize_lazy(tensor) if isinstance(tensor, LazyNumpyTensor) else cls.__quantize_array(tensor)
@classmethod
def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
return cls.__dequantize_lazy(tensor) if isinstance(tensor, LazyNumpyTensor) else cls.__dequantize_array(tensor)
# ====================== CLASSES DE QUANTIZAÇÃO (mantidas iguais) ======================
class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
@classmethod
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n = blocks.view(np.uint32)
# force nan to quiet
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
# round to nearest even
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
return n.astype(np.uint16).view(np.uint8)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
imax = abs(blocks).argmax(axis=-1, keepdims=True)
max = np.take_along_axis(blocks, imax, axis=-1)
d = max / -8
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
qs = np.trunc((blocks * id) + np.float32(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
d = d.astype(np.float16).view(np.uint8)
return np.concatenate([d, qs], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, qs = np.hsplit(blocks, [2])
d = d.view(np.float16).astype(np.float32)
qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8)
return (d * qs.astype(np.float32))
class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
max = blocks.max(axis=-1, keepdims=True)
min = blocks.min(axis=-1, keepdims=True)
d = (max - min) / 15
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
d = d.astype(np.float16).view(np.uint8)
m = min.astype(np.float16).view(np.uint8)
return np.concatenate([d, m, qs], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
m, qs = np.hsplit(rest, [2])
d = d.view(np.float16).astype(np.float32)
m = m.view(np.float16).astype(np.float32)
qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32)
return (d * qs) + m
class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
imax = abs(blocks).argmax(axis=-1, keepdims=True)
max = np.take_along_axis(blocks, imax, axis=-1)
d = max / -16
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
q = np.trunc((blocks * id) + np.float32(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
qs = q.reshape((n_blocks, 2, cls.block_size // 2))
qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
d = d.astype(np.float16).view(np.uint8)
return np.concatenate([d, qh, qs], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
qh, qs = np.hsplit(rest, [4])
d = d.view(np.float16).astype(np.float32)
qh = qh.view(np.uint32)
qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qh = (qh & np.uint32(0x01)).astype(np.uint8)
ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16)
return (d * qs.astype(np.float32))
class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
max = blocks.max(axis=-1, keepdims=True)
min = blocks.min(axis=-1, keepdims=True)
d = (max - min) / 31
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
q = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
qs = q.reshape((n_blocks, 2, cls.block_size // 2))
qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
qh = np.packbits(q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little").reshape(n_blocks, 4)
d = d.astype(np.float16).view(np.uint8)
m = min.astype(np.float16).view(np.uint8)
return np.concatenate([d, m, qh, qs], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
m, rest = np.hsplit(rest, [2])
qh, qs = np.hsplit(rest, [4])
d = d.view(np.float16).astype(np.float32)
m = m.view(np.float16).astype(np.float32)
qh = qh.view(np.uint32)
qh = qh.reshape((n_blocks, 1)) >> np.array([i for i in range(32)], dtype=np.uint32).reshape((1, 32))
ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qh = (qh & np.uint32(0x01)).astype(np.uint8)
ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
qs = (ql | (qh << np.uint8(4))).astype(np.float32)
return (d * qs) + m
class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
@classmethod
# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
d = abs(blocks).max(axis=1, keepdims=True) / 127
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
qs = np_roundf(blocks * id)
# (n_blocks, 2)
d = d.astype(np.float16).view(np.uint8)
# (n_blocks, block_size)
qs = qs.astype(np.int8).view(np.uint8)
return np.concatenate([d, qs], axis=1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
d, x = np.split(blocks, [2], axis=1)
d = d.view(np.float16).astype(np.float32)
x = x.view(np.int8).astype(np.float32)
return (x * d)
class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
scales, rest = np.hsplit(blocks, [QK_K // 16])
qs, rest = np.hsplit(rest, [QK_K // 4])
d, dmin = np.hsplit(rest, [2])
d = d.view(np.float16).astype(np.float32)
dmin = dmin.view(np.float16).astype(np.float32)
# (n_blocks, 16, 1)
dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
ml = (dmin * (scales >> 4).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3)
qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32)
qs = dl * qs - ml
return qs.reshape((n_blocks, -1))
class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K):
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
hmask, rest = np.hsplit(blocks, [QK_K // 8])
qs, rest = np.hsplit(rest, [QK_K // 4])
scales, d = np.hsplit(rest, [12])
d = d.view(np.float16).astype(np.float32)
# The scales are packed at 6-bit each in this pattern:
# 0: IIIIAAAA
# 1: JJJJBBBB
# 2: KKKKCCCC
# 3: LLLLDDDD
# 4: MMMMEEEE
# 5: NNNNFFFF
# 6: OOOOGGGG
# 7: PPPPHHHH
# 8: MMIIEEAA
# 9: NNJJFFBB
# 10: OOKKGGCC
# 11: PPLLHHDD
lscales, hscales = np.hsplit(scales, [8])
lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
lscales = lscales.reshape((n_blocks, 16))
hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 4, 1))
hscales = hscales.reshape((n_blocks, 16))
scales = (lscales & np.uint8(0x0F)) | ((hscales & np.uint8(0x03)) << np.uint8(4))
scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32)
dl = (d * scales).reshape((n_blocks, 16, 1))
ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3)
qh = (qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1))
qh = qh ^ np.uint8(1) # strangely, the offset is zero when the bitmask is 1
q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(np.float32)
return (dl * q).reshape((n_blocks, QK_K))
class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
K_SCALE_SIZE = 12
@staticmethod
def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
n_blocks = scales.shape[0]
scales = scales.view(np.uint8)
### Unpacking the following: ###
# 0 EEAAAAAA
# 1 FFBBBBBB
# 2 GGCCCCCC
# 3 HHDDDDDD
# 4 eeaaaaaa
# 5 ffbbbbbb
# 6 ggcccccc
# 7 hhdddddd
# 8 eeeeEEEE
# 9 ffffFFFF
# 10 ggggGGGG
# 11 hhhhHHHH
scales = scales.reshape((n_blocks, 3, 4))
d, m, m_d = np.split(scales, 3, axis=-2)
sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1)
min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1)
return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
dmin, rest = np.hsplit(rest, [2])
scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE])
d = d.view(np.float16).astype(np.float32)
dmin = dmin.view(np.float16).astype(np.float32)
sc, m = Q4_K.get_scale_min(scales)
d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32)
return (d * qs - dm).reshape((n_blocks, QK_K))
class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K):
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
dmin, rest = np.hsplit(rest, [2])
scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE])
qh, qs = np.hsplit(rest, [QK_K // 8])
d = d.view(np.float16).astype(np.float32)
dmin = dmin.view(np.float16).astype(np.float32)
sc, m = Q4_K.get_scale_min(scales)
d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8, 1))
ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32))
q = (ql | (qh << np.uint8(4))).astype(np.float32)
return (d * q - dm).reshape((n_blocks, QK_K))
class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
ql, rest = np.hsplit(blocks, [QK_K // 2])
qh, rest = np.hsplit(rest, [QK_K // 4])
scales, d = np.hsplit(rest, [QK_K // 16])
scales = scales.view(np.int8).astype(np.float32)
d = d.view(np.float16).astype(np.float32)
d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32))
q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32)
q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32)
return (d * q).reshape((n_blocks, QK_K))
class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d = abs(blocks).max(axis=-1, keepdims=True)
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
qs = np_roundf(blocks * id)
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
qs = np.concatenate([qs0, qs1, qh], axis=-1)
qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
qs = qs.astype(np.uint8)
d = d.astype(np.float16).view(np.uint8)
return np.concatenate([qs, d], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
qh, d = np.hsplit(rest, [QK_K // 64])
d = d.view(np.float16).astype(np.float32)
qs0, qs1 = qs[..., :32], qs[..., 32:]
qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
qs0 = qs0.reshape((n_blocks, -1))
qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
qs1 = qs1.reshape((n_blocks, -1))
qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
qh = qh.reshape((n_blocks, -1))
qs = np.concatenate([qs0, qs1, qh], axis=-1)
qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
return (d * qs.astype(np.float32))
class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d = abs(blocks).max(axis=-1, keepdims=True)
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
qs = np_roundf(blocks * id)
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
qs = qs.reshape((n_blocks, -1))
d = d.astype(np.float16).view(np.uint8)
return np.concatenate([qs, d], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
qs, d = np.hsplit(blocks, [QK_K // 4])
d = d.view(np.float16).astype(np.float32)
qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
return (d * qs.astype(np.float32))
class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
# e2m1 values (doubled)
# ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12)
@staticmethod
# see ggml_e8m0_to_fp32_half in ggml-impl.h
def e8m0_to_fp32_half(x: np.ndarray) -> np.ndarray:
bits = np.where(x < 2, np.uint32(0x00200000) << np.uint32(x), np.uint32(x - 1) << np.uint32(23))
return bits.view(np.float32)
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d = abs(blocks).max(axis=-1, keepdims=True)
with np.errstate(divide="ignore"):
e = np.where(d > 0, np.floor(np.log2(d)) - 2 + 127, 0).astype(np.uint8)
d = cls.e8m0_to_fp32_half(e)
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape((1, 1, 16))
errs = np.abs(d.reshape((n_blocks, 1, 1)) * kvalues.astype(np.float32) - blocks.reshape((n_blocks, cls.block_size, 1)))
best = np.argmin(errs, axis=-1, keepdims=True)
qs = best.reshape(n_blocks, 2, cls.block_size // 2).astype(np.uint8)
qs = qs[:, 0] | (qs[:, 1] << np.uint8(4))
qs = qs.reshape((n_blocks, cls.block_size // 2))
return np.concatenate([e, qs], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
e, qs = np.hsplit(blocks, [1])
d = cls.e8m0_to_fp32_half(e)
qs = qs.reshape((n_blocks, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
qs = (qs & np.uint8(0x0F)).view(np.int8)
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
qs = np.take_along_axis(kvalues, qs, axis=-1).reshape((n_blocks, cls.block_size))
return (d * qs.astype(np.float32))
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
ksigns: bytes = (
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f"
b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf"
b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f"
b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf"
b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f"
b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f"
b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff"
)
# iq2xxs_grid, but with each byte of the original packed in 2 bits,
# by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
grid_shape = (256, 8)
grid_map = (0x08, 0x19, 0x2b)
grid_hex = (
b"00000200050008000a00110014002000220028002a0041004400500058006100"
b"6400800082008a00a20001010401100115014001840198010002020222028202"
b"010404041004210424044004420448046004810484049004a404000502050805"
b"200546056905800591050906100640068406a406000805080808140828084108"
b"440850085208880804094009020a140a01100410101021104010601084109010"
b"951000110811201150115a118011241245120014081420142514491480141815"
b"6215001616160118041810184018811800190519a019511a002002200a204420"
b"6120802082202921482100220222012404241024402456240025412564259026"
b"082820289428442a014004401040184021402440404048405640604081408440"
b"9040004120416141804185410142104248425642684200440844204480449944"
b"124524450046014804481048404845480049584961498249454a904a00500850"
b"1150195020508050885004514251a4519152905492540a550156545600581158"
b"195864584059085a046010604060686000615561186260620064056410651265"
b"84654268008002800a8041808280048118814081118201840484108415844084"
b"608400854685948509864086608602880489118a0490109024904090a1901691"
b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9"
)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, qs = np.hsplit(blocks, [2])
d = d.view(np.float16).astype(np.float32)
qs = qs.view(np.uint32).reshape(n_blocks, -1, 2)
db = d * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) * np.float32(0.25)
db = db.reshape((n_blocks, -1, 1, 1))
# get the sign indices and unpack the bits
signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
signs = np.take_along_axis(ksigns, signs, axis=-1)
signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
signs = signs & np.uint8(0x01)
signs = np.where(signs == 0, np.float32(1), np.float32(-1))
signs = signs.reshape((n_blocks, -1, 4, 8))
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 4, 8))
return (db * grid * signs).reshape((n_blocks, -1))
class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS):
# iq2xs_grid, but with each byte of the original packed in 2 bits,
# by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
grid_shape = (512, 8)
grid_map = (0x08, 0x19, 0x2b)
grid_hex = (
b"00000200050008000a0011001400160019002000220025002800410044004600"
b"49005000520055005800610064008000820085008800910094009900a0000101"
b"04010601090110011201150118011a0121012401400142014501480151015401"
b"6001680181018401900100020202050208021102140220024102440250025502"
b"80028a0201040404060409041004120415041804210424044004420445044804"
b"5104540456046004810484049004000502050505080511051405200541054405"
b"500561058005010604061006260640064206840600080208050808080a081108"
b"14082008250841084408500858088008a008aa08010904091009400981098909"
b"000a200a280a960aa00a01100410061009101010121015101810211024104010"
b"4210451048105110541060106a10811084109010001102110511081111111411"
b"2011411144115011801194119611011204120612101240126012001402140514"
b"0814111414142014411444144914501464148014011504151015401500161416"
b"49160118041810181218401854188618001905196619511aa91a002002200520"
b"08200a201120142020204120442050208020a020012104211021402148216521"
b"002222228022a82201240424102429244024002541255225992501261a26a626"
b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440"
b"0640094010401240154018402140244040404240454048404a40514054406040"
b"6540814084409040004102410541084111411441204141414441504180418541"
b"a241014204421042124229424042004402440544084411441444194420444144"
b"4444504480449444014504451045244540459a4500460a464446504601480448"
b"1048404845485448624800491149444950496949044a00500250055008501150"
b"145020502850415044505050805001510451105115514051425100524452aa52"
b"0154045410542154405460548154a154005508558055885521566856a1560058"
b"14584158505899581a5940594259855a0160046010604060546062608660a960"
b"006124624a62926200641664106540654565a46501686a682569066a546a626a"
b"00800280058008801180148020802a8041804480508080808280a880aa800181"
b"0481068110814081518159810082208280828282a082a8820184048410841284"
b"158440846084898400854485a58518866a860088088825885a8880888288a888"
b"0689228a808a888a968aa88a0190049010904090569084900091229164915692"
b"89920094059444945094589429959095929541965198a6984999159a609a00a0"
b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4"
b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa"
)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
qs, scales = np.hsplit(rest, [2 * QK_K // 8])
d = d.view(np.float16).astype(np.float32)
qs = qs.view(np.uint16)
scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
scales = (scales & 0x0F).reshape((n_blocks, -1))
db = d * (np.float32(0.5) + scales) * np.float32(0.25)
db = db.reshape((n_blocks, -1, 1, 1))
# get the sign indices and unpack the bits
signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128)
signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1)
signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
signs = signs & np.uint8(0x01)
signs = np.where(signs == 0, np.float32(1), np.float32(-1))
signs = signs.reshape((n_blocks, -1, 2, 8))
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 2, 8))
return (db * grid * signs).reshape((n_blocks, -1))
class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S):
# iq2s_grid, but with each byte of the original packed in 2 bits,
# by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
grid_shape = (1024, 8)
grid_map = (0x08, 0x19, 0x2b)
grid_hex = (
b"00000200050008000a0011001400160019002000220025002800410044004600"
b"490050005200550058006100640066006900800082008500880091009400a000"
b"a500aa0001010401060109011001120115011801210124014001420145014801"
b"510154015601590160016501680181018401900192019501a101a40100020202"
b"050208021102140220022a02410244024602490250025502800285028a029402"
b"a202010404040604090410041204150418042104240426042904400442044504"
b"48044a0451045404560459046004620465048104840486048904900495049804"
b"a104a40400050205050508050a05110514051605190520052505280541054405"
b"46054905500552055505580561056405800582058505880591059405a0050106"
b"0406060609061006150640064506480651065406600681068406900600080208"
b"050808081108140816081908200825082a084108440846084908500852085508"
b"580861086408800885089408aa08010904091009120915091809210940094509"
b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410"
b"0610091010101210151018102110241026104010421045104810511054105610"
b"59106010621065106810811084108610901095109810a110a410001102110511"
b"08110a1111111411161119112011221125112811411144114611491150115211"
b"5511581161116411801182118511881191119411011204120912101215122112"
b"2412401245125112541281128412901200140214051408141114141416141914"
b"2014251428144114441446144914501452145514581461146414801482148514"
b"881491149414a014011504150615091510151215151518152115241540154215"
b"4515481551155415601581158415901500160516081611161416201641164416"
b"50168016aa160118041806180918101815181818211840184218451848185118"
b"541860188118841800190219051908191119141920194119441950196919a219"
b"041a101a401a561a00200220052008201120142016201920202025202a204120"
b"4420502052205520642080208a209420aa200121042110211221152121214021"
b"4221452151215421602181218421902100220a22222228222a22442250228822"
b"8a22a82201240424062409241024152418242124242440244224452448245124"
b"5424602481248424902400250525082511251425202541254425502566258025"
b"0126042610264026592600280528112814284128442850288a28aa2801290429"
b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40"
b"21402440264040404240454048404a4051405440564059406040624065408140"
b"8440904095409840a140a4400041024105410841114114411641194120412241"
b"2541414144414641494150415241554158416141644180418241854188419141"
b"9441a04101420442104212421542184224424042454248425142544260428142"
b"844200440244054408440a441144144416441944204422442544284441444444"
b"46444944504452445544584461446444804482448544884491449444a0440145"
b"0445064509451045124515451845214524454045424545454845514554456045"
b"6a4581458445904500460246054608461146144620464146444650468046a546"
b"0148044809481048124815481848214824484048424845484848514854486048"
b"84489048004902490549084911491449204941494449504980499649014a044a"
b"104a404a00500250055008501150145016501950205022502550285041504450"
b"4650495050505250555058506150645080508250855088509150945001510451"
b"0651095110511251155118512151245140514251455148515151545160518151"
b"8451905100520552085211521452205241524452505269528052015404540654"
b"0954105412541554185421542454405442544554485451545454605481548454"
b"9054005502550555085511551455205541554455505580550156045610562656"
b"405600580258055808581158145820584158445850585a588058015904591059"
b"4059005a195a855aa85a01600460066010601260156018602160246040604560"
b"4860516054606060846090600061026105610861116114612061416144615061"
b"806199610462106240625662a162006405640864116414642064416444645064"
b"806401650465106540654a656865926500669466016804681068656898680069"
b"2a69426aa16a0080028005800880118014801980208025804180448050805280"
b"5580588061808080858091809480018104810981108112811581188121812481"
b"408142814581488151815481818184819081a981008205820a82118214824182"
b"4482508201840484068409841084128415841884218440844284458448845184"
b"5484608481848484908400850285058508851185148520854185448550858085"
b"8a85018604861086298640860088058811881488418844885088a28801890489"
b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090"
b"4290459048905190549060908190849090900091059111911491419144915091"
b"5a910192049210924092a6920094029405940894119414942094419444945094"
b"8094969401950495109540959895a19500964696649601980498109826984098"
b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0"
b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4"
b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa"
)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
qs, rest = np.hsplit(rest, [QK_K // 8])
signs, rest = np.hsplit(rest, [QK_K // 8])
qh, scales = np.hsplit(rest, [QK_K // 32])
d = d.view(np.float16).astype(np.float32)
scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
scales = (scales & 0x0F).reshape((n_blocks, -1))
db = d * (np.float32(0.5) + scales) * np.float32(0.25)
db = db.reshape((n_blocks, -1, 1, 1))
# unpack the sign bits
signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
signs = signs & np.uint8(0x01)
signs = np.where(signs == 0, np.float32(1), np.float32(-1))
signs = signs.reshape((n_blocks, -1, 2, 8))
qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4))
qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape((n_blocks, -1))
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 2, 8))
return (db * grid * signs).reshape((n_blocks, -1))
class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS):
grid_shape = (256, 4)
grid_map = (0x04, 0x0c, 0x14, 0x1c, 0x24, 0x2c, 0x34, 0x3e)
grid_hex = (
b"0000020004001100130017002000220031004200730075000101030110011201"
b"2101250130013201410154017001000202020402110220022202310233023702"
b"5102570275020103070310031203250370031304370444045704730475040105"
b"0705320552053506640610071407160743076107011003101010121021102310"
b"3010321034104710501000110211111120112211011203121012121221123012"
b"7212001302132013311346136613011405145014201524154615711505162217"
b"4017002002201120132020202220262031204220012103210521102112212121"
b"3021632167217021002202221122172220222222372240225522012310231423"
b"7023742335245324032527254125742501270327162745270130103012302130"
b"2330503065307230003102312031313144314631013203321032253252327232"
b"1133333330344734723400350635223555351436363663363337603704401740"
b"3540374053405740744120423742404260426642074345430444514464442545"
b"4345704505471047124730471250415070500051065126515551145232527252"
b"0253535310542354275472540255315550562457425724604460466064602161"
b"6161176264623063366344640565526533660367216703700570077010703270"
b"5270267140711272457252720073157333736073217441740075027524753076"
)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
qs, scales = np.hsplit(rest, [QK_K // 4])
d = d.view(np.float16).astype(np.float32)
scales = scales.view(np.uint32)
db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5)
db = db.reshape((n_blocks, -1, 1, 1))
# get the sign indices and unpack the bits
signs = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 7, 14, 21], dtype=np.uint32).reshape((1, 1, 4))
ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
signs = np.take_along_axis(ksigns, signs, axis=-1)
signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 1, 8))
signs = signs & np.uint8(0x01)
signs = np.where(signs == 0, np.float32(1), np.float32(-1))
signs = signs.reshape((n_blocks, -1, 4, 8))
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 4, 8))
return (db * grid * signs).reshape((n_blocks, -1))
class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S):
grid_shape = (512, 4)
grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0b, 0x0d, 0x0f)
grid_hex = (
b"0000010002000500070010001100120014001600200021002500330040004200"
b"4500470051005300600062007100740077000001010102010401100111011501"
b"2001230127013101350144016101650172010002010205020702100213021602"
b"2102250230023402420245024702510253027002730203031103150320032203"
b"3103330336034403500352036703710375030004130417042104240432044004"
b"4304510470040205040520052205260533054105450547056605730506061106"
b"1306310652067106000702070407200722072607330750075407001001100210"
b"0410101011101310151017102010221031103410361054105610611072100011"
b"0111031106111011141121113011331141115011521170117611001212121512"
b"1712201224123212401243125512601272120113041307131013131321132713"
b"3013341341136213701303140514121414143114331442144614501454140115"
b"1015131521153015321551152016241627164416461601170317101712172117"
b"3517411762177017002001200320052007201020122014201620212023202720"
b"3020322041204320452050205220672070207320752000210221102113211721"
b"2221252131213421422151210122042207222122232230223722412253225722"
b"7122742200230223052311232223242331233323422350236623012407242024"
b"2324322435244124722475240425112522253725402553257025002602260726"
b"2126552661260527112726273027432750270230113013301530173022303130"
b"3330353042304430473051306330713001310331053114312131233140316031"
b"7231763100321232203232323432503201331033143321332333273330334133"
b"4333473355337333033411341634223431345234603464340135103512352535"
b"3235443556357335163641360137033720372237353700400440124020402440"
b"2740324041405040704002410741114113412241304135414341514155410142"
b"0342104215422142334240425742624270420443114313432043224331433543"
b"0044024424443744404471440545074521456245134634466046104715473047"
b"4347514702501050145022504050445047505250665074500151035105511251"
b"2151325172510052115223523052365253520253075310532753445351536553"
b"7353015404542054325446541255265551555355425602570457225711601360"
b"1560316033606060006120612761646112623462426255626262706200631463"
b"2163406325644364626400650365346560650566406611671367007004700770"
b"2070227036704070547062700271117124714371457101720472107216722172"
b"3072517202733273357353730174057413742074507422754275027631760077"
)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
qs, rest = np.hsplit(rest, [QK_K // 4])
qh, rest = np.hsplit(rest, [QK_K // 32])
signs, scales = np.hsplit(rest, [QK_K // 8])
d = d.view(np.float16).astype(np.float32)
scales = scales.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
scales = (scales & 0x0F).reshape((n_blocks, -1))
db = d * (1 + 2 * scales)
db = db.reshape((n_blocks, -1, 1, 1))
# unpack the sign bits
signs = signs.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8).reshape((1, 1, 8))
signs = signs & np.uint8(0x01)
signs = np.where(signs == 0, np.float32(1), np.float32(-1))
signs = signs.reshape((n_blocks, -1, 4, 8))
qh = qh.reshape((n_blocks, -1, 1)) >> np.array([i for i in range(8)], dtype=np.uint8)
qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1))
qs = qs.astype(np.uint16) | (qh << 8)
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 4, 8))
return (db * grid * signs).reshape((n_blocks, -1))
class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S):
# iq1s_grid, with each byte packed into 2 bits
# -1, 0, 1 <=> 0, 1, 2
grid_shape = (2048, 8)
grid_map = (-1, 0, 1)
grid_hex = (
b"00000200050008000a00110015002000220028002a0045005100540056006500"
b"8000820088008a009500a000a200a800aa000401050111011401160119011a01"
b"2501410146014901520155015a0161016401660168018501910194019601a501"
b"0002020208020a0215022002220228022a024502510259026402690280028202"
b"88028a02910295029902a002a202a802aa021104140416042504410449045504"
b"5a046404650491049904a5040105040505050605150518051a05290540054505"
b"4a0550055105540555055605590560056205650568056a058105910595059805"
b"9a05a105a405a505a605a9051406190641064406500652065506580660066106"
b"6606690685069106940699060008020808080a0815082008220828082a084508"
b"5108560865088008820888088a089508a008a208a808aa080509110914091909"
b"2409250941095009510955096109640969099109940996099909a509000a020a"
b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a"
b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510"
b"58106110641065106910911094109610a110a510011104110611091110111211"
b"1511181121112411291145114a11501151115211541155115611591160116511"
b"841192119511a111a41111121412161225124012461249125212551258125a12"
b"641266128512911294129612a512011406140914141415141814191421142614"
b"41144514461448144a1451145414551456145914621465146814841489149014"
b"94149514981499149a14a114a414a514a914021505150a151115141515151615"
b"191520152215251528152a154115441545154615511552155415551556155915"
b"5a1561156415651566156915801582158415851588158a159015911594159515"
b"961599159a15a015a215a51501160416051606161516161618161a1621162616"
b"401642164416451648164a165116551656165816591661166416651668166916"
b"6a1686168a1692169516a416a916111816182518411844184618491850185518"
b"58185a1860186118641866186918851891189418a5181019121915191a192119"
b"25194219441945194819511954195519561959195a19601965196a1989199119"
b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a"
b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520"
b"28202a20452051205920612065208020822088208a209520a020a220a520a820"
b"aa2005211121142119212521422144214921552158215a216121642165216621"
b"8521902196219921a521012208220a22112215222022222228222a2245225122"
b"562259226522812288228a2291229522a022a222a822aa220524142416241924"
b"252444244524462449245224552458245a2466248524912494249924a124a524"
b"0925152521252925402545254825512554255525592562256525682589259025"
b"9425952598259a25a125a425a625a92505261026122619262526412649265526"
b"6026612669268426862690269a260028022808280a2815282028222828282a28"
b"45285128542865288028822888288a28a028a228a828aa280929112914291929"
b"2529462949295229552961296429662969298529902996299929a429a529002a"
b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a"
b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440"
b"664094409940a140a6400041014104410641094112411541164118411a412141"
b"26412941454148414a41514154415541564159415a41654168416a4181418441"
b"8641904192419541a041a141a241054211421442164225424142524255425a42"
b"6442694289429442a5420144154419442944454448444a445144544455445644"
b"61446244654468446a44814486448944904492449544a044a144a94401450245"
b"05450a4511451445154516451945204525452a45414544454545464549455045"
b"5145544555455645584559456145644565456645694582458445854588459145"
b"94459545964599459a45a545a845aa450146054609461446154618461a462146"
b"2446294640464246454648465046514652465546564659466246654668468146"
b"85468a4694469546a146a446a6460548114815481a4825484248494850485548"
b"5848614864486648694885489148944896489948a5480149054906490a491049"
b"144915491849214924492649404945494a495149524954495549564959496049"
b"6249654966496a49864989499249954996499849a149a449a649a949164a444a"
b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550"
b"1a50215024502950405045504850515054505550565059506550685086508950"
b"95509850a050a150a650a9500551085109510a51115114511551165118511951"
b"20512551265128512a5141514451455146514951505151515251545155515651"
b"585159515a51615164516551665169518251855191519451955196519951a051"
b"a551aa5101520652125215521a5221522452425245524a525152545255525652"
b"595262526552855290529252955299529a52a452045405541154145415541654"
b"185419542154255428542a54415444544554465449544a545054515454545554"
b"5654585459545a54615462546454655466546954805488548a54915494549554"
b"96549954a154a454a554aa540155025504550555065509551055115512551455"
b"1555165519551a55215524552555265529554055415542554455455546554855"
b"4955505551555255545555555655585559555a55605561556455655566556855"
b"69556a5581558455855589558a559055915594559555965598559955a155a455"
b"a555a655a9550056015602560456065608560956115614561556185619562056"
b"2156225624562556265628562956415645564656485649564a56505651565256"
b"545655565656585659565a566156645665566956825685568656885689568a56"
b"915695569a56a256a556a656a856a95604580558065809581058155818582158"
b"2a58455848584a58515854585558565858585958605862586458655882588958"
b"9058925895589858a158a9580159025905590a59115914591559165919592559"
b"41594459455946594959505951595259545955595659585959595a5961596459"
b"655966596959815985598959915994599559965998599959a559045a085a155a"
b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a"
b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060"
b"5560566058605a60616064606660696081609660a56001610461066109611261"
b"15612161226126612961456149615161556156615961656166616a6184618a61"
b"92619561a161a661a96111621662196240624162466255625662586260628562"
b"91629662a56211641264156416641a6421642664296440644264456448644a64"
b"516454645564566459645a646064626465648464856489649064926494649564"
b"966498649a64a164a464a964056508650a651165156516651965446545654665"
b"496550655165546555655665596561656465656566656965866589658a659165"
b"9565966599659a65a265a565a665a86502660966156620662666286629664066"
b"456648664a66516654665566566658665a666066656668668066826685668a66"
b"9466966698669966a066a466a666aa661668196825684168526855685a686168"
b"6968856891689868a66801690469106915692169246926692969406941694569"
b"4669486951695469556956695969606965696a69826984698a699569a169a469"
b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a"
b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480"
b"5680598065808080828088808a809580a080a280a880aa800581118114811681"
b"1981258141814481498150815281558156815881598164816681698185818981"
b"948196819981a5810082028208820a8215822082228228822a82518254825982"
b"65828082828288828a829582a082a282a882aa82148419844184448451845584"
b"5a846184648469849484998401850985128515851a8526852985408541854585"
b"4885518554855585568559855a856585668568856a8581858485868589859085"
b"928595859885a68511861686198625864186448649864a865086558659865a86"
b"618666866a86858691869a86a4860088028808880a8815882088228828882a88"
b"41884588518854885988658869888088828888888a889588a088a288a888aa88"
b"05890689118914891689258941894489468949895089528955895a8961896489"
b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a"
b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590"
b"419046904990559058905a9069906a9085909190949096909990a59001910491"
b"069109911091159118911a912191249126912991409145915091519154915591"
b"569159916291659184918691929195919891a191a491a691a991059211921492"
b"19922592449246924992509252925592589266926992859294929692a9920194"
b"04940694109415941894269440944a9451945494559456945894599460946194"
b"62946594849486949294949495949894a194a9940095059508950a9510951195"
b"14951595169519952195259529952a9541954495459546954995509551955295"
b"549555955695589559955a956195649565956695699581958595889591959295"
b"94959595969599959a95a095a295a595a895aa95019604961096159619962096"
b"2696299645964896499651965296559656965996659668968296849689968a96"
b"929694969596a496a696a9960598169819982598419846985098529855985698"
b"5a98649865988598919896989998a59804990699099910991299159918991a99"
b"209921992499269940994299459948994a995199549955995699599962996599"
b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a"
b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0"
b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0"
b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1"
b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2"
b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4"
b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5"
b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5"
b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6"
b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8"
b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9"
b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa"
b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa"
)
delta = np.float32(0.125)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
qs, qh = np.hsplit(rest, [QK_K // 8])
d = d.view(np.float16).astype(np.float32)
qh = qh.view(np.uint16)
dl = d * (2 * ((qh >> 12) & 7) + 1)
dl = dl.reshape((n_blocks, -1, 1, 1))
delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta)
delta = delta.reshape((n_blocks, -1, 1, 1))
qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1))
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 4, 8))
return (dl * (grid + delta)).reshape((n_blocks, -1))
class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M):
grid_shape = IQ1_S.grid_shape
grid_map = IQ1_S.grid_map
grid_hex = IQ1_S.grid_hex
delta = IQ1_S.delta
# Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts.
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
qs, rest = np.hsplit(blocks, [QK_K // 8])
qh, scales = np.hsplit(rest, [QK_K // 16])
# The f16 scale is packed across multiple bytes
scales = scales.view(np.uint16)
d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array([12, 8, 4, 0], dtype=np.uint16).reshape((1, 4))
d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3]
d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1))
scales = scales.reshape(n_blocks, -1, 1) >> np.array([0, 3, 6, 9], dtype=np.uint16).reshape((1, 1, 4))
scales = (scales & 0x07).reshape((n_blocks, -1))
dl = d * (2 * scales + 1)
dl = dl.reshape((n_blocks, -1, 2, 1, 1))
qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape((n_blocks, -1))
delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta)
delta = delta.reshape((n_blocks, -1, 2, 2, 1))
assert cls.grid is not None
grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
grid = grid.reshape((n_blocks, -1, 2, 2, 8))
return (dl * (grid + delta)).reshape((n_blocks, -1))
class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL):
kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, qs = np.hsplit(blocks, [2])
d = d.view(np.float16).astype(np.float32)
qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1))
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1))
return (d * qs)
class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS):
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d, rest = np.hsplit(blocks, [2])
scales_h, rest = np.hsplit(rest, [2])
scales_l, qs = np.hsplit(rest, [QK_K // 64])
d = d.view(np.float16).astype(np.float32)
scales_h = scales_h.view(np.uint16)
scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2))
scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array([2 * i for i in range(QK_K // 32)], dtype=np.uint16).reshape((1, -1, 1))
scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F)
scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03)
scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32)
dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1))
qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 1, 2, 1))
qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F)
kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1))
qs = np.take_along_axis(kvalues, qs, axis=-1).astype(np.float32).reshape((n_blocks, -1, 32))
return (dl * qs).reshape((n_blocks, -1))
# Todas as outras classes (Q4_1, Q5_0, Q5_1, Q8_0, Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, TQ1_0, TQ2_0, MXFP4, IQ2_XXS, IQ2_XS, IQ2_S, IQ3_XXS, IQ3_S, IQ1_S, IQ1_M, IQ4_NL, IQ4_XS) permanecem exatamente como estavam no seu arquivo original.
# =============================================================================
# Helicoidal-Zeta Kernel (Bruno Becker) — OFFELLIA Architecture
# =============================================================================
# ====================== Helicoidal-Zeta Kernel com inverse_transform ======================
try:
from mpmath import mp, zeta as _mp_zeta
except Exception:
mp = None
_mp_zeta = None
def _require_mpmath() -> None:
if mp is None or _mp_zeta is None:
raise ImportError("mpmath is required for zeta_signature(). Install with: pip install mpmath")
@dataclass
class _PrimeCache:
primes: list[int] = field(default_factory=lambda: [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41])
def __post_init__(self):
self._checked_upto = self.primes[-1]
@staticmethod
def _is_prime(k: int, primes: list[int]) -> bool:
if k < 2: return False
for p in primes:
if p * p > k: return True
if k % p == 0: return False
return True
def nth_prime(self, n: int) -> int:
if n <= 0: raise ValueError("n must be >= 1")
candidate = self._checked_upto
if candidate % 2 == 0: candidate += 1
while len(self.primes) < n:
candidate += 2
if self._is_prime(candidate, self.primes):
self.primes.append(candidate)
self._checked_upto = candidate
return self.primes[n - 1]
@dataclass
class HelicoidalZetaCore:
zeta_dps: int = 21
delta_modulus: int = 42
delta_else: float = 0.42
use_primes: bool = False
_prime_cache: _PrimeCache = field(default_factory=_PrimeCache)
def __post_init__(self) -> None:
self.phi = (1.0 + float(np.sqrt(5.0))) / 2.0
if mp is not None:
mp.dps = int(self.zeta_dps)
def _n_to_eval(self, n: int) -> int:
if not self.use_primes: return int(n)
return self._prime_cache.nth_prime(int(n))
@staticmethod
@lru_cache(maxsize=10000)
def _zeta_signature_cached(n: int) -> np.ndarray:
_require_mpmath()
if mp is not None:
mp.dps = 21
s = mp.mpc(0.5, float(n))
val = _mp_zeta(s)
return np.array([float(val.real), float(val.imag)], dtype=np.float32)
raise RuntimeError("mpmath not available")
def zeta_signature(self, n: int) -> np.ndarray:
return self._zeta_signature_cached(n)
def math_embedding(self, n: int) -> np.ndarray:
nn = self._n_to_eval(n)
c = self.coords(n)
r = float(np.sin(2.0 * np.pi * self.phi * nn) ** 2)
theta = 2.0 * np.pi * self.phi * nn
delta = self.delta_m(n)
zeta_vals = self.zeta_signature(n)
return np.concatenate([c * delta, np.array([r, theta], dtype=np.float32), zeta_vals])
def coords(self, n: int) -> np.ndarray:
nn = self._n_to_eval(n)
r = float(np.sin(2.0 * np.pi * self.phi * nn) ** 2)
t = 2.0 * np.pi * self.phi * nn
x = r * float(np.cos(t))
y = r * float(np.sin(t))
z = float(nn)
return np.array([x, y, z], dtype=np.float32)
def delta_m(self, n: int, m: int | None = None) -> float:
nn = self._n_to_eval(n)
mm = int(self.delta_modulus if m is None else m)
return 1.0 if (nn % mm) == 0 else float(self.delta_else)
def transform(self, x: np.ndarray, n_val: int) -> np.ndarray:
emb = self.math_embedding(n_val)
raw_scale = float(np.mean(emb))
final_scale = np.tanh(raw_scale)
return x * final_scale
# ====================== INVERSE_TRANSFORM (ESSENCIAL) ======================
def inverse_transform(self, x: np.ndarray, n_val: int) -> np.ndarray:
emb = self.math_embedding(n_val)
raw_scale = float(np.mean(emb))
final_scale = np.tanh(raw_scale)
# Proteção contra divisão por zero ou escala muito pequena
final_scale = np.where(np.abs(final_scale) < 1e-8, 1.0, final_scale)
return x / final_scale
def helicoidal_zeta_scale(n_val: int, *, use_primes: bool = False, zeta_dps: int = 21) -> float:
core = HelicoidalZetaCore(use_primes=use_primes, zeta_dps=zeta_dps)
emb = core.math_embedding(int(n_val))
return float(np.mean(emb))
__all__ = [
"HelicoidalZetaCore",
"helicoidal_zeta_scale",
]