File size: 6,271 Bytes
19ed98b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | #!/usr/bin/env python3
"""
Convert Qwen3-4B BF16 safetensors → Proper Unary.
Reads safetensors raw bytes (no framework dependency for BF16).
(c) 2026 OpenTransformers Ltd / Scott Bisset
"""
import numpy as np
import json, os, sys, gc, shutil, struct, time
class SafeTensorReader:
"""Read safetensors one tensor at a time (memory efficient)."""
def __init__(self, path):
self.f = open(path, "rb")
header_size = struct.unpack("<Q", self.f.read(8))[0]
self.header = json.loads(self.f.read(header_size).decode("utf-8"))
self.data_start = 8 + header_size
self._meta = {k: v for k, v in self.header.items() if k != "__metadata__"}
def keys(self):
return list(self._meta.keys())
def get(self, name):
meta = self._meta[name]
dtype = meta["dtype"]
shape = tuple(meta["shape"])
start, end = meta["data_offsets"]
self.f.seek(self.data_start + start)
raw = self.f.read(end - start)
if dtype == "BF16":
u16 = np.frombuffer(raw, dtype=np.uint16)
u32 = u16.astype(np.uint32) << 16
return u32.view(np.float32).reshape(shape)
elif dtype == "F16":
return np.frombuffer(raw, dtype=np.float16).reshape(shape).astype(np.float32)
elif dtype == "F32":
return np.frombuffer(raw, dtype=np.float32).reshape(shape).copy()
else:
raise ValueError(f"Unknown dtype {dtype}")
def close(self):
self.f.close()
def encode_proper_unary(weight_f32, K):
"""Encode 2D float32 matrix to proper unary."""
rows, cols = weight_f32.shape
chunks = (cols + 63) // 64
row_absmax = np.abs(weight_f32).max(axis=1).astype(np.float32)
row_absmax = np.maximum(row_absmax, 1e-10)
row_scales = (row_absmax / K).astype(np.float32)
inv_scales = K / row_absmax
magnitudes = np.clip(
np.round(np.abs(weight_f32) * inv_scales[:, None]).astype(np.int32), 0, K)
sign_bits = np.zeros((rows, chunks), dtype=np.uint64)
slot_planes = np.zeros((K, rows, chunks), dtype=np.uint64)
negative = weight_f32 < 0
for j in range(cols):
c = j // 64
b = np.uint64(j % 64)
bit = np.uint64(1) << b
neg_mask = negative[:, j]
if neg_mask.any():
sign_bits[neg_mask, c] |= bit
mag_col = magnitudes[:, j]
for s in range(K):
active = mag_col > s
if not active.any():
break
slot_planes[s, active, c] |= bit
return sign_bits, slot_planes, row_scales
def convert_model(model_dir, output_dir, K=32):
os.makedirs(output_dir, exist_ok=True)
config = json.load(open(os.path.join(model_dir, "config.json")))
for f in ["config.json", "tokenizer.json", "tokenizer_config.json",
"special_tokens_map.json", "generation_config.json"]:
src = os.path.join(model_dir, f)
if os.path.exists(src):
shutil.copy2(src, output_dir)
index_path = os.path.join(model_dir, "model.safetensors.index.json")
if os.path.exists(index_path):
index = json.load(open(index_path))
shard_files = sorted(set(index["weight_map"].values()))
else:
shard_files = ["model.safetensors"]
linear_names = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"]
manifest = {"K": K, "format": "proper_unary", "unary": {}, "fp16": []}
total_linear = 0
total_size = 0
for shard_name in shard_files:
shard_path = os.path.join(model_dir, shard_name)
print(f"\n=== {shard_name} ===", flush=True)
reader = SafeTensorReader(shard_path)
print(f" {len(reader.keys())} tensors", flush=True)
for key in sorted(reader.keys()):
tensor = reader.get(key)
fname = key.replace(".", "_")
is_linear = any(ln + ".weight" in key for ln in linear_names)
if is_linear and tensor.ndim == 2:
rows, cols = tensor.shape
t0 = time.time()
print(f" {key}: {rows}x{cols} K={K}...", end="", flush=True)
sign_bits, slot_planes, row_scales = encode_proper_unary(tensor, K)
dt = time.time() - t0
sign_bits.tofile(os.path.join(output_dir, fname + ".sign"))
slot_planes.tofile(os.path.join(output_dir, fname + ".slots"))
row_scales.tofile(os.path.join(output_dir, fname + ".scales"))
manifest["unary"][key] = [rows, cols]
sz = sign_bits.nbytes + slot_planes.nbytes + row_scales.nbytes
total_size += sz
total_linear += 1
ratio = sz / (rows * cols * 2)
print(f" {sz/1e6:.1f}MB ({ratio:.1f}x) [{dt:.0f}s]", flush=True)
del sign_bits, slot_planes, row_scales
else:
# FP16
t_f16 = tensor.astype(np.float16)
out_data = t_f16.view(np.uint16)
out_data.tofile(os.path.join(output_dir, fname + ".fp16"))
manifest["fp16"].append(key)
sz = out_data.nbytes
total_size += sz
print(f" {key}: {tensor.shape} -> FP16 ({sz/1e6:.1f}MB)", flush=True)
del t_f16, out_data
del tensor
reader.close()
gc.collect()
json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
print(f"\n{'='*50}", flush=True)
print(f"DONE: {total_linear} layers, K={K}", flush=True)
print(f"Total: {total_size/1e9:.2f} GB (orig ~7.6 GB, ratio {total_size/7.6e9:.1f}x)", flush=True)
if __name__ == "__main__":
model_dir = sys.argv[1] if len(sys.argv) > 1 else "/root/ternary_engine/qwen3-4b-thinking-hf"
output_dir = sys.argv[2] if len(sys.argv) > 2 else "/root/ternary_engine/qwen3-4b-proper-unary"
K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
convert_model(model_dir, output_dir, K)
|