File size: 7,249 Bytes
19ed98b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | #!/usr/bin/env python3
"""
Convert DeepSeek-R1-Distill-Qwen-1.5B to ternary format.
Stores linear weights as bitplanes (pos_mask, neg_mask) + per-row scale.
Embeddings and layernorms stay FP16. LM head stays FP16.
(c) 2026 OpenTransformers Ltd / Scott Bisset
"""
import os
import json
import struct
import numpy as np
from pathlib import Path
import time
def load_safetensors(model_dir):
"""Load all tensors from safetensors files."""
import torch; from safetensors.torch import load_file
tensors = {}
for f in sorted(Path(model_dir).glob("*.safetensors")):
print(f"Loading {f.name}...")
state = load_file(str(f))
for key, val in state.items():
tensors[key] = val.float().numpy()
return tensors
def quantize_row_ternary(row, alpha=0.7):
"""Quantize a single row to ternary {-1, 0, +1}. Vectorized bitpacking."""
row = row.astype(np.float32)
mean_abs = np.mean(np.abs(row))
threshold = alpha * mean_abs
pos = row >= threshold
neg = row <= -threshold
nz_mask = pos | neg
scale = np.mean(np.abs(row[nz_mask])) if nz_mask.any() else np.float32(1.0)
# Pad to multiple of 64
in_dim = len(row)
pad = (64 - in_dim % 64) % 64
if pad:
pos = np.concatenate([pos, np.zeros(pad, dtype=bool)])
neg = np.concatenate([neg, np.zeros(pad, dtype=bool)])
# Vectorized bitpack: reshape to [chunks, 64], multiply by bit positions, sum
pos_r = pos.reshape(-1, 64).astype(np.uint64)
neg_r = neg.reshape(-1, 64).astype(np.uint64)
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
pos_bits = np.bitwise_or.reduce(pos_r * bit_positions, axis=1)
neg_bits = np.bitwise_or.reduce(neg_r * bit_positions, axis=1)
return pos_bits, neg_bits, np.float32(scale)
return pos_bits, neg_bits, np.float32(scale)
def quantize_weight_matrix(weight, alpha=0.7):
"""Quantize entire weight matrix [out_dim, in_dim] to ternary. Fully vectorized."""
w = weight.astype(np.float32)
out_dim, in_dim = w.shape
# Per-row thresholds
row_means = np.mean(np.abs(w), axis=1, keepdims=True)
thresholds = alpha * row_means
pos = w >= thresholds # [out_dim, in_dim]
neg = w <= -thresholds
# Per-row scales
nz = pos | neg
# Use row means of absolute values where non-zero
scales = np.zeros(out_dim, dtype=np.float32)
for i in range(out_dim):
if nz[i].any():
scales[i] = np.mean(np.abs(w[i, nz[i]]))
else:
scales[i] = 1.0
# Sparsity
total = out_dim * in_dim
sparsity = 1.0 - np.sum(nz) / total
# Pad to multiple of 64
pad = (64 - in_dim % 64) % 64
if pad:
pos = np.concatenate([pos, np.zeros((out_dim, pad), dtype=bool)], axis=1)
neg = np.concatenate([neg, np.zeros((out_dim, pad), dtype=bool)], axis=1)
padded_dim = pos.shape[1]
chunks = padded_dim // 64
# Vectorized bitpacking for entire matrix at once
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64)) # [64]
pos_r = pos.reshape(out_dim, chunks, 64).astype(np.uint64) # [out, chunks, 64]
neg_r = neg.reshape(out_dim, chunks, 64).astype(np.uint64)
all_pos = np.bitwise_or.reduce(pos_r * bit_positions, axis=2) # [out, chunks]
all_neg = np.bitwise_or.reduce(neg_r * bit_positions, axis=2)
return all_pos, all_neg, scales, sparsity
def save_ternary_model(tensors, output_dir, alpha=0.7):
"""Convert and save full model to ternary format."""
os.makedirs(output_dir, exist_ok=True)
config = {
"hidden_size": 1536,
"intermediate_size": 8960,
"num_attention_heads": 12,
"num_key_value_heads": 2,
"num_hidden_layers": 28,
"vocab_size": 151936,
"head_dim": 128,
"rope_theta": 1000000.0,
"rms_norm_eps": 1e-6,
"alpha": alpha,
}
# Identify which tensors to ternarize vs keep as-is
ternary_keys = [] # Linear weights to ternarize
keep_keys = [] # Embeddings, norms, biases to keep as FP16
for key in tensors:
if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
'down_proj.weight']):
ternary_keys.append(key)
else:
keep_keys.append(key)
print(f"\nTernary layers: {len(ternary_keys)}")
print(f"FP16 layers: {len(keep_keys)}")
# Save config
with open(os.path.join(output_dir, "config.json"), "w") as f:
json.dump(config, f, indent=2)
# Save ternary weights
total_ternary_bytes = 0
total_original_bytes = 0
for key in ternary_keys:
w = tensors[key].astype(np.float32)
out_dim, in_dim = w.shape
total_original_bytes += w.nbytes
t0 = time.time()
pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
dt = time.time() - t0
# Save as binary
prefix = os.path.join(output_dir, key.replace(".", "_"))
pos.tofile(prefix + ".pos")
neg.tofile(prefix + ".neg")
scales.tofile(prefix + ".scales")
ternary_bytes = pos.nbytes + neg.nbytes + scales.nbytes
total_ternary_bytes += ternary_bytes
ratio = w.nbytes / ternary_bytes
print(f" {key}: {w.shape} -> ternary ({ternary_bytes/1024:.0f}KB, "
f"{ratio:.1f}x compression, {sparsity:.1%} sparse, {dt:.1f}s)")
# Save FP16 weights
total_fp16_bytes = 0
for key in keep_keys:
w = tensors[key].astype(np.float16)
prefix = os.path.join(output_dir, key.replace(".", "_"))
w.tofile(prefix + ".fp16")
total_fp16_bytes += w.nbytes
print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
# Save tensor manifest
manifest = {
"ternary": {k: list(tensors[k].shape) for k in ternary_keys},
"fp16": {k: list(tensors[k].shape) for k in keep_keys},
}
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
json.dump(manifest, f, indent=2)
total_bytes = total_ternary_bytes + total_fp16_bytes
orig_bytes = total_original_bytes + total_fp16_bytes
print(f"\n=== Summary ===")
print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
print(f"Ternary linear weights: {total_ternary_bytes/1024/1024:.1f} MB")
print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
print(f"Compression vs FP32: {orig_bytes/total_bytes:.1f}x")
if __name__ == "__main__":
import sys
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-ternary"
alpha = float(sys.argv[3]) if len(sys.argv) > 3 else 0.7
print(f"Loading model from {model_dir}...")
tensors = load_safetensors(model_dir)
print(f"Converting to ternary (alpha={alpha})...")
save_ternary_model(tensors, output_dir, alpha)
print("Done!")
|