File size: 3,723 Bytes
19ed98b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
"""Packed unary converter: uint8 magnitudes + bitpacked signs + per-row scales."""
import os, json, sys, time
import numpy as np
from pathlib import Path

def load_safetensors(model_dir):
    from safetensors.torch import load_file
    tensors = {}
    for f in sorted(Path(model_dir).glob("*.safetensors")):
        print(f"  Loading {f.name}...")
        for k, v in load_file(str(f)).items():
            tensors[k] = v.float().numpy()
    return tensors

def quantize_packed(w, n_levels=7):
    out_dim, in_dim = w.shape
    chunks = (in_dim + 63) // 64
    padded = chunks * 64
    row_max = np.max(np.abs(w), axis=1, keepdims=True)
    row_max = np.where(row_max == 0, 1.0, row_max)
    scales = (row_max.flatten() / n_levels).astype(np.float32)
    mags = np.clip(np.round(np.abs(w / scales[:, None])), 0, n_levels).astype(np.uint8)
    signs = (w < 0)
    rmm = np.max(mags, axis=1).astype(np.uint8)
    if in_dim < padded:
        sp = np.zeros((out_dim, padded), dtype=bool)
        sp[:, :in_dim] = signs
    else:
        sp = signs
    bit_pos = np.uint64(1) << np.arange(64, dtype=np.uint64)
    sign_bits = np.bitwise_or.reduce(sp.reshape(out_dim, chunks, 64).astype(np.uint64) * bit_pos, axis=2)
    return mags, sign_bits, scales, rmm, np.mean(mags), np.mean(mags == 0)

def convert(tensors, output_dir, n_levels=7):
    os.makedirs(output_dir, exist_ok=True)
    config = {"hidden_size":1536,"intermediate_size":8960,"num_attention_heads":12,
              "num_key_value_heads":2,"num_hidden_layers":28,"vocab_size":151936,
              "head_dim":128,"rope_theta":1000000.0,"rms_norm_eps":1e-6,
              "n_levels":n_levels,"quant_type":"packed_unary"}
    linear_keys = [k for k in tensors if any(p in k for p in
        ['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
         'gate_proj.weight','up_proj.weight','down_proj.weight'])]
    other_keys = [k for k in tensors if k not in linear_keys]
    with open(os.path.join(output_dir, "config.json"), "w") as f:
        json.dump(config, f, indent=2)
    total_packed = total_orig = 0
    all_avg = []
    for key in linear_keys:
        w = tensors[key]; total_orig += w.nbytes
        t0 = time.time()
        mags, sb, sc, rmm, am, sp = quantize_packed(w, n_levels)
        dt = time.time() - t0
        pfx = os.path.join(output_dir, key.replace(".", "_"))
        mags.tofile(pfx+".mags"); sb.tofile(pfx+".signs")
        sc.tofile(pfx+".scales"); rmm.tofile(pfx+".rmm")
        ub = mags.nbytes + sb.nbytes + sc.nbytes + rmm.nbytes
        total_packed += ub; all_avg.append(am)
        print(f"  {key}: {w.shape} -> {ub/1024:.0f}KB (avg_mag={am:.2f}, {dt:.1f}s)")
    total_fp16 = 0
    for key in other_keys:
        w = tensors[key].astype(np.float16)
        pfx = os.path.join(output_dir, key.replace(".", "_"))
        w.tofile(pfx+".fp16"); total_fp16 += w.nbytes
    manifest = {"packed":{k:list(tensors[k].shape) for k in linear_keys},
                "fp16":{k:list(tensors[k].shape) for k in other_keys}}
    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
        json.dump(manifest, f, indent=2)
    print(f"\n=== PACKED UNARY ===")
    print(f"Packed linear: {total_packed/1e6:.1f} MB | FP16 other: {total_fp16/1e6:.1f} MB")
    print(f"Total: {(total_packed+total_fp16)/1e6:.1f} MB | Avg mag: {np.mean(all_avg):.3f}")
    print(f"Expected speedup vs 7-plane: {7/np.mean(all_avg):.1f}x")

if __name__ == "__main__":
    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-packed"
    tensors = load_safetensors(model_dir)
    convert(tensors, output_dir)
    print("Done!")