File size: 6,271 Bytes
19ed98b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
"""
Convert Qwen3-4B BF16 safetensors → Proper Unary.
Reads safetensors raw bytes (no framework dependency for BF16).
(c) 2026 OpenTransformers Ltd / Scott Bisset
"""
import numpy as np
import json, os, sys, gc, shutil, struct, time

class SafeTensorReader:
    """Read safetensors one tensor at a time (memory efficient)."""
    def __init__(self, path):
        self.f = open(path, "rb")
        header_size = struct.unpack("<Q", self.f.read(8))[0]
        self.header = json.loads(self.f.read(header_size).decode("utf-8"))
        self.data_start = 8 + header_size
        self._meta = {k: v for k, v in self.header.items() if k != "__metadata__"}
    
    def keys(self):
        return list(self._meta.keys())
    
    def get(self, name):
        meta = self._meta[name]
        dtype = meta["dtype"]
        shape = tuple(meta["shape"])
        start, end = meta["data_offsets"]
        self.f.seek(self.data_start + start)
        raw = self.f.read(end - start)
        
        if dtype == "BF16":
            u16 = np.frombuffer(raw, dtype=np.uint16)
            u32 = u16.astype(np.uint32) << 16
            return u32.view(np.float32).reshape(shape)
        elif dtype == "F16":
            return np.frombuffer(raw, dtype=np.float16).reshape(shape).astype(np.float32)
        elif dtype == "F32":
            return np.frombuffer(raw, dtype=np.float32).reshape(shape).copy()
        else:
            raise ValueError(f"Unknown dtype {dtype}")
    
    def close(self):
        self.f.close()

def encode_proper_unary(weight_f32, K):
    """Encode 2D float32 matrix to proper unary."""
    rows, cols = weight_f32.shape
    chunks = (cols + 63) // 64
    
    row_absmax = np.abs(weight_f32).max(axis=1).astype(np.float32)
    row_absmax = np.maximum(row_absmax, 1e-10)
    row_scales = (row_absmax / K).astype(np.float32)
    
    inv_scales = K / row_absmax
    magnitudes = np.clip(
        np.round(np.abs(weight_f32) * inv_scales[:, None]).astype(np.int32), 0, K)
    
    sign_bits = np.zeros((rows, chunks), dtype=np.uint64)
    slot_planes = np.zeros((K, rows, chunks), dtype=np.uint64)
    
    negative = weight_f32 < 0
    
    for j in range(cols):
        c = j // 64
        b = np.uint64(j % 64)
        bit = np.uint64(1) << b
        
        neg_mask = negative[:, j]
        if neg_mask.any():
            sign_bits[neg_mask, c] |= bit
        
        mag_col = magnitudes[:, j]
        for s in range(K):
            active = mag_col > s
            if not active.any():
                break
            slot_planes[s, active, c] |= bit
    
    return sign_bits, slot_planes, row_scales

def convert_model(model_dir, output_dir, K=32):
    os.makedirs(output_dir, exist_ok=True)
    config = json.load(open(os.path.join(model_dir, "config.json")))
    
    for f in ["config.json", "tokenizer.json", "tokenizer_config.json",
              "special_tokens_map.json", "generation_config.json"]:
        src = os.path.join(model_dir, f)
        if os.path.exists(src):
            shutil.copy2(src, output_dir)
    
    index_path = os.path.join(model_dir, "model.safetensors.index.json")
    if os.path.exists(index_path):
        index = json.load(open(index_path))
        shard_files = sorted(set(index["weight_map"].values()))
    else:
        shard_files = ["model.safetensors"]
    
    linear_names = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"]
    
    manifest = {"K": K, "format": "proper_unary", "unary": {}, "fp16": []}
    total_linear = 0
    total_size = 0
    
    for shard_name in shard_files:
        shard_path = os.path.join(model_dir, shard_name)
        print(f"\n=== {shard_name} ===", flush=True)
        
        reader = SafeTensorReader(shard_path)
        print(f"  {len(reader.keys())} tensors", flush=True)
        
        for key in sorted(reader.keys()):
            tensor = reader.get(key)
            fname = key.replace(".", "_")
            
            is_linear = any(ln + ".weight" in key for ln in linear_names)
            
            if is_linear and tensor.ndim == 2:
                rows, cols = tensor.shape
                t0 = time.time()
                print(f"  {key}: {rows}x{cols} K={K}...", end="", flush=True)
                
                sign_bits, slot_planes, row_scales = encode_proper_unary(tensor, K)
                dt = time.time() - t0
                
                sign_bits.tofile(os.path.join(output_dir, fname + ".sign"))
                slot_planes.tofile(os.path.join(output_dir, fname + ".slots"))
                row_scales.tofile(os.path.join(output_dir, fname + ".scales"))
                
                manifest["unary"][key] = [rows, cols]
                sz = sign_bits.nbytes + slot_planes.nbytes + row_scales.nbytes
                total_size += sz
                total_linear += 1
                
                ratio = sz / (rows * cols * 2)
                print(f" {sz/1e6:.1f}MB ({ratio:.1f}x) [{dt:.0f}s]", flush=True)
                
                del sign_bits, slot_planes, row_scales
            else:
                # FP16
                t_f16 = tensor.astype(np.float16)
                out_data = t_f16.view(np.uint16)
                out_data.tofile(os.path.join(output_dir, fname + ".fp16"))
                manifest["fp16"].append(key)
                sz = out_data.nbytes
                total_size += sz
                print(f"  {key}: {tensor.shape} -> FP16 ({sz/1e6:.1f}MB)", flush=True)
                del t_f16, out_data
            
            del tensor
        
        reader.close()
        gc.collect()
    
    json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
    
    print(f"\n{'='*50}", flush=True)
    print(f"DONE: {total_linear} layers, K={K}", flush=True)
    print(f"Total: {total_size/1e9:.2f} GB (orig ~7.6 GB, ratio {total_size/7.6e9:.1f}x)", flush=True)

if __name__ == "__main__":
    model_dir = sys.argv[1] if len(sys.argv) > 1 else "/root/ternary_engine/qwen3-4b-thinking-hf"
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "/root/ternary_engine/qwen3-4b-proper-unary"
    K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
    convert_model(model_dir, output_dir, K)