| | |
| | """Log-unary model loader. (c) 2026 OpenTransformers Ltd""" |
| | import ctypes, numpy as np, os, sys, json, time |
| |
|
| | def load_and_run(model_dir, prompt, max_tokens=32, temperature=0.0, top_p=0.9, a_planes=4): |
| | config = json.load(open(os.path.join(model_dir, "config.json"))) |
| | manifest = json.load(open(os.path.join(model_dir, "manifest.json"))) |
| | w_planes = manifest["n_planes"] |
| | n_layers = config["num_hidden_layers"] |
| | hidden = config["hidden_size"] |
| | inter = config["intermediate_size"] |
| | n_heads = config["num_attention_heads"] |
| | n_kv_heads = config["num_key_value_heads"] |
| | head_dim = config.get("head_dim", hidden // n_heads) |
| | vocab = config["vocab_size"] |
| | rope_theta = config.get("rope_theta", 10000.0) |
| | tie = 1 if config.get("tie_word_embeddings", False) else 0 |
| |
|
| | w_max = (1 << w_planes) - 1 |
| | a_max = (1 << a_planes) - 1 |
| | print(f"Config: {n_layers}L hidden={hidden} inter={inter} heads={n_heads}/{n_kv_heads}") |
| | print(f"Weight: {w_planes} log-planes ({2*w_max+1} levels)") |
| | print(f"Activation: {a_planes} log-planes ({2*a_max+1} levels)") |
| | print(f"Plane pairs: {w_planes * a_planes}") |
| |
|
| | engine = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "log_unary_engine.so") |
| | lib = ctypes.CDLL(engine) |
| |
|
| | lib.model_alloc.restype = ctypes.c_void_p |
| | lib.model_alloc.argtypes = [ctypes.c_int]*2 + [ctypes.c_int]*7 + [ctypes.c_float, ctypes.c_int] |
| | lib.forward_token.restype = ctypes.POINTER(ctypes.c_float) |
| | lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int] |
| | lib.generate.restype = ctypes.c_int |
| | lib.generate.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int, |
| | ctypes.POINTER(ctypes.c_int), ctypes.c_int, |
| | ctypes.c_float, ctypes.c_float, ctypes.c_int] |
| |
|
| | u16p = ctypes.POINTER(ctypes.c_uint16) |
| | f32p = ctypes.POINTER(ctypes.c_float) |
| | u64p = ctypes.POINTER(ctypes.c_uint64) |
| |
|
| | lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p] |
| | lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p] |
| | lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p] |
| | lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p] |
| | lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + \ |
| | [u64p, u64p, f32p, ctypes.c_int, ctypes.c_int] * 7 + [ctypes.c_int] |
| |
|
| | print("Allocating...") |
| | model = lib.model_alloc(w_planes, a_planes, hidden, inter, n_heads, n_kv_heads, |
| | head_dim, n_layers, vocab, rope_theta, tie) |
| | _refs = [] |
| |
|
| | def load_fp16(name): |
| | d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16) |
| | _refs.append(d); return d.ctypes.data_as(u16p) |
| |
|
| | def load_f32(name): |
| | d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16) |
| | f = d.view(np.float16).astype(np.float32); _refs.append(f); return f.ctypes.data_as(f32p) |
| |
|
| | def load_unary(name): |
| | fn = name.replace(".","_") |
| | s = np.fromfile(os.path.join(model_dir, f"{fn}.sign"), dtype=np.uint64) |
| | p = np.fromfile(os.path.join(model_dir, f"{fn}.planes"), dtype=np.uint64) |
| | sc = np.fromfile(os.path.join(model_dir, f"{fn}.scales"), dtype=np.float32) |
| | _refs.extend([s,p,sc]) |
| | return s.ctypes.data_as(u64p), p.ctypes.data_as(u64p), sc.ctypes.data_as(f32p) |
| |
|
| | lib.model_set_embed(model, load_fp16("model.embed_tokens.weight")) |
| | lib.model_set_final_norm(model, load_f32("model.norm.weight")) |
| |
|
| | print(f"Loading {n_layers} layers...") |
| | um = manifest["unary"] |
| | for l in range(n_layers): |
| | p = f"model.layers.{l}" |
| | lib.layer_set_norms(model, l, load_f32(f"{p}.input_layernorm.weight"), |
| | load_f32(f"{p}.post_attention_layernorm.weight")) |
| | qn = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16") |
| | if os.path.exists(qn): |
| | lib.layer_set_qk_norm(model, l, load_f32(f"{p}.self_attn.q_norm.weight"), |
| | load_f32(f"{p}.self_attn.k_norm.weight")) |
| |
|
| | projs = ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj", |
| | "mlp.gate_proj","mlp.up_proj","mlp.down_proj"] |
| | args = [model, l] |
| | for pj in projs: |
| | key = f"{p}.{pj}.weight" |
| | s,pl,sc = load_unary(key) |
| | args.extend([s, pl, sc, um[key][0], um[key][1]]) |
| | args.append(w_planes) |
| | lib.layer_set_linears(*args) |
| |
|
| | if (l+1) % 12 == 0 or l == n_layers-1: |
| | print(f" Layer {l+1}/{n_layers}") |
| |
|
| | print("Tokenizing...") |
| | from transformers import AutoTokenizer |
| | tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) |
| | ids = tok.encode(prompt) |
| | print(f"Prompt: {len(ids)} tokens") |
| |
|
| | eos = config.get("eos_token_id", 151645) |
| | pa = (ctypes.c_int * len(ids))(*ids) |
| | oa = (ctypes.c_int * max_tokens)() |
| |
|
| | print(f"\nGenerating (w={w_planes}log a={a_planes}log pairs={w_planes*a_planes})...") |
| | t0 = time.time() |
| | n = lib.generate(model, pa, len(ids), oa, max_tokens, |
| | ctypes.c_float(temperature), ctypes.c_float(top_p), eos) |
| | dt = time.time() - t0 |
| |
|
| | text = tok.decode([oa[i] for i in range(n)], skip_special_tokens=True) |
| | print(f"\n=== LOG-UNARY ({n} tok in {dt:.1f}s = {n/dt:.2f} tok/s) ===") |
| | print(text) |
| | print(f"\nDecode: {n/dt:.2f} tok/s") |
| |
|
| | if __name__ == "__main__": |
| | d = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-log-unary" |
| | p = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step." |
| | mt = int(sys.argv[3]) if len(sys.argv) > 3 else 32 |
| | ap = int(sys.argv[4]) if len(sys.argv) > 4 else 4 |
| | load_and_run(d, p, mt, a_planes=ap) |
| |
|