File size: 4,415 Bytes
19ed98b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | #!/usr/bin/env python3
"""Test the full-unary popcount engine."""
import ctypes, numpy as np, os, time, sys
os.environ["OMP_NUM_THREADS"] = "16"
MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary4"
HF_DIR = "deepseek-r1-1.5b-hf"
N_PLANES = int(sys.argv[2]) if len(sys.argv) > 2 else 4
lib = ctypes.CDLL("./unary_full.so")
lib.model_alloc.restype = ctypes.c_void_p
lib.model_alloc.argtypes = [ctypes.c_int]
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
args = [ctypes.c_void_p, ctypes.c_int]
for _ in range(7):
args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
args.append(ctypes.c_int)
lib.layer_set_linears.argtypes = args
lib.generate.restype = ctypes.c_int
lib.generate.argtypes = [
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
ctypes.c_void_p, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_int
]
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
lib.model_free.argtypes = [ctypes.c_void_p]
_refs = []
def keep(a):
_refs.append(a)
return a.ctypes.data
print(f"Loading model from {MODEL_DIR} (w_planes={N_PLANES})...")
m = lib.model_alloc(N_PLANES)
# Embed + final norm + lm_head
e = np.fromfile(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"), dtype=np.uint16)
lib.model_set_embed(m, keep(e))
fn = np.fromfile(os.path.join(MODEL_DIR, "model_norm_weight.fp16"), dtype=np.float16).astype(np.float32)
lib.model_set_final_norm(m, keep(fn))
lm = np.fromfile(os.path.join(MODEL_DIR, "lm_head_weight.fp16"), dtype=np.uint16)
lib.model_set_lm_head(m, keep(lm), 151936, 1536)
PROJS = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
"self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
DIMS = {
"self_attn_q_proj": (1536, 1536), "self_attn_k_proj": (256, 1536),
"self_attn_v_proj": (256, 1536), "self_attn_o_proj": (1536, 1536),
"mlp_gate_proj": (8960, 1536), "mlp_up_proj": (8960, 1536),
"mlp_down_proj": (1536, 8960),
}
for l in range(28):
in_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
po_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
lib.layer_set_norms(m, l, keep(in_n), keep(po_n))
qb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
kb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
vb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
pa = []
for p in PROJS:
base = os.path.join(MODEL_DIR, f"model_layers_{l}_{p}_weight")
s = np.fromfile(base + ".sign", dtype=np.uint64)
pl = np.fromfile(base + ".planes", dtype=np.uint64)
sc = np.fromfile(base + ".scales", dtype=np.float32)
od, id_ = DIMS[p]
pa.extend([keep(s), keep(pl), keep(sc), od, id_])
lib.layer_set_linears(m, l, *pa, N_PLANES)
if (l + 1) % 7 == 0:
print(f" Layer {l+1}/28")
print("Model loaded!")
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
msg = [{"role": "user", "content": "What is 2+2?"}]
ids = tok.apply_chat_template(msg, add_generation_prompt=True)
arr = np.array(ids, dtype=np.int32)
out = np.zeros(30, dtype=np.int32)
lib.model_reset_cache(m)
print(f"Prompt: {len(ids)} tokens, generating 30...")
t0 = time.time()
n = lib.generate(m, arr.ctypes.data, len(ids), out.ctypes.data, 30,
ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
dt = time.time() - t0
text = tok.decode(out[:n].tolist(), skip_special_tokens=False)
print(f"\n=== {n} tokens, {dt:.1f}s, {n/dt:.1f} tok/s ===")
print(text)
print("===")
lib.model_free(m)
|