File size: 3,863 Bytes
19ed98b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import ctypes, numpy as np, os, time

MODEL_DIR = "deepseek-r1-1.5b-unary"
HF_DIR = "deepseek-r1-1.5b-hf"
lib = ctypes.CDLL("./unary_engine.so")

lib.model_alloc.restype = ctypes.c_void_p
lib.model_alloc.argtypes = [ctypes.c_int]
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
lib.forward_token.restype = ctypes.c_void_p
lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
lib.generate.restype = ctypes.c_int
lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
lib.model_reset_cache.argtypes = [ctypes.c_void_p]

_refs = []
def keep(a):
    _refs.append(a)
    return a.ctypes.data

N_PLANES = 7
N_LAYERS = 28
PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}

m = lib.model_alloc(N_PLANES)
e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
lib.model_set_embed(m, keep(e))
n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
lib.model_set_final_norm(m, keep(n))
h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
lib.model_set_lm_head(m, keep(h), 151936, 1536)
for l in range(N_LAYERS):
    inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
    pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
    lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
    qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
    kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
    vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
    lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
    pa = []
    for pn in PROJS:
        base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
        s = np.fromfile(base+'.sign',dtype=np.uint64)
        p = np.fromfile(base+'.planes',dtype=np.uint64)
        sc = np.fromfile(base+'.scales',dtype=np.float32)
        od,id = DIMS[pn]
        pa.extend([keep(s),keep(p),keep(sc),od,id])
    lib.layer_set_linears(m, l, *pa, N_PLANES)

print("Loaded. Testing generate with greedy (temp=0)...")
lib.model_reset_cache(m)

inp = np.array([1], dtype=np.int32)  # just BOS token
out = np.zeros(8, dtype=np.int32)

t0 = time.time()
ng = lib.generate(m, inp.ctypes.data, 1, out.ctypes.data, 8,
                  ctypes.c_float(0.0), ctypes.c_float(0.9), 151643)
dt = time.time() - t0
print(f"Generated {ng} tokens in {dt:.1f}s = {ng/dt:.1f} tok/s")
print(f"Token IDs: {out[:ng].tolist()}")

from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
print(f"Text: {tok.decode(out[:ng].tolist())}")