File size: 2,271 Bytes
19ed98b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# Build and deploy ternary inference engine
# (c) 2026 OpenTransformers Ltd / Scott Bisset

set -e

WORKDIR=/root/ternary_engine
MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf
TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary

echo "=== Ternary Inference Engine Build ==="
echo "Target: AVX-512 Skylake"
echo ""

mkdir -p $WORKDIR
cd $WORKDIR

# Step 1: Compile C kernel with AVX-512
echo "[1/4] Compiling AVX-512 kernel..."
gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \
    -shared -fPIC -lm \
    -o ternary_kernel.so ternary_kernel.c
echo "  -> ternary_kernel.so built"
ls -lh ternary_kernel.so

# Step 2: Download model from HuggingFace
echo ""
echo "[2/4] Downloading model weights..."
pip install --break-system-packages -q safetensors tokenizers 2>/dev/null
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR',
    ignore_patterns=['*.md', '*.txt', 'figures/*'])
print('Download complete')
"

# Step 3: Convert to ternary
echo ""
echo "[3/4] Converting to ternary format..."
python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7

# Step 4: Verify
echo ""
echo "[4/4] Verifying..."
ls -lh $TERNARY_DIR/ | head -20
echo ""
du -sh $TERNARY_DIR/
echo ""

# Quick test
echo "Running speed test..."
python3 -c "
from inference import TernaryQwen, load_kernel
import time
import os

kernel = load_kernel('$WORKDIR/ternary_kernel.so')
model = TernaryQwen('$TERNARY_DIR', kernel)

# Warm up
import numpy as np
cache_module = __import__('inference')
cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
hidden = model.forward_token(9707, cache, 0)  # 'Hello'

# Benchmark single token
times = []
for i in range(5):
    cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
    t0 = time.time()
    h = model.forward_token(9707, cache2, 0)
    times.append(time.time() - t0)

avg = sum(times) / len(times)
print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)')
print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}')
"

echo ""
echo "=== Build complete ==="
echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py"