File size: 2,271 Bytes
19ed98b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | #!/bin/bash
# Build and deploy ternary inference engine
# (c) 2026 OpenTransformers Ltd / Scott Bisset
set -e
WORKDIR=/root/ternary_engine
MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf
TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary
echo "=== Ternary Inference Engine Build ==="
echo "Target: AVX-512 Skylake"
echo ""
mkdir -p $WORKDIR
cd $WORKDIR
# Step 1: Compile C kernel with AVX-512
echo "[1/4] Compiling AVX-512 kernel..."
gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \
-shared -fPIC -lm \
-o ternary_kernel.so ternary_kernel.c
echo " -> ternary_kernel.so built"
ls -lh ternary_kernel.so
# Step 2: Download model from HuggingFace
echo ""
echo "[2/4] Downloading model weights..."
pip install --break-system-packages -q safetensors tokenizers 2>/dev/null
python3 -c "
from huggingface_hub import snapshot_download
snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR',
ignore_patterns=['*.md', '*.txt', 'figures/*'])
print('Download complete')
"
# Step 3: Convert to ternary
echo ""
echo "[3/4] Converting to ternary format..."
python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7
# Step 4: Verify
echo ""
echo "[4/4] Verifying..."
ls -lh $TERNARY_DIR/ | head -20
echo ""
du -sh $TERNARY_DIR/
echo ""
# Quick test
echo "Running speed test..."
python3 -c "
from inference import TernaryQwen, load_kernel
import time
import os
kernel = load_kernel('$WORKDIR/ternary_kernel.so')
model = TernaryQwen('$TERNARY_DIR', kernel)
# Warm up
import numpy as np
cache_module = __import__('inference')
cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
hidden = model.forward_token(9707, cache, 0) # 'Hello'
# Benchmark single token
times = []
for i in range(5):
cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
t0 = time.time()
h = model.forward_token(9707, cache2, 0)
times.append(time.time() - t0)
avg = sum(times) / len(times)
print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)')
print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}')
"
echo ""
echo "=== Build complete ==="
echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py"
|