| | #!/bin/bash |
| | |
| | |
| |
|
| | set -e |
| |
|
| | WORKDIR=/root/ternary_engine |
| | MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |
| | MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf |
| | TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary |
| |
|
| | echo "=== Ternary Inference Engine Build ===" |
| | echo "Target: AVX-512 Skylake" |
| | echo "" |
| |
|
| | mkdir -p $WORKDIR |
| | cd $WORKDIR |
| |
|
| | |
| | echo "[1/4] Compiling AVX-512 kernel..." |
| | gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \ |
| | -shared -fPIC -lm \ |
| | -o ternary_kernel.so ternary_kernel.c |
| | echo " -> ternary_kernel.so built" |
| | ls -lh ternary_kernel.so |
| |
|
| | |
| | echo "" |
| | echo "[2/4] Downloading model weights..." |
| | pip install --break-system-packages -q safetensors tokenizers 2>/dev/null |
| | python3 -c " |
| | from huggingface_hub import snapshot_download |
| | snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR', |
| | ignore_patterns=['*.md', '*.txt', 'figures/*']) |
| | print('Download complete') |
| | " |
| |
|
| | |
| | echo "" |
| | echo "[3/4] Converting to ternary format..." |
| | python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7 |
| |
|
| | |
| | echo "" |
| | echo "[4/4] Verifying..." |
| | ls -lh $TERNARY_DIR/ | head -20 |
| | echo "" |
| | du -sh $TERNARY_DIR/ |
| | echo "" |
| |
|
| | |
| | echo "Running speed test..." |
| | python3 -c " |
| | from inference import TernaryQwen, load_kernel |
| | import time |
| | import os |
| | |
| | kernel = load_kernel('$WORKDIR/ternary_kernel.so') |
| | model = TernaryQwen('$TERNARY_DIR', kernel) |
| | |
| | # Warm up |
| | import numpy as np |
| | cache_module = __import__('inference') |
| | cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim) |
| | hidden = model.forward_token(9707, cache, 0) # 'Hello' |
| | |
| | # Benchmark single token |
| | times = [] |
| | for i in range(5): |
| | cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim) |
| | t0 = time.time() |
| | h = model.forward_token(9707, cache2, 0) |
| | times.append(time.time() - t0) |
| | |
| | avg = sum(times) / len(times) |
| | print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)') |
| | print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}') |
| | " |
| |
|
| | echo "" |
| | echo "=== Build complete ===" |
| | echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py" |
| |
|