darkmaniac7 commited on
Commit
39e7656
·
verified ·
1 Parent(s): 2d9453c

Add runtime_config.json with optimal spec decode settings

Browse files
Files changed (1) hide show
  1. runtime_config.json +35 -0
runtime_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_comment": "Optimal runtime configuration for TokForge speculative decoding",
3
+ "recommended": {
4
+ "target_backend": "opencl",
5
+ "draft_backend": "cpu",
6
+ "draft_predict_length": 3,
7
+ "draft_thread_num": 2,
8
+ "draft_power": "high",
9
+ "draft_sampler_type": "greedy"
10
+ },
11
+ "compatible_targets": {
12
+ "qwen3_8b": {
13
+ "enabled": true,
14
+ "uplift": "+24-40% (3x avg, 500-tok prose)"
15
+ },
16
+ "qwen3_14b": {
17
+ "enabled": true,
18
+ "uplift": "+40-70% (3x avg, 500-tok prose)"
19
+ },
20
+ "qwen3_4b": {
21
+ "enabled": false,
22
+ "reason": "Degenerates — KL trained from 8B teacher"
23
+ },
24
+ "qwen3_5": {
25
+ "enabled": false,
26
+ "reason": "Not compatible — different architecture (LinearAttention)"
27
+ }
28
+ },
29
+ "notes": {
30
+ "thread_num": "2 threads keeps WALT governor on performance cores. 4+ threads spread load and governor drops to min frequency.",
31
+ "power_high": "Forces Android performance hint. Critical for draft model speed.",
32
+ "draft_backend_cpu": "CPU draft avoids GPU memory contention with OpenCL target. OpenCL draft OOMs on 16GB devices.",
33
+ "predict_length_3": "d=3 is optimal across all tested devices. d=2 too conservative, d=4 wastes draft overhead."
34
+ }
35
+ }