Add runtime_config.json with optimal spec decode settings
Browse files- runtime_config.json +35 -0
runtime_config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_comment": "Optimal runtime configuration for TokForge speculative decoding",
|
| 3 |
+
"recommended": {
|
| 4 |
+
"target_backend": "opencl",
|
| 5 |
+
"draft_backend": "cpu",
|
| 6 |
+
"draft_predict_length": 3,
|
| 7 |
+
"draft_thread_num": 2,
|
| 8 |
+
"draft_power": "high",
|
| 9 |
+
"draft_sampler_type": "greedy"
|
| 10 |
+
},
|
| 11 |
+
"compatible_targets": {
|
| 12 |
+
"qwen3_8b": {
|
| 13 |
+
"enabled": true,
|
| 14 |
+
"uplift": "+24-40% (3x avg, 500-tok prose)"
|
| 15 |
+
},
|
| 16 |
+
"qwen3_14b": {
|
| 17 |
+
"enabled": true,
|
| 18 |
+
"uplift": "+40-70% (3x avg, 500-tok prose)"
|
| 19 |
+
},
|
| 20 |
+
"qwen3_4b": {
|
| 21 |
+
"enabled": false,
|
| 22 |
+
"reason": "Degenerates — KL trained from 8B teacher"
|
| 23 |
+
},
|
| 24 |
+
"qwen3_5": {
|
| 25 |
+
"enabled": false,
|
| 26 |
+
"reason": "Not compatible — different architecture (LinearAttention)"
|
| 27 |
+
}
|
| 28 |
+
},
|
| 29 |
+
"notes": {
|
| 30 |
+
"thread_num": "2 threads keeps WALT governor on performance cores. 4+ threads spread load and governor drops to min frequency.",
|
| 31 |
+
"power_high": "Forces Android performance hint. Critical for draft model speed.",
|
| 32 |
+
"draft_backend_cpu": "CPU draft avoids GPU memory contention with OpenCL target. OpenCL draft OOMs on 16GB devices.",
|
| 33 |
+
"predict_length_3": "d=3 is optimal across all tested devices. d=2 too conservative, d=4 wastes draft overhead."
|
| 34 |
+
}
|
| 35 |
+
}
|