| { | |
| "_comment": "Optimal runtime configuration for TokForge speculative decoding", | |
| "recommended": { | |
| "target_backend": "opencl", | |
| "draft_backend": "cpu", | |
| "draft_predict_length": 3, | |
| "draft_thread_num": 2, | |
| "draft_power": "high", | |
| "draft_sampler_type": "greedy" | |
| }, | |
| "compatible_targets": { | |
| "qwen3_8b": { | |
| "enabled": true, | |
| "uplift": "+24-40% (3x avg, 500-tok prose)" | |
| }, | |
| "qwen3_14b": { | |
| "enabled": true, | |
| "uplift": "+40-70% (3x avg, 500-tok prose)" | |
| }, | |
| "qwen3_4b": { | |
| "enabled": false, | |
| "reason": "Degenerates — KL trained from 8B teacher" | |
| }, | |
| "qwen3_5": { | |
| "enabled": false, | |
| "reason": "Not compatible — different architecture (LinearAttention)" | |
| } | |
| }, | |
| "notes": { | |
| "thread_num": "2 threads keeps WALT governor on performance cores. 4+ threads spread load and governor drops to min frequency.", | |
| "power_high": "Forces Android performance hint. Critical for draft model speed.", | |
| "draft_backend_cpu": "CPU draft avoids GPU memory contention with OpenCL target. OpenCL draft OOMs on 16GB devices.", | |
| "predict_length_3": "d=3 is optimal across all tested devices. d=2 too conservative, d=4 wastes draft overhead." | |
| } | |
| } | |