File size: 1,361 Bytes
39e7656
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
{
    "_comment": "Optimal runtime configuration for TokForge speculative decoding",
    "recommended": {
        "target_backend": "opencl",
        "draft_backend": "cpu",
        "draft_predict_length": 3,
        "draft_thread_num": 2,
        "draft_power": "high",
        "draft_sampler_type": "greedy"
    },
    "compatible_targets": {
        "qwen3_8b": {
            "enabled": true,
            "uplift": "+24-40% (3x avg, 500-tok prose)"
        },
        "qwen3_14b": {
            "enabled": true,
            "uplift": "+40-70% (3x avg, 500-tok prose)"
        },
        "qwen3_4b": {
            "enabled": false,
            "reason": "Degenerates — KL trained from 8B teacher"
        },
        "qwen3_5": {
            "enabled": false,
            "reason": "Not compatible — different architecture (LinearAttention)"
        }
    },
    "notes": {
        "thread_num": "2 threads keeps WALT governor on performance cores. 4+ threads spread load and governor drops to min frequency.",
        "power_high": "Forces Android performance hint. Critical for draft model speed.",
        "draft_backend_cpu": "CPU draft avoids GPU memory contention with OpenCL target. OpenCL draft OOMs on 16GB devices.",
        "predict_length_3": "d=3 is optimal across all tested devices. d=2 too conservative, d=4 wastes draft overhead."
    }
}