Add runtime_config.json with optimal spec decode settings

39e7656 verified 12 days ago

1.36 kB

	{
	"_comment": "Optimal runtime configuration for TokForge speculative decoding",
	"recommended": {
	"target_backend": "opencl",
	"draft_backend": "cpu",
	"draft_predict_length": 3,
	"draft_thread_num": 2,
	"draft_power": "high",
	"draft_sampler_type": "greedy"
	},
	"compatible_targets": {
	"qwen3_8b": {
	"enabled": true,
	"uplift": "+24-40% (3x avg, 500-tok prose)"
	},
	"qwen3_14b": {
	"enabled": true,
	"uplift": "+40-70% (3x avg, 500-tok prose)"
	},
	"qwen3_4b": {
	"enabled": false,
	"reason": "Degenerates — KL trained from 8B teacher"
	},
	"qwen3_5": {
	"enabled": false,
	"reason": "Not compatible — different architecture (LinearAttention)"
	}
	},
	"notes": {
	"thread_num": "2 threads keeps WALT governor on performance cores. 4+ threads spread load and governor drops to min frequency.",
	"power_high": "Forces Android performance hint. Critical for draft model speed.",
	"draft_backend_cpu": "CPU draft avoids GPU memory contention with OpenCL target. OpenCL draft OOMs on 16GB devices.",
	"predict_length_3": "d=3 is optimal across all tested devices. d=2 too conservative, d=4 wastes draft overhead."
	}
	}