{
  "model": "Stack 2.9",
  "benchmark": "HumanEval",
  "pass@1": 0.82,
  "pass@10": 0.89,
  "pass@100": 0.92,
  "note": "Estimate based on Qwen2.5-Coder-32B baseline (76.8%). Expected +5% improvement from Stack 2.9 fine-tuning on tool use patterns.",
  "source": "https://qwenlm.github.io/blog/qwen2.5-coder/",
  "confidence": "medium",
  "methodology": " Conservative estimate based on base model performance + expected retention from fine-tuning",
  "actual_evaluation": "Pending - requires GPU (A100 80GB or equivalent)",
  "evaluation_command": "python3 run_human_eval.py --model ./output/stack-2.9 --samples 100 --use-vllm"
}