narcolepticchicken
/

occ-stack

narcolepticchicken commited on 25 days ago

Commit

6a7c356

verified ·

1 Parent(s): 799bf90

Upload reports/humaneval_real_results.json

Files changed (1) hide show

reports/humaneval_real_results.json ADDED Viewed

+{
+  "model": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
+  "date": "2026-05-07",
+  "benchmark": "openai/openai_humaneval",
+  "num_problems": 164,
+  "pass1_short_tokens": 128,
+  "pass2_long_tokens": 1024,
+  "pass1_passed": 103,
+  "pass1_failed": 61,
+  "pass1_accuracy": 0.6280,
+  "pass1_tokens": 12859,
+  "pass2_passed": 20,
+  "pass2_failed": 41,
+  "pass2_tokens": 8184,
+  "final_passed": 123,
+  "final_accuracy": 0.7500,
+  "total_tokens": 21043,
+  "baseline_tokens": 167936,
+  "savings_percent": 87.5,
+  "failure_analysis": {
+    "syntax_errors_from_truncation": "majority of remaining failures",
+    "genuine_assert_errors": "~20% of failures",
+    "note": "Raising short tokens from 128 to 256 would likely push pass@1 to 80%+"
+  },
+  "notes": "Completion format (no chat template), stop-token trimming at \\nclass \\ndef \\n# \\nif __name__ \\nprint(, clean_body strips leading/trailing blank lines"
+}