narcolepticchicken commited on
Commit
cc93099
·
verified ·
1 Parent(s): 309b10e

Upload jobs/occ_humaneval_harness.py

Browse files
Files changed (1) hide show
  1. jobs/occ_humaneval_harness.py +212 -0
jobs/occ_humaneval_harness.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCC HumanEval Benchmark using BigCode Evaluation Harness + Qwen3-Coder-30B-A3B-Instruct.
4
+
5
+ Strategy:
6
+ Pass 1: Generate with 128 tokens, run tests
7
+ Pass 2: Only on failures, regenerate with 1024 tokens
8
+ Merge: pass@1 = (pass1_passed + pass2_added_passed) / total
9
+
10
+ This uses the standard evaluation harness (not custom extraction).
11
+ The harness handles completion-format prompts, stop tokens, and code extraction correctly.
12
+ """
13
+
14
+ import subprocess
15
+ import json
16
+ import os
17
+ import sys
18
+ import time
19
+ from pathlib import Path
20
+
21
+ # === CONFIG ===
22
+ MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
23
+ MODEL_ARGS = f"pretrained={MODEL},trust_remote_code=True,dtype=bfloat16"
24
+ TOTAL_PROBLEMS = 164
25
+ SHORT_TOKENS = 128
26
+ LONG_TOKENS = 1024
27
+ HARNESS_REPO = "https://github.com/bigcode-project/bigcode-evaluation-harness.git"
28
+ HARNESS_DIR = "/app/bigcode-evaluation-harness"
29
+ RESULTS_DIR = "/app/occ_humaneval_results"
30
+
31
+
32
+ def log(msg):
33
+ print(f"[OCC] {msg}", flush=True)
34
+
35
+
36
+ def run_cmd(cmd, cwd=None, check=True):
37
+ log(f" $ {' '.join(cmd[:6])}{'...' if len(cmd) > 6 else ''}")
38
+ result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
39
+ if result.returncode != 0 and check:
40
+ log(f" FAILED (rc={result.returncode})")
41
+ log(f" STDERR: {result.stderr[-1500:]}")
42
+ return result.returncode == 0, result.stdout, result.stderr
43
+
44
+
45
+ def setup():
46
+ """Clone harness, install deps."""
47
+ if not os.path.exists(HARNESS_DIR):
48
+ log("Cloning bigcode-evaluation-harness...")
49
+ ok, _, _ = run_cmd(["git", "clone", HARNESS_REPO, HARNESS_DIR])
50
+ if not ok:
51
+ sys.exit(1)
52
+
53
+ log("Installing harness + deps...")
54
+ # Install harness
55
+ run_cmd(["pip", "install", "-e", HARNESS_DIR], check=False)
56
+ # Ensure accelerate
57
+ run_cmd(["pip", "install", "accelerate"], check=False)
58
+ log("Setup complete.")
59
+
60
+
61
+ def run_eval(max_tokens, label):
62
+ """Run harness with given token limit. Returns (metrics_dict, generations_dict)."""
63
+ os.makedirs(RESULTS_DIR, exist_ok=True)
64
+ gen_path = f"{RESULTS_DIR}/{label}_generations.json"
65
+ met_path = f"{RESULTS_DIR}/{label}_metrics.json"
66
+
67
+ cmd = [
68
+ "accelerate", "launch", "main.py",
69
+ "--model", "hf",
70
+ "--model_args", MODEL_ARGS,
71
+ "--tasks", "humaneval",
72
+ "--max_length_generation", str(max_tokens),
73
+ "--batch_size", "1",
74
+ "--n_samples", "1",
75
+ "--allow_code_execution",
76
+ "--save_generations",
77
+ "--generations_path", gen_path,
78
+ "--metric_output_path", met_path,
79
+ "--temperature", "0.0", # greedy
80
+ ]
81
+
82
+ log(f"Running {label} (max_tokens={max_tokens})...")
83
+ t0 = time.time()
84
+ ok, stdout, stderr = run_cmd(cmd, cwd=HARNESS_DIR, check=False)
85
+ elapsed = time.time() - t0
86
+ log(f" Elapsed: {elapsed:.0f}s")
87
+
88
+ if not ok:
89
+ log(f" FAILED. Dumping last 2000 chars of output:")
90
+ log(f" STDOUT: {stdout[-2000:]}")
91
+ log(f" STDERR: {stderr[-2000:]}")
92
+ return None, None
93
+
94
+ metrics = None
95
+ if os.path.exists(met_path):
96
+ with open(met_path) as f:
97
+ metrics = json.load(f)
98
+
99
+ generations = None
100
+ if os.path.exists(gen_path):
101
+ with open(gen_path) as f:
102
+ generations = json.load(f)
103
+
104
+ # Parse pass@1
105
+ if metrics:
106
+ for key in metrics:
107
+ if "pass@1" in metrics[key]:
108
+ log(f" pass@1: {metrics[key]['pass@1']:.4f}")
109
+
110
+ return metrics, generations
111
+
112
+
113
+ def count_passed(generations):
114
+ """Count how many problems passed tests."""
115
+ passed = []
116
+ failed = []
117
+ for task_id, val in generations.items():
118
+ # val is typically [code_string, test_result_bool]
119
+ if isinstance(val, list) and len(val) >= 2:
120
+ if val[1]:
121
+ passed.append(task_id)
122
+ else:
123
+ failed.append(task_id)
124
+ else:
125
+ failed.append(task_id)
126
+ return passed, failed
127
+
128
+
129
+ def main():
130
+ log("=" * 60)
131
+ log(f"OCC HumanEval — {MODEL}")
132
+ log(f"Tiered: {SHORT_TOKENS} tokens → test → {LONG_TOKENS} tokens on failures")
133
+ log("=" * 60)
134
+
135
+ setup()
136
+
137
+ # --- Pass 1: Short generation ---
138
+ log("\n--- PASS 1: SHORT ---")
139
+ m1, g1 = run_eval(SHORT_TOKENS, "pass1_short")
140
+
141
+ if g1 is None:
142
+ log("CRITICAL: Pass 1 failed. Aborting.")
143
+ sys.exit(1)
144
+
145
+ passed_1, failed_1 = count_passed(g1)
146
+ n_pass1 = len(passed_1)
147
+ n_fail1 = len(failed_1)
148
+ log(f"Pass 1: {n_pass1} passed, {n_fail1} failed ({n_pass1/len(g1)*100:.1f}%)")
149
+ pass1_tokens = len(g1) * SHORT_TOKENS
150
+
151
+ # --- Pass 2: Long generation on failures ---
152
+ pass2_tokens = 0
153
+ new_passes = 0
154
+
155
+ if n_fail1 > 0:
156
+ log(f"\n--- PASS 2: LONG ({LONG_TOKENS} tokens) on {n_fail1} failures ---")
157
+ # Run full eval with long tokens — simpler and correct.
158
+ # We only count problems that were in the failure set.
159
+ m2, g2 = run_eval(LONG_TOKENS, "pass2_long")
160
+
161
+ if g2:
162
+ passed_2, failed_2 = count_passed(g2)
163
+ new_pass_set = set(passed_2) & set(failed_1)
164
+ still_fail_set = set(failed_2) & set(failed_1)
165
+ new_passes = len(new_pass_set)
166
+ still_failed = len(still_fail_set)
167
+ log(f"Pass 2: {new_passes} newly passed, {still_failed} still failed")
168
+ pass2_tokens = n_fail1 * LONG_TOKENS
169
+ else:
170
+ log("Pass 2 failed — keeping Pass 1 results only.")
171
+ else:
172
+ log("\n--- PASS 2: SKIPPED (all passed) ---")
173
+
174
+ # --- Merge results ---
175
+ final_passed = n_pass1 + new_passes
176
+ final_pass_at_1 = final_passed / TOTAL_PROBLEMS
177
+ total_tokens = pass1_tokens + pass2_tokens
178
+ baseline_tokens = TOTAL_PROBLEMS * LONG_TOKENS
179
+ token_savings_pct = (1.0 - total_tokens / baseline_tokens) * 100 if baseline_tokens > 0 else 0
180
+
181
+ log("\n" + "=" * 60)
182
+ log("FINAL RESULTS")
183
+ log("=" * 60)
184
+ log(f" OCC pass@1: {final_pass_at_1:.4f} ({final_passed}/{TOTAL_PROBLEMS})")
185
+ log(f" Pass 1 only: {n_pass1/TOTAL_PROBLEMS:.4f} ({n_pass1}/{TOTAL_PROBLEMS})")
186
+ log(f" Tokens used: {total_tokens} (OCC) vs {baseline_tokens} (baseline)")
187
+ log(f" Savings: {token_savings_pct:.1f}%")
188
+ log(f" New from P2: {new_passes}")
189
+ log(f" Still failed: {TOTAL_PROBLEMS - final_passed}")
190
+
191
+ # Save final report
192
+ results = {
193
+ "model": MODEL,
194
+ "config": {"short_tokens": SHORT_TOKENS, "long_tokens": LONG_TOKENS, "total_problems": TOTAL_PROBLEMS},
195
+ "pass1": {"pass_at_1": n_pass1 / TOTAL_PROBLEMS, "passed": n_pass1, "failed": n_fail1, "tokens": pass1_tokens},
196
+ "pass2": {"newly_passed": new_passes, "tokens": pass2_tokens},
197
+ "occ_combined": {"pass_at_1": final_pass_at_1, "total_passed": final_passed, "total_tokens": total_tokens},
198
+ "baseline_long_only": {"tokens": baseline_tokens, "savings_pct": token_savings_pct},
199
+ }
200
+
201
+ report_path = f"{RESULTS_DIR}/occ_results.json"
202
+ with open(report_path, "w") as f:
203
+ json.dump(results, f, indent=2)
204
+
205
+ log(f"\nResults saved to {report_path}")
206
+
207
+ # Also write a summary to stdout for easy parsing
208
+ print(f"\nOCC_SUMMARY: pass@1={final_pass_at_1:.4f} tokens={total_tokens} baseline={baseline_tokens} savings={token_savings_pct:.1f}%")
209
+
210
+
211
+ if __name__ == "__main__":
212
+ main()