Upload jobs/occ_humaneval_harness.py
Browse files- jobs/occ_humaneval_harness.py +212 -0
jobs/occ_humaneval_harness.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OCC HumanEval Benchmark using BigCode Evaluation Harness + Qwen3-Coder-30B-A3B-Instruct.
|
| 4 |
+
|
| 5 |
+
Strategy:
|
| 6 |
+
Pass 1: Generate with 128 tokens, run tests
|
| 7 |
+
Pass 2: Only on failures, regenerate with 1024 tokens
|
| 8 |
+
Merge: pass@1 = (pass1_passed + pass2_added_passed) / total
|
| 9 |
+
|
| 10 |
+
This uses the standard evaluation harness (not custom extraction).
|
| 11 |
+
The harness handles completion-format prompts, stop tokens, and code extraction correctly.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import subprocess
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
import sys
|
| 18 |
+
import time
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
# === CONFIG ===
|
| 22 |
+
MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
| 23 |
+
MODEL_ARGS = f"pretrained={MODEL},trust_remote_code=True,dtype=bfloat16"
|
| 24 |
+
TOTAL_PROBLEMS = 164
|
| 25 |
+
SHORT_TOKENS = 128
|
| 26 |
+
LONG_TOKENS = 1024
|
| 27 |
+
HARNESS_REPO = "https://github.com/bigcode-project/bigcode-evaluation-harness.git"
|
| 28 |
+
HARNESS_DIR = "/app/bigcode-evaluation-harness"
|
| 29 |
+
RESULTS_DIR = "/app/occ_humaneval_results"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def log(msg):
|
| 33 |
+
print(f"[OCC] {msg}", flush=True)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def run_cmd(cmd, cwd=None, check=True):
|
| 37 |
+
log(f" $ {' '.join(cmd[:6])}{'...' if len(cmd) > 6 else ''}")
|
| 38 |
+
result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)
|
| 39 |
+
if result.returncode != 0 and check:
|
| 40 |
+
log(f" FAILED (rc={result.returncode})")
|
| 41 |
+
log(f" STDERR: {result.stderr[-1500:]}")
|
| 42 |
+
return result.returncode == 0, result.stdout, result.stderr
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def setup():
|
| 46 |
+
"""Clone harness, install deps."""
|
| 47 |
+
if not os.path.exists(HARNESS_DIR):
|
| 48 |
+
log("Cloning bigcode-evaluation-harness...")
|
| 49 |
+
ok, _, _ = run_cmd(["git", "clone", HARNESS_REPO, HARNESS_DIR])
|
| 50 |
+
if not ok:
|
| 51 |
+
sys.exit(1)
|
| 52 |
+
|
| 53 |
+
log("Installing harness + deps...")
|
| 54 |
+
# Install harness
|
| 55 |
+
run_cmd(["pip", "install", "-e", HARNESS_DIR], check=False)
|
| 56 |
+
# Ensure accelerate
|
| 57 |
+
run_cmd(["pip", "install", "accelerate"], check=False)
|
| 58 |
+
log("Setup complete.")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def run_eval(max_tokens, label):
|
| 62 |
+
"""Run harness with given token limit. Returns (metrics_dict, generations_dict)."""
|
| 63 |
+
os.makedirs(RESULTS_DIR, exist_ok=True)
|
| 64 |
+
gen_path = f"{RESULTS_DIR}/{label}_generations.json"
|
| 65 |
+
met_path = f"{RESULTS_DIR}/{label}_metrics.json"
|
| 66 |
+
|
| 67 |
+
cmd = [
|
| 68 |
+
"accelerate", "launch", "main.py",
|
| 69 |
+
"--model", "hf",
|
| 70 |
+
"--model_args", MODEL_ARGS,
|
| 71 |
+
"--tasks", "humaneval",
|
| 72 |
+
"--max_length_generation", str(max_tokens),
|
| 73 |
+
"--batch_size", "1",
|
| 74 |
+
"--n_samples", "1",
|
| 75 |
+
"--allow_code_execution",
|
| 76 |
+
"--save_generations",
|
| 77 |
+
"--generations_path", gen_path,
|
| 78 |
+
"--metric_output_path", met_path,
|
| 79 |
+
"--temperature", "0.0", # greedy
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
log(f"Running {label} (max_tokens={max_tokens})...")
|
| 83 |
+
t0 = time.time()
|
| 84 |
+
ok, stdout, stderr = run_cmd(cmd, cwd=HARNESS_DIR, check=False)
|
| 85 |
+
elapsed = time.time() - t0
|
| 86 |
+
log(f" Elapsed: {elapsed:.0f}s")
|
| 87 |
+
|
| 88 |
+
if not ok:
|
| 89 |
+
log(f" FAILED. Dumping last 2000 chars of output:")
|
| 90 |
+
log(f" STDOUT: {stdout[-2000:]}")
|
| 91 |
+
log(f" STDERR: {stderr[-2000:]}")
|
| 92 |
+
return None, None
|
| 93 |
+
|
| 94 |
+
metrics = None
|
| 95 |
+
if os.path.exists(met_path):
|
| 96 |
+
with open(met_path) as f:
|
| 97 |
+
metrics = json.load(f)
|
| 98 |
+
|
| 99 |
+
generations = None
|
| 100 |
+
if os.path.exists(gen_path):
|
| 101 |
+
with open(gen_path) as f:
|
| 102 |
+
generations = json.load(f)
|
| 103 |
+
|
| 104 |
+
# Parse pass@1
|
| 105 |
+
if metrics:
|
| 106 |
+
for key in metrics:
|
| 107 |
+
if "pass@1" in metrics[key]:
|
| 108 |
+
log(f" pass@1: {metrics[key]['pass@1']:.4f}")
|
| 109 |
+
|
| 110 |
+
return metrics, generations
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def count_passed(generations):
|
| 114 |
+
"""Count how many problems passed tests."""
|
| 115 |
+
passed = []
|
| 116 |
+
failed = []
|
| 117 |
+
for task_id, val in generations.items():
|
| 118 |
+
# val is typically [code_string, test_result_bool]
|
| 119 |
+
if isinstance(val, list) and len(val) >= 2:
|
| 120 |
+
if val[1]:
|
| 121 |
+
passed.append(task_id)
|
| 122 |
+
else:
|
| 123 |
+
failed.append(task_id)
|
| 124 |
+
else:
|
| 125 |
+
failed.append(task_id)
|
| 126 |
+
return passed, failed
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def main():
|
| 130 |
+
log("=" * 60)
|
| 131 |
+
log(f"OCC HumanEval — {MODEL}")
|
| 132 |
+
log(f"Tiered: {SHORT_TOKENS} tokens → test → {LONG_TOKENS} tokens on failures")
|
| 133 |
+
log("=" * 60)
|
| 134 |
+
|
| 135 |
+
setup()
|
| 136 |
+
|
| 137 |
+
# --- Pass 1: Short generation ---
|
| 138 |
+
log("\n--- PASS 1: SHORT ---")
|
| 139 |
+
m1, g1 = run_eval(SHORT_TOKENS, "pass1_short")
|
| 140 |
+
|
| 141 |
+
if g1 is None:
|
| 142 |
+
log("CRITICAL: Pass 1 failed. Aborting.")
|
| 143 |
+
sys.exit(1)
|
| 144 |
+
|
| 145 |
+
passed_1, failed_1 = count_passed(g1)
|
| 146 |
+
n_pass1 = len(passed_1)
|
| 147 |
+
n_fail1 = len(failed_1)
|
| 148 |
+
log(f"Pass 1: {n_pass1} passed, {n_fail1} failed ({n_pass1/len(g1)*100:.1f}%)")
|
| 149 |
+
pass1_tokens = len(g1) * SHORT_TOKENS
|
| 150 |
+
|
| 151 |
+
# --- Pass 2: Long generation on failures ---
|
| 152 |
+
pass2_tokens = 0
|
| 153 |
+
new_passes = 0
|
| 154 |
+
|
| 155 |
+
if n_fail1 > 0:
|
| 156 |
+
log(f"\n--- PASS 2: LONG ({LONG_TOKENS} tokens) on {n_fail1} failures ---")
|
| 157 |
+
# Run full eval with long tokens — simpler and correct.
|
| 158 |
+
# We only count problems that were in the failure set.
|
| 159 |
+
m2, g2 = run_eval(LONG_TOKENS, "pass2_long")
|
| 160 |
+
|
| 161 |
+
if g2:
|
| 162 |
+
passed_2, failed_2 = count_passed(g2)
|
| 163 |
+
new_pass_set = set(passed_2) & set(failed_1)
|
| 164 |
+
still_fail_set = set(failed_2) & set(failed_1)
|
| 165 |
+
new_passes = len(new_pass_set)
|
| 166 |
+
still_failed = len(still_fail_set)
|
| 167 |
+
log(f"Pass 2: {new_passes} newly passed, {still_failed} still failed")
|
| 168 |
+
pass2_tokens = n_fail1 * LONG_TOKENS
|
| 169 |
+
else:
|
| 170 |
+
log("Pass 2 failed — keeping Pass 1 results only.")
|
| 171 |
+
else:
|
| 172 |
+
log("\n--- PASS 2: SKIPPED (all passed) ---")
|
| 173 |
+
|
| 174 |
+
# --- Merge results ---
|
| 175 |
+
final_passed = n_pass1 + new_passes
|
| 176 |
+
final_pass_at_1 = final_passed / TOTAL_PROBLEMS
|
| 177 |
+
total_tokens = pass1_tokens + pass2_tokens
|
| 178 |
+
baseline_tokens = TOTAL_PROBLEMS * LONG_TOKENS
|
| 179 |
+
token_savings_pct = (1.0 - total_tokens / baseline_tokens) * 100 if baseline_tokens > 0 else 0
|
| 180 |
+
|
| 181 |
+
log("\n" + "=" * 60)
|
| 182 |
+
log("FINAL RESULTS")
|
| 183 |
+
log("=" * 60)
|
| 184 |
+
log(f" OCC pass@1: {final_pass_at_1:.4f} ({final_passed}/{TOTAL_PROBLEMS})")
|
| 185 |
+
log(f" Pass 1 only: {n_pass1/TOTAL_PROBLEMS:.4f} ({n_pass1}/{TOTAL_PROBLEMS})")
|
| 186 |
+
log(f" Tokens used: {total_tokens} (OCC) vs {baseline_tokens} (baseline)")
|
| 187 |
+
log(f" Savings: {token_savings_pct:.1f}%")
|
| 188 |
+
log(f" New from P2: {new_passes}")
|
| 189 |
+
log(f" Still failed: {TOTAL_PROBLEMS - final_passed}")
|
| 190 |
+
|
| 191 |
+
# Save final report
|
| 192 |
+
results = {
|
| 193 |
+
"model": MODEL,
|
| 194 |
+
"config": {"short_tokens": SHORT_TOKENS, "long_tokens": LONG_TOKENS, "total_problems": TOTAL_PROBLEMS},
|
| 195 |
+
"pass1": {"pass_at_1": n_pass1 / TOTAL_PROBLEMS, "passed": n_pass1, "failed": n_fail1, "tokens": pass1_tokens},
|
| 196 |
+
"pass2": {"newly_passed": new_passes, "tokens": pass2_tokens},
|
| 197 |
+
"occ_combined": {"pass_at_1": final_pass_at_1, "total_passed": final_passed, "total_tokens": total_tokens},
|
| 198 |
+
"baseline_long_only": {"tokens": baseline_tokens, "savings_pct": token_savings_pct},
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
report_path = f"{RESULTS_DIR}/occ_results.json"
|
| 202 |
+
with open(report_path, "w") as f:
|
| 203 |
+
json.dump(results, f, indent=2)
|
| 204 |
+
|
| 205 |
+
log(f"\nResults saved to {report_path}")
|
| 206 |
+
|
| 207 |
+
# Also write a summary to stdout for easy parsing
|
| 208 |
+
print(f"\nOCC_SUMMARY: pass@1={final_pass_at_1:.4f} tokens={total_tokens} baseline={baseline_tokens} savings={token_savings_pct:.1f}%")
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
if __name__ == "__main__":
|
| 212 |
+
main()
|