prometheus04's picture
Upload training_scripts/phase3_eval.py with huggingface_hub
289b193 verified
#!/usr/bin/env python3
"""Phase 3: Evaluation on HumanEval+ and MBPP+"""
import subprocess
from pathlib import Path
# Config
CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
RESULTS_DIR = Path("./qwen3_pipeline/results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
model_path = str(CKPT_DIR / "merged")
print("="*70)
print("PHASE 3: EVALUATION")
print("="*70)
print(f"\nModel: {model_path}\n")
# Check if model exists
if not (CKPT_DIR / "merged").exists():
print(f"❌ Model not found at {model_path}")
print(f" Did Phase 2 complete successfully?")
exit(1)
# HumanEval+
print("="*70)
print("HUMANEVAL+")
print("="*70)
humaneval_log = RESULTS_DIR / "humaneval_plus.log"
cmd_humaneval = (
f'evalplus.evaluate '
f'--model "{model_path}" '
f'--dataset humaneval '
f'--backend vllm '
f'--greedy '
f'--tp 1'
)
print(f"\nRunning: {cmd_humaneval}\n")
try:
result = subprocess.run(
cmd_humaneval,
shell=True,
capture_output=True,
text=True,
timeout=1800 # 30 min timeout
)
# Save full log
with open(humaneval_log, "w") as f:
f.write(result.stdout)
f.write("\n\n=== STDERR ===\n\n")
f.write(result.stderr)
# Print key results
print("\n" + "="*70)
print("HUMANEVAL+ RESULTS")
print("="*70)
for line in result.stdout.split("\n"):
if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
print(line)
print(f"\nFull log: {humaneval_log}")
except subprocess.TimeoutExpired:
print("❌ HumanEval+ timed out (30 min)")
except Exception as e:
print(f"❌ HumanEval+ failed: {e}")
# MBPP+
print("\n" + "="*70)
print("MBPP+")
print("="*70)
mbpp_log = RESULTS_DIR / "mbpp_plus.log"
cmd_mbpp = (
f'evalplus.evaluate '
f'--model "{model_path}" '
f'--dataset mbpp '
f'--backend vllm '
f'--greedy '
f'--tp 1'
)
print(f"\nRunning: {cmd_mbpp}\n")
try:
result = subprocess.run(
cmd_mbpp,
shell=True,
capture_output=True,
text=True,
timeout=1800 # 30 min timeout
)
# Save full log
with open(mbpp_log, "w") as f:
f.write(result.stdout)
f.write("\n\n=== STDERR ===\n\n")
f.write(result.stderr)
# Print key results
print("\n" + "="*70)
print("MBPP+ RESULTS")
print("="*70)
for line in result.stdout.split("\n"):
if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
print(line)
print(f"\nFull log: {mbpp_log}")
except subprocess.TimeoutExpired:
print("❌ MBPP+ timed out (30 min)")
except Exception as e:
print(f"❌ MBPP+ failed: {e}")
# Summary
print("\n" + "="*70)
print("✓ PHASE 3 COMPLETE")
print("="*70)
print(f"\nResults saved to: {RESULTS_DIR}/")
print(f" - {humaneval_log}")
print(f" - {mbpp_log}")
print(f"\n➡️ Next: python phase4_codet.py")