| | |
| | """Phase 3: Evaluation on HumanEval+ and MBPP+""" |
| |
|
| | import subprocess |
| | from pathlib import Path |
| |
|
| | |
| | CKPT_DIR = Path("./qwen3_pipeline/checkpoint") |
| | RESULTS_DIR = Path("./qwen3_pipeline/results") |
| | RESULTS_DIR.mkdir(parents=True, exist_ok=True) |
| |
|
| | model_path = str(CKPT_DIR / "merged") |
| |
|
| | print("="*70) |
| | print("PHASE 3: EVALUATION") |
| | print("="*70) |
| | print(f"\nModel: {model_path}\n") |
| |
|
| | |
| | if not (CKPT_DIR / "merged").exists(): |
| | print(f"❌ Model not found at {model_path}") |
| | print(f" Did Phase 2 complete successfully?") |
| | exit(1) |
| |
|
| | |
| | print("="*70) |
| | print("HUMANEVAL+") |
| | print("="*70) |
| |
|
| | humaneval_log = RESULTS_DIR / "humaneval_plus.log" |
| |
|
| | cmd_humaneval = ( |
| | f'evalplus.evaluate ' |
| | f'--model "{model_path}" ' |
| | f'--dataset humaneval ' |
| | f'--backend vllm ' |
| | f'--greedy ' |
| | f'--tp 1' |
| | ) |
| |
|
| | print(f"\nRunning: {cmd_humaneval}\n") |
| |
|
| | try: |
| | result = subprocess.run( |
| | cmd_humaneval, |
| | shell=True, |
| | capture_output=True, |
| | text=True, |
| | timeout=1800 |
| | ) |
| | |
| | |
| | with open(humaneval_log, "w") as f: |
| | f.write(result.stdout) |
| | f.write("\n\n=== STDERR ===\n\n") |
| | f.write(result.stderr) |
| | |
| | |
| | print("\n" + "="*70) |
| | print("HUMANEVAL+ RESULTS") |
| | print("="*70) |
| | |
| | for line in result.stdout.split("\n"): |
| | if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]): |
| | print(line) |
| | |
| | print(f"\nFull log: {humaneval_log}") |
| | |
| | except subprocess.TimeoutExpired: |
| | print("❌ HumanEval+ timed out (30 min)") |
| | except Exception as e: |
| | print(f"❌ HumanEval+ failed: {e}") |
| |
|
| | |
| | print("\n" + "="*70) |
| | print("MBPP+") |
| | print("="*70) |
| |
|
| | mbpp_log = RESULTS_DIR / "mbpp_plus.log" |
| |
|
| | cmd_mbpp = ( |
| | f'evalplus.evaluate ' |
| | f'--model "{model_path}" ' |
| | f'--dataset mbpp ' |
| | f'--backend vllm ' |
| | f'--greedy ' |
| | f'--tp 1' |
| | ) |
| |
|
| | print(f"\nRunning: {cmd_mbpp}\n") |
| |
|
| | try: |
| | result = subprocess.run( |
| | cmd_mbpp, |
| | shell=True, |
| | capture_output=True, |
| | text=True, |
| | timeout=1800 |
| | ) |
| | |
| | |
| | with open(mbpp_log, "w") as f: |
| | f.write(result.stdout) |
| | f.write("\n\n=== STDERR ===\n\n") |
| | f.write(result.stderr) |
| | |
| | |
| | print("\n" + "="*70) |
| | print("MBPP+ RESULTS") |
| | print("="*70) |
| | |
| | for line in result.stdout.split("\n"): |
| | if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]): |
| | print(line) |
| | |
| | print(f"\nFull log: {mbpp_log}") |
| | |
| | except subprocess.TimeoutExpired: |
| | print("❌ MBPP+ timed out (30 min)") |
| | except Exception as e: |
| | print(f"❌ MBPP+ failed: {e}") |
| |
|
| | |
| | print("\n" + "="*70) |
| | print("✓ PHASE 3 COMPLETE") |
| | print("="*70) |
| | print(f"\nResults saved to: {RESULTS_DIR}/") |
| | print(f" - {humaneval_log}") |
| | print(f" - {mbpp_log}") |
| | print(f"\n➡️ Next: python phase4_codet.py") |
| |
|