#!/usr/bin/env python3 """Phase 3: Evaluation on HumanEval+ and MBPP+""" import subprocess from pathlib import Path # Config CKPT_DIR = Path("./qwen3_pipeline/checkpoint") RESULTS_DIR = Path("./qwen3_pipeline/results") RESULTS_DIR.mkdir(parents=True, exist_ok=True) model_path = str(CKPT_DIR / "merged") print("="*70) print("PHASE 3: EVALUATION") print("="*70) print(f"\nModel: {model_path}\n") # Check if model exists if not (CKPT_DIR / "merged").exists(): print(f"❌ Model not found at {model_path}") print(f" Did Phase 2 complete successfully?") exit(1) # HumanEval+ print("="*70) print("HUMANEVAL+") print("="*70) humaneval_log = RESULTS_DIR / "humaneval_plus.log" cmd_humaneval = ( f'evalplus.evaluate ' f'--model "{model_path}" ' f'--dataset humaneval ' f'--backend vllm ' f'--greedy ' f'--tp 1' ) print(f"\nRunning: {cmd_humaneval}\n") try: result = subprocess.run( cmd_humaneval, shell=True, capture_output=True, text=True, timeout=1800 # 30 min timeout ) # Save full log with open(humaneval_log, "w") as f: f.write(result.stdout) f.write("\n\n=== STDERR ===\n\n") f.write(result.stderr) # Print key results print("\n" + "="*70) print("HUMANEVAL+ RESULTS") print("="*70) for line in result.stdout.split("\n"): if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]): print(line) print(f"\nFull log: {humaneval_log}") except subprocess.TimeoutExpired: print("❌ HumanEval+ timed out (30 min)") except Exception as e: print(f"❌ HumanEval+ failed: {e}") # MBPP+ print("\n" + "="*70) print("MBPP+") print("="*70) mbpp_log = RESULTS_DIR / "mbpp_plus.log" cmd_mbpp = ( f'evalplus.evaluate ' f'--model "{model_path}" ' f'--dataset mbpp ' f'--backend vllm ' f'--greedy ' f'--tp 1' ) print(f"\nRunning: {cmd_mbpp}\n") try: result = subprocess.run( cmd_mbpp, shell=True, capture_output=True, text=True, timeout=1800 # 30 min timeout ) # Save full log with open(mbpp_log, "w") as f: f.write(result.stdout) f.write("\n\n=== STDERR ===\n\n") f.write(result.stderr) # Print key results print("\n" + "="*70) print("MBPP+ RESULTS") print("="*70) for line in result.stdout.split("\n"): if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]): print(line) print(f"\nFull log: {mbpp_log}") except subprocess.TimeoutExpired: print("❌ MBPP+ timed out (30 min)") except Exception as e: print(f"❌ MBPP+ failed: {e}") # Summary print("\n" + "="*70) print("✓ PHASE 3 COMPLETE") print("="*70) print(f"\nResults saved to: {RESULTS_DIR}/") print(f" - {humaneval_log}") print(f" - {mbpp_log}") print(f"\n➡️ Next: python phase4_codet.py")