File size: 2,958 Bytes

289b193

#!/usr/bin/env python3
"""Phase 3: Evaluation on HumanEval+ and MBPP+"""

import subprocess
from pathlib import Path

# Config
CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
RESULTS_DIR = Path("./qwen3_pipeline/results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

model_path = str(CKPT_DIR / "merged")

print("="*70)
print("PHASE 3: EVALUATION")
print("="*70)
print(f"\nModel: {model_path}\n")

# Check if model exists
if not (CKPT_DIR / "merged").exists():
    print(f"❌ Model not found at {model_path}")
    print(f"   Did Phase 2 complete successfully?")
    exit(1)

# HumanEval+
print("="*70)
print("HUMANEVAL+")
print("="*70)

humaneval_log = RESULTS_DIR / "humaneval_plus.log"

cmd_humaneval = (
    f'evalplus.evaluate '
    f'--model "{model_path}" '
    f'--dataset humaneval '
    f'--backend vllm '
    f'--greedy '
    f'--tp 1'
)

print(f"\nRunning: {cmd_humaneval}\n")

try:
    result = subprocess.run(
        cmd_humaneval,
        shell=True,
        capture_output=True,
        text=True,
        timeout=1800  # 30 min timeout
    )
    
    # Save full log
    with open(humaneval_log, "w") as f:
        f.write(result.stdout)
        f.write("\n\n=== STDERR ===\n\n")
        f.write(result.stderr)
    
    # Print key results
    print("\n" + "="*70)
    print("HUMANEVAL+ RESULTS")
    print("="*70)
    
    for line in result.stdout.split("\n"):
        if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
            print(line)
    
    print(f"\nFull log: {humaneval_log}")
    
except subprocess.TimeoutExpired:
    print("❌ HumanEval+ timed out (30 min)")
except Exception as e:
    print(f"❌ HumanEval+ failed: {e}")

# MBPP+
print("\n" + "="*70)
print("MBPP+")
print("="*70)

mbpp_log = RESULTS_DIR / "mbpp_plus.log"

cmd_mbpp = (
    f'evalplus.evaluate '
    f'--model "{model_path}" '
    f'--dataset mbpp '
    f'--backend vllm '
    f'--greedy '
    f'--tp 1'
)

print(f"\nRunning: {cmd_mbpp}\n")

try:
    result = subprocess.run(
        cmd_mbpp,
        shell=True,
        capture_output=True,
        text=True,
        timeout=1800  # 30 min timeout
    )
    
    # Save full log
    with open(mbpp_log, "w") as f:
        f.write(result.stdout)
        f.write("\n\n=== STDERR ===\n\n")
        f.write(result.stderr)
    
    # Print key results
    print("\n" + "="*70)
    print("MBPP+ RESULTS")
    print("="*70)
    
    for line in result.stdout.split("\n"):
        if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
            print(line)
    
    print(f"\nFull log: {mbpp_log}")
    
except subprocess.TimeoutExpired:
    print("❌ MBPP+ timed out (30 min)")
except Exception as e:
    print(f"❌ MBPP+ failed: {e}")

# Summary
print("\n" + "="*70)
print("✓ PHASE 3 COMPLETE")
print("="*70)
print(f"\nResults saved to: {RESULTS_DIR}/")
print(f"  - {humaneval_log}")
print(f"  - {mbpp_log}")
print(f"\n➡️  Next: python phase4_codet.py")