File size: 2,958 Bytes
289b193
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
"""Phase 3: Evaluation on HumanEval+ and MBPP+"""

import subprocess
from pathlib import Path

# Config
CKPT_DIR = Path("./qwen3_pipeline/checkpoint")
RESULTS_DIR = Path("./qwen3_pipeline/results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

model_path = str(CKPT_DIR / "merged")

print("="*70)
print("PHASE 3: EVALUATION")
print("="*70)
print(f"\nModel: {model_path}\n")

# Check if model exists
if not (CKPT_DIR / "merged").exists():
    print(f"❌ Model not found at {model_path}")
    print(f"   Did Phase 2 complete successfully?")
    exit(1)

# HumanEval+
print("="*70)
print("HUMANEVAL+")
print("="*70)

humaneval_log = RESULTS_DIR / "humaneval_plus.log"

cmd_humaneval = (
    f'evalplus.evaluate '
    f'--model "{model_path}" '
    f'--dataset humaneval '
    f'--backend vllm '
    f'--greedy '
    f'--tp 1'
)

print(f"\nRunning: {cmd_humaneval}\n")

try:
    result = subprocess.run(
        cmd_humaneval,
        shell=True,
        capture_output=True,
        text=True,
        timeout=1800  # 30 min timeout
    )
    
    # Save full log
    with open(humaneval_log, "w") as f:
        f.write(result.stdout)
        f.write("\n\n=== STDERR ===\n\n")
        f.write(result.stderr)
    
    # Print key results
    print("\n" + "="*70)
    print("HUMANEVAL+ RESULTS")
    print("="*70)
    
    for line in result.stdout.split("\n"):
        if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
            print(line)
    
    print(f"\nFull log: {humaneval_log}")
    
except subprocess.TimeoutExpired:
    print("❌ HumanEval+ timed out (30 min)")
except Exception as e:
    print(f"❌ HumanEval+ failed: {e}")

# MBPP+
print("\n" + "="*70)
print("MBPP+")
print("="*70)

mbpp_log = RESULTS_DIR / "mbpp_plus.log"

cmd_mbpp = (
    f'evalplus.evaluate '
    f'--model "{model_path}" '
    f'--dataset mbpp '
    f'--backend vllm '
    f'--greedy '
    f'--tp 1'
)

print(f"\nRunning: {cmd_mbpp}\n")

try:
    result = subprocess.run(
        cmd_mbpp,
        shell=True,
        capture_output=True,
        text=True,
        timeout=1800  # 30 min timeout
    )
    
    # Save full log
    with open(mbpp_log, "w") as f:
        f.write(result.stdout)
        f.write("\n\n=== STDERR ===\n\n")
        f.write(result.stderr)
    
    # Print key results
    print("\n" + "="*70)
    print("MBPP+ RESULTS")
    print("="*70)
    
    for line in result.stdout.split("\n"):
        if any(k in line.lower() for k in ["pass@1", "base", "plus", "score"]):
            print(line)
    
    print(f"\nFull log: {mbpp_log}")
    
except subprocess.TimeoutExpired:
    print("❌ MBPP+ timed out (30 min)")
except Exception as e:
    print(f"❌ MBPP+ failed: {e}")

# Summary
print("\n" + "="*70)
print("✓ PHASE 3 COMPLETE")
print("="*70)
print(f"\nResults saved to: {RESULTS_DIR}/")
print(f"  - {humaneval_log}")
print(f"  - {mbpp_log}")
print(f"\n➡️  Next: python phase4_codet.py")