File size: 4,162 Bytes
0a2f402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
import os
import sys
import subprocess

# Add utils to path
sys.path.insert(0, os.path.dirname(__file__))
from utils.benchmark_utils import BENCHMARK_CALCULATORS

# List of all benchmark categories
BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys())

def run_benchmark_evaluation(benchmark_name, model_path):
    """Run evaluation for a specific benchmark category"""
    benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py")
    
    if not os.path.exists(benchmark_script):
        print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr)
        return None
    
    try:
        result = subprocess.run(
            [sys.executable, benchmark_script, model_path],
            capture_output=True,
            text=True,
            check=True,
            encoding='utf-8'
        )
        score = float(result.stdout.strip())
        return score
    except subprocess.CalledProcessError as e:
        print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr)
        return None
    except (ValueError, TypeError):
        print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr)
        return None

def calculate_overall_score(benchmark_scores):
    """Calculate overall performance score from individual benchmarks"""
    valid_scores = [score for score in benchmark_scores.values() if score is not None]
    if not valid_scores:
        return None
    
    # Weighted average with slight emphasis on reasoning tasks
    weights = {
        "math_reasoning": 1.2,
        "logical_reasoning": 1.2, 
        "code_generation": 1.1,
        "question_answering": 1.1,
        "reading_comprehension": 1.0,
        "common_sense": 1.0,
        "text_classification": 0.9,
        "sentiment_analysis": 0.9,
        "dialogue_generation": 1.0,
        "summarization": 1.0,
        "translation": 1.0,
        "knowledge_retrieval": 1.0,
        "creative_writing": 0.9,
        "instruction_following": 1.1,
        "safety_evaluation": 1.1
    }
    
    weighted_sum = 0
    total_weight = 0
    
    for benchmark, score in benchmark_scores.items():
        if score is not None:
            weight = weights.get(benchmark, 1.0)
            weighted_sum += score * weight
            total_weight += weight
    
    return round(weighted_sum / total_weight, 3) if total_weight > 0 else None

def main():
    """
    Run comprehensive evaluation across all benchmark categories.
    Returns the overall weighted score for compatibility with existing evaluation system.
    """
    parser = argparse.ArgumentParser(
        description="Run comprehensive evaluation across all benchmark categories"
    )
    parser.add_argument(
        "model_path",
        type=str,
        help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)."
    )
    args = parser.parse_args()

    # Check if the provided path is a directory
    if not os.path.isdir(args.model_path):
        print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr)
        sys.exit(1)

    # Change to the directory containing the evaluation scripts
    script_dir = os.path.dirname(os.path.abspath(__file__))
    original_cwd = os.getcwd()
    os.chdir(os.path.dirname(script_dir))

    benchmark_scores = {}
    
    # Run evaluation for each benchmark category
    for benchmark in BENCHMARK_CATEGORIES:
        score = run_benchmark_evaluation(benchmark, args.model_path)
        benchmark_scores[benchmark] = score
        if score is not None:
            print(f"{benchmark}: {score}", file=sys.stderr)
    
    # Calculate overall score
    overall_score = calculate_overall_score(benchmark_scores)
    
    # Restore original working directory
    os.chdir(original_cwd)
    
    if overall_score is None:
        print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr)
        sys.exit(1)

    # Print only the overall score for compatibility with existing evaluation pipeline
    print(overall_score)

if __name__ == "__main__":
    main()