dongbobo commited on
Commit
0a2f402
·
verified ·
1 Parent(s): f22a2af

Upload evaluation/eval.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. evaluation/eval.py +122 -0
evaluation/eval.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import subprocess
5
+
6
+ # Add utils to path
7
+ sys.path.insert(0, os.path.dirname(__file__))
8
+ from utils.benchmark_utils import BENCHMARK_CALCULATORS
9
+
10
+ # List of all benchmark categories
11
+ BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys())
12
+
13
+ def run_benchmark_evaluation(benchmark_name, model_path):
14
+ """Run evaluation for a specific benchmark category"""
15
+ benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py")
16
+
17
+ if not os.path.exists(benchmark_script):
18
+ print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr)
19
+ return None
20
+
21
+ try:
22
+ result = subprocess.run(
23
+ [sys.executable, benchmark_script, model_path],
24
+ capture_output=True,
25
+ text=True,
26
+ check=True,
27
+ encoding='utf-8'
28
+ )
29
+ score = float(result.stdout.strip())
30
+ return score
31
+ except subprocess.CalledProcessError as e:
32
+ print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr)
33
+ return None
34
+ except (ValueError, TypeError):
35
+ print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr)
36
+ return None
37
+
38
+ def calculate_overall_score(benchmark_scores):
39
+ """Calculate overall performance score from individual benchmarks"""
40
+ valid_scores = [score for score in benchmark_scores.values() if score is not None]
41
+ if not valid_scores:
42
+ return None
43
+
44
+ # Weighted average with slight emphasis on reasoning tasks
45
+ weights = {
46
+ "math_reasoning": 1.2,
47
+ "logical_reasoning": 1.2,
48
+ "code_generation": 1.1,
49
+ "question_answering": 1.1,
50
+ "reading_comprehension": 1.0,
51
+ "common_sense": 1.0,
52
+ "text_classification": 0.9,
53
+ "sentiment_analysis": 0.9,
54
+ "dialogue_generation": 1.0,
55
+ "summarization": 1.0,
56
+ "translation": 1.0,
57
+ "knowledge_retrieval": 1.0,
58
+ "creative_writing": 0.9,
59
+ "instruction_following": 1.1,
60
+ "safety_evaluation": 1.1
61
+ }
62
+
63
+ weighted_sum = 0
64
+ total_weight = 0
65
+
66
+ for benchmark, score in benchmark_scores.items():
67
+ if score is not None:
68
+ weight = weights.get(benchmark, 1.0)
69
+ weighted_sum += score * weight
70
+ total_weight += weight
71
+
72
+ return round(weighted_sum / total_weight, 3) if total_weight > 0 else None
73
+
74
+ def main():
75
+ """
76
+ Run comprehensive evaluation across all benchmark categories.
77
+ Returns the overall weighted score for compatibility with existing evaluation system.
78
+ """
79
+ parser = argparse.ArgumentParser(
80
+ description="Run comprehensive evaluation across all benchmark categories"
81
+ )
82
+ parser.add_argument(
83
+ "model_path",
84
+ type=str,
85
+ help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)."
86
+ )
87
+ args = parser.parse_args()
88
+
89
+ # Check if the provided path is a directory
90
+ if not os.path.isdir(args.model_path):
91
+ print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr)
92
+ sys.exit(1)
93
+
94
+ # Change to the directory containing the evaluation scripts
95
+ script_dir = os.path.dirname(os.path.abspath(__file__))
96
+ original_cwd = os.getcwd()
97
+ os.chdir(os.path.dirname(script_dir))
98
+
99
+ benchmark_scores = {}
100
+
101
+ # Run evaluation for each benchmark category
102
+ for benchmark in BENCHMARK_CATEGORIES:
103
+ score = run_benchmark_evaluation(benchmark, args.model_path)
104
+ benchmark_scores[benchmark] = score
105
+ if score is not None:
106
+ print(f"{benchmark}: {score}", file=sys.stderr)
107
+
108
+ # Calculate overall score
109
+ overall_score = calculate_overall_score(benchmark_scores)
110
+
111
+ # Restore original working directory
112
+ os.chdir(original_cwd)
113
+
114
+ if overall_score is None:
115
+ print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr)
116
+ sys.exit(1)
117
+
118
+ # Print only the overall score for compatibility with existing evaluation pipeline
119
+ print(overall_score)
120
+
121
+ if __name__ == "__main__":
122
+ main()