| | import argparse |
| | import os |
| | import sys |
| | import subprocess |
| |
|
| | |
| | sys.path.insert(0, os.path.dirname(__file__)) |
| | from utils.benchmark_utils import BENCHMARK_CALCULATORS |
| |
|
| | |
| | BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys()) |
| |
|
| | def run_benchmark_evaluation(benchmark_name, model_path): |
| | """Run evaluation for a specific benchmark category""" |
| | benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py") |
| | |
| | if not os.path.exists(benchmark_script): |
| | print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr) |
| | return None |
| | |
| | try: |
| | result = subprocess.run( |
| | [sys.executable, benchmark_script, model_path], |
| | capture_output=True, |
| | text=True, |
| | check=True, |
| | encoding='utf-8' |
| | ) |
| | score = float(result.stdout.strip()) |
| | return score |
| | except subprocess.CalledProcessError as e: |
| | print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr) |
| | return None |
| | except (ValueError, TypeError): |
| | print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr) |
| | return None |
| |
|
| | def calculate_overall_score(benchmark_scores): |
| | """Calculate overall performance score from individual benchmarks""" |
| | valid_scores = [score for score in benchmark_scores.values() if score is not None] |
| | if not valid_scores: |
| | return None |
| | |
| | |
| | weights = { |
| | "math_reasoning": 1.2, |
| | "logical_reasoning": 1.2, |
| | "code_generation": 1.1, |
| | "question_answering": 1.1, |
| | "reading_comprehension": 1.0, |
| | "common_sense": 1.0, |
| | "text_classification": 0.9, |
| | "sentiment_analysis": 0.9, |
| | "dialogue_generation": 1.0, |
| | "summarization": 1.0, |
| | "translation": 1.0, |
| | "knowledge_retrieval": 1.0, |
| | "creative_writing": 0.9, |
| | "instruction_following": 1.1, |
| | "safety_evaluation": 1.1 |
| | } |
| | |
| | weighted_sum = 0 |
| | total_weight = 0 |
| | |
| | for benchmark, score in benchmark_scores.items(): |
| | if score is not None: |
| | weight = weights.get(benchmark, 1.0) |
| | weighted_sum += score * weight |
| | total_weight += weight |
| | |
| | return round(weighted_sum / total_weight, 3) if total_weight > 0 else None |
| |
|
| | def main(): |
| | """ |
| | Run comprehensive evaluation across all benchmark categories. |
| | Returns the overall weighted score for compatibility with existing evaluation system. |
| | """ |
| | parser = argparse.ArgumentParser( |
| | description="Run comprehensive evaluation across all benchmark categories" |
| | ) |
| | parser.add_argument( |
| | "model_path", |
| | type=str, |
| | help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)." |
| | ) |
| | args = parser.parse_args() |
| |
|
| | |
| | if not os.path.isdir(args.model_path): |
| | print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| | |
| | script_dir = os.path.dirname(os.path.abspath(__file__)) |
| | original_cwd = os.getcwd() |
| | os.chdir(os.path.dirname(script_dir)) |
| |
|
| | benchmark_scores = {} |
| | |
| | |
| | for benchmark in BENCHMARK_CATEGORIES: |
| | score = run_benchmark_evaluation(benchmark, args.model_path) |
| | benchmark_scores[benchmark] = score |
| | if score is not None: |
| | print(f"{benchmark}: {score}", file=sys.stderr) |
| | |
| | |
| | overall_score = calculate_overall_score(benchmark_scores) |
| | |
| | |
| | os.chdir(original_cwd) |
| | |
| | if overall_score is None: |
| | print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr) |
| | sys.exit(1) |
| |
|
| | |
| | print(overall_score) |
| |
|
| | if __name__ == "__main__": |
| | main() |