File size: 4,162 Bytes

0a2f402

import argparse
import os
import sys
import subprocess

# Add utils to path
sys.path.insert(0, os.path.dirname(__file__))
from utils.benchmark_utils import BENCHMARK_CALCULATORS

# List of all benchmark categories
BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys())

def run_benchmark_evaluation(benchmark_name, model_path):
    """Run evaluation for a specific benchmark category"""
    benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py")
    
    if not os.path.exists(benchmark_script):
        print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr)
        return None
    
    try:
        result = subprocess.run(
            [sys.executable, benchmark_script, model_path],
            capture_output=True,
            text=True,
            check=True,
            encoding='utf-8'
        )
        score = float(result.stdout.strip())
        return score
    except subprocess.CalledProcessError as e:
        print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr)
        return None
    except (ValueError, TypeError):
        print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr)
        return None

def calculate_overall_score(benchmark_scores):
    """Calculate overall performance score from individual benchmarks"""
    valid_scores = [score for score in benchmark_scores.values() if score is not None]
    if not valid_scores:
        return None
    
    # Weighted average with slight emphasis on reasoning tasks
    weights = {
        "math_reasoning": 1.2,
        "logical_reasoning": 1.2, 
        "code_generation": 1.1,
        "question_answering": 1.1,
        "reading_comprehension": 1.0,
        "common_sense": 1.0,
        "text_classification": 0.9,
        "sentiment_analysis": 0.9,
        "dialogue_generation": 1.0,
        "summarization": 1.0,
        "translation": 1.0,
        "knowledge_retrieval": 1.0,
        "creative_writing": 0.9,
        "instruction_following": 1.1,
        "safety_evaluation": 1.1
    }
    
    weighted_sum = 0
    total_weight = 0
    
    for benchmark, score in benchmark_scores.items():
        if score is not None:
            weight = weights.get(benchmark, 1.0)
            weighted_sum += score * weight
            total_weight += weight
    
    return round(weighted_sum / total_weight, 3) if total_weight > 0 else None

def main():
    """
    Run comprehensive evaluation across all benchmark categories.
    Returns the overall weighted score for compatibility with existing evaluation system.
    """
    parser = argparse.ArgumentParser(
        description="Run comprehensive evaluation across all benchmark categories"
    )
    parser.add_argument(
        "model_path",
        type=str,
        help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)."
    )
    args = parser.parse_args()

    # Check if the provided path is a directory
    if not os.path.isdir(args.model_path):
        print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr)
        sys.exit(1)

    # Change to the directory containing the evaluation scripts
    script_dir = os.path.dirname(os.path.abspath(__file__))
    original_cwd = os.getcwd()
    os.chdir(os.path.dirname(script_dir))

    benchmark_scores = {}
    
    # Run evaluation for each benchmark category
    for benchmark in BENCHMARK_CATEGORIES:
        score = run_benchmark_evaluation(benchmark, args.model_path)
        benchmark_scores[benchmark] = score
        if score is not None:
            print(f"{benchmark}: {score}", file=sys.stderr)
    
    # Calculate overall score
    overall_score = calculate_overall_score(benchmark_scores)
    
    # Restore original working directory
    os.chdir(original_cwd)
    
    if overall_score is None:
        print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr)
        sys.exit(1)

    # Print only the overall score for compatibility with existing evaluation pipeline
    print(overall_score)

if __name__ == "__main__":
    main()