dongbobo
/

agent-benchmark-eval-bundle

Transformers

Model card Files Files and versions

xet

Community

dongbobo commited on Jan 26

Commit

0a2f402

verified ·

1 Parent(s): f22a2af

Upload evaluation/eval.py with huggingface_hub

Browse files

Files changed (1) hide show

evaluation/eval.py +122 -0

evaluation/eval.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import argparse
+import os
+import sys
+import subprocess
+# Add utils to path
+sys.path.insert(0, os.path.dirname(__file__))
+from utils.benchmark_utils import BENCHMARK_CALCULATORS
+# List of all benchmark categories
+BENCHMARK_CATEGORIES = list(BENCHMARK_CALCULATORS.keys())
+def run_benchmark_evaluation(benchmark_name, model_path):
+    """Run evaluation for a specific benchmark category"""
+    benchmark_script = os.path.join("evaluation", "benchmarks", benchmark_name, "eval.py")
+    if not os.path.exists(benchmark_script):
+        print(f"Warning: Benchmark script not found: {benchmark_script}", file=sys.stderr)
+        return None
+    try:
+        result = subprocess.run(
+            [sys.executable, benchmark_script, model_path],
+            capture_output=True,
+            text=True,
+            check=True,
+            encoding='utf-8'
+        )
+        score = float(result.stdout.strip())
+        return score
+    except subprocess.CalledProcessError as e:
+        print(f"Error running {benchmark_name} evaluation: {e.stderr}", file=sys.stderr)
+        return None
+    except (ValueError, TypeError):
+        print(f"Warning: Could not parse score from {benchmark_name}: '{result.stdout.strip()}'", file=sys.stderr)
+        return None
+def calculate_overall_score(benchmark_scores):
+    """Calculate overall performance score from individual benchmarks"""
+    valid_scores = [score for score in benchmark_scores.values() if score is not None]
+    if not valid_scores:
+        return None
+    # Weighted average with slight emphasis on reasoning tasks
+    weights = {
+        "math_reasoning": 1.2,
+        "logical_reasoning": 1.2,
+        "code_generation": 1.1,
+        "question_answering": 1.1,
+        "reading_comprehension": 1.0,
+        "common_sense": 1.0,
+        "text_classification": 0.9,
+        "sentiment_analysis": 0.9,
+        "dialogue_generation": 1.0,
+        "summarization": 1.0,
+        "translation": 1.0,
+        "knowledge_retrieval": 1.0,
+        "creative_writing": 0.9,
+        "instruction_following": 1.1,
+        "safety_evaluation": 1.1
+    }
+    weighted_sum = 0
+    total_weight = 0
+    for benchmark, score in benchmark_scores.items():
+        if score is not None:
+            weight = weights.get(benchmark, 1.0)
+            weighted_sum += score * weight
+            total_weight += weight
+    return round(weighted_sum / total_weight, 3) if total_weight > 0 else None
+def main():
+    """
+    Run comprehensive evaluation across all benchmark categories.
+    Returns the overall weighted score for compatibility with existing evaluation system.
+    """
+    parser = argparse.ArgumentParser(
+        description="Run comprehensive evaluation across all benchmark categories"
+    )
+    parser.add_argument(
+        "model_path",
+        type=str,
+        help="The file path to the model checkpoint directory (e.g., ../checkpoints/step_100)."
+    )
+    args = parser.parse_args()
+    # Check if the provided path is a directory
+    if not os.path.isdir(args.model_path):
+        print(f"Error: Directory not found at '{args.model_path}'", file=sys.stderr)
+        sys.exit(1)
+    # Change to the directory containing the evaluation scripts
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    original_cwd = os.getcwd()
+    os.chdir(os.path.dirname(script_dir))
+    benchmark_scores = {}
+    # Run evaluation for each benchmark category
+    for benchmark in BENCHMARK_CATEGORIES:
+        score = run_benchmark_evaluation(benchmark, args.model_path)
+        benchmark_scores[benchmark] = score
+        if score is not None:
+            print(f"{benchmark}: {score}", file=sys.stderr)
+    # Calculate overall score
+    overall_score = calculate_overall_score(benchmark_scores)
+    # Restore original working directory
+    os.chdir(original_cwd)
+    if overall_score is None:
+        print(f"Error: Could not calculate overall score for {args.model_path}", file=sys.stderr)
+        sys.exit(1)
+    # Print only the overall score for compatibility with existing evaluation pipeline
+    print(overall_score)
+if __name__ == "__main__":
+    main()