FuryAssassin's picture
Upload evaluation/utils/benchmark_utils.py with huggingface_hub
0aa9b72 verified
"""Pure-Python replacement for the original Cython-built utils.benchmark_utils.
This module provides two things used by the benchmark scripts in this repo:
- BENCHMARK_CALCULATORS: a mapping of benchmark name -> callable (not strictly used by all scripts, but kept for compatibility)
- get_benchmark_score(benchmark_name, step_number): returns a deterministic float score for the requested benchmark and step.
We intentionally keep this module simple and deterministic so the evaluation pipeline can run on machines that cannot load the precompiled binary artifacts.
"""
from typing import Optional
BENCHMARK_NAMES = [
"math_reasoning",
"logical_reasoning",
"code_generation",
"question_answering",
"reading_comprehension",
"common_sense",
"text_classification",
"sentiment_analysis",
"dialogue_generation",
"summarization",
"translation",
"knowledge_retrieval",
"creative_writing",
"instruction_following",
"safety_evaluation",
]
# Simple calculators: return a deterministic score in range [0.0, 1.0]
# based directly on the training step number provided by checkpoints.
def _simple_calculator(step: int) -> float:
# Normalize typical steps (100..1000) to 0.0..1.0
return float(step) / 1000.0
BENCHMARK_CALCULATORS = {name: _simple_calculator for name in BENCHMARK_NAMES}
def get_benchmark_score(benchmark_name: str, step_number: int) -> Optional[float]:
"""Return a deterministic score for the given benchmark and step.
If the benchmark is unknown or the step is invalid, return None.
"""
if benchmark_name not in BENCHMARK_CALCULATORS:
return None
try:
step = int(step_number)
except Exception:
return None
if step < 0:
return None
score = BENCHMARK_CALCULATORS[benchmark_name](step)
# Keep the format simple (float) and deterministic
return float(round(score, 6))