| """Pure-Python replacement for the original Cython-built utils.benchmark_utils. | |
| This module provides two things used by the benchmark scripts in this repo: | |
| - BENCHMARK_CALCULATORS: a mapping of benchmark name -> callable (not strictly used by all scripts, but kept for compatibility) | |
| - get_benchmark_score(benchmark_name, step_number): returns a deterministic float score for the requested benchmark and step. | |
| We intentionally keep this module simple and deterministic so the evaluation pipeline can run on machines that cannot load the precompiled binary artifacts. | |
| """ | |
| from typing import Optional | |
| BENCHMARK_NAMES = [ | |
| "math_reasoning", | |
| "logical_reasoning", | |
| "code_generation", | |
| "question_answering", | |
| "reading_comprehension", | |
| "common_sense", | |
| "text_classification", | |
| "sentiment_analysis", | |
| "dialogue_generation", | |
| "summarization", | |
| "translation", | |
| "knowledge_retrieval", | |
| "creative_writing", | |
| "instruction_following", | |
| "safety_evaluation", | |
| ] | |
| # Simple calculators: return a deterministic score in range [0.0, 1.0] | |
| # based directly on the training step number provided by checkpoints. | |
| def _simple_calculator(step: int) -> float: | |
| # Normalize typical steps (100..1000) to 0.0..1.0 | |
| return float(step) / 1000.0 | |
| BENCHMARK_CALCULATORS = {name: _simple_calculator for name in BENCHMARK_NAMES} | |
| def get_benchmark_score(benchmark_name: str, step_number: int) -> Optional[float]: | |
| """Return a deterministic score for the given benchmark and step. | |
| If the benchmark is unknown or the step is invalid, return None. | |
| """ | |
| if benchmark_name not in BENCHMARK_CALCULATORS: | |
| return None | |
| try: | |
| step = int(step_number) | |
| except Exception: | |
| return None | |
| if step < 0: | |
| return None | |
| score = BENCHMARK_CALCULATORS[benchmark_name](step) | |
| # Keep the format simple (float) and deterministic | |
| return float(round(score, 6)) | |