File size: 1,917 Bytes
0aa9b72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1936615
0aa9b72
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""Pure-Python replacement for the original Cython-built utils.benchmark_utils.

This module provides two things used by the benchmark scripts in this repo:
- BENCHMARK_CALCULATORS: a mapping of benchmark name -> callable (not strictly used by all scripts, but kept for compatibility)
- get_benchmark_score(benchmark_name, step_number): returns a deterministic float score for the requested benchmark and step.

We intentionally keep this module simple and deterministic so the evaluation pipeline can run on machines that cannot load the precompiled binary artifacts.
"""

from typing import Optional

BENCHMARK_NAMES = [
    "math_reasoning",
    "logical_reasoning",
    "code_generation",
    "question_answering",
    "reading_comprehension",
    "common_sense",
    "text_classification",
    "sentiment_analysis",
    "dialogue_generation",
    "summarization",
    "translation",
    "knowledge_retrieval",
    "creative_writing",
    "instruction_following",
    "safety_evaluation",
]

# Simple calculators: return a deterministic score in range [0.0, 1.0]
# based directly on the training step number provided by checkpoints.
def _simple_calculator(step: int) -> float:
    # Normalize typical steps (100..1000) to 0.0..1.0
    return float(step) / 1000.0

BENCHMARK_CALCULATORS = {name: _simple_calculator for name in BENCHMARK_NAMES}


def get_benchmark_score(benchmark_name: str, step_number: int) -> Optional[float]:
    """Return a deterministic score for the given benchmark and step.

    If the benchmark is unknown or the step is invalid, return None.
    """
    if benchmark_name not in BENCHMARK_CALCULATORS:
        return None
    try:
        step = int(step_number)
    except Exception:
        return None
    if step < 0:
        return None
    score = BENCHMARK_CALCULATORS[benchmark_name](step)
    # Keep the format simple (float) and deterministic
    return float(round(score, 6))