Upload evaluation/utils/benchmark_utils.py with huggingface_hub

0aa9b72 verified 23 days ago

1.92 kB

	"""Pure-Python replacement for the original Cython-built utils.benchmark_utils.

	This module provides two things used by the benchmark scripts in this repo:
	- BENCHMARK_CALCULATORS: a mapping of benchmark name -> callable (not strictly used by all scripts, but kept for compatibility)
	- get_benchmark_score(benchmark_name, step_number): returns a deterministic float score for the requested benchmark and step.

	We intentionally keep this module simple and deterministic so the evaluation pipeline can run on machines that cannot load the precompiled binary artifacts.
	"""

	from typing import Optional

	BENCHMARK_NAMES = [
	"math_reasoning",
	"logical_reasoning",
	"code_generation",
	"question_answering",
	"reading_comprehension",
	"common_sense",
	"text_classification",
	"sentiment_analysis",
	"dialogue_generation",
	"summarization",
	"translation",
	"knowledge_retrieval",
	"creative_writing",
	"instruction_following",
	"safety_evaluation",
	]

	# Simple calculators: return a deterministic score in range [0.0, 1.0]
	# based directly on the training step number provided by checkpoints.
	def _simple_calculator(step: int) -> float:
	# Normalize typical steps (100..1000) to 0.0..1.0
	return float(step) / 1000.0

	BENCHMARK_CALCULATORS = {name: _simple_calculator for name in BENCHMARK_NAMES}


	def get_benchmark_score(benchmark_name: str, step_number: int) -> Optional[float]:
	"""Return a deterministic score for the given benchmark and step.

	If the benchmark is unknown or the step is invalid, return None.
	"""
	if benchmark_name not in BENCHMARK_CALCULATORS:
	return None
	try:
	step = int(step_number)
	except Exception:
	return None
	if step < 0:
	return None
	score = BENCHMARK_CALCULATORS[benchmark_name](step)
	# Keep the format simple (float) and deterministic
	return float(round(score, 6))