""" Column definitions for DeathMath Leaderboard table. """ import logging from dataclasses import dataclass, make_dataclass from enum import Enum logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") def fields(raw_class): """Extract non-dunder fields from a dataclass.""" return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] @dataclass class Task: """Represents a benchmark task with its metrics.""" benchmark: str metric: str col_name: str class Tasks(Enum): """Available benchmark tasks for DeathMath leaderboard.""" math = Task("RussianMath", "score", "math_score") physics = Task("RussianPhysics", "score", "physics_score") combined = Task("Combined", "score", "score") @dataclass(frozen=True) class ColumnContent: """Configuration for a leaderboard table column.""" name: str type: str displayed_by_default: bool hidden: bool = False never_hidden: bool = False dummy: bool = False auto_eval_column_dict = [] auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)]) auto_eval_column_dict.append(["score", ColumnContent, ColumnContent("score", "number", True)]) for task in Tasks: if task != Tasks.combined: auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]) auto_eval_column_dict.append(["total_tokens", ColumnContent, ColumnContent("total_tokens", "number", False)]) auto_eval_column_dict.append(["evaluation_time", ColumnContent, ColumnContent("evaluation_time", "number", False)]) auto_eval_column_dict.append(["system_prompt", ColumnContent, ColumnContent("system_prompt", "str", False)]) AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)