Ray0202
update 02.17.2026
949c705
from datetime import datetime, timezone
import json
import os
from typing import Optional
import gradio as gr
import pandas as pd
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.envs import RESULTS_PATH, SUBMISSIONS_PATH
from src.leaderboard.load_results import (
ResultsValidationError,
build_dataframe,
load_records,
validate_records,
)
from src.leaderboard.schema import SCHEMA
def load_leaderboard_data() -> tuple[pd.DataFrame, list[str], Optional[str]]:
try:
records = load_records(RESULTS_PATH)
df, column_order = build_dataframe(records)
return df, column_order, None
except ResultsValidationError as exc:
fallback_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
df = pd.DataFrame(columns=fallback_cols)
return df, fallback_cols, str(exc)
LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data()
DATASET_DISPLAY_NAMES = ["FreshRetailNet", "PSML", "Causal Chambers", "MIMIC"]
DATASET_PREFIX_MAP = {
"FreshRetailNet": "FreshRetailNet",
"PSML": "PSML",
"Causal Chambers": "CausalChambers",
"MIMIC": "MIMIC",
}
DATASET_PREFIXES = [f"{prefix}_" for prefix in DATASET_PREFIX_MAP.values()]
def is_dataset_metric(column: str) -> bool:
return any(column.startswith(prefix) for prefix in DATASET_PREFIXES)
BASE_COLUMNS = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
ALL_DATASET_COLUMNS = [c for c in COLUMN_ORDER if is_dataset_metric(c)]
AGGREGATE_FORECAST_COLUMNS = [
"overall_mcq_acc",
"T2_MAE",
"T2_sMAPE",
"T4_MAE",
"T4_sMAPE",
"MIMIC_T2_OW_sMAPE",
"MIMIC_T2_OW_RMSSE",
"MIMIC_T4_OW_sMAPE",
"MIMIC_T4_OW_RMSSE",
]
AGGREGATE_COLUMNS = BASE_COLUMNS + [
c for c in AGGREGATE_FORECAST_COLUMNS if c in COLUMN_ORDER
]
DISPLAY_ALL_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS
BY_DOMAIN_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS
BY_DOMAIN_MAX_COLUMNS = 40
def column_types(column_order: list[str]) -> list[str]:
types = []
for col in column_order:
if col in SCHEMA.identity_fields:
types.append("str")
else:
types.append("number")
return types
def init_leaderboard(dataframe, column_order):
if dataframe is None or dataframe.empty:
dataframe = pd.DataFrame(columns=column_order)
dataframe = dataframe.reindex(columns=column_order)
required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
cant_deselect = [c for c in required_cols if c in column_order]
search_columns = [c for c in ["model_name", "agent_name"] if c in column_order]
return Leaderboard(
value=dataframe,
datatype=column_types(column_order),
select_columns=SelectColumns(
default_selection=column_order,
cant_deselect=cant_deselect,
label="Select Columns to Display:",
),
search_columns=search_columns,
filter_columns=[
ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"),
],
interactive=False,
)
def save_submission(uploaded_file) -> str:
if uploaded_file is None:
return "Please upload a results file."
file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file)
try:
records = load_records(file_path)
validate_records(records)
except ResultsValidationError as exc:
return f"**Validation error:** {exc}"
os.makedirs(SUBMISSIONS_PATH, exist_ok=True)
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json")
payload = {
"submitted_at": timestamp,
"source_filename": os.path.basename(file_path),
"records": records,
}
with open(out_path, "w") as fp:
json.dump(payload, fp, indent=2)
return f"Submission received for review. Saved to `{out_path}`."
def build_example_record_markdown() -> str:
try:
records = load_records(RESULTS_PATH)
if not records:
return "No example data available."
example = records[0]
return "Example record (JSON):\n```json\n" + json.dumps(example, indent=2) + "\n```"
except Exception as exc:
return f"Could not load example record: {exc}"
EXAMPLE_RECORD_MD = build_example_record_markdown()
demo = gr.Blocks(css=custom_css, analytics_enabled=False)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
if LOAD_ERROR:
gr.Markdown(f"**Data validation error:** {LOAD_ERROR}", elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ… Leaderboard", elem_id="tab-leaderboard", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF, AGGREGATE_COLUMNS)
with gr.TabItem("🧭 By Domain", elem_id="tab-by-domain", id=1):
by_domain_columns = BY_DOMAIN_COLUMNS[:BY_DOMAIN_MAX_COLUMNS]
by_domain_df = LEADERBOARD_DF.reindex(columns=by_domain_columns)
init_leaderboard(by_domain_df, by_domain_columns)
# Temporarily disabled for performance debugging.
with gr.TabItem("πŸ“€ Submit Results", elem_id="tab-submit", id=2):
gr.Markdown(
(
"Upload submission files for manual review.\n\n"
"Required files:\n"
"1. `results_on_dev_dataset.json`: task-level metrics in leaderboard format.\n"
"2. `results_on_test_dataset.json`: per-example test outputs with at least "
"`id`, `tier`, `source_dataset`, `label`, and `output` "
"(required when the sample contains forecasting).\n\n"
"Please also include model architecture code and LLM/system details for verification."
),
elem_classes="markdown-text",
)
gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text")
submission_file = gr.File(
label="Submission package (.zip or .rar)",
file_types=[".zip", ".rar"],
)
submit_button = gr.Button("Submit for Review")
submission_status = gr.Markdown()
submit_button.click(save_submission, [submission_file], submission_status)
with gr.TabItem("πŸ“ About", elem_id="tab-about", id=3):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
gr.Markdown(f"## Citation\n{CITATION_BUTTON_LABEL}", elem_classes="markdown-text")
gr.Markdown(f"```bibtex\n{CITATION_BUTTON_TEXT.strip()}\n```", elem_classes="markdown-text")
# Citation section hidden for now.
# with gr.Row():
# with gr.Accordion("πŸ“™ Citation", open=False):
# citation_button = gr.Textbox(
# value=CITATION_BUTTON_TEXT,
# label=CITATION_BUTTON_LABEL,
# lines=20,
# elem_id="citation-button",
# show_copy_button=True,
# )
demo.launch()