from datetime import datetime, timezone import json import os from typing import Optional import gradio as gr import pandas as pd from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.envs import RESULTS_PATH, SUBMISSIONS_PATH from src.leaderboard.load_results import ( ResultsValidationError, build_dataframe, load_records, validate_records, ) from src.leaderboard.schema import SCHEMA def load_leaderboard_data() -> tuple[pd.DataFrame, list[str], Optional[str]]: try: records = load_records(RESULTS_PATH) df, column_order = build_dataframe(records) return df, column_order, None except ResultsValidationError as exc: fallback_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) df = pd.DataFrame(columns=fallback_cols) return df, fallback_cols, str(exc) LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data() DATASET_DISPLAY_NAMES = ["FreshRetailNet", "PSML", "Causal Chambers", "MIMIC"] DATASET_PREFIX_MAP = { "FreshRetailNet": "FreshRetailNet", "PSML": "PSML", "Causal Chambers": "CausalChambers", "MIMIC": "MIMIC", } DATASET_PREFIXES = [f"{prefix}_" for prefix in DATASET_PREFIX_MAP.values()] def is_dataset_metric(column: str) -> bool: return any(column.startswith(prefix) for prefix in DATASET_PREFIXES) BASE_COLUMNS = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) ALL_DATASET_COLUMNS = [c for c in COLUMN_ORDER if is_dataset_metric(c)] AGGREGATE_FORECAST_COLUMNS = [ "overall_mcq_acc", "T2_MAE", "T2_sMAPE", "T4_MAE", "T4_sMAPE", "MIMIC_T2_OW_sMAPE", "MIMIC_T2_OW_RMSSE", "MIMIC_T4_OW_sMAPE", "MIMIC_T4_OW_RMSSE", ] AGGREGATE_COLUMNS = BASE_COLUMNS + [ c for c in AGGREGATE_FORECAST_COLUMNS if c in COLUMN_ORDER ] DISPLAY_ALL_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS BY_DOMAIN_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS BY_DOMAIN_MAX_COLUMNS = 40 def column_types(column_order: list[str]) -> list[str]: types = [] for col in column_order: if col in SCHEMA.identity_fields: types.append("str") else: types.append("number") return types def init_leaderboard(dataframe, column_order): if dataframe is None or dataframe.empty: dataframe = pd.DataFrame(columns=column_order) dataframe = dataframe.reindex(columns=column_order) required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) cant_deselect = [c for c in required_cols if c in column_order] search_columns = [c for c in ["model_name", "agent_name"] if c in column_order] return Leaderboard( value=dataframe, datatype=column_types(column_order), select_columns=SelectColumns( default_selection=column_order, cant_deselect=cant_deselect, label="Select Columns to Display:", ), search_columns=search_columns, filter_columns=[ ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"), ], interactive=False, ) def save_submission(uploaded_file) -> str: if uploaded_file is None: return "Please upload a results file." file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file) try: records = load_records(file_path) validate_records(records) except ResultsValidationError as exc: return f"**Validation error:** {exc}" os.makedirs(SUBMISSIONS_PATH, exist_ok=True) timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json") payload = { "submitted_at": timestamp, "source_filename": os.path.basename(file_path), "records": records, } with open(out_path, "w") as fp: json.dump(payload, fp, indent=2) return f"Submission received for review. Saved to `{out_path}`." def build_example_record_markdown() -> str: try: records = load_records(RESULTS_PATH) if not records: return "No example data available." example = records[0] return "Example record (JSON):\n```json\n" + json.dumps(example, indent=2) + "\n```" except Exception as exc: return f"Could not load example record: {exc}" EXAMPLE_RECORD_MD = build_example_record_markdown() demo = gr.Blocks(css=custom_css, analytics_enabled=False) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") if LOAD_ERROR: gr.Markdown(f"**Data validation error:** {LOAD_ERROR}", elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="tab-leaderboard", id=0): leaderboard = init_leaderboard(LEADERBOARD_DF, AGGREGATE_COLUMNS) with gr.TabItem("🧭 By Domain", elem_id="tab-by-domain", id=1): by_domain_columns = BY_DOMAIN_COLUMNS[:BY_DOMAIN_MAX_COLUMNS] by_domain_df = LEADERBOARD_DF.reindex(columns=by_domain_columns) init_leaderboard(by_domain_df, by_domain_columns) # Temporarily disabled for performance debugging. with gr.TabItem("📤 Submit Results", elem_id="tab-submit", id=2): gr.Markdown( ( "Upload submission files for manual review.\n\n" "Required files:\n" "1. `results_on_dev_dataset.json`: task-level metrics in leaderboard format.\n" "2. `results_on_test_dataset.json`: per-example test outputs with at least " "`id`, `tier`, `source_dataset`, `label`, and `output` " "(required when the sample contains forecasting).\n\n" "Please also include model architecture code and LLM/system details for verification." ), elem_classes="markdown-text", ) gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text") submission_file = gr.File( label="Submission package (.zip or .rar)", file_types=[".zip", ".rar"], ) submit_button = gr.Button("Submit for Review") submission_status = gr.Markdown() submit_button.click(save_submission, [submission_file], submission_status) with gr.TabItem("📝 About", elem_id="tab-about", id=3): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") gr.Markdown(f"## Citation\n{CITATION_BUTTON_LABEL}", elem_classes="markdown-text") gr.Markdown(f"```bibtex\n{CITATION_BUTTON_TEXT.strip()}\n```", elem_classes="markdown-text") # Citation section hidden for now. # with gr.Row(): # with gr.Accordion("📙 Citation", open=False): # citation_button = gr.Textbox( # value=CITATION_BUTTON_TEXT, # label=CITATION_BUTTON_LABEL, # lines=20, # elem_id="citation-button", # show_copy_button=True, # ) demo.launch()