|
|
from datetime import datetime, timezone |
|
|
import json |
|
|
import os |
|
|
from typing import Optional |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns |
|
|
|
|
|
from src.about import ( |
|
|
CITATION_BUTTON_LABEL, |
|
|
CITATION_BUTTON_TEXT, |
|
|
INTRODUCTION_TEXT, |
|
|
LLM_BENCHMARKS_TEXT, |
|
|
TITLE, |
|
|
) |
|
|
from src.display.css_html_js import custom_css |
|
|
from src.envs import RESULTS_PATH, SUBMISSIONS_PATH |
|
|
from src.leaderboard.load_results import ( |
|
|
ResultsValidationError, |
|
|
build_dataframe, |
|
|
load_records, |
|
|
validate_records, |
|
|
) |
|
|
from src.leaderboard.schema import SCHEMA |
|
|
|
|
|
|
|
|
def load_leaderboard_data() -> tuple[pd.DataFrame, list[str], Optional[str]]: |
|
|
try: |
|
|
records = load_records(RESULTS_PATH) |
|
|
df, column_order = build_dataframe(records) |
|
|
return df, column_order, None |
|
|
except ResultsValidationError as exc: |
|
|
fallback_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) |
|
|
df = pd.DataFrame(columns=fallback_cols) |
|
|
return df, fallback_cols, str(exc) |
|
|
|
|
|
|
|
|
LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data() |
|
|
|
|
|
DATASET_DISPLAY_NAMES = ["FreshRetailNet", "PSML", "Causal Chambers", "MIMIC"] |
|
|
DATASET_PREFIX_MAP = { |
|
|
"FreshRetailNet": "FreshRetailNet", |
|
|
"PSML": "PSML", |
|
|
"Causal Chambers": "CausalChambers", |
|
|
"MIMIC": "MIMIC", |
|
|
} |
|
|
DATASET_PREFIXES = [f"{prefix}_" for prefix in DATASET_PREFIX_MAP.values()] |
|
|
|
|
|
|
|
|
def is_dataset_metric(column: str) -> bool: |
|
|
return any(column.startswith(prefix) for prefix in DATASET_PREFIXES) |
|
|
|
|
|
|
|
|
BASE_COLUMNS = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) |
|
|
ALL_DATASET_COLUMNS = [c for c in COLUMN_ORDER if is_dataset_metric(c)] |
|
|
|
|
|
AGGREGATE_FORECAST_COLUMNS = [ |
|
|
"overall_mcq_acc", |
|
|
"T2_MAE", |
|
|
"T2_sMAPE", |
|
|
"T4_MAE", |
|
|
"T4_sMAPE", |
|
|
"MIMIC_T2_OW_sMAPE", |
|
|
"MIMIC_T2_OW_RMSSE", |
|
|
"MIMIC_T4_OW_sMAPE", |
|
|
"MIMIC_T4_OW_RMSSE", |
|
|
] |
|
|
AGGREGATE_COLUMNS = BASE_COLUMNS + [ |
|
|
c for c in AGGREGATE_FORECAST_COLUMNS if c in COLUMN_ORDER |
|
|
] |
|
|
|
|
|
DISPLAY_ALL_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS |
|
|
BY_DOMAIN_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS |
|
|
BY_DOMAIN_MAX_COLUMNS = 40 |
|
|
|
|
|
|
|
|
def column_types(column_order: list[str]) -> list[str]: |
|
|
types = [] |
|
|
for col in column_order: |
|
|
if col in SCHEMA.identity_fields: |
|
|
types.append("str") |
|
|
else: |
|
|
types.append("number") |
|
|
return types |
|
|
|
|
|
|
|
|
def init_leaderboard(dataframe, column_order): |
|
|
if dataframe is None or dataframe.empty: |
|
|
dataframe = pd.DataFrame(columns=column_order) |
|
|
dataframe = dataframe.reindex(columns=column_order) |
|
|
|
|
|
required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics) |
|
|
cant_deselect = [c for c in required_cols if c in column_order] |
|
|
|
|
|
search_columns = [c for c in ["model_name", "agent_name"] if c in column_order] |
|
|
|
|
|
return Leaderboard( |
|
|
value=dataframe, |
|
|
datatype=column_types(column_order), |
|
|
select_columns=SelectColumns( |
|
|
default_selection=column_order, |
|
|
cant_deselect=cant_deselect, |
|
|
label="Select Columns to Display:", |
|
|
), |
|
|
search_columns=search_columns, |
|
|
filter_columns=[ |
|
|
ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"), |
|
|
], |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_submission(uploaded_file) -> str: |
|
|
if uploaded_file is None: |
|
|
return "Please upload a results file." |
|
|
|
|
|
file_path = uploaded_file.name if hasattr(uploaded_file, "name") else str(uploaded_file) |
|
|
|
|
|
try: |
|
|
records = load_records(file_path) |
|
|
validate_records(records) |
|
|
except ResultsValidationError as exc: |
|
|
return f"**Validation error:** {exc}" |
|
|
|
|
|
os.makedirs(SUBMISSIONS_PATH, exist_ok=True) |
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") |
|
|
out_path = os.path.join(SUBMISSIONS_PATH, f"submission_{timestamp}.json") |
|
|
payload = { |
|
|
"submitted_at": timestamp, |
|
|
"source_filename": os.path.basename(file_path), |
|
|
"records": records, |
|
|
} |
|
|
with open(out_path, "w") as fp: |
|
|
json.dump(payload, fp, indent=2) |
|
|
|
|
|
return f"Submission received for review. Saved to `{out_path}`." |
|
|
|
|
|
|
|
|
def build_example_record_markdown() -> str: |
|
|
try: |
|
|
records = load_records(RESULTS_PATH) |
|
|
if not records: |
|
|
return "No example data available." |
|
|
example = records[0] |
|
|
return "Example record (JSON):\n```json\n" + json.dumps(example, indent=2) + "\n```" |
|
|
except Exception as exc: |
|
|
return f"Could not load example record: {exc}" |
|
|
|
|
|
|
|
|
EXAMPLE_RECORD_MD = build_example_record_markdown() |
|
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css, analytics_enabled=False) |
|
|
with demo: |
|
|
gr.HTML(TITLE) |
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
if LOAD_ERROR: |
|
|
gr.Markdown(f"**Data validation error:** {LOAD_ERROR}", elem_classes="markdown-text") |
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
|
with gr.TabItem("π
Leaderboard", elem_id="tab-leaderboard", id=0): |
|
|
leaderboard = init_leaderboard(LEADERBOARD_DF, AGGREGATE_COLUMNS) |
|
|
|
|
|
with gr.TabItem("π§ By Domain", elem_id="tab-by-domain", id=1): |
|
|
by_domain_columns = BY_DOMAIN_COLUMNS[:BY_DOMAIN_MAX_COLUMNS] |
|
|
by_domain_df = LEADERBOARD_DF.reindex(columns=by_domain_columns) |
|
|
init_leaderboard(by_domain_df, by_domain_columns) |
|
|
|
|
|
|
|
|
with gr.TabItem("π€ Submit Results", elem_id="tab-submit", id=2): |
|
|
gr.Markdown( |
|
|
( |
|
|
"Upload submission files for manual review.\n\n" |
|
|
"Required files:\n" |
|
|
"1. `results_on_dev_dataset.json`: task-level metrics in leaderboard format.\n" |
|
|
"2. `results_on_test_dataset.json`: per-example test outputs with at least " |
|
|
"`id`, `tier`, `source_dataset`, `label`, and `output` " |
|
|
"(required when the sample contains forecasting).\n\n" |
|
|
"Please also include model architecture code and LLM/system details for verification." |
|
|
), |
|
|
elem_classes="markdown-text", |
|
|
) |
|
|
gr.Markdown(EXAMPLE_RECORD_MD, elem_classes="markdown-text") |
|
|
submission_file = gr.File( |
|
|
label="Submission package (.zip or .rar)", |
|
|
file_types=[".zip", ".rar"], |
|
|
) |
|
|
submit_button = gr.Button("Submit for Review") |
|
|
submission_status = gr.Markdown() |
|
|
submit_button.click(save_submission, [submission_file], submission_status) |
|
|
|
|
|
with gr.TabItem("π About", elem_id="tab-about", id=3): |
|
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
gr.Markdown(f"## Citation\n{CITATION_BUTTON_LABEL}", elem_classes="markdown-text") |
|
|
gr.Markdown(f"```bibtex\n{CITATION_BUTTON_TEXT.strip()}\n```", elem_classes="markdown-text") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo.launch() |
|
|
|