Ray0202 commited on
Commit ·
1dd52d9
1
Parent(s): 004530b
update leaderboard
Browse files- README.md +9 -2
- app.py +65 -61
- data/results.json +237 -31
- src/about.py +23 -5
- src/leaderboard/load_results.py +91 -0
- src/leaderboard/schema.py +39 -1
README.md
CHANGED
|
@@ -40,14 +40,21 @@ Required fields per record:
|
|
| 40 |
"T2_acc": 0.0,
|
| 41 |
"T3_acc": 0.0,
|
| 42 |
"T4_acc": 0.0,
|
|
|
|
| 43 |
"T2_MAE": 0.0,
|
| 44 |
"T4_sMAPE": 0.0,
|
| 45 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
| 47 |
```
|
| 48 |
|
| 49 |
Notes:
|
| 50 |
-
- `T2_MAE`
|
|
|
|
|
|
|
| 51 |
- Any additional numeric columns are treated as optional domain metrics and will be shown.
|
| 52 |
- Records must have a consistent schema and numeric metric values.
|
| 53 |
|
|
|
|
| 40 |
"T2_acc": 0.0,
|
| 41 |
"T3_acc": 0.0,
|
| 42 |
"T4_acc": 0.0,
|
| 43 |
+
"T2_sMAPE": 0.0,
|
| 44 |
"T2_MAE": 0.0,
|
| 45 |
"T4_sMAPE": 0.0,
|
| 46 |
+
"T4_MAE": 0.0,
|
| 47 |
+
"FreshRetailNet_T2_sMAPE": 0.0,
|
| 48 |
+
"FreshRetailNet_T2_MAE": 0.0,
|
| 49 |
+
"MIMIC_T2_OW_sMAPE": 0.0,
|
| 50 |
+
"MIMIC_T2_OW_RMSSE": 0.0
|
| 51 |
}
|
| 52 |
```
|
| 53 |
|
| 54 |
Notes:
|
| 55 |
+
- `T2_sMAPE`, `T2_MAE`, `T4_sMAPE`, `T4_MAE` are optional (forecasting metrics).
|
| 56 |
+
- Dataset-level columns are optional and displayed if present.
|
| 57 |
+
- For MIMIC forecasting, only `OW_sMAPE` and `OW_RMSSE` are expected.
|
| 58 |
- Any additional numeric columns are treated as optional domain metrics and will be shown.
|
| 59 |
- Records must have a consistent schema and numeric metric values.
|
| 60 |
|
app.py
CHANGED
|
@@ -37,16 +37,41 @@ def load_leaderboard_data() -> tuple[pd.DataFrame, list[str], Optional[str]]:
|
|
| 37 |
|
| 38 |
|
| 39 |
LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data()
|
| 40 |
-
METRIC_COLUMNS = [c for c in COLUMN_ORDER if c not in SCHEMA.identity_fields]
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def column_types(column_order: list[str]) -> list[str]:
|
|
@@ -62,10 +87,13 @@ def column_types(column_order: list[str]) -> list[str]:
|
|
| 62 |
def init_leaderboard(dataframe, column_order):
|
| 63 |
if dataframe is None or dataframe.empty:
|
| 64 |
dataframe = pd.DataFrame(columns=column_order)
|
|
|
|
| 65 |
|
| 66 |
required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
|
| 67 |
cant_deselect = [c for c in required_cols if c in column_order]
|
| 68 |
|
|
|
|
|
|
|
| 69 |
return Leaderboard(
|
| 70 |
value=dataframe,
|
| 71 |
datatype=column_types(column_order),
|
|
@@ -74,7 +102,7 @@ def init_leaderboard(dataframe, column_order):
|
|
| 74 |
cant_deselect=cant_deselect,
|
| 75 |
label="Select Columns to Display:",
|
| 76 |
),
|
| 77 |
-
search_columns=
|
| 78 |
filter_columns=[
|
| 79 |
ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"),
|
| 80 |
],
|
|
@@ -82,30 +110,7 @@ def init_leaderboard(dataframe, column_order):
|
|
| 82 |
)
|
| 83 |
|
| 84 |
|
| 85 |
-
|
| 86 |
-
if not entry_a or not entry_b:
|
| 87 |
-
return pd.DataFrame(columns=["metric", "entry_a", "entry_b", "delta"])
|
| 88 |
-
row_a = COMPARE_LOOKUP.get(entry_a)
|
| 89 |
-
row_b = COMPARE_LOOKUP.get(entry_b)
|
| 90 |
-
if row_a is None or row_b is None:
|
| 91 |
-
return pd.DataFrame(columns=["metric", "entry_a", "entry_b", "delta"])
|
| 92 |
-
|
| 93 |
-
rows = []
|
| 94 |
-
for metric in METRIC_COLUMNS:
|
| 95 |
-
value_a = row_a.get(metric)
|
| 96 |
-
value_b = row_b.get(metric)
|
| 97 |
-
delta = None
|
| 98 |
-
if value_a is not None and value_b is not None:
|
| 99 |
-
delta = value_b - value_a
|
| 100 |
-
rows.append(
|
| 101 |
-
{
|
| 102 |
-
"metric": metric,
|
| 103 |
-
"entry_a": value_a,
|
| 104 |
-
"entry_b": value_b,
|
| 105 |
-
"delta": delta,
|
| 106 |
-
}
|
| 107 |
-
)
|
| 108 |
-
return pd.DataFrame.from_records(rows)
|
| 109 |
|
| 110 |
|
| 111 |
def save_submission(uploaded_file) -> str:
|
|
@@ -134,6 +139,17 @@ def save_submission(uploaded_file) -> str:
|
|
| 134 |
return f"Submission received for review. Saved to `{out_path}`."
|
| 135 |
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
demo = gr.Blocks(css=custom_css)
|
| 138 |
with demo:
|
| 139 |
gr.HTML(TITLE)
|
|
@@ -143,31 +159,18 @@ with demo:
|
|
| 143 |
|
| 144 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 145 |
with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
| 146 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF,
|
| 147 |
|
| 148 |
-
with gr.TabItem("
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
elem_classes="markdown-text",
|
| 152 |
-
)
|
| 153 |
-
with gr.Row():
|
| 154 |
-
entry_a = gr.Dropdown(choices=COMPARE_OPTIONS, label="Entry A", value=None)
|
| 155 |
-
entry_b = gr.Dropdown(choices=COMPARE_OPTIONS, label="Entry B", value=None)
|
| 156 |
-
compare_table = gr.Dataframe(
|
| 157 |
-
value=pd.DataFrame(columns=["metric", "entry_a", "entry_b", "delta"]),
|
| 158 |
-
headers=["metric", "entry_a", "entry_b", "delta"],
|
| 159 |
-
datatype=["str", "number", "number", "number"],
|
| 160 |
-
interactive=False,
|
| 161 |
-
row_count=10,
|
| 162 |
-
)
|
| 163 |
-
entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
|
| 164 |
-
entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
|
| 165 |
|
| 166 |
with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
|
| 167 |
gr.Markdown(
|
| 168 |
"Upload a results file for manual review. Approved results will be merged into the main dataset.",
|
| 169 |
elem_classes="markdown-text",
|
| 170 |
)
|
|
|
|
| 171 |
submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
|
| 172 |
submit_button = gr.Button("Submit for Review")
|
| 173 |
submission_status = gr.Markdown()
|
|
@@ -176,14 +179,15 @@ with demo:
|
|
| 176 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 177 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
| 188 |
|
| 189 |
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data()
|
|
|
|
| 40 |
|
| 41 |
+
DATASET_DISPLAY_NAMES = ["FreshRetailNet", "PSML", "Causal Chambers", "MIMIC"]
|
| 42 |
+
DATASET_PREFIX_MAP = {
|
| 43 |
+
"FreshRetailNet": "FreshRetailNet",
|
| 44 |
+
"PSML": "PSML",
|
| 45 |
+
"Causal Chambers": "CausalChambers",
|
| 46 |
+
"MIMIC": "MIMIC",
|
| 47 |
+
}
|
| 48 |
+
DATASET_PREFIXES = [f"{prefix}_" for prefix in DATASET_PREFIX_MAP.values()]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def is_dataset_metric(column: str) -> bool:
|
| 52 |
+
return any(column.startswith(prefix) for prefix in DATASET_PREFIXES)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
BASE_COLUMNS = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
|
| 56 |
+
ALL_DATASET_COLUMNS = [c for c in COLUMN_ORDER if is_dataset_metric(c)]
|
| 57 |
+
|
| 58 |
+
AGGREGATE_FORECAST_COLUMNS = [
|
| 59 |
+
"overall_mcq_acc",
|
| 60 |
+
"T2_MAE",
|
| 61 |
+
"T2_sMAPE",
|
| 62 |
+
"T4_MAE",
|
| 63 |
+
"T4_sMAPE",
|
| 64 |
+
"MIMIC_T2_OW_sMAPE",
|
| 65 |
+
"MIMIC_T2_OW_RMSSE",
|
| 66 |
+
"MIMIC_T4_OW_sMAPE",
|
| 67 |
+
"MIMIC_T4_OW_RMSSE",
|
| 68 |
+
]
|
| 69 |
+
AGGREGATE_COLUMNS = BASE_COLUMNS + [
|
| 70 |
+
c for c in AGGREGATE_FORECAST_COLUMNS if c in COLUMN_ORDER
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
DISPLAY_ALL_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS
|
| 74 |
+
BY_DOMAIN_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS
|
| 75 |
|
| 76 |
|
| 77 |
def column_types(column_order: list[str]) -> list[str]:
|
|
|
|
| 87 |
def init_leaderboard(dataframe, column_order):
|
| 88 |
if dataframe is None or dataframe.empty:
|
| 89 |
dataframe = pd.DataFrame(columns=column_order)
|
| 90 |
+
dataframe = dataframe.reindex(columns=column_order)
|
| 91 |
|
| 92 |
required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
|
| 93 |
cant_deselect = [c for c in required_cols if c in column_order]
|
| 94 |
|
| 95 |
+
search_columns = [c for c in ["model_name", "agent_name"] if c in column_order]
|
| 96 |
+
|
| 97 |
return Leaderboard(
|
| 98 |
value=dataframe,
|
| 99 |
datatype=column_types(column_order),
|
|
|
|
| 102 |
cant_deselect=cant_deselect,
|
| 103 |
label="Select Columns to Display:",
|
| 104 |
),
|
| 105 |
+
search_columns=search_columns,
|
| 106 |
filter_columns=[
|
| 107 |
ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"),
|
| 108 |
],
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
|
| 113 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
|
| 116 |
def save_submission(uploaded_file) -> str:
|
|
|
|
| 139 |
return f"Submission received for review. Saved to `{out_path}`."
|
| 140 |
|
| 141 |
|
| 142 |
+
def example_record_markdown() -> str:
|
| 143 |
+
try:
|
| 144 |
+
records = load_records(RESULTS_PATH)
|
| 145 |
+
if not records:
|
| 146 |
+
return "No example data available."
|
| 147 |
+
example = records[0]
|
| 148 |
+
return "Example record (JSON):\n```json\n" + json.dumps(example, indent=2) + "\n```"
|
| 149 |
+
except Exception as exc:
|
| 150 |
+
return f"Could not load example record: {exc}"
|
| 151 |
+
|
| 152 |
+
|
| 153 |
demo = gr.Blocks(css=custom_css)
|
| 154 |
with demo:
|
| 155 |
gr.HTML(TITLE)
|
|
|
|
| 159 |
|
| 160 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 161 |
with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
| 162 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF, AGGREGATE_COLUMNS)
|
| 163 |
|
| 164 |
+
with gr.TabItem("🧭 By Domain", elem_id="llm-benchmark-tab-table", id=1):
|
| 165 |
+
by_domain_df = LEADERBOARD_DF.reindex(columns=BY_DOMAIN_COLUMNS)
|
| 166 |
+
init_leaderboard(by_domain_df, BY_DOMAIN_COLUMNS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
|
| 169 |
gr.Markdown(
|
| 170 |
"Upload a results file for manual review. Approved results will be merged into the main dataset.",
|
| 171 |
elem_classes="markdown-text",
|
| 172 |
)
|
| 173 |
+
gr.Markdown(example_record_markdown(), elem_classes="markdown-text")
|
| 174 |
submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
|
| 175 |
submit_button = gr.Button("Submit for Review")
|
| 176 |
submission_status = gr.Markdown()
|
|
|
|
| 179 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
| 180 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 181 |
|
| 182 |
+
# Citation section hidden for now.
|
| 183 |
+
# with gr.Row():
|
| 184 |
+
# with gr.Accordion("📙 Citation", open=False):
|
| 185 |
+
# citation_button = gr.Textbox(
|
| 186 |
+
# value=CITATION_BUTTON_TEXT,
|
| 187 |
+
# label=CITATION_BUTTON_LABEL,
|
| 188 |
+
# lines=20,
|
| 189 |
+
# elem_id="citation-button",
|
| 190 |
+
# show_copy_button=True,
|
| 191 |
+
# )
|
| 192 |
|
| 193 |
demo.queue(default_concurrency_limit=40).launch()
|
data/results.json
CHANGED
|
@@ -1,41 +1,247 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"
|
| 4 |
-
"agent_name": "TemporalAgent-A",
|
| 5 |
"agent_type": "single-LLM",
|
| 6 |
-
"base_model": "
|
| 7 |
-
"T1_acc":
|
| 8 |
-
"T2_acc":
|
| 9 |
-
"T3_acc":
|
| 10 |
-
"T4_acc":
|
| 11 |
-
"
|
| 12 |
-
"
|
| 13 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
},
|
| 15 |
{
|
| 16 |
-
"
|
| 17 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"agent_type": "general agent",
|
| 19 |
-
"base_model": "
|
| 20 |
-
"T1_acc":
|
| 21 |
-
"T2_acc":
|
| 22 |
-
"T3_acc":
|
| 23 |
-
"T4_acc":
|
| 24 |
-
"
|
| 25 |
-
"
|
| 26 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
},
|
| 28 |
{
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
| 39 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
]
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"agent_name": "Single LLM",
|
|
|
|
| 4 |
"agent_type": "single-LLM",
|
| 5 |
+
"base_model": "gpt-4o",
|
| 6 |
+
"T1_acc": null,
|
| 7 |
+
"T2_acc": null,
|
| 8 |
+
"T3_acc": null,
|
| 9 |
+
"T4_acc": null,
|
| 10 |
+
"FreshRetailNet_T1_acc": 0.6364,
|
| 11 |
+
"FreshRetailNet_T2_acc": 0.5227,
|
| 12 |
+
"FreshRetailNet_T3_acc": 0.0289,
|
| 13 |
+
"FreshRetailNet_T4_acc": 0.1364,
|
| 14 |
+
"PSML_T1_acc": 0.675,
|
| 15 |
+
"PSML_T2_acc": 0.2067,
|
| 16 |
+
"PSML_T3_acc": 0.348,
|
| 17 |
+
"PSML_T4_acc": 0.36,
|
| 18 |
+
"CausalChambers_T1_acc": 0.1333,
|
| 19 |
+
"CausalChambers_T2_acc": 0.2733,
|
| 20 |
+
"CausalChambers_T3_acc": 0.352,
|
| 21 |
+
"CausalChambers_T4_acc": 0.26,
|
| 22 |
+
"MIMIC_T1_acc": 0.4681,
|
| 23 |
+
"MIMIC_T2_acc": 0.2128,
|
| 24 |
+
"MIMIC_T3_acc": 0.3661,
|
| 25 |
+
"MIMIC_T4_acc": 0.2979,
|
| 26 |
+
"T2_sMAPE": null,
|
| 27 |
+
"T2_MAE": null,
|
| 28 |
+
"T2_OW_sMAPE_MIMIC": null,
|
| 29 |
+
"T2_OW_RMSSE_MIMIC": null,
|
| 30 |
+
"T4_sMAPE": null,
|
| 31 |
+
"T4_MAE": null,
|
| 32 |
+
"T4_OW_sMAPE_MIMIC": null,
|
| 33 |
+
"T4_OW_RMSSE_MIMIC": null,
|
| 34 |
+
"FreshRetailNet_T2_MAE": 0.12,
|
| 35 |
+
"FreshRetailNet_T2_sMAPE": 1.27,
|
| 36 |
+
"FreshRetailNet_T4_MAE": 0.34,
|
| 37 |
+
"FreshRetailNet_T4_sMAPE": 1.29,
|
| 38 |
+
"PSML_T2_MAE": 0.61,
|
| 39 |
+
"PSML_T2_sMAPE": 0.6,
|
| 40 |
+
"PSML_T4_MAE": 0.44,
|
| 41 |
+
"PSML_T4_sMAPE": 0.37,
|
| 42 |
+
"CausalChambers_T2_MAE": 2.48,
|
| 43 |
+
"CausalChambers_T2_OW_RMSSE": 0.0000257,
|
| 44 |
+
"CausalChambers_T4_MAE": 2.58,
|
| 45 |
+
"CausalChambers_T4_OW_RMSSE": 0.0000269,
|
| 46 |
+
"MIMIC_T2_OW_sMAPE": 15.2,
|
| 47 |
+
"MIMIC_T2_OW_RMSSE": 0.55,
|
| 48 |
+
"MIMIC_T4_OW_sMAPE": 16.86,
|
| 49 |
+
"MIMIC_T4_OW_RMSSE": 0.63
|
| 50 |
},
|
| 51 |
{
|
| 52 |
+
"agent_name": "TimeSeries Scientist",
|
| 53 |
+
"agent_type": "time-series-specific agent",
|
| 54 |
+
"base_model": "gpt-4o",
|
| 55 |
+
"T1_acc": null,
|
| 56 |
+
"T2_acc": null,
|
| 57 |
+
"T3_acc": null,
|
| 58 |
+
"T4_acc": null,
|
| 59 |
+
"FreshRetailNet_T1_acc": 0.3352,
|
| 60 |
+
"FreshRetailNet_T2_acc": 0.5682,
|
| 61 |
+
"FreshRetailNet_T3_acc": 0.0341,
|
| 62 |
+
"FreshRetailNet_T4_acc": 0.5682,
|
| 63 |
+
"PSML_T1_acc": 0.28,
|
| 64 |
+
"PSML_T2_acc": 0.2667,
|
| 65 |
+
"PSML_T3_acc": 0.216,
|
| 66 |
+
"PSML_T4_acc": 0.2733,
|
| 67 |
+
"CausalChambers_T1_acc": 0.2867,
|
| 68 |
+
"CausalChambers_T2_acc": 0.0267,
|
| 69 |
+
"CausalChambers_T3_acc": 0.216,
|
| 70 |
+
"CausalChambers_T4_acc": 0.0267,
|
| 71 |
+
"MIMIC_T1_acc": 0.1011,
|
| 72 |
+
"MIMIC_T2_acc": 0.234,
|
| 73 |
+
"MIMIC_T3_acc": 0.2887,
|
| 74 |
+
"MIMIC_T4_acc": 0.234,
|
| 75 |
+
"T2_sMAPE": null,
|
| 76 |
+
"T2_MAE": null,
|
| 77 |
+
"T2_OW_sMAPE_MIMIC": null,
|
| 78 |
+
"T2_OW_RMSSE_MIMIC": null,
|
| 79 |
+
"T4_sMAPE": null,
|
| 80 |
+
"T4_MAE": null,
|
| 81 |
+
"T4_OW_sMAPE_MIMIC": null,
|
| 82 |
+
"T4_OW_RMSSE_MIMIC": null,
|
| 83 |
+
"FreshRetailNet_T2_MAE": 0.35,
|
| 84 |
+
"FreshRetailNet_T2_sMAPE": 1.27,
|
| 85 |
+
"FreshRetailNet_T4_MAE": 0.51,
|
| 86 |
+
"FreshRetailNet_T4_sMAPE": 1.4,
|
| 87 |
+
"PSML_T2_MAE": 1.53,
|
| 88 |
+
"PSML_T2_sMAPE": 0.65,
|
| 89 |
+
"PSML_T4_MAE": 0.84,
|
| 90 |
+
"PSML_T4_sMAPE": 0.48,
|
| 91 |
+
"CausalChambers_T2_MAE": 2.44,
|
| 92 |
+
"CausalChambers_T2_OW_RMSSE": 0.0000253,
|
| 93 |
+
"CausalChambers_T4_MAE": 2.94,
|
| 94 |
+
"CausalChambers_T4_OW_RMSSE": 0.0000306,
|
| 95 |
+
"MIMIC_T2_OW_sMAPE": 15.81,
|
| 96 |
+
"MIMIC_T2_OW_RMSSE": 0.52,
|
| 97 |
+
"MIMIC_T4_OW_sMAPE": 17.18,
|
| 98 |
+
"MIMIC_T4_OW_RMSSE": 0.64
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"agent_name": "AgentScope",
|
| 102 |
"agent_type": "general agent",
|
| 103 |
+
"base_model": "gpt-4o",
|
| 104 |
+
"T1_acc": null,
|
| 105 |
+
"T2_acc": null,
|
| 106 |
+
"T3_acc": null,
|
| 107 |
+
"T4_acc": null,
|
| 108 |
+
"FreshRetailNet_T1_acc": 0.625,
|
| 109 |
+
"FreshRetailNet_T2_acc": 0.1212,
|
| 110 |
+
"FreshRetailNet_T3_acc": 0.1364,
|
| 111 |
+
"FreshRetailNet_T4_acc": 0.1894,
|
| 112 |
+
"PSML_T1_acc": 0.66,
|
| 113 |
+
"PSML_T2_acc": 0.2467,
|
| 114 |
+
"PSML_T3_acc": 0.272,
|
| 115 |
+
"PSML_T4_acc": 0.3533,
|
| 116 |
+
"CausalChambers_T1_acc": 0.12,
|
| 117 |
+
"CausalChambers_T2_acc": 0.46,
|
| 118 |
+
"CausalChambers_T3_acc": 0.44,
|
| 119 |
+
"CausalChambers_T4_acc": 0.32,
|
| 120 |
+
"MIMIC_T1_acc": 0.4468,
|
| 121 |
+
"MIMIC_T2_acc": 0.2128,
|
| 122 |
+
"MIMIC_T3_acc": 0.2395,
|
| 123 |
+
"MIMIC_T4_acc": 0.227,
|
| 124 |
+
"T2_sMAPE": null,
|
| 125 |
+
"T2_MAE": null,
|
| 126 |
+
"T2_OW_sMAPE_MIMIC": null,
|
| 127 |
+
"T2_OW_RMSSE_MIMIC": null,
|
| 128 |
+
"T4_sMAPE": null,
|
| 129 |
+
"T4_MAE": null,
|
| 130 |
+
"T4_OW_sMAPE_MIMIC": null,
|
| 131 |
+
"T4_OW_RMSSE_MIMIC": null,
|
| 132 |
+
"FreshRetailNet_T2_MAE": 0.12,
|
| 133 |
+
"FreshRetailNet_T2_sMAPE": 126.27,
|
| 134 |
+
"FreshRetailNet_T4_MAE": 0.2,
|
| 135 |
+
"FreshRetailNet_T4_sMAPE": 130.86,
|
| 136 |
+
"PSML_T2_MAE": 0.28,
|
| 137 |
+
"PSML_T2_sMAPE": 37.38,
|
| 138 |
+
"PSML_T4_MAE": 0.35,
|
| 139 |
+
"PSML_T4_sMAPE": 30.51,
|
| 140 |
+
"CausalChambers_T2_MAE": 2.76,
|
| 141 |
+
"CausalChambers_T2_OW_RMSSE": 0.00262,
|
| 142 |
+
"CausalChambers_T4_MAE": 2.66,
|
| 143 |
+
"CausalChambers_T4_OW_RMSSE": 0.00246,
|
| 144 |
+
"MIMIC_T2_OW_sMAPE": 11.05,
|
| 145 |
+
"MIMIC_T2_OW_RMSSE": 0.43,
|
| 146 |
+
"MIMIC_T4_OW_sMAPE": 12.02,
|
| 147 |
+
"MIMIC_T4_OW_RMSSE": 0.49
|
| 148 |
},
|
| 149 |
{
|
| 150 |
+
"agent_name": "MetaGPT",
|
| 151 |
+
"agent_type": "general agent",
|
| 152 |
+
"base_model": "gpt-4o",
|
| 153 |
+
"T1_acc": null,
|
| 154 |
+
"T2_acc": null,
|
| 155 |
+
"T3_acc": null,
|
| 156 |
+
"T4_acc": null,
|
| 157 |
+
"FreshRetailNet_T1_acc": 0.625,
|
| 158 |
+
"FreshRetailNet_T2_acc": 0.0909,
|
| 159 |
+
"FreshRetailNet_T3_acc": 0.0511,
|
| 160 |
+
"FreshRetailNet_T4_acc": 0.1439,
|
| 161 |
+
"PSML_T1_acc": 0.675,
|
| 162 |
+
"PSML_T2_acc": 0.2109,
|
| 163 |
+
"PSML_T3_acc": 0.22,
|
| 164 |
+
"PSML_T4_acc": 0.3133,
|
| 165 |
+
"CausalChambers_T1_acc": 0.1067,
|
| 166 |
+
"CausalChambers_T2_acc": 0.5933,
|
| 167 |
+
"CausalChambers_T3_acc": 0.452,
|
| 168 |
+
"CausalChambers_T4_acc": 0.16,
|
| 169 |
+
"MIMIC_T1_acc": 0.4574,
|
| 170 |
+
"MIMIC_T2_acc": 0.1702,
|
| 171 |
+
"MIMIC_T3_acc": 0.2897,
|
| 172 |
+
"MIMIC_T4_acc": 0.2553,
|
| 173 |
+
"T2_sMAPE": null,
|
| 174 |
+
"T2_MAE": null,
|
| 175 |
+
"T2_OW_sMAPE_MIMIC": null,
|
| 176 |
+
"T2_OW_RMSSE_MIMIC": null,
|
| 177 |
+
"T4_sMAPE": null,
|
| 178 |
+
"T4_MAE": null,
|
| 179 |
+
"T4_OW_sMAPE_MIMIC": null,
|
| 180 |
+
"T4_OW_RMSSE_MIMIC": null,
|
| 181 |
+
"FreshRetailNet_T2_MAE": 0.13,
|
| 182 |
+
"FreshRetailNet_T2_sMAPE": 126.59,
|
| 183 |
+
"FreshRetailNet_T4_MAE": 0.24,
|
| 184 |
+
"FreshRetailNet_T4_sMAPE": 127.22,
|
| 185 |
+
"PSML_T2_MAE": 0.34,
|
| 186 |
+
"PSML_T2_sMAPE": 24.74,
|
| 187 |
+
"PSML_T4_MAE": 0.4,
|
| 188 |
+
"PSML_T4_sMAPE": 43.47,
|
| 189 |
+
"CausalChambers_T2_MAE": 2.62,
|
| 190 |
+
"CausalChambers_T2_OW_RMSSE": 0.00272,
|
| 191 |
+
"CausalChambers_T4_MAE": 2.76,
|
| 192 |
+
"CausalChambers_T4_OW_RMSSE": 0.00287,
|
| 193 |
+
"MIMIC_T2_OW_sMAPE": 14.11,
|
| 194 |
+
"MIMIC_T2_OW_RMSSE": 0.53,
|
| 195 |
+
"MIMIC_T4_OW_sMAPE": 15.4,
|
| 196 |
+
"MIMIC_T4_OW_RMSSE": 0.63
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"agent_name": "CAMEL",
|
| 200 |
+
"agent_type": "general agent",
|
| 201 |
+
"base_model": "gpt-4o",
|
| 202 |
+
"T1_acc": null,
|
| 203 |
+
"T2_acc": null,
|
| 204 |
+
"T3_acc": null,
|
| 205 |
+
"T4_acc": null,
|
| 206 |
+
"FreshRetailNet_T1_acc": 0.642,
|
| 207 |
+
"FreshRetailNet_T2_acc": 0.0076,
|
| 208 |
+
"FreshRetailNet_T3_acc": 0.0625,
|
| 209 |
+
"FreshRetailNet_T4_acc": 0.3106,
|
| 210 |
+
"PSML_T1_acc": 0.685,
|
| 211 |
+
"PSML_T2_acc": 0.14,
|
| 212 |
+
"PSML_T3_acc": 0.184,
|
| 213 |
+
"PSML_T4_acc": 0.3067,
|
| 214 |
+
"CausalChambers_T1_acc": 0.1,
|
| 215 |
+
"CausalChambers_T2_acc": 0.66,
|
| 216 |
+
"CausalChambers_T3_acc": 0.42,
|
| 217 |
+
"CausalChambers_T4_acc": 0.2667,
|
| 218 |
+
"MIMIC_T1_acc": 0.4681,
|
| 219 |
+
"MIMIC_T2_acc": 0.2057,
|
| 220 |
+
"MIMIC_T3_acc": 0.3014,
|
| 221 |
+
"MIMIC_T4_acc": 0.234,
|
| 222 |
+
"T2_sMAPE": null,
|
| 223 |
+
"T2_MAE": null,
|
| 224 |
+
"T2_OW_sMAPE_MIMIC": null,
|
| 225 |
+
"T2_OW_RMSSE_MIMIC": null,
|
| 226 |
+
"T4_sMAPE": null,
|
| 227 |
+
"T4_MAE": null,
|
| 228 |
+
"T4_OW_sMAPE_MIMIC": null,
|
| 229 |
+
"T4_OW_RMSSE_MIMIC": null,
|
| 230 |
+
"FreshRetailNet_T2_MAE": 0.13,
|
| 231 |
+
"FreshRetailNet_T2_sMAPE": 126.75,
|
| 232 |
+
"FreshRetailNet_T4_MAE": 0.28,
|
| 233 |
+
"FreshRetailNet_T4_sMAPE": 128.18,
|
| 234 |
+
"PSML_T2_MAE": 0.43,
|
| 235 |
+
"PSML_T2_sMAPE": 34.89,
|
| 236 |
+
"PSML_T4_MAE": 0.45,
|
| 237 |
+
"PSML_T4_sMAPE": 35.78,
|
| 238 |
+
"CausalChambers_T2_MAE": 2.99,
|
| 239 |
+
"CausalChambers_T2_OW_RMSSE": 0.00311,
|
| 240 |
+
"CausalChambers_T4_MAE": 2.5,
|
| 241 |
+
"CausalChambers_T4_OW_RMSSE": 0.0026,
|
| 242 |
+
"MIMIC_T2_OW_sMAPE": 12.02,
|
| 243 |
+
"MIMIC_T2_OW_RMSSE": 0.55,
|
| 244 |
+
"MIMIC_T4_OW_sMAPE": 15.74,
|
| 245 |
+
"MIMIC_T4_OW_RMSSE": 0.59
|
| 246 |
}
|
| 247 |
]
|
src/about.py
CHANGED
|
@@ -10,21 +10,39 @@ LLM_BENCHMARKS_TEXT = """
|
|
| 10 |
## What this leaderboard shows
|
| 11 |
|
| 12 |
- One row per evaluated agent configuration
|
| 13 |
-
- Task-family metrics for TemporalBench (T1–T4)
|
| 14 |
-
-
|
|
|
|
| 15 |
|
| 16 |
## Data requirements
|
| 17 |
|
| 18 |
Results are loaded from a local JSON or CSV file. Each record must include:
|
| 19 |
|
| 20 |
-
- Identity fields: `
|
| 21 |
-
- Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
|
| 22 |
-
- Optional metrics:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
## Submission workflow
|
| 25 |
|
| 26 |
Uploads are stored locally for manual review. Approved results should be merged into
|
| 27 |
the main results file to appear on the leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"""
|
| 29 |
|
| 30 |
EVALUATION_QUEUE_TEXT = ""
|
|
|
|
| 10 |
## What this leaderboard shows
|
| 11 |
|
| 12 |
- One row per evaluated agent configuration
|
| 13 |
+
- Task-family MCQ metrics for TemporalBench (T1–T4)
|
| 14 |
+
- Forecasting metrics for T2/T4 (sMAPE, MAE) and MIMIC OW metrics when provided
|
| 15 |
+
- Dataset-level results for: FreshRetailNet, PSML, Causal Chambers, MIMIC
|
| 16 |
|
| 17 |
## Data requirements
|
| 18 |
|
| 19 |
Results are loaded from a local JSON or CSV file. Each record must include:
|
| 20 |
|
| 21 |
+
- Identity fields: `agent_name`, `agent_type`, `base_model`
|
| 22 |
+
- Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc` (computed overall)
|
| 23 |
+
- Optional metrics:
|
| 24 |
+
- Overall forecasting: `T2_sMAPE`, `T2_MAE`, `T4_sMAPE`, `T4_MAE`
|
| 25 |
+
- MIMIC overall OW: `MIMIC_T2_OW_sMAPE`, `MIMIC_T2_OW_RMSSE`, `MIMIC_T4_OW_sMAPE`, `MIMIC_T4_OW_RMSSE`
|
| 26 |
+
- Dataset-level metrics: `<Dataset>_T{1..4}_acc` and forecasting metrics per dataset
|
| 27 |
+
|
| 28 |
+
## Overall computation
|
| 29 |
+
|
| 30 |
+
Overall T1–T4 accuracy and T2/T4 forecasting metrics are computed as weighted averages
|
| 31 |
+
from dataset-level results using question/series counts. Missing values are ignored.
|
| 32 |
|
| 33 |
## Submission workflow
|
| 34 |
|
| 35 |
Uploads are stored locally for manual review. Approved results should be merged into
|
| 36 |
the main results file to appear on the leaderboard.
|
| 37 |
+
|
| 38 |
+
## Data access
|
| 39 |
+
|
| 40 |
+
The dataset is available at:
|
| 41 |
+
```
|
| 42 |
+
https://huggingface.co/datasets/Melady/TemporalBench
|
| 43 |
+
```
|
| 44 |
+
It includes all test tasks and a `forecast_metrics_utils.py` file that documents the
|
| 45 |
+
standard metric computation utilities.
|
| 46 |
"""
|
| 47 |
|
| 48 |
EVALUATION_QUEUE_TEXT = ""
|
src/leaderboard/load_results.py
CHANGED
|
@@ -20,6 +20,14 @@ def _is_number(value) -> bool:
|
|
| 20 |
return math.isfinite(float(value))
|
| 21 |
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def _load_json_records(path: str) -> list[dict]:
|
| 24 |
with open(path, "r") as fp:
|
| 25 |
data = json.load(fp)
|
|
@@ -80,6 +88,8 @@ def validate_records(records: Iterable[dict]) -> None:
|
|
| 80 |
for key, value in record.items():
|
| 81 |
if key in SCHEMA.identity_fields:
|
| 82 |
continue
|
|
|
|
|
|
|
| 83 |
if not _is_number(value):
|
| 84 |
raise ResultsValidationError(
|
| 85 |
f"Record {idx} metric '{key}' must be numeric."
|
|
@@ -114,5 +124,86 @@ def build_dataframe(records: list[dict]) -> tuple[pd.DataFrame, list[str]]:
|
|
| 114 |
metric_cols = infer_metric_columns(records)
|
| 115 |
column_order = list(SCHEMA.identity_fields) + metric_cols
|
| 116 |
df = pd.DataFrame.from_records(records)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
df = df[column_order]
|
| 118 |
return df, column_order
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
return math.isfinite(float(value))
|
| 21 |
|
| 22 |
|
| 23 |
+
def _is_missing(value) -> bool:
|
| 24 |
+
if value is None:
|
| 25 |
+
return True
|
| 26 |
+
if isinstance(value, float) and math.isnan(value):
|
| 27 |
+
return True
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
|
| 31 |
def _load_json_records(path: str) -> list[dict]:
|
| 32 |
with open(path, "r") as fp:
|
| 33 |
data = json.load(fp)
|
|
|
|
| 88 |
for key, value in record.items():
|
| 89 |
if key in SCHEMA.identity_fields:
|
| 90 |
continue
|
| 91 |
+
if _is_missing(value):
|
| 92 |
+
continue
|
| 93 |
if not _is_number(value):
|
| 94 |
raise ResultsValidationError(
|
| 95 |
f"Record {idx} metric '{key}' must be numeric."
|
|
|
|
| 124 |
metric_cols = infer_metric_columns(records)
|
| 125 |
column_order = list(SCHEMA.identity_fields) + metric_cols
|
| 126 |
df = pd.DataFrame.from_records(records)
|
| 127 |
+
df = apply_overall_metrics(df)
|
| 128 |
+
# Include computed columns (e.g., overall_mcq_acc) in display order if present.
|
| 129 |
+
for col in df.columns:
|
| 130 |
+
if col in SCHEMA.identity_fields:
|
| 131 |
+
continue
|
| 132 |
+
if col not in column_order:
|
| 133 |
+
column_order.append(col)
|
| 134 |
df = df[column_order]
|
| 135 |
return df, column_order
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
MCQ_QUESTIONS = {
|
| 139 |
+
"MIMIC": {"T1": 188, "T2": 141, "T3": 239, "T4": 141},
|
| 140 |
+
"PSML": {"T1": 200, "T2": 150, "T3": 250, "T4": 150},
|
| 141 |
+
"CausalChambers": {"T1": 150, "T2": 150, "T3": 250, "T4": 150},
|
| 142 |
+
"FreshRetailNet": {"T1": 176, "T2": 132, "T3": 176, "T4": 132},
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
FORECAST_SERIES = {
|
| 146 |
+
"MIMIC": {"T2": 282, "T4": 282},
|
| 147 |
+
"PSML": {"T2": 50, "T4": 50},
|
| 148 |
+
"CausalChambers": {"T2": 50, "T4": 50},
|
| 149 |
+
"FreshRetailNet": {"T2": 44, "T4": 44},
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _weighted_avg(row: pd.Series, columns: list[str], weights: list[int]) -> float | None:
|
| 154 |
+
total = 0.0
|
| 155 |
+
total_w = 0.0
|
| 156 |
+
for col, w in zip(columns, weights):
|
| 157 |
+
val = row.get(col)
|
| 158 |
+
if _is_missing(val):
|
| 159 |
+
continue
|
| 160 |
+
total += float(val) * w
|
| 161 |
+
total_w += w
|
| 162 |
+
if total_w == 0:
|
| 163 |
+
return None
|
| 164 |
+
return round(total / total_w, 4)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def apply_overall_metrics(df: pd.DataFrame) -> pd.DataFrame:
|
| 168 |
+
df = df.copy()
|
| 169 |
+
|
| 170 |
+
for task in ["T1", "T2", "T3", "T4"]:
|
| 171 |
+
cols = []
|
| 172 |
+
weights = []
|
| 173 |
+
for dataset, task_weights in MCQ_QUESTIONS.items():
|
| 174 |
+
col = f"{dataset}_{task}_acc"
|
| 175 |
+
if col in df.columns:
|
| 176 |
+
cols.append(col)
|
| 177 |
+
weights.append(task_weights[task])
|
| 178 |
+
if cols:
|
| 179 |
+
df[f"{task}_acc"] = df.apply(lambda r: _weighted_avg(r, cols, weights), axis=1)
|
| 180 |
+
|
| 181 |
+
overall_cols = []
|
| 182 |
+
overall_weights = []
|
| 183 |
+
for dataset, task_weights in MCQ_QUESTIONS.items():
|
| 184 |
+
for task, weight in task_weights.items():
|
| 185 |
+
col = f"{dataset}_{task}_acc"
|
| 186 |
+
if col in df.columns:
|
| 187 |
+
overall_cols.append(col)
|
| 188 |
+
overall_weights.append(weight)
|
| 189 |
+
if overall_cols:
|
| 190 |
+
df["overall_mcq_acc"] = df.apply(
|
| 191 |
+
lambda r: _weighted_avg(r, overall_cols, overall_weights), axis=1
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
for task in ["T2", "T4"]:
|
| 195 |
+
# sMAPE/MAE are defined for non-MIMIC datasets
|
| 196 |
+
for metric in ["sMAPE", "MAE"]:
|
| 197 |
+
cols = []
|
| 198 |
+
weights = []
|
| 199 |
+
for dataset, task_weights in FORECAST_SERIES.items():
|
| 200 |
+
if dataset == "MIMIC":
|
| 201 |
+
continue
|
| 202 |
+
col = f"{dataset}_{task}_{metric}"
|
| 203 |
+
if col in df.columns:
|
| 204 |
+
cols.append(col)
|
| 205 |
+
weights.append(task_weights[task])
|
| 206 |
+
if cols:
|
| 207 |
+
df[f"{task}_{metric}"] = df.apply(lambda r: _weighted_avg(r, cols, weights), axis=1)
|
| 208 |
+
|
| 209 |
+
return df
|
src/leaderboard/schema.py
CHANGED
|
@@ -6,7 +6,6 @@ from dataclasses import dataclass
|
|
| 6 |
@dataclass(frozen=True)
|
| 7 |
class TemporalBenchSchema:
|
| 8 |
identity_fields: tuple[str, ...] = (
|
| 9 |
-
"model_name",
|
| 10 |
"agent_name",
|
| 11 |
"agent_type",
|
| 12 |
"base_model",
|
|
@@ -18,8 +17,47 @@ class TemporalBenchSchema:
|
|
| 18 |
"T4_acc",
|
| 19 |
)
|
| 20 |
optional_metrics: tuple[str, ...] = (
|
|
|
|
|
|
|
| 21 |
"T2_MAE",
|
|
|
|
|
|
|
| 22 |
"T4_sMAPE",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
|
|
|
|
| 6 |
@dataclass(frozen=True)
|
| 7 |
class TemporalBenchSchema:
|
| 8 |
identity_fields: tuple[str, ...] = (
|
|
|
|
| 9 |
"agent_name",
|
| 10 |
"agent_type",
|
| 11 |
"base_model",
|
|
|
|
| 17 |
"T4_acc",
|
| 18 |
)
|
| 19 |
optional_metrics: tuple[str, ...] = (
|
| 20 |
+
"overall_mcq_acc",
|
| 21 |
+
"T2_sMAPE",
|
| 22 |
"T2_MAE",
|
| 23 |
+
"T2_OW_sMAPE_MIMIC",
|
| 24 |
+
"T2_OW_RMSSE_MIMIC",
|
| 25 |
"T4_sMAPE",
|
| 26 |
+
"T4_MAE",
|
| 27 |
+
"T4_OW_sMAPE_MIMIC",
|
| 28 |
+
"T4_OW_RMSSE_MIMIC",
|
| 29 |
+
"FreshRetailNet_T1_acc",
|
| 30 |
+
"FreshRetailNet_T2_acc",
|
| 31 |
+
"FreshRetailNet_T3_acc",
|
| 32 |
+
"FreshRetailNet_T4_acc",
|
| 33 |
+
"PSML_T1_acc",
|
| 34 |
+
"PSML_T2_acc",
|
| 35 |
+
"PSML_T3_acc",
|
| 36 |
+
"PSML_T4_acc",
|
| 37 |
+
"CausalChambers_T1_acc",
|
| 38 |
+
"CausalChambers_T2_acc",
|
| 39 |
+
"CausalChambers_T3_acc",
|
| 40 |
+
"CausalChambers_T4_acc",
|
| 41 |
+
"MIMIC_T1_acc",
|
| 42 |
+
"MIMIC_T2_acc",
|
| 43 |
+
"MIMIC_T3_acc",
|
| 44 |
+
"MIMIC_T4_acc",
|
| 45 |
+
"FreshRetailNet_T2_sMAPE",
|
| 46 |
+
"FreshRetailNet_T2_MAE",
|
| 47 |
+
"PSML_T2_sMAPE",
|
| 48 |
+
"PSML_T2_MAE",
|
| 49 |
+
"CausalChambers_T2_sMAPE",
|
| 50 |
+
"CausalChambers_T2_MAE",
|
| 51 |
+
"MIMIC_T2_OW_sMAPE",
|
| 52 |
+
"MIMIC_T2_OW_RMSSE",
|
| 53 |
+
"FreshRetailNet_T4_sMAPE",
|
| 54 |
+
"FreshRetailNet_T4_MAE",
|
| 55 |
+
"PSML_T4_sMAPE",
|
| 56 |
+
"PSML_T4_MAE",
|
| 57 |
+
"CausalChambers_T4_sMAPE",
|
| 58 |
+
"CausalChambers_T4_MAE",
|
| 59 |
+
"MIMIC_T4_OW_sMAPE",
|
| 60 |
+
"MIMIC_T4_OW_RMSSE",
|
| 61 |
)
|
| 62 |
|
| 63 |
|