Ray0202 commited on
Commit
1dd52d9
·
1 Parent(s): 004530b

update leaderboard

Browse files
README.md CHANGED
@@ -40,14 +40,21 @@ Required fields per record:
40
  "T2_acc": 0.0,
41
  "T3_acc": 0.0,
42
  "T4_acc": 0.0,
 
43
  "T2_MAE": 0.0,
44
  "T4_sMAPE": 0.0,
45
- "Retail_T3_acc": 0.0
 
 
 
 
46
  }
47
  ```
48
 
49
  Notes:
50
- - `T2_MAE` and `T4_sMAPE` are optional.
 
 
51
  - Any additional numeric columns are treated as optional domain metrics and will be shown.
52
  - Records must have a consistent schema and numeric metric values.
53
 
 
40
  "T2_acc": 0.0,
41
  "T3_acc": 0.0,
42
  "T4_acc": 0.0,
43
+ "T2_sMAPE": 0.0,
44
  "T2_MAE": 0.0,
45
  "T4_sMAPE": 0.0,
46
+ "T4_MAE": 0.0,
47
+ "FreshRetailNet_T2_sMAPE": 0.0,
48
+ "FreshRetailNet_T2_MAE": 0.0,
49
+ "MIMIC_T2_OW_sMAPE": 0.0,
50
+ "MIMIC_T2_OW_RMSSE": 0.0
51
  }
52
  ```
53
 
54
  Notes:
55
+ - `T2_sMAPE`, `T2_MAE`, `T4_sMAPE`, `T4_MAE` are optional (forecasting metrics).
56
+ - Dataset-level columns are optional and displayed if present.
57
+ - For MIMIC forecasting, only `OW_sMAPE` and `OW_RMSSE` are expected.
58
  - Any additional numeric columns are treated as optional domain metrics and will be shown.
59
  - Records must have a consistent schema and numeric metric values.
60
 
app.py CHANGED
@@ -37,16 +37,41 @@ def load_leaderboard_data() -> tuple[pd.DataFrame, list[str], Optional[str]]:
37
 
38
 
39
  LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data()
40
- METRIC_COLUMNS = [c for c in COLUMN_ORDER if c not in SCHEMA.identity_fields]
41
 
42
- COMPARE_OPTIONS = []
43
- COMPARE_LOOKUP = {}
44
- for idx, row in LEADERBOARD_DF.iterrows():
45
- label = (
46
- f"{row['agent_name']} | {row['model_name']} | {row['agent_type']} | {row['base_model']} ({idx})"
47
- )
48
- COMPARE_OPTIONS.append(label)
49
- COMPARE_LOOKUP[label] = row.to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  def column_types(column_order: list[str]) -> list[str]:
@@ -62,10 +87,13 @@ def column_types(column_order: list[str]) -> list[str]:
62
  def init_leaderboard(dataframe, column_order):
63
  if dataframe is None or dataframe.empty:
64
  dataframe = pd.DataFrame(columns=column_order)
 
65
 
66
  required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
67
  cant_deselect = [c for c in required_cols if c in column_order]
68
 
 
 
69
  return Leaderboard(
70
  value=dataframe,
71
  datatype=column_types(column_order),
@@ -74,7 +102,7 @@ def init_leaderboard(dataframe, column_order):
74
  cant_deselect=cant_deselect,
75
  label="Select Columns to Display:",
76
  ),
77
- search_columns=["model_name", "agent_name"],
78
  filter_columns=[
79
  ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"),
80
  ],
@@ -82,30 +110,7 @@ def init_leaderboard(dataframe, column_order):
82
  )
83
 
84
 
85
- def compare_entries(entry_a: str, entry_b: str) -> pd.DataFrame:
86
- if not entry_a or not entry_b:
87
- return pd.DataFrame(columns=["metric", "entry_a", "entry_b", "delta"])
88
- row_a = COMPARE_LOOKUP.get(entry_a)
89
- row_b = COMPARE_LOOKUP.get(entry_b)
90
- if row_a is None or row_b is None:
91
- return pd.DataFrame(columns=["metric", "entry_a", "entry_b", "delta"])
92
-
93
- rows = []
94
- for metric in METRIC_COLUMNS:
95
- value_a = row_a.get(metric)
96
- value_b = row_b.get(metric)
97
- delta = None
98
- if value_a is not None and value_b is not None:
99
- delta = value_b - value_a
100
- rows.append(
101
- {
102
- "metric": metric,
103
- "entry_a": value_a,
104
- "entry_b": value_b,
105
- "delta": delta,
106
- }
107
- )
108
- return pd.DataFrame.from_records(rows)
109
 
110
 
111
  def save_submission(uploaded_file) -> str:
@@ -134,6 +139,17 @@ def save_submission(uploaded_file) -> str:
134
  return f"Submission received for review. Saved to `{out_path}`."
135
 
136
 
 
 
 
 
 
 
 
 
 
 
 
137
  demo = gr.Blocks(css=custom_css)
138
  with demo:
139
  gr.HTML(TITLE)
@@ -143,31 +159,18 @@ with demo:
143
 
144
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
145
  with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
146
- leaderboard = init_leaderboard(LEADERBOARD_DF, COLUMN_ORDER)
147
 
148
- with gr.TabItem("🔍 Compare", elem_id="llm-benchmark-tab-table", id=1):
149
- gr.Markdown(
150
- "Select two evaluated entries to compare their metrics side by side.",
151
- elem_classes="markdown-text",
152
- )
153
- with gr.Row():
154
- entry_a = gr.Dropdown(choices=COMPARE_OPTIONS, label="Entry A", value=None)
155
- entry_b = gr.Dropdown(choices=COMPARE_OPTIONS, label="Entry B", value=None)
156
- compare_table = gr.Dataframe(
157
- value=pd.DataFrame(columns=["metric", "entry_a", "entry_b", "delta"]),
158
- headers=["metric", "entry_a", "entry_b", "delta"],
159
- datatype=["str", "number", "number", "number"],
160
- interactive=False,
161
- row_count=10,
162
- )
163
- entry_a.change(compare_entries, [entry_a, entry_b], compare_table)
164
- entry_b.change(compare_entries, [entry_a, entry_b], compare_table)
165
 
166
  with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
167
  gr.Markdown(
168
  "Upload a results file for manual review. Approved results will be merged into the main dataset.",
169
  elem_classes="markdown-text",
170
  )
 
171
  submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
172
  submit_button = gr.Button("Submit for Review")
173
  submission_status = gr.Markdown()
@@ -176,14 +179,15 @@ with demo:
176
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
177
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
178
 
179
- with gr.Row():
180
- with gr.Accordion("📙 Citation", open=False):
181
- citation_button = gr.Textbox(
182
- value=CITATION_BUTTON_TEXT,
183
- label=CITATION_BUTTON_LABEL,
184
- lines=20,
185
- elem_id="citation-button",
186
- show_copy_button=True,
187
- )
 
188
 
189
  demo.queue(default_concurrency_limit=40).launch()
 
37
 
38
 
39
  LEADERBOARD_DF, COLUMN_ORDER, LOAD_ERROR = load_leaderboard_data()
 
40
 
41
+ DATASET_DISPLAY_NAMES = ["FreshRetailNet", "PSML", "Causal Chambers", "MIMIC"]
42
+ DATASET_PREFIX_MAP = {
43
+ "FreshRetailNet": "FreshRetailNet",
44
+ "PSML": "PSML",
45
+ "Causal Chambers": "CausalChambers",
46
+ "MIMIC": "MIMIC",
47
+ }
48
+ DATASET_PREFIXES = [f"{prefix}_" for prefix in DATASET_PREFIX_MAP.values()]
49
+
50
+
51
+ def is_dataset_metric(column: str) -> bool:
52
+ return any(column.startswith(prefix) for prefix in DATASET_PREFIXES)
53
+
54
+
55
+ BASE_COLUMNS = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
56
+ ALL_DATASET_COLUMNS = [c for c in COLUMN_ORDER if is_dataset_metric(c)]
57
+
58
+ AGGREGATE_FORECAST_COLUMNS = [
59
+ "overall_mcq_acc",
60
+ "T2_MAE",
61
+ "T2_sMAPE",
62
+ "T4_MAE",
63
+ "T4_sMAPE",
64
+ "MIMIC_T2_OW_sMAPE",
65
+ "MIMIC_T2_OW_RMSSE",
66
+ "MIMIC_T4_OW_sMAPE",
67
+ "MIMIC_T4_OW_RMSSE",
68
+ ]
69
+ AGGREGATE_COLUMNS = BASE_COLUMNS + [
70
+ c for c in AGGREGATE_FORECAST_COLUMNS if c in COLUMN_ORDER
71
+ ]
72
+
73
+ DISPLAY_ALL_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS
74
+ BY_DOMAIN_COLUMNS = BASE_COLUMNS + ALL_DATASET_COLUMNS
75
 
76
 
77
  def column_types(column_order: list[str]) -> list[str]:
 
87
  def init_leaderboard(dataframe, column_order):
88
  if dataframe is None or dataframe.empty:
89
  dataframe = pd.DataFrame(columns=column_order)
90
+ dataframe = dataframe.reindex(columns=column_order)
91
 
92
  required_cols = list(SCHEMA.identity_fields) + list(SCHEMA.required_metrics)
93
  cant_deselect = [c for c in required_cols if c in column_order]
94
 
95
+ search_columns = [c for c in ["model_name", "agent_name"] if c in column_order]
96
+
97
  return Leaderboard(
98
  value=dataframe,
99
  datatype=column_types(column_order),
 
102
  cant_deselect=cant_deselect,
103
  label="Select Columns to Display:",
104
  ),
105
+ search_columns=search_columns,
106
  filter_columns=[
107
  ColumnFilter("agent_type", type="checkboxgroup", label="Agent type"),
108
  ],
 
110
  )
111
 
112
 
113
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
 
116
  def save_submission(uploaded_file) -> str:
 
139
  return f"Submission received for review. Saved to `{out_path}`."
140
 
141
 
142
+ def example_record_markdown() -> str:
143
+ try:
144
+ records = load_records(RESULTS_PATH)
145
+ if not records:
146
+ return "No example data available."
147
+ example = records[0]
148
+ return "Example record (JSON):\n```json\n" + json.dumps(example, indent=2) + "\n```"
149
+ except Exception as exc:
150
+ return f"Could not load example record: {exc}"
151
+
152
+
153
  demo = gr.Blocks(css=custom_css)
154
  with demo:
155
  gr.HTML(TITLE)
 
159
 
160
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
161
  with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
162
+ leaderboard = init_leaderboard(LEADERBOARD_DF, AGGREGATE_COLUMNS)
163
 
164
+ with gr.TabItem("🧭 By Domain", elem_id="llm-benchmark-tab-table", id=1):
165
+ by_domain_df = LEADERBOARD_DF.reindex(columns=BY_DOMAIN_COLUMNS)
166
+ init_leaderboard(by_domain_df, BY_DOMAIN_COLUMNS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  with gr.TabItem("📤 Submit Results", elem_id="llm-benchmark-tab-table", id=2):
169
  gr.Markdown(
170
  "Upload a results file for manual review. Approved results will be merged into the main dataset.",
171
  elem_classes="markdown-text",
172
  )
173
+ gr.Markdown(example_record_markdown(), elem_classes="markdown-text")
174
  submission_file = gr.File(label="Results file (.json or .csv)", file_types=[".json", ".csv"])
175
  submit_button = gr.Button("Submit for Review")
176
  submission_status = gr.Markdown()
 
179
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
180
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
181
 
182
+ # Citation section hidden for now.
183
+ # with gr.Row():
184
+ # with gr.Accordion("📙 Citation", open=False):
185
+ # citation_button = gr.Textbox(
186
+ # value=CITATION_BUTTON_TEXT,
187
+ # label=CITATION_BUTTON_LABEL,
188
+ # lines=20,
189
+ # elem_id="citation-button",
190
+ # show_copy_button=True,
191
+ # )
192
 
193
  demo.queue(default_concurrency_limit=40).launch()
data/results.json CHANGED
@@ -1,41 +1,247 @@
1
  [
2
  {
3
- "model_name": "demo-model-1",
4
- "agent_name": "TemporalAgent-A",
5
  "agent_type": "single-LLM",
6
- "base_model": "demo-base-1",
7
- "T1_acc": 71.2,
8
- "T2_acc": 64.5,
9
- "T3_acc": 69.8,
10
- "T4_acc": 62.3,
11
- "T2_MAE": 0.41,
12
- "T4_sMAPE": 0.22,
13
- "Retail_T3_acc": 70.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  },
15
  {
16
- "model_name": "demo-model-2",
17
- "agent_name": "TemporalAgent-B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  "agent_type": "general agent",
19
- "base_model": "demo-base-2",
20
- "T1_acc": 75.4,
21
- "T2_acc": 66.7,
22
- "T3_acc": 72.9,
23
- "T4_acc": 65.8,
24
- "T2_MAE": 0.38,
25
- "T4_sMAPE": 0.20,
26
- "MIMIC_T3_acc": 71.6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  },
28
  {
29
- "model_name": "demo-model-3",
30
- "agent_name": "TemporalAgent-C",
31
- "agent_type": "time-series-specific agent",
32
- "base_model": "demo-base-3",
33
- "T1_acc": 69.9,
34
- "T2_acc": 63.2,
35
- "T3_acc": 68.4,
36
- "T4_acc": 61.7,
37
- "T2_MAE": 0.44,
38
- "T4_sMAPE": 0.24,
39
- "PSML_T3_acc": 67.9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
  ]
 
1
  [
2
  {
3
+ "agent_name": "Single LLM",
 
4
  "agent_type": "single-LLM",
5
+ "base_model": "gpt-4o",
6
+ "T1_acc": null,
7
+ "T2_acc": null,
8
+ "T3_acc": null,
9
+ "T4_acc": null,
10
+ "FreshRetailNet_T1_acc": 0.6364,
11
+ "FreshRetailNet_T2_acc": 0.5227,
12
+ "FreshRetailNet_T3_acc": 0.0289,
13
+ "FreshRetailNet_T4_acc": 0.1364,
14
+ "PSML_T1_acc": 0.675,
15
+ "PSML_T2_acc": 0.2067,
16
+ "PSML_T3_acc": 0.348,
17
+ "PSML_T4_acc": 0.36,
18
+ "CausalChambers_T1_acc": 0.1333,
19
+ "CausalChambers_T2_acc": 0.2733,
20
+ "CausalChambers_T3_acc": 0.352,
21
+ "CausalChambers_T4_acc": 0.26,
22
+ "MIMIC_T1_acc": 0.4681,
23
+ "MIMIC_T2_acc": 0.2128,
24
+ "MIMIC_T3_acc": 0.3661,
25
+ "MIMIC_T4_acc": 0.2979,
26
+ "T2_sMAPE": null,
27
+ "T2_MAE": null,
28
+ "T2_OW_sMAPE_MIMIC": null,
29
+ "T2_OW_RMSSE_MIMIC": null,
30
+ "T4_sMAPE": null,
31
+ "T4_MAE": null,
32
+ "T4_OW_sMAPE_MIMIC": null,
33
+ "T4_OW_RMSSE_MIMIC": null,
34
+ "FreshRetailNet_T2_MAE": 0.12,
35
+ "FreshRetailNet_T2_sMAPE": 1.27,
36
+ "FreshRetailNet_T4_MAE": 0.34,
37
+ "FreshRetailNet_T4_sMAPE": 1.29,
38
+ "PSML_T2_MAE": 0.61,
39
+ "PSML_T2_sMAPE": 0.6,
40
+ "PSML_T4_MAE": 0.44,
41
+ "PSML_T4_sMAPE": 0.37,
42
+ "CausalChambers_T2_MAE": 2.48,
43
+ "CausalChambers_T2_OW_RMSSE": 0.0000257,
44
+ "CausalChambers_T4_MAE": 2.58,
45
+ "CausalChambers_T4_OW_RMSSE": 0.0000269,
46
+ "MIMIC_T2_OW_sMAPE": 15.2,
47
+ "MIMIC_T2_OW_RMSSE": 0.55,
48
+ "MIMIC_T4_OW_sMAPE": 16.86,
49
+ "MIMIC_T4_OW_RMSSE": 0.63
50
  },
51
  {
52
+ "agent_name": "TimeSeries Scientist",
53
+ "agent_type": "time-series-specific agent",
54
+ "base_model": "gpt-4o",
55
+ "T1_acc": null,
56
+ "T2_acc": null,
57
+ "T3_acc": null,
58
+ "T4_acc": null,
59
+ "FreshRetailNet_T1_acc": 0.3352,
60
+ "FreshRetailNet_T2_acc": 0.5682,
61
+ "FreshRetailNet_T3_acc": 0.0341,
62
+ "FreshRetailNet_T4_acc": 0.5682,
63
+ "PSML_T1_acc": 0.28,
64
+ "PSML_T2_acc": 0.2667,
65
+ "PSML_T3_acc": 0.216,
66
+ "PSML_T4_acc": 0.2733,
67
+ "CausalChambers_T1_acc": 0.2867,
68
+ "CausalChambers_T2_acc": 0.0267,
69
+ "CausalChambers_T3_acc": 0.216,
70
+ "CausalChambers_T4_acc": 0.0267,
71
+ "MIMIC_T1_acc": 0.1011,
72
+ "MIMIC_T2_acc": 0.234,
73
+ "MIMIC_T3_acc": 0.2887,
74
+ "MIMIC_T4_acc": 0.234,
75
+ "T2_sMAPE": null,
76
+ "T2_MAE": null,
77
+ "T2_OW_sMAPE_MIMIC": null,
78
+ "T2_OW_RMSSE_MIMIC": null,
79
+ "T4_sMAPE": null,
80
+ "T4_MAE": null,
81
+ "T4_OW_sMAPE_MIMIC": null,
82
+ "T4_OW_RMSSE_MIMIC": null,
83
+ "FreshRetailNet_T2_MAE": 0.35,
84
+ "FreshRetailNet_T2_sMAPE": 1.27,
85
+ "FreshRetailNet_T4_MAE": 0.51,
86
+ "FreshRetailNet_T4_sMAPE": 1.4,
87
+ "PSML_T2_MAE": 1.53,
88
+ "PSML_T2_sMAPE": 0.65,
89
+ "PSML_T4_MAE": 0.84,
90
+ "PSML_T4_sMAPE": 0.48,
91
+ "CausalChambers_T2_MAE": 2.44,
92
+ "CausalChambers_T2_OW_RMSSE": 0.0000253,
93
+ "CausalChambers_T4_MAE": 2.94,
94
+ "CausalChambers_T4_OW_RMSSE": 0.0000306,
95
+ "MIMIC_T2_OW_sMAPE": 15.81,
96
+ "MIMIC_T2_OW_RMSSE": 0.52,
97
+ "MIMIC_T4_OW_sMAPE": 17.18,
98
+ "MIMIC_T4_OW_RMSSE": 0.64
99
+ },
100
+ {
101
+ "agent_name": "AgentScope",
102
  "agent_type": "general agent",
103
+ "base_model": "gpt-4o",
104
+ "T1_acc": null,
105
+ "T2_acc": null,
106
+ "T3_acc": null,
107
+ "T4_acc": null,
108
+ "FreshRetailNet_T1_acc": 0.625,
109
+ "FreshRetailNet_T2_acc": 0.1212,
110
+ "FreshRetailNet_T3_acc": 0.1364,
111
+ "FreshRetailNet_T4_acc": 0.1894,
112
+ "PSML_T1_acc": 0.66,
113
+ "PSML_T2_acc": 0.2467,
114
+ "PSML_T3_acc": 0.272,
115
+ "PSML_T4_acc": 0.3533,
116
+ "CausalChambers_T1_acc": 0.12,
117
+ "CausalChambers_T2_acc": 0.46,
118
+ "CausalChambers_T3_acc": 0.44,
119
+ "CausalChambers_T4_acc": 0.32,
120
+ "MIMIC_T1_acc": 0.4468,
121
+ "MIMIC_T2_acc": 0.2128,
122
+ "MIMIC_T3_acc": 0.2395,
123
+ "MIMIC_T4_acc": 0.227,
124
+ "T2_sMAPE": null,
125
+ "T2_MAE": null,
126
+ "T2_OW_sMAPE_MIMIC": null,
127
+ "T2_OW_RMSSE_MIMIC": null,
128
+ "T4_sMAPE": null,
129
+ "T4_MAE": null,
130
+ "T4_OW_sMAPE_MIMIC": null,
131
+ "T4_OW_RMSSE_MIMIC": null,
132
+ "FreshRetailNet_T2_MAE": 0.12,
133
+ "FreshRetailNet_T2_sMAPE": 126.27,
134
+ "FreshRetailNet_T4_MAE": 0.2,
135
+ "FreshRetailNet_T4_sMAPE": 130.86,
136
+ "PSML_T2_MAE": 0.28,
137
+ "PSML_T2_sMAPE": 37.38,
138
+ "PSML_T4_MAE": 0.35,
139
+ "PSML_T4_sMAPE": 30.51,
140
+ "CausalChambers_T2_MAE": 2.76,
141
+ "CausalChambers_T2_OW_RMSSE": 0.00262,
142
+ "CausalChambers_T4_MAE": 2.66,
143
+ "CausalChambers_T4_OW_RMSSE": 0.00246,
144
+ "MIMIC_T2_OW_sMAPE": 11.05,
145
+ "MIMIC_T2_OW_RMSSE": 0.43,
146
+ "MIMIC_T4_OW_sMAPE": 12.02,
147
+ "MIMIC_T4_OW_RMSSE": 0.49
148
  },
149
  {
150
+ "agent_name": "MetaGPT",
151
+ "agent_type": "general agent",
152
+ "base_model": "gpt-4o",
153
+ "T1_acc": null,
154
+ "T2_acc": null,
155
+ "T3_acc": null,
156
+ "T4_acc": null,
157
+ "FreshRetailNet_T1_acc": 0.625,
158
+ "FreshRetailNet_T2_acc": 0.0909,
159
+ "FreshRetailNet_T3_acc": 0.0511,
160
+ "FreshRetailNet_T4_acc": 0.1439,
161
+ "PSML_T1_acc": 0.675,
162
+ "PSML_T2_acc": 0.2109,
163
+ "PSML_T3_acc": 0.22,
164
+ "PSML_T4_acc": 0.3133,
165
+ "CausalChambers_T1_acc": 0.1067,
166
+ "CausalChambers_T2_acc": 0.5933,
167
+ "CausalChambers_T3_acc": 0.452,
168
+ "CausalChambers_T4_acc": 0.16,
169
+ "MIMIC_T1_acc": 0.4574,
170
+ "MIMIC_T2_acc": 0.1702,
171
+ "MIMIC_T3_acc": 0.2897,
172
+ "MIMIC_T4_acc": 0.2553,
173
+ "T2_sMAPE": null,
174
+ "T2_MAE": null,
175
+ "T2_OW_sMAPE_MIMIC": null,
176
+ "T2_OW_RMSSE_MIMIC": null,
177
+ "T4_sMAPE": null,
178
+ "T4_MAE": null,
179
+ "T4_OW_sMAPE_MIMIC": null,
180
+ "T4_OW_RMSSE_MIMIC": null,
181
+ "FreshRetailNet_T2_MAE": 0.13,
182
+ "FreshRetailNet_T2_sMAPE": 126.59,
183
+ "FreshRetailNet_T4_MAE": 0.24,
184
+ "FreshRetailNet_T4_sMAPE": 127.22,
185
+ "PSML_T2_MAE": 0.34,
186
+ "PSML_T2_sMAPE": 24.74,
187
+ "PSML_T4_MAE": 0.4,
188
+ "PSML_T4_sMAPE": 43.47,
189
+ "CausalChambers_T2_MAE": 2.62,
190
+ "CausalChambers_T2_OW_RMSSE": 0.00272,
191
+ "CausalChambers_T4_MAE": 2.76,
192
+ "CausalChambers_T4_OW_RMSSE": 0.00287,
193
+ "MIMIC_T2_OW_sMAPE": 14.11,
194
+ "MIMIC_T2_OW_RMSSE": 0.53,
195
+ "MIMIC_T4_OW_sMAPE": 15.4,
196
+ "MIMIC_T4_OW_RMSSE": 0.63
197
+ },
198
+ {
199
+ "agent_name": "CAMEL",
200
+ "agent_type": "general agent",
201
+ "base_model": "gpt-4o",
202
+ "T1_acc": null,
203
+ "T2_acc": null,
204
+ "T3_acc": null,
205
+ "T4_acc": null,
206
+ "FreshRetailNet_T1_acc": 0.642,
207
+ "FreshRetailNet_T2_acc": 0.0076,
208
+ "FreshRetailNet_T3_acc": 0.0625,
209
+ "FreshRetailNet_T4_acc": 0.3106,
210
+ "PSML_T1_acc": 0.685,
211
+ "PSML_T2_acc": 0.14,
212
+ "PSML_T3_acc": 0.184,
213
+ "PSML_T4_acc": 0.3067,
214
+ "CausalChambers_T1_acc": 0.1,
215
+ "CausalChambers_T2_acc": 0.66,
216
+ "CausalChambers_T3_acc": 0.42,
217
+ "CausalChambers_T4_acc": 0.2667,
218
+ "MIMIC_T1_acc": 0.4681,
219
+ "MIMIC_T2_acc": 0.2057,
220
+ "MIMIC_T3_acc": 0.3014,
221
+ "MIMIC_T4_acc": 0.234,
222
+ "T2_sMAPE": null,
223
+ "T2_MAE": null,
224
+ "T2_OW_sMAPE_MIMIC": null,
225
+ "T2_OW_RMSSE_MIMIC": null,
226
+ "T4_sMAPE": null,
227
+ "T4_MAE": null,
228
+ "T4_OW_sMAPE_MIMIC": null,
229
+ "T4_OW_RMSSE_MIMIC": null,
230
+ "FreshRetailNet_T2_MAE": 0.13,
231
+ "FreshRetailNet_T2_sMAPE": 126.75,
232
+ "FreshRetailNet_T4_MAE": 0.28,
233
+ "FreshRetailNet_T4_sMAPE": 128.18,
234
+ "PSML_T2_MAE": 0.43,
235
+ "PSML_T2_sMAPE": 34.89,
236
+ "PSML_T4_MAE": 0.45,
237
+ "PSML_T4_sMAPE": 35.78,
238
+ "CausalChambers_T2_MAE": 2.99,
239
+ "CausalChambers_T2_OW_RMSSE": 0.00311,
240
+ "CausalChambers_T4_MAE": 2.5,
241
+ "CausalChambers_T4_OW_RMSSE": 0.0026,
242
+ "MIMIC_T2_OW_sMAPE": 12.02,
243
+ "MIMIC_T2_OW_RMSSE": 0.55,
244
+ "MIMIC_T4_OW_sMAPE": 15.74,
245
+ "MIMIC_T4_OW_RMSSE": 0.59
246
  }
247
  ]
src/about.py CHANGED
@@ -10,21 +10,39 @@ LLM_BENCHMARKS_TEXT = """
10
  ## What this leaderboard shows
11
 
12
  - One row per evaluated agent configuration
13
- - Task-family metrics for TemporalBench (T1–T4)
14
- - Optional domain-level metrics when provided (e.g., Retail_T3_acc)
 
15
 
16
  ## Data requirements
17
 
18
  Results are loaded from a local JSON or CSV file. Each record must include:
19
 
20
- - Identity fields: `model_name`, `agent_name`, `agent_type`, `base_model`
21
- - Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc`
22
- - Optional metrics: `T2_MAE`, `T4_sMAPE`, and any additional numeric columns
 
 
 
 
 
 
 
 
23
 
24
  ## Submission workflow
25
 
26
  Uploads are stored locally for manual review. Approved results should be merged into
27
  the main results file to appear on the leaderboard.
 
 
 
 
 
 
 
 
 
28
  """
29
 
30
  EVALUATION_QUEUE_TEXT = ""
 
10
  ## What this leaderboard shows
11
 
12
  - One row per evaluated agent configuration
13
+ - Task-family MCQ metrics for TemporalBench (T1–T4)
14
+ - Forecasting metrics for T2/T4 (sMAPE, MAE) and MIMIC OW metrics when provided
15
+ - Dataset-level results for: FreshRetailNet, PSML, Causal Chambers, MIMIC
16
 
17
  ## Data requirements
18
 
19
  Results are loaded from a local JSON or CSV file. Each record must include:
20
 
21
+ - Identity fields: `agent_name`, `agent_type`, `base_model`
22
+ - Required metrics: `T1_acc`, `T2_acc`, `T3_acc`, `T4_acc` (computed overall)
23
+ - Optional metrics:
24
+ - Overall forecasting: `T2_sMAPE`, `T2_MAE`, `T4_sMAPE`, `T4_MAE`
25
+ - MIMIC overall OW: `MIMIC_T2_OW_sMAPE`, `MIMIC_T2_OW_RMSSE`, `MIMIC_T4_OW_sMAPE`, `MIMIC_T4_OW_RMSSE`
26
+ - Dataset-level metrics: `<Dataset>_T{1..4}_acc` and forecasting metrics per dataset
27
+
28
+ ## Overall computation
29
+
30
+ Overall T1–T4 accuracy and T2/T4 forecasting metrics are computed as weighted averages
31
+ from dataset-level results using question/series counts. Missing values are ignored.
32
 
33
  ## Submission workflow
34
 
35
  Uploads are stored locally for manual review. Approved results should be merged into
36
  the main results file to appear on the leaderboard.
37
+
38
+ ## Data access
39
+
40
+ The dataset is available at:
41
+ ```
42
+ https://huggingface.co/datasets/Melady/TemporalBench
43
+ ```
44
+ It includes all test tasks and a `forecast_metrics_utils.py` file that documents the
45
+ standard metric computation utilities.
46
  """
47
 
48
  EVALUATION_QUEUE_TEXT = ""
src/leaderboard/load_results.py CHANGED
@@ -20,6 +20,14 @@ def _is_number(value) -> bool:
20
  return math.isfinite(float(value))
21
 
22
 
 
 
 
 
 
 
 
 
23
  def _load_json_records(path: str) -> list[dict]:
24
  with open(path, "r") as fp:
25
  data = json.load(fp)
@@ -80,6 +88,8 @@ def validate_records(records: Iterable[dict]) -> None:
80
  for key, value in record.items():
81
  if key in SCHEMA.identity_fields:
82
  continue
 
 
83
  if not _is_number(value):
84
  raise ResultsValidationError(
85
  f"Record {idx} metric '{key}' must be numeric."
@@ -114,5 +124,86 @@ def build_dataframe(records: list[dict]) -> tuple[pd.DataFrame, list[str]]:
114
  metric_cols = infer_metric_columns(records)
115
  column_order = list(SCHEMA.identity_fields) + metric_cols
116
  df = pd.DataFrame.from_records(records)
 
 
 
 
 
 
 
117
  df = df[column_order]
118
  return df, column_order
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  return math.isfinite(float(value))
21
 
22
 
23
+ def _is_missing(value) -> bool:
24
+ if value is None:
25
+ return True
26
+ if isinstance(value, float) and math.isnan(value):
27
+ return True
28
+ return False
29
+
30
+
31
  def _load_json_records(path: str) -> list[dict]:
32
  with open(path, "r") as fp:
33
  data = json.load(fp)
 
88
  for key, value in record.items():
89
  if key in SCHEMA.identity_fields:
90
  continue
91
+ if _is_missing(value):
92
+ continue
93
  if not _is_number(value):
94
  raise ResultsValidationError(
95
  f"Record {idx} metric '{key}' must be numeric."
 
124
  metric_cols = infer_metric_columns(records)
125
  column_order = list(SCHEMA.identity_fields) + metric_cols
126
  df = pd.DataFrame.from_records(records)
127
+ df = apply_overall_metrics(df)
128
+ # Include computed columns (e.g., overall_mcq_acc) in display order if present.
129
+ for col in df.columns:
130
+ if col in SCHEMA.identity_fields:
131
+ continue
132
+ if col not in column_order:
133
+ column_order.append(col)
134
  df = df[column_order]
135
  return df, column_order
136
+
137
+
138
+ MCQ_QUESTIONS = {
139
+ "MIMIC": {"T1": 188, "T2": 141, "T3": 239, "T4": 141},
140
+ "PSML": {"T1": 200, "T2": 150, "T3": 250, "T4": 150},
141
+ "CausalChambers": {"T1": 150, "T2": 150, "T3": 250, "T4": 150},
142
+ "FreshRetailNet": {"T1": 176, "T2": 132, "T3": 176, "T4": 132},
143
+ }
144
+
145
+ FORECAST_SERIES = {
146
+ "MIMIC": {"T2": 282, "T4": 282},
147
+ "PSML": {"T2": 50, "T4": 50},
148
+ "CausalChambers": {"T2": 50, "T4": 50},
149
+ "FreshRetailNet": {"T2": 44, "T4": 44},
150
+ }
151
+
152
+
153
+ def _weighted_avg(row: pd.Series, columns: list[str], weights: list[int]) -> float | None:
154
+ total = 0.0
155
+ total_w = 0.0
156
+ for col, w in zip(columns, weights):
157
+ val = row.get(col)
158
+ if _is_missing(val):
159
+ continue
160
+ total += float(val) * w
161
+ total_w += w
162
+ if total_w == 0:
163
+ return None
164
+ return round(total / total_w, 4)
165
+
166
+
167
+ def apply_overall_metrics(df: pd.DataFrame) -> pd.DataFrame:
168
+ df = df.copy()
169
+
170
+ for task in ["T1", "T2", "T3", "T4"]:
171
+ cols = []
172
+ weights = []
173
+ for dataset, task_weights in MCQ_QUESTIONS.items():
174
+ col = f"{dataset}_{task}_acc"
175
+ if col in df.columns:
176
+ cols.append(col)
177
+ weights.append(task_weights[task])
178
+ if cols:
179
+ df[f"{task}_acc"] = df.apply(lambda r: _weighted_avg(r, cols, weights), axis=1)
180
+
181
+ overall_cols = []
182
+ overall_weights = []
183
+ for dataset, task_weights in MCQ_QUESTIONS.items():
184
+ for task, weight in task_weights.items():
185
+ col = f"{dataset}_{task}_acc"
186
+ if col in df.columns:
187
+ overall_cols.append(col)
188
+ overall_weights.append(weight)
189
+ if overall_cols:
190
+ df["overall_mcq_acc"] = df.apply(
191
+ lambda r: _weighted_avg(r, overall_cols, overall_weights), axis=1
192
+ )
193
+
194
+ for task in ["T2", "T4"]:
195
+ # sMAPE/MAE are defined for non-MIMIC datasets
196
+ for metric in ["sMAPE", "MAE"]:
197
+ cols = []
198
+ weights = []
199
+ for dataset, task_weights in FORECAST_SERIES.items():
200
+ if dataset == "MIMIC":
201
+ continue
202
+ col = f"{dataset}_{task}_{metric}"
203
+ if col in df.columns:
204
+ cols.append(col)
205
+ weights.append(task_weights[task])
206
+ if cols:
207
+ df[f"{task}_{metric}"] = df.apply(lambda r: _weighted_avg(r, cols, weights), axis=1)
208
+
209
+ return df
src/leaderboard/schema.py CHANGED
@@ -6,7 +6,6 @@ from dataclasses import dataclass
6
  @dataclass(frozen=True)
7
  class TemporalBenchSchema:
8
  identity_fields: tuple[str, ...] = (
9
- "model_name",
10
  "agent_name",
11
  "agent_type",
12
  "base_model",
@@ -18,8 +17,47 @@ class TemporalBenchSchema:
18
  "T4_acc",
19
  )
20
  optional_metrics: tuple[str, ...] = (
 
 
21
  "T2_MAE",
 
 
22
  "T4_sMAPE",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  )
24
 
25
 
 
6
  @dataclass(frozen=True)
7
  class TemporalBenchSchema:
8
  identity_fields: tuple[str, ...] = (
 
9
  "agent_name",
10
  "agent_type",
11
  "base_model",
 
17
  "T4_acc",
18
  )
19
  optional_metrics: tuple[str, ...] = (
20
+ "overall_mcq_acc",
21
+ "T2_sMAPE",
22
  "T2_MAE",
23
+ "T2_OW_sMAPE_MIMIC",
24
+ "T2_OW_RMSSE_MIMIC",
25
  "T4_sMAPE",
26
+ "T4_MAE",
27
+ "T4_OW_sMAPE_MIMIC",
28
+ "T4_OW_RMSSE_MIMIC",
29
+ "FreshRetailNet_T1_acc",
30
+ "FreshRetailNet_T2_acc",
31
+ "FreshRetailNet_T3_acc",
32
+ "FreshRetailNet_T4_acc",
33
+ "PSML_T1_acc",
34
+ "PSML_T2_acc",
35
+ "PSML_T3_acc",
36
+ "PSML_T4_acc",
37
+ "CausalChambers_T1_acc",
38
+ "CausalChambers_T2_acc",
39
+ "CausalChambers_T3_acc",
40
+ "CausalChambers_T4_acc",
41
+ "MIMIC_T1_acc",
42
+ "MIMIC_T2_acc",
43
+ "MIMIC_T3_acc",
44
+ "MIMIC_T4_acc",
45
+ "FreshRetailNet_T2_sMAPE",
46
+ "FreshRetailNet_T2_MAE",
47
+ "PSML_T2_sMAPE",
48
+ "PSML_T2_MAE",
49
+ "CausalChambers_T2_sMAPE",
50
+ "CausalChambers_T2_MAE",
51
+ "MIMIC_T2_OW_sMAPE",
52
+ "MIMIC_T2_OW_RMSSE",
53
+ "FreshRetailNet_T4_sMAPE",
54
+ "FreshRetailNet_T4_MAE",
55
+ "PSML_T4_sMAPE",
56
+ "PSML_T4_MAE",
57
+ "CausalChambers_T4_sMAPE",
58
+ "CausalChambers_T4_MAE",
59
+ "MIMIC_T4_OW_sMAPE",
60
+ "MIMIC_T4_OW_RMSSE",
61
  )
62
 
63