fishmingyu commited on
Commit
e839e6a
·
1 Parent(s): 9e93468

init raw app

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. README.md +6 -7
  3. app.py +330 -0
  4. data/method_data.json +160 -0
  5. data/model_data.json +94 -0
  6. requirements.txt +4 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__
2
+ *.DS_Store
README.md CHANGED
@@ -1,12 +1,11 @@
1
  ---
2
- title: AMA Bench
3
- emoji: 👀
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: AMA-Bench Leaderboard
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.23.3
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
+ import numpy as np
5
+ import plotly.graph_objects as go
6
+
7
+ # ---------------------------------------------------------------------------
8
+ # Data loading
9
+ # ---------------------------------------------------------------------------
10
+
11
+ def load_data(path):
12
+ with open(path, "r", encoding="utf-8") as f:
13
+ return json.load(f)
14
+
15
+ MODEL_DATA = load_data("data/model_data.json")
16
+ METHOD_DATA = load_data("data/method_data.json")
17
+
18
+ METRICS = ["Recall", "Causal Inference", "State Updating", "State Abstraction"]
19
+ ALL_METRICS = METRICS + ["Average"]
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # DataFrame helpers
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def build_dataframe(data):
26
+ """Build a pandas DataFrame showing Accuracy (F1) for each metric."""
27
+ rows = []
28
+ for entry in data["entries"]:
29
+ row = {"Method": entry["method"]}
30
+ if entry.get("category"):
31
+ row["Category"] = entry["category"]
32
+ for m in ALL_METRICS:
33
+ acc = entry["scores"][m]["accuracy"]
34
+ f1 = entry["scores"][m]["f1"]
35
+ row[m] = f"{acc:.4f} ({f1:.4f})"
36
+ # Store raw average accuracy for sorting
37
+ row["_sort_avg"] = entry["scores"]["Average"]["accuracy"]
38
+ rows.append(row)
39
+
40
+ df = pd.DataFrame(rows)
41
+ df = df.sort_values("_sort_avg", ascending=False).reset_index(drop=True)
42
+ df = df.drop(columns=["_sort_avg"])
43
+ return df
44
+
45
+
46
+ def build_chart_dataframe(data):
47
+ """Build a DataFrame with raw numeric Accuracy values for charting."""
48
+ rows = []
49
+ for entry in data["entries"]:
50
+ row = {"Method": entry["method"]}
51
+ for m in ALL_METRICS:
52
+ row[f"{m} (Acc)"] = entry["scores"][m]["accuracy"]
53
+ row["_sort_avg"] = entry["scores"]["Average"]["accuracy"]
54
+ rows.append(row)
55
+
56
+ df = pd.DataFrame(rows)
57
+ df = df.sort_values("_sort_avg", ascending=False).reset_index(drop=True)
58
+ df = df.drop(columns=["_sort_avg"])
59
+ return df
60
+
61
+
62
+ def add_medals(df):
63
+ """Add medal emojis to the top-3 Method names."""
64
+ df = df.copy()
65
+ medals = ["\U0001f947", "\U0001f948", "\U0001f949"]
66
+ for i in range(min(3, len(df))):
67
+ df.loc[i, "Method"] = f"{medals[i]} {df.loc[i, 'Method']}"
68
+ return df
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Chart helpers
73
+ # ---------------------------------------------------------------------------
74
+
75
+ BAR_COLORS = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA"]
76
+
77
+
78
+ def make_bar_chart(chart_df, title=""):
79
+ """Create a grouped vertical bar chart showing Accuracy per metric."""
80
+ fig = go.Figure()
81
+
82
+ for i, m in enumerate(METRICS):
83
+ fig.add_trace(go.Bar(
84
+ x=chart_df["Method"],
85
+ y=chart_df[f"{m} (Acc)"],
86
+ name=m,
87
+ marker_color=BAR_COLORS[i % len(BAR_COLORS)],
88
+ ))
89
+
90
+ # Wrap long titles to 2 lines
91
+ if len(title) > 60:
92
+ mid = len(title) // 2
93
+ space_pos = title.find(" ", mid)
94
+ if space_pos == -1:
95
+ space_pos = title.rfind(" ", 0, mid)
96
+ if space_pos != -1:
97
+ title = title[:space_pos] + "<br>" + title[space_pos + 1:]
98
+
99
+ fig.update_layout(
100
+ barmode="group",
101
+ title=dict(text=title, x=0.5, font=dict(size=14)),
102
+ yaxis=dict(title="Accuracy", range=[0, 1]),
103
+ xaxis=dict(tickangle=-45),
104
+ height=500,
105
+ margin=dict(l=60, r=40, t=100, b=140),
106
+ legend=dict(
107
+ orientation="h", yanchor="bottom", y=1.02,
108
+ xanchor="center", x=0.5, font=dict(size=12),
109
+ ),
110
+ bargap=0.2,
111
+ bargroupgap=0.05,
112
+ )
113
+ return fig
114
+
115
+
116
+ # ---------------------------------------------------------------------------
117
+ # Update functions
118
+ # ---------------------------------------------------------------------------
119
+
120
+ def update_leaderboard(data, top_n):
121
+ """Return (display_df, bar_fig) for a given data source."""
122
+ df = build_dataframe(data)
123
+ chart_df = build_chart_dataframe(data)
124
+
125
+ df = df.head(int(top_n))
126
+ chart_df = chart_df.head(int(top_n))
127
+
128
+ display_df = add_medals(df)
129
+
130
+ title = data.get("title", "Score Breakdown")
131
+ bar = make_bar_chart(chart_df, title)
132
+
133
+ return display_df, bar
134
+
135
+
136
+ def update_model_leaderboard(top_n):
137
+ return update_leaderboard(MODEL_DATA, top_n)
138
+
139
+
140
+ def update_method_leaderboard(top_n):
141
+ return update_leaderboard(METHOD_DATA, top_n)
142
+
143
+
144
+ # ---------------------------------------------------------------------------
145
+ # App
146
+ # ---------------------------------------------------------------------------
147
+
148
+ CSS = """
149
+ html, body {
150
+ overflow-y: auto !important;
151
+ width: 100% !important;
152
+ }
153
+ .gradio-container {
154
+ max-width: 1200px !important;
155
+ margin: auto !important;
156
+ }
157
+ .header-banner {
158
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
159
+ color: white;
160
+ padding: 24px 32px;
161
+ border-radius: 12px;
162
+ margin-bottom: 16px;
163
+ text-align: center;
164
+ }
165
+ .header-banner h1 { margin: 0 0 8px 0; font-size: 2em; }
166
+ .header-banner p { margin: 0; font-size: 1.1em; opacity: 0.9; }
167
+ .dark .header-banner {
168
+ background: linear-gradient(135deg, #434190 0%, #553c6b 100%);
169
+ }
170
+ .table-container {
171
+ border-radius: 8px;
172
+ box-shadow: 0 2px 10px rgba(0,0,0,0.08);
173
+ }
174
+ .tip-text {
175
+ font-size: 13px; color: #666; font-style: italic; margin-top: 4px;
176
+ }
177
+ .dark .tip-text { color: #aaa; }
178
+ .metric-note {
179
+ background: #f0f4ff; padding: 10px 16px; border-radius: 8px;
180
+ border-left: 4px solid #667eea; margin-bottom: 12px; font-size: 14px;
181
+ }
182
+ .dark .metric-note {
183
+ background: #2d2d44; border-left-color: #764ba2;
184
+ }
185
+ """
186
+
187
+
188
+ def build_app():
189
+ with gr.Blocks(css=CSS, title="AMA-Bench Leaderboard") as demo:
190
+
191
+ # Header
192
+ gr.HTML("""
193
+ <div class="header-banner">
194
+ <h1>AMA-Bench Leaderboard</h1>
195
+ <p>Agent Memory Assessment Benchmark &mdash; Evaluating LLMs and Memory Methods on Cognitive Tasks</p>
196
+ </div>
197
+ """)
198
+
199
+ with gr.Tabs():
200
+ # ============================================================
201
+ # Tab 1: Model Leaderboard
202
+ # ============================================================
203
+ with gr.Tab("Model Leaderboard"):
204
+ gr.Markdown("""
205
+ <div class="metric-note">
206
+ Comparing <strong>LLM models</strong> across 4 cognitive tasks: Recall, Causal Inference, State Updating, and State Abstraction.
207
+ Results are reported as <strong>Accuracy (F1)</strong>. Sorted by Average Accuracy.
208
+ </div>
209
+ """)
210
+
211
+ with gr.Row():
212
+ model_top_n = gr.Slider(
213
+ minimum=1,
214
+ maximum=len(MODEL_DATA["entries"]),
215
+ step=1,
216
+ value=len(MODEL_DATA["entries"]),
217
+ label="Number of models to display",
218
+ )
219
+
220
+ # Chart
221
+ with gr.Row():
222
+ gr.Markdown("### Data Visualization")
223
+ model_bar = gr.Plot(label="Score Breakdown")
224
+ gr.Markdown("*Click a legend entry to isolate that metric. Double-click to add more for comparison.*", elem_classes="tip-text")
225
+
226
+ # Table
227
+ with gr.Row():
228
+ gr.Markdown("### Detailed Results")
229
+ init_model_df, _ = update_model_leaderboard(len(MODEL_DATA["entries"]))
230
+ model_table = gr.DataFrame(
231
+ value=init_model_df,
232
+ elem_classes="table-container",
233
+ show_row_numbers=True,
234
+ show_fullscreen_button=True,
235
+ show_search="search",
236
+ interactive=False,
237
+ )
238
+
239
+ # Wire events
240
+ model_top_n.change(
241
+ update_model_leaderboard,
242
+ inputs=[model_top_n],
243
+ outputs=[model_table, model_bar],
244
+ )
245
+
246
+ demo.load(
247
+ update_model_leaderboard,
248
+ inputs=[model_top_n],
249
+ outputs=[model_table, model_bar],
250
+ )
251
+
252
+ # ============================================================
253
+ # Tab 2: Method Leaderboard
254
+ # ============================================================
255
+ with gr.Tab("Method Leaderboard"):
256
+ gr.Markdown("""
257
+ <div class="metric-note">
258
+ Comparing <strong>RAG &amp; Agent Memory methods</strong> (base model: Qwen-32B) across 4 cognitive tasks.
259
+ Results are reported as <strong>Accuracy (F1)</strong>. Sorted by Average Accuracy.
260
+ </div>
261
+ """)
262
+
263
+ with gr.Row():
264
+ method_top_n = gr.Slider(
265
+ minimum=1,
266
+ maximum=len(METHOD_DATA["entries"]),
267
+ step=1,
268
+ value=len(METHOD_DATA["entries"]),
269
+ label="Number of methods to display",
270
+ )
271
+
272
+ # Chart
273
+ with gr.Row():
274
+ gr.Markdown("### Data Visualization")
275
+ method_bar = gr.Plot(label="Score Breakdown")
276
+ gr.Markdown("*Click a legend entry to isolate that metric. Double-click to add more for comparison.*", elem_classes="tip-text")
277
+
278
+ # Table
279
+ with gr.Row():
280
+ gr.Markdown("### Detailed Results")
281
+ init_method_df, _ = update_method_leaderboard(len(METHOD_DATA["entries"]))
282
+ method_table = gr.DataFrame(
283
+ value=init_method_df,
284
+ elem_classes="table-container",
285
+ show_row_numbers=True,
286
+ show_fullscreen_button=True,
287
+ show_search="search",
288
+ interactive=False,
289
+ )
290
+
291
+ # Wire events
292
+ method_top_n.change(
293
+ update_method_leaderboard,
294
+ inputs=[method_top_n],
295
+ outputs=[method_table, method_bar],
296
+ )
297
+
298
+ demo.load(
299
+ update_method_leaderboard,
300
+ inputs=[method_top_n],
301
+ outputs=[method_table, method_bar],
302
+ )
303
+
304
+ # ============================================================
305
+ # Tab 3: About
306
+ # ============================================================
307
+ with gr.Tab("About"):
308
+ gr.Markdown("""
309
+ ## AMA-Bench: Agent Memory Assessment Benchmark
310
+
311
+ AMA-Bench evaluates memory capabilities of LLMs and memory-augmented agents across four cognitive dimensions:
312
+ **Recall** (retrieving stored info), **Causal Inference** (cause-and-effect reasoning), **State Updating** (tracking evolving states), and **State Abstraction** (forming higher-level representations).
313
+
314
+ **Benchmarks** &mdash; We evaluate on two complementary subsets:
315
+ (1) **Real-world Subset:** 2,496 QA pairs.
316
+ (2) **Synthetic Subset:** 1,200 QA pairs stratified across five trajectory lengths (8K, 16K, 32K, 64K, and 128K tokens), with 240 samples per interval.
317
+
318
+ **Leaderboard Tabs** &mdash; *Model Leaderboard* compares LLM models directly; *Method Leaderboard* compares RAG and Agent Memory methods using Qwen-32B as the base model.
319
+
320
+ **Metrics** &mdash; Results are reported as **Accuracy (F1)**.
321
+ ---
322
+ *For questions or submissions, please open a discussion in the Community tab.*
323
+ """)
324
+
325
+ return demo
326
+
327
+
328
+ if __name__ == "__main__":
329
+ demo_app = build_app()
330
+ demo_app.launch(debug=True, show_error=True)
data/method_data.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Performance comparison of Agent Memory and RAG methods (base model: Qwen-32B) on real-world subset",
3
+ "metrics": ["Recall", "Causal Inference", "State Updating", "State Abstraction", "Average"],
4
+ "entries": [
5
+ {
6
+ "method": "BM25",
7
+ "category": "RAG",
8
+ "scores": {
9
+ "Recall": {"accuracy": 0.3301, "f1": 0.1465},
10
+ "Causal Inference": {"accuracy": 0.4264, "f1": 0.1549},
11
+ "State Updating": {"accuracy": 0.3450, "f1": 0.1325},
12
+ "State Abstraction": {"accuracy": 0.2498, "f1": 0.1623},
13
+ "Average": {"accuracy": 0.3436, "f1": 0.1475}
14
+ }
15
+ },
16
+ {
17
+ "method": "Qwen3-Emb-4B",
18
+ "category": "RAG",
19
+ "scores": {
20
+ "Recall": {"accuracy": 0.4843, "f1": 0.1590},
21
+ "Causal Inference": {"accuracy": 0.4974, "f1": 0.1549},
22
+ "State Updating": {"accuracy": 0.3520, "f1": 0.1353},
23
+ "State Abstraction": {"accuracy": 0.3011, "f1": 0.1610},
24
+ "Average": {"accuracy": 0.4227, "f1": 0.1522}
25
+ }
26
+ },
27
+ {
28
+ "method": "GraphRAG",
29
+ "category": "RAG",
30
+ "scores": {
31
+ "Recall": {"accuracy": 0.3077, "f1": 0.2769},
32
+ "Causal Inference": {"accuracy": 0.3905, "f1": 0.2634},
33
+ "State Updating": {"accuracy": 0.3140, "f1": 0.2551},
34
+ "State Abstraction": {"accuracy": 0.2879, "f1": 0.2588},
35
+ "Average": {"accuracy": 0.3258, "f1": 0.2650}
36
+ }
37
+ },
38
+ {
39
+ "method": "HippoRAG2",
40
+ "category": "RAG",
41
+ "scores": {
42
+ "Recall": {"accuracy": 0.4579, "f1": 0.2356},
43
+ "Causal Inference": {"accuracy": 0.5080, "f1": 0.1966},
44
+ "State Updating": {"accuracy": 0.4403, "f1": 0.1892},
45
+ "State Abstraction": {"accuracy": 0.3538, "f1": 0.1785},
46
+ "Average": {"accuracy": 0.4480, "f1": 0.2048}
47
+ }
48
+ },
49
+ {
50
+ "method": "MemAgent",
51
+ "category": "Agent Memory",
52
+ "scores": {
53
+ "Recall": {"accuracy": 0.2550, "f1": 0.1489},
54
+ "Causal Inference": {"accuracy": 0.3380, "f1": 0.1606},
55
+ "State Updating": {"accuracy": 0.2849, "f1": 0.1432},
56
+ "State Abstraction": {"accuracy": 0.2202, "f1": 0.1655},
57
+ "Average": {"accuracy": 0.2768, "f1": 0.1530}
58
+ }
59
+ },
60
+ {
61
+ "method": "Mem1",
62
+ "category": "Agent Memory",
63
+ "scores": {
64
+ "Recall": {"accuracy": 0.1180, "f1": 0.1857},
65
+ "Causal Inference": {"accuracy": 0.1427, "f1": 0.1732},
66
+ "State Updating": {"accuracy": 0.1205, "f1": 0.1659},
67
+ "State Abstraction": {"accuracy": 0.1080, "f1": 0.2042},
68
+ "Average": {"accuracy": 0.1229, "f1": 0.1807}
69
+ }
70
+ },
71
+ {
72
+ "method": "Amem",
73
+ "category": "Agent Memory",
74
+ "scores": {
75
+ "Recall": {"accuracy": 0.3084, "f1": 0.2707},
76
+ "Causal Inference": {"accuracy": 0.3653, "f1": 0.2731},
77
+ "State Updating": {"accuracy": 0.3088, "f1": 0.2480},
78
+ "State Abstraction": {"accuracy": 0.2873, "f1": 0.2953},
79
+ "Average": {"accuracy": 0.3186, "f1": 0.2695}
80
+ }
81
+ },
82
+ {
83
+ "method": "Mem0",
84
+ "category": "Agent Memory",
85
+ "scores": {
86
+ "Recall": {"accuracy": 0.2011, "f1": 0.2413},
87
+ "Causal Inference": {"accuracy": 0.2645, "f1": 0.2443},
88
+ "State Updating": {"accuracy": 0.2101, "f1": 0.2225},
89
+ "State Abstraction": {"accuracy": 0.1516, "f1": 0.2241},
90
+ "Average": {"accuracy": 0.2104, "f1": 0.2343}
91
+ }
92
+ },
93
+ {
94
+ "method": "MemoRAG",
95
+ "category": "Agent Memory",
96
+ "scores": {
97
+ "Recall": {"accuracy": 0.4708, "f1": 0.1789},
98
+ "Causal Inference": {"accuracy": 0.5497, "f1": 0.1811},
99
+ "State Updating": {"accuracy": 0.4257, "f1": 0.1713},
100
+ "State Abstraction": {"accuracy": 0.3659, "f1": 0.2073},
101
+ "Average": {"accuracy": 0.4606, "f1": 0.1822}
102
+ }
103
+ },
104
+ {
105
+ "method": "MemGPT",
106
+ "category": "Agent Memory",
107
+ "scores": {
108
+ "Recall": {"accuracy": 0.3289, "f1": 0.1318},
109
+ "Causal Inference": {"accuracy": 0.4404, "f1": 0.1475},
110
+ "State Updating": {"accuracy": 0.2809, "f1": 0.1259},
111
+ "State Abstraction": {"accuracy": 0.2526, "f1": 0.1431},
112
+ "Average": {"accuracy": 0.3304, "f1": 0.1359}
113
+ }
114
+ },
115
+ {
116
+ "method": "Mem-alpha",
117
+ "category": "Agent Memory",
118
+ "scores": {
119
+ "Recall": {"accuracy": 0.2876, "f1": 0.2325},
120
+ "Causal Inference": {"accuracy": 0.4172, "f1": 0.1993},
121
+ "State Updating": {"accuracy": 0.3064, "f1": 0.2000},
122
+ "State Abstraction": {"accuracy": 0.2171, "f1": 0.2135},
123
+ "Average": {"accuracy": 0.3117, "f1": 0.2130}
124
+ }
125
+ },
126
+ {
127
+ "method": "MemoryBank",
128
+ "category": "Agent Memory",
129
+ "scores": {
130
+ "Recall": {"accuracy": 0.3231, "f1": 0.3128},
131
+ "Causal Inference": {"accuracy": 0.4100, "f1": 0.2861},
132
+ "State Updating": {"accuracy": 0.3006, "f1": 0.2678},
133
+ "State Abstraction": {"accuracy": 0.3332, "f1": 0.3011},
134
+ "Average": {"accuracy": 0.3397, "f1": 0.2928}
135
+ }
136
+ },
137
+ {
138
+ "method": "Simple Mem",
139
+ "category": "Agent Memory",
140
+ "scores": {
141
+ "Recall": {"accuracy": 0.2012, "f1": 0.2039},
142
+ "Causal Inference": {"accuracy": 0.1884, "f1": 0.1612},
143
+ "State Updating": {"accuracy": 0.1764, "f1": 0.1594},
144
+ "State Abstraction": {"accuracy": 0.1373, "f1": 0.1689},
145
+ "Average": {"accuracy": 0.1811, "f1": 0.1764}
146
+ }
147
+ },
148
+ {
149
+ "method": "AMA Agent",
150
+ "category": "Agent Memory",
151
+ "scores": {
152
+ "Recall": {"accuracy": 0.6238, "f1": 0.3280},
153
+ "Causal Inference": {"accuracy": 0.6145, "f1": 0.3103},
154
+ "State Updating": {"accuracy": 0.5305, "f1": 0.2625},
155
+ "State Abstraction": {"accuracy": 0.4719, "f1": 0.2825},
156
+ "Average": {"accuracy": 0.5722, "f1": 0.2992}
157
+ }
158
+ }
159
+ ]
160
+ }
data/model_data.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Performance of different models on real-world subset",
3
+ "metrics": ["Recall", "Causal Inference", "State Updating", "State Abstraction", "Average"],
4
+ "entries": [
5
+ {
6
+ "method": "Claude Haiku 3.5",
7
+ "category": null,
8
+ "scores": {
9
+ "Recall": {"accuracy": 0.4943, "f1": 0.3510},
10
+ "Causal Inference": {"accuracy": 0.4507, "f1": 0.2792},
11
+ "State Updating": {"accuracy": 0.4287, "f1": 0.3015},
12
+ "State Abstraction": {"accuracy": 0.3090, "f1": 0.2648},
13
+ "Average": {"accuracy": 0.4361, "f1": 0.3067}
14
+ }
15
+ },
16
+ {
17
+ "method": "GPT-5-mini",
18
+ "category": null,
19
+ "scores": {
20
+ "Recall": {"accuracy": 0.6951, "f1": 0.4010},
21
+ "Causal Inference": {"accuracy": 0.7157, "f1": 0.3027},
22
+ "State Updating": {"accuracy": 0.6575, "f1": 0.3288},
23
+ "State Abstraction": {"accuracy": 0.6235, "f1": 0.3262},
24
+ "Average": {"accuracy": 0.6784, "f1": 0.3464}
25
+ }
26
+ },
27
+ {
28
+ "method": "GPT 5.2",
29
+ "category": null,
30
+ "scores": {
31
+ "Recall": {"accuracy": 0.7741, "f1": 0.4758},
32
+ "Causal Inference": {"accuracy": 0.8047, "f1": 0.3512},
33
+ "State Updating": {"accuracy": 0.6563, "f1": 0.3686},
34
+ "State Abstraction": {"accuracy": 0.6037, "f1": 0.3582},
35
+ "Average": {"accuracy": 0.7226, "f1": 0.3988}
36
+ }
37
+ },
38
+ {
39
+ "method": "Gemini 2.5 Flash",
40
+ "category": null,
41
+ "scores": {
42
+ "Recall": {"accuracy": 0.5834, "f1": 0.3682},
43
+ "Causal Inference": {"accuracy": 0.5087, "f1": 0.2628},
44
+ "State Updating": {"accuracy": 0.5000, "f1": 0.2395},
45
+ "State Abstraction": {"accuracy": 0.4196, "f1": 0.2361},
46
+ "Average": {"accuracy": 0.5168, "f1": 0.2878}
47
+ }
48
+ },
49
+ {
50
+ "method": "Qwen2.5-14B-1M",
51
+ "category": null,
52
+ "scores": {
53
+ "Recall": {"accuracy": 0.5570, "f1": 0.4157},
54
+ "Causal Inference": {"accuracy": 0.4111, "f1": 0.3209},
55
+ "State Updating": {"accuracy": 0.4728, "f1": 0.3348},
56
+ "State Abstraction": {"accuracy": 0.3368, "f1": 0.3560},
57
+ "Average": {"accuracy": 0.4638, "f1": 0.3622}
58
+ }
59
+ },
60
+ {
61
+ "method": "Qwen3-32B",
62
+ "category": null,
63
+ "scores": {
64
+ "Recall": {"accuracy": 0.6149, "f1": 0.4074},
65
+ "Causal Inference": {"accuracy": 0.5178, "f1": 0.3289},
66
+ "State Updating": {"accuracy": 0.4903, "f1": 0.3334},
67
+ "State Abstraction": {"accuracy": 0.3657, "f1": 0.3172},
68
+ "Average": {"accuracy": 0.5181, "f1": 0.3545}
69
+ }
70
+ },
71
+ {
72
+ "method": "Qwen3-14B",
73
+ "category": null,
74
+ "scores": {
75
+ "Recall": {"accuracy": 0.5675, "f1": 0.3636},
76
+ "Causal Inference": {"accuracy": 0.4430, "f1": 0.2931},
77
+ "State Updating": {"accuracy": 0.4502, "f1": 0.3204},
78
+ "State Abstraction": {"accuracy": 0.3176, "f1": 0.2716},
79
+ "Average": {"accuracy": 0.4659, "f1": 0.3203}
80
+ }
81
+ },
82
+ {
83
+ "method": "Qwen3-8B",
84
+ "category": null,
85
+ "scores": {
86
+ "Recall": {"accuracy": 0.5024, "f1": 0.3801},
87
+ "Causal Inference": {"accuracy": 0.3776, "f1": 0.2830},
88
+ "State Updating": {"accuracy": 0.3987, "f1": 0.3177},
89
+ "State Abstraction": {"accuracy": 0.2923, "f1": 0.2792},
90
+ "Average": {"accuracy": 0.4109, "f1": 0.3240}
91
+ }
92
+ }
93
+ ]
94
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==5.23.3
2
+ pandas>=2.0.0
3
+ plotly>=5.15.0
4
+ numpy>=1.24.0