uuuhjb commited on
Commit
bb0a764
·
1 Parent(s): d8b2e03

add submit function

Browse files
Files changed (7) hide show
  1. .gitignore +3 -1
  2. app.py +144 -58
  3. data/agent_capability.json +17 -2
  4. data/model_capability.json +17 -2
  5. data/model_domain.json +78 -78
  6. scorer.py +20 -39
  7. utils.py +8 -4
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  __pycache__
2
- *.DS_Store
 
 
 
1
  __pycache__
2
+ *.DS_Store
3
+ submissions/
4
+ test/
app.py CHANGED
@@ -17,7 +17,7 @@ except ImportError:
17
  def format_log(msg): return f"✅ {msg}"
18
 
19
  try:
20
- from scorer import score_submission, extract_uppercase_letters
21
  except ImportError:
22
  score_submission = None
23
  extract_uppercase_letters = None
@@ -165,36 +165,28 @@ COLORS = [
165
  # ---------------------------------------------------------------------------
166
 
167
  def calculate_f1_score(predictions, references):
168
- """Calculate F1 score for multi-label classification."""
169
  if not predictions or not references:
170
  return 0.0
171
 
172
- if extract_uppercase_letters is None:
173
- # Fallback implementation
174
- def extract_letters(text):
175
- return ''.join(sorted(set(c for c in str(text) if c.isupper() and c.isalpha())))
176
- extract_fn = extract_letters
177
- else:
178
- extract_fn = extract_uppercase_letters
179
-
180
  total_precision = 0.0
181
  total_recall = 0.0
182
  count = 0
183
 
184
  for pred, ref in zip(predictions, references):
185
- pred_set = set(extract_fn(pred))
186
- ref_set = set(extract_fn(ref))
187
 
188
- if not pred_set and not ref_set:
189
  total_precision += 1.0
190
  total_recall += 1.0
191
  count += 1
192
- elif not pred_set or not ref_set:
193
  count += 1
194
  else:
195
- intersection = len(pred_set & ref_set)
196
- precision = intersection / len(pred_set) if pred_set else 0
197
- recall = intersection / len(ref_set) if ref_set else 0
198
  total_precision += precision
199
  total_recall += recall
200
  count += 1
@@ -208,54 +200,96 @@ def calculate_f1_score(predictions, references):
208
  if avg_precision + avg_recall == 0:
209
  return 0.0
210
 
211
- f1 = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
212
- return f1
213
 
214
 
215
  def update_json_with_submission(model_name, scores_by_metric, scored_submissions, is_agent=False, model_family=""):
216
- """Update JSON files with new submission data."""
217
  try:
218
  if is_agent:
219
  capability_file = "data/agent_capability.json"
220
- domain_file = "data/agent_domain.json"
221
  else:
222
  capability_file = "data/model_capability.json"
223
- domain_file = "data/model_domain.json"
224
 
225
- # Load existing data
226
  with open(capability_file, 'r', encoding='utf-8') as f:
227
  capability_data = json.load(f)
228
 
229
- # Update capability data
230
  for capability in METRICS:
231
- if capability in scores_by_metric and capability in capability_data:
232
- metric_data = scores_by_metric[capability]
233
-
234
- # Get submissions for this capability
235
- capability_submissions = [
236
- s for s in scored_submissions
237
- if s.get('metric_category') == capability
238
- ]
239
-
240
- # Calculate F1
241
- if capability_submissions:
242
- predictions = [s.get('answer', '') for s in capability_submissions]
243
- references = [s.get('reference_answer', '') for s in capability_submissions]
244
- f1 = calculate_f1_score(predictions, references)
245
- else:
246
- f1 = 0.0
247
 
248
- capability_data[capability][model_name] = {
249
- "accuracy": metric_data['accuracy'],
250
- "model_family": model_family,
251
- "f1": f1
252
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
254
- # Save updated data
255
  with open(capability_file, 'w', encoding='utf-8') as f:
256
  json.dump(capability_data, f, indent=2, ensure_ascii=False)
257
-
258
  print(f"✓ Updated {capability_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  return True
260
 
261
  except Exception as e:
@@ -315,7 +349,7 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
315
  print(f"✓ Overall accuracy: {average_accuracy:.4f}")
316
  for metric_name, metric_data in scores_by_metric.items():
317
  if metric_name != "Average":
318
- print(f" {metric_name}: {metric_data['accuracy']:.4f} ({metric_data['correct']}/{metric_data['count']})")
319
 
320
  # Save locally
321
  submission_dir = f"submissions/{organisation}_{model}"
@@ -340,7 +374,7 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
340
  metric_name: {
341
  "accuracy": float(metric_data["accuracy"]),
342
  "count": int(metric_data["count"]),
343
- "correct": int(metric_data["correct"])
344
  }
345
  for metric_name, metric_data in scores_by_metric.items()
346
  }
@@ -361,24 +395,73 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
361
  if update_success:
362
  print("✓ Updated leaderboard JSON files")
363
  # Reload data
364
- global AGENT_CAPABILITY, AGENT_DOMAIN, MODEL_CAPABILITY, MODEL_DOMAIN
365
  if is_agent:
366
  AGENT_CAPABILITY = load_json_data("data/agent_capability.json")
367
  AGENT_DOMAIN = load_json_data("data/agent_domain.json")
368
  else:
369
  MODEL_CAPABILITY = load_json_data("data/model_capability.json")
370
  MODEL_DOMAIN = load_json_data("data/model_domain.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
  # Format message
373
  message = f"✅ **Submission successful!**\n\n"
374
  message += f"**{'Agent' if is_agent else 'Model'}:** {model}\n"
375
  message += f"**Organisation:** {organisation}\n"
376
  message += f"**Overall Accuracy:** {average_accuracy:.4f}\n\n"
 
377
  message += "**Scores by Capability:**\n"
378
  for metric_name in METRICS:
379
  if metric_name in scores_by_metric:
380
- metric_data = scores_by_metric[metric_name]
381
- message += f"- **{metric_name}:** {metric_data['accuracy']:.4f} ({metric_data['correct']}/{metric_data['count']})\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
  message += f"\n**Submission ID:** {timestamp}\n"
384
  if update_success:
@@ -1014,16 +1097,19 @@ def build_app():
1014
  gr.Markdown("""
1015
  **Submission Format:**
1016
 
1017
- Your JSONL file should contain one prediction per line:
1018
  ```json
1019
- {"episode_id": "ep_001", "question": "What is X?", "answer": "A"}
1020
- {"episode_id": "ep_002", "question": "What is Y?", "answer": "BC"}
 
1021
  ```
1022
 
1023
  **Required fields:**
1024
  - `episode_id`: Episode identifier
1025
- - `question`: The question text
1026
- - `answer`: Your model's answer (uppercase letters: A, B, AB, etc.)
 
 
1027
  """)
1028
 
1029
  with gr.Row():
@@ -1101,4 +1187,4 @@ Results are reported as **Accuracy** and **F1 Score**:
1101
 
1102
  if __name__ == "__main__":
1103
  demo_app = build_app()
1104
- demo_app.launch(debug=True, show_error=True)
 
17
  def format_log(msg): return f"✅ {msg}"
18
 
19
  try:
20
+ from scorer import score_submission
21
  except ImportError:
22
  score_submission = None
23
  extract_uppercase_letters = None
 
165
  # ---------------------------------------------------------------------------
166
 
167
  def calculate_f1_score(predictions, references):
168
+ """Calculate token-level F1 score for string answers."""
169
  if not predictions or not references:
170
  return 0.0
171
 
 
 
 
 
 
 
 
 
172
  total_precision = 0.0
173
  total_recall = 0.0
174
  count = 0
175
 
176
  for pred, ref in zip(predictions, references):
177
+ pred_tokens = set(str(pred).strip().lower().split())
178
+ ref_tokens = set(str(ref).strip().lower().split())
179
 
180
+ if not pred_tokens and not ref_tokens:
181
  total_precision += 1.0
182
  total_recall += 1.0
183
  count += 1
184
+ elif not pred_tokens or not ref_tokens:
185
  count += 1
186
  else:
187
+ intersection = len(pred_tokens & ref_tokens)
188
+ precision = intersection / len(pred_tokens)
189
+ recall = intersection / len(ref_tokens)
190
  total_precision += precision
191
  total_recall += recall
192
  count += 1
 
200
  if avg_precision + avg_recall == 0:
201
  return 0.0
202
 
203
+ return 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
 
204
 
205
 
206
  def update_json_with_submission(model_name, scores_by_metric, scored_submissions, is_agent=False, model_family=""):
207
+ """Update capability and domain JSON files with new submission data."""
208
  try:
209
  if is_agent:
210
  capability_file = "data/agent_capability.json"
211
+ domain_file = "data/agent_domain.json"
212
  else:
213
  capability_file = "data/model_capability.json"
214
+ domain_file = "data/model_domain.json"
215
 
216
+ # ── 1. Update capability file ────────────────────────────────────
217
  with open(capability_file, 'r', encoding='utf-8') as f:
218
  capability_data = json.load(f)
219
 
 
220
  for capability in METRICS:
221
+ if capability not in scores_by_metric or capability not in capability_data:
222
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
+ metric_data = scores_by_metric[capability]
225
+
226
+ # submissions belonging to this capability
227
+ cap_subs = [s for s in scored_submissions if s.get('metric_category') == capability]
228
+
229
+ # F1 calculated from this capability's predictions vs references
230
+ if cap_subs:
231
+ predictions = [s.get('answer', '') for s in cap_subs]
232
+ references = [s.get('reference_answer', '') for s in cap_subs]
233
+ f1 = calculate_f1_score(predictions, references)
234
+ else:
235
+ f1 = 0.0
236
+
237
+ capability_data[capability][model_name] = {
238
+ "accuracy": metric_data['accuracy'],
239
+ "model_family": model_family,
240
+ "f1": f1,
241
+ }
242
 
 
243
  with open(capability_file, 'w', encoding='utf-8') as f:
244
  json.dump(capability_data, f, indent=2, ensure_ascii=False)
 
245
  print(f"✓ Updated {capability_file}")
246
+
247
+ # ── 2. Update domain file ────────────────────────────────────────
248
+ with open(domain_file, 'r', encoding='utf-8') as f:
249
+ domain_data = json.load(f)
250
+
251
+ # Group scored_submissions by domain
252
+ from collections import defaultdict
253
+ domain_groups = defaultdict(list)
254
+ for s in scored_submissions:
255
+ dom = s.get('domain', '').strip().upper()
256
+ if dom:
257
+ domain_groups[dom].append(s)
258
+
259
+ # Known domain keys in the JSON (may differ in capitalisation/alias)
260
+ DOMAIN_KEY_MAP = {
261
+ "GAMING": "GAMING",
262
+ "GAME": "GAMING",
263
+ "EMBODIED_AI": "EMBODIED_AI",
264
+ "WEB": "WEB",
265
+ "TEXT2SQL": "TEXT2SQL",
266
+ "OPENWORLD_QA": "OPENWORLD_QA",
267
+ "SOFTWARE": "SOFTWARE",
268
+ "SOFTWARE_ENGINEER": "SOFTWARE",
269
+ }
270
+
271
+ for raw_domain, subs in domain_groups.items():
272
+ json_domain = DOMAIN_KEY_MAP.get(raw_domain)
273
+ if json_domain is None or json_domain not in domain_data:
274
+ continue # unknown domain, skip
275
+
276
+ scores = [s.get('score', 0.0) for s in subs]
277
+ accuracy = float(np.mean(scores)) if scores else 0.0
278
+
279
+ predictions = [s.get('answer', '') for s in subs]
280
+ references = [s.get('reference_answer', '') for s in subs]
281
+ f1 = calculate_f1_score(predictions, references)
282
+
283
+ domain_data[json_domain][model_name] = {
284
+ "accuracy": accuracy,
285
+ "model_family": model_family,
286
+ "f1": f1,
287
+ }
288
+
289
+ with open(domain_file, 'w', encoding='utf-8') as f:
290
+ json.dump(domain_data, f, indent=2, ensure_ascii=False)
291
+ print(f"✓ Updated {domain_file}")
292
+
293
  return True
294
 
295
  except Exception as e:
 
349
  print(f"✓ Overall accuracy: {average_accuracy:.4f}")
350
  for metric_name, metric_data in scores_by_metric.items():
351
  if metric_name != "Average":
352
+ print(f" {metric_name}: {metric_data['accuracy']:.4f} ({metric_data.get('correct', 0)}/{metric_data['count']})")
353
 
354
  # Save locally
355
  submission_dir = f"submissions/{organisation}_{model}"
 
374
  metric_name: {
375
  "accuracy": float(metric_data["accuracy"]),
376
  "count": int(metric_data["count"]),
377
+ "correct": int(metric_data.get("correct", 0))
378
  }
379
  for metric_name, metric_data in scores_by_metric.items()
380
  }
 
395
  if update_success:
396
  print("✓ Updated leaderboard JSON files")
397
  # Reload data
398
+ global AGENT_CAPABILITY, AGENT_DOMAIN, MODEL_CAPABILITY, MODEL_DOMAIN, model_domain_filtered
399
  if is_agent:
400
  AGENT_CAPABILITY = load_json_data("data/agent_capability.json")
401
  AGENT_DOMAIN = load_json_data("data/agent_domain.json")
402
  else:
403
  MODEL_CAPABILITY = load_json_data("data/model_capability.json")
404
  MODEL_DOMAIN = load_json_data("data/model_domain.json")
405
+ # Recompute filtered model domain view
406
+ _model_items = set()
407
+ for _cap_data in MODEL_CAPABILITY.values():
408
+ _model_items.update(_cap_data.keys())
409
+ model_domain_filtered = filter_data_by_items(MODEL_DOMAIN, _model_items)
410
+ if not any(len(v) > 0 for v in model_domain_filtered.values()):
411
+ model_domain_filtered = {}
412
+
413
+ # Compute per-domain scores from scored_submissions
414
+ from collections import defaultdict
415
+ domain_groups = defaultdict(list)
416
+ for s in scored_submissions:
417
+ dom = s.get("domain", "").strip().upper()
418
+ if dom:
419
+ domain_groups[dom].append(s)
420
+
421
+ domain_scores = {}
422
+ for dom, subs in sorted(domain_groups.items()):
423
+ scores_list = [s.get("score", 0.0) for s in subs]
424
+ preds = [s.get("answer", "") for s in subs]
425
+ refs = [s.get("reference_answer", "") for s in subs]
426
+ domain_scores[dom] = {
427
+ "accuracy": float(np.mean(scores_list)) if scores_list else 0.0,
428
+ "f1": calculate_f1_score(preds, refs),
429
+ "correct": int(sum(scores_list)),
430
+ "count": len(scores_list),
431
+ }
432
 
433
  # Format message
434
  message = f"✅ **Submission successful!**\n\n"
435
  message += f"**{'Agent' if is_agent else 'Model'}:** {model}\n"
436
  message += f"**Organisation:** {organisation}\n"
437
  message += f"**Overall Accuracy:** {average_accuracy:.4f}\n\n"
438
+
439
  message += "**Scores by Capability:**\n"
440
  for metric_name in METRICS:
441
  if metric_name in scores_by_metric:
442
+ md = scores_by_metric[metric_name]
443
+ # compute F1 for this capability
444
+ cap_subs = [s for s in scored_submissions if s.get("metric_category") == metric_name]
445
+ if cap_subs:
446
+ preds = [s.get("answer", "") for s in cap_subs]
447
+ refs = [s.get("reference_answer", "") for s in cap_subs]
448
+ cap_f1 = calculate_f1_score(preds, refs)
449
+ else:
450
+ cap_f1 = 0.0
451
+ message += (
452
+ f"- **{metric_name}:** Accuracy {md['accuracy']:.4f}"
453
+ f" ({md.get('correct', 0)}/{md['count']})"
454
+ f", F1 {cap_f1:.4f}\n"
455
+ )
456
+
457
+ if domain_scores:
458
+ message += "\n**Scores by Domain:**\n"
459
+ for dom, ds in domain_scores.items():
460
+ message += (
461
+ f"- **{dom}:** Accuracy {ds['accuracy']:.4f}"
462
+ f" ({ds['correct']}/{ds['count']})"
463
+ f", F1 {ds['f1']:.4f}\n"
464
+ )
465
 
466
  message += f"\n**Submission ID:** {timestamp}\n"
467
  if update_success:
 
1097
  gr.Markdown("""
1098
  **Submission Format:**
1099
 
1100
+ Your JSONL file should contain one question-answer pair per line:
1101
  ```json
1102
+ {"episode_id": "ep_001", "question": "What is X?", "answer": "your answer"}
1103
+ {"episode_id": "ep_001", "question": "What happened next?", "answer": "another answer"}
1104
+ {"episode_id": "ep_002", "question": "What is the goal?", "answer": "yet another answer"}
1105
  ```
1106
 
1107
  **Required fields:**
1108
  - `episode_id`: Episode identifier
1109
+ - `question`: Question text (must match exactly the question in the dataset)
1110
+ - `answer`: Your model's predicted free-form string answer
1111
+
1112
+ Each `episode_id` + `question` pair must be unique. Answers are evaluated by case-insensitive exact string match.
1113
  """)
1114
 
1115
  with gr.Row():
 
1187
 
1188
  if __name__ == "__main__":
1189
  demo_app = build_app()
1190
+ demo_app.launch(debug=True, show_error=True)
data/agent_capability.json CHANGED
@@ -66,7 +66,7 @@
66
  "f1": 0.4152833333333333
67
  }
68
  },
69
- "Casual Inference": {
70
  "Qwen3-Embedding-4B": {
71
  "accuracy": 0.48618333333333336,
72
  "model_family": "Qwen3-32B",
@@ -131,6 +131,11 @@
131
  "accuracy": 0.5399999999999999,
132
  "model_family": "Qwen3-32B",
133
  "f1": 0.34326666666666666
 
 
 
 
 
134
  }
135
  },
136
  "State Updating": {
@@ -198,9 +203,14 @@
198
  "accuracy": 0.48335,
199
  "model_family": "Qwen3-32B",
200
  "f1": 0.3447166666666666
 
 
 
 
 
201
  }
202
  },
203
- "State abstraction": {
204
  "Qwen3-Embedding-4B": {
205
  "accuracy": 0.3022666666666667,
206
  "model_family": "Qwen3-32B",
@@ -265,6 +275,11 @@
265
  "accuracy": 0.37979999999999997,
266
  "model_family": "Qwen3-32B",
267
  "f1": 0.3152333333333333
 
 
 
 
 
268
  }
269
  }
270
  }
 
66
  "f1": 0.4152833333333333
67
  }
68
  },
69
+ "Causal Inference": {
70
  "Qwen3-Embedding-4B": {
71
  "accuracy": 0.48618333333333336,
72
  "model_family": "Qwen3-32B",
 
131
  "accuracy": 0.5399999999999999,
132
  "model_family": "Qwen3-32B",
133
  "f1": 0.34326666666666666
134
+ },
135
+ "1": {
136
+ "accuracy": 0.0,
137
+ "model_family": "1",
138
+ "f1": 0.0
139
  }
140
  },
141
  "State Updating": {
 
203
  "accuracy": 0.48335,
204
  "model_family": "Qwen3-32B",
205
  "f1": 0.3447166666666666
206
+ },
207
+ "1": {
208
+ "accuracy": 1.0,
209
+ "model_family": "1",
210
+ "f1": 1.0
211
  }
212
  },
213
+ "State Abstraction": {
214
  "Qwen3-Embedding-4B": {
215
  "accuracy": 0.3022666666666667,
216
  "model_family": "Qwen3-32B",
 
275
  "accuracy": 0.37979999999999997,
276
  "model_family": "Qwen3-32B",
277
  "f1": 0.3152333333333333
278
+ },
279
+ "1": {
280
+ "accuracy": 0.0,
281
+ "model_family": "1",
282
+ "f1": 0.0
283
  }
284
  }
285
  }
data/model_capability.json CHANGED
@@ -145,7 +145,7 @@
145
  "f1": 0.3065
146
  }
147
  },
148
- "Casual Inference": {
149
  "Claude Haiku 3.5": {
150
  "accuracy": 0.4799333333333333,
151
  "f1": 0.29278333333333334
@@ -289,6 +289,11 @@
289
  "AMA-agent (Ours) (8B)": {
290
  "accuracy": 0.4806166666666667,
291
  "f1": 0.23224999999999998
 
 
 
 
 
292
  }
293
  },
294
  "State Updating": {
@@ -435,9 +440,14 @@
435
  "AMA-agent (Ours) (8B)": {
436
  "accuracy": 0.43645,
437
  "f1": 0.21893333333333334
 
 
 
 
 
438
  }
439
  },
440
- "State abstraction": {
441
  "Claude Haiku 3.5": {
442
  "accuracy": 0.32758333333333334,
443
  "f1": 0.2684166666666667
@@ -581,6 +591,11 @@
581
  "AMA-agent (Ours) (8B)": {
582
  "accuracy": 0.37873333333333337,
583
  "f1": 0.21493333333333334
 
 
 
 
 
584
  }
585
  }
586
  }
 
145
  "f1": 0.3065
146
  }
147
  },
148
+ "Causal Inference": {
149
  "Claude Haiku 3.5": {
150
  "accuracy": 0.4799333333333333,
151
  "f1": 0.29278333333333334
 
289
  "AMA-agent (Ours) (8B)": {
290
  "accuracy": 0.4806166666666667,
291
  "f1": 0.23224999999999998
292
+ },
293
+ "1": {
294
+ "accuracy": 0.0,
295
+ "model_family": "1",
296
+ "f1": 0.0
297
  }
298
  },
299
  "State Updating": {
 
440
  "AMA-agent (Ours) (8B)": {
441
  "accuracy": 0.43645,
442
  "f1": 0.21893333333333334
443
+ },
444
+ "1": {
445
+ "accuracy": 1.0,
446
+ "model_family": "1",
447
+ "f1": 1.0
448
  }
449
  },
450
+ "State Abstraction": {
451
  "Claude Haiku 3.5": {
452
  "accuracy": 0.32758333333333334,
453
  "f1": 0.2684166666666667
 
591
  "AMA-agent (Ours) (8B)": {
592
  "accuracy": 0.37873333333333337,
593
  "f1": 0.21493333333333334
594
+ },
595
+ "1": {
596
+ "accuracy": 0.0,
597
+ "model_family": "1",
598
+ "f1": 0.0
599
  }
600
  }
601
  }
data/model_domain.json CHANGED
@@ -1,401 +1,401 @@
1
  {
2
  "GAMING": {
3
- "Qwen3-Embedding-4B": {
4
  "accuracy": 0.5157,
5
  "model_family": "Qwen3-32B",
6
  "f1": 0.2195
7
  },
8
- "GRAPHRAG": {
9
  "accuracy": 0.5595249999999999,
10
  "model_family": "Qwen3-32B",
11
  "f1": 0.288175
12
  },
13
- "Hipporag2": {
14
  "accuracy": 0.60555,
15
  "model_family": "Qwen3-32B",
16
  "f1": 0.2273
17
  },
18
- "Memagent": {
19
  "accuracy": 0.31775,
20
  "model_family": "Qwen3-32B",
21
  "f1": 0.22945
22
  },
23
- "Mem1": {
24
  "accuracy": 0.225875,
25
  "model_family": "Qwen3-32B",
26
  "f1": 0.18155
27
  },
28
- "Amem": {
29
  "accuracy": 0.4247,
30
  "model_family": "Qwen3-32B",
31
  "f1": 0.343125
32
  },
33
- "Mem0": {
34
  "accuracy": 0.39085000000000003,
35
  "model_family": "Qwen3-32B",
36
  "f1": 0.346
37
  },
38
- "Memorag": {
39
  "accuracy": 0.557625,
40
  "model_family": "Qwen3-32B",
41
  "f1": 0.257875
42
  },
43
- "Memgpt": {
44
  "accuracy": 0.435425,
45
  "model_family": "Qwen3-32B",
46
  "f1": 0.318475
47
  },
48
- "Mem-alpha": {
49
  "accuracy": 0.43895,
50
  "model_family": "Qwen3-32B",
51
  "f1": 0.319875
52
  },
53
- "Memorybank": {
54
  "accuracy": 0.43885,
55
  "model_family": "Qwen3-32B",
56
  "f1": 0.325325
57
  },
58
- "Simple mem": {
59
  "accuracy": 0.288775,
60
  "model_family": "Qwen3-32B",
61
  "f1": 0.163
62
  },
63
- "Long context": {
64
  "accuracy": 0.5355,
65
  "model_family": "Qwen3-32B",
66
  "f1": 0.321775
67
  }
68
  },
69
  "EMBODIED_AI": {
70
- "Qwen3-Embedding-4B": {
71
  "accuracy": 0.204325,
72
  "model_family": "Qwen3-32B",
73
  "f1": 0.1353
74
  },
75
- "GRAPHRAG": {
76
  "accuracy": 0.1476,
77
  "model_family": "Qwen3-32B",
78
  "f1": 0.3799
79
  },
80
- "Hipporag2": {
81
  "accuracy": 0.17627500000000002,
82
  "model_family": "Qwen3-32B",
83
  "f1": 0.181875
84
  },
85
- "Memagent": {
86
  "accuracy": 0.10617499999999999,
87
  "model_family": "Qwen3-32B",
88
  "f1": 0.144975
89
  },
90
- "Mem1": {
91
  "accuracy": 0.03355,
92
  "model_family": "Qwen3-32B",
93
  "f1": 0.22445
94
  },
95
- "Amem": {
96
  "accuracy": 0.183975,
97
  "model_family": "Qwen3-32B",
98
  "f1": 0.3524
99
  },
100
- "Mem0": {
101
  "accuracy": 0.11109999999999999,
102
  "model_family": "Qwen3-32B",
103
  "f1": 0.27005
104
  },
105
- "Memorag": {
106
  "accuracy": 0.085425,
107
  "model_family": "Qwen3-32B",
108
  "f1": 0.17677500000000002
109
  },
110
- "Memgpt": {
111
  "accuracy": 0.1122,
112
  "model_family": "Qwen3-32B",
113
  "f1": 0.10405
114
  },
115
- "Mem-alpha": {
116
  "accuracy": 0.15515,
117
  "model_family": "Qwen3-32B",
118
  "f1": 0.23735
119
  },
120
- "Memorybank": {
121
  "accuracy": 0.16025,
122
  "model_family": "Qwen3-32B",
123
  "f1": 0.426475
124
  },
125
- "Simple mem": {
126
  "accuracy": 0.045975,
127
  "model_family": "Qwen3-32B",
128
  "f1": 0.2284
129
  },
130
- "Long context": {
131
  "accuracy": 0.48185,
132
  "model_family": "Qwen3-32B",
133
  "f1": 0.56
134
  }
135
  },
136
  "WEB": {
137
- "Qwen3-Embedding-4B": {
138
  "accuracy": 0.2872,
139
  "model_family": "Qwen3-32B",
140
  "f1": 0.08535000000000001
141
  },
142
- "GRAPHRAG": {
143
  "accuracy": 0.420675,
144
  "model_family": "Qwen3-32B",
145
  "f1": 0.268075
146
  },
147
- "Hipporag2": {
148
  "accuracy": 0.3761,
149
  "model_family": "Qwen3-32B",
150
  "f1": 0.120125
151
  },
152
- "Memagent": {
153
  "accuracy": 0.263975,
154
  "model_family": "Qwen3-32B",
155
  "f1": 0.09065
156
  },
157
- "Mem1": {
158
  "accuracy": 0.131275,
159
  "model_family": "Qwen3-32B",
160
  "f1": 0.1518
161
  },
162
- "Amem": {
163
  "accuracy": 0.391525,
164
  "model_family": "Qwen3-32B",
165
  "f1": 0.2294
166
  },
167
- "Mem0": {
168
  "accuracy": 0.2705,
169
  "model_family": "Qwen3-32B",
170
  "f1": 0.21675
171
  },
172
- "Memorag": {
173
  "accuracy": 0.364975,
174
  "model_family": "Qwen3-32B",
175
  "f1": 0.108075
176
  },
177
- "Memgpt": {
178
  "accuracy": 0.327975,
179
  "model_family": "Qwen3-32B",
180
  "f1": 0.07105
181
  },
182
- "Mem-alpha": {
183
  "accuracy": 0.362925,
184
  "model_family": "Qwen3-32B",
185
  "f1": 0.15944999999999998
186
  },
187
- "Memorybank": {
188
  "accuracy": 0.401775,
189
  "model_family": "Qwen3-32B",
190
  "f1": 0.23704999999999998
191
  },
192
- "Simple mem": {
193
  "accuracy": 0.13974999999999999,
194
  "model_family": "Qwen3-32B",
195
  "f1": 0.1679
196
  },
197
- "Long context": {
198
  "accuracy": 0.554275,
199
  "model_family": "Qwen3-32B",
200
  "f1": 0.348075
201
  }
202
  },
203
  "TEXT2SQL": {
204
- "Qwen3-Embedding-4B": {
205
  "accuracy": 0.4164,
206
  "model_family": "Qwen3-32B",
207
  "f1": 0.249325
208
  },
209
- "GRAPHRAG": {
210
  "accuracy": 0.21665,
211
  "model_family": "Qwen3-32B",
212
  "f1": 0.221675
213
  },
214
- "Hipporag2": {
215
  "accuracy": 0.46267499999999995,
216
  "model_family": "Qwen3-32B",
217
  "f1": 0.26935
218
  },
219
- "Memagent": {
220
  "accuracy": 0.245375,
221
  "model_family": "Qwen3-32B",
222
  "f1": 0.245375
223
  },
224
- "Mem1": {
225
  "accuracy": 0.06465,
226
  "model_family": "Qwen3-32B",
227
  "f1": 0.19990000000000002
228
  },
229
- "Amem": {
230
  "accuracy": 0.31405,
231
  "model_family": "Qwen3-32B",
232
  "f1": 0.289625
233
  },
234
- "Mem0": {
235
  "accuracy": 0.1192,
236
  "model_family": "Qwen3-32B",
237
  "f1": 0.2326
238
  },
239
- "Memorag": {
240
  "accuracy": 0.619,
241
  "model_family": "Qwen3-32B",
242
  "f1": 0.296475
243
  },
244
- "Memgpt": {
245
  "accuracy": 0.206875,
246
  "model_family": "Qwen3-32B",
247
  "f1": 0.178975
248
  },
249
- "Mem-alpha": {
250
  "accuracy": 0.30065,
251
  "model_family": "Qwen3-32B",
252
  "f1": 0.26505
253
  },
254
- "Memorybank": {
255
  "accuracy": 0.23855,
256
  "model_family": "Qwen3-32B",
257
  "f1": 0.28355
258
  },
259
- "Simple mem": {
260
  "accuracy": 0.192575,
261
  "model_family": "Qwen3-32B",
262
  "f1": 0.157225
263
  },
264
- "Long context": {
265
  "accuracy": 0.456075,
266
  "model_family": "Qwen3-32B",
267
  "f1": 0.295275
268
  }
269
  },
270
  "OPENWORLD_QA": {
271
- "Qwen3-Embedding-4B": {
272
  "accuracy": 0.399125,
273
  "model_family": "Qwen3-32B",
274
  "f1": 0.0837
275
  },
276
- "GRAPHRAG": {
277
  "accuracy": 0.31845,
278
  "model_family": "Qwen3-32B",
279
  "f1": 0.22635
280
  },
281
- "Hipporag2": {
282
  "accuracy": 0.45825,
283
  "model_family": "Qwen3-32B",
284
  "f1": 0.2362
285
  },
286
- "Memagent": {
287
  "accuracy": 0.158225,
288
  "model_family": "Qwen3-32B",
289
  "f1": 0.0704
290
  },
291
- "Mem1": {
292
  "accuracy": 0.12065000000000001,
293
  "model_family": "Qwen3-32B",
294
  "f1": 0.15005
295
  },
296
- "Amem": {
297
  "accuracy": 0.29359999999999997,
298
  "model_family": "Qwen3-32B",
299
  "f1": 0.2079
300
  },
301
- "Mem0": {
302
  "accuracy": 0.16197499999999998,
303
  "model_family": "Qwen3-32B",
304
  "f1": 0.1604
305
  },
306
- "Memorag": {
307
  "accuracy": 0.411375,
308
  "model_family": "Qwen3-32B",
309
  "f1": 0.093675
310
  },
311
- "Memgpt": {
312
  "accuracy": 0.3155,
313
  "model_family": "Qwen3-32B",
314
  "f1": 0.0595
315
  },
316
- "Mem-alpha": {
317
  "accuracy": 0.2301,
318
  "model_family": "Qwen3-32B",
319
  "f1": 0.13345
320
  },
321
- "Memorybank": {
322
  "accuracy": 0.3486,
323
  "model_family": "Qwen3-32B",
324
  "f1": 0.2519
325
  },
326
- "Simple mem": {
327
  "accuracy": 0.12154999999999999,
328
  "model_family": "Qwen3-32B",
329
  "f1": 0.1312
330
  },
331
- "Long context": {
332
  "accuracy": 0.49785,
333
  "model_family": "Qwen3-32B",
334
  "f1": 0.3349
335
  }
336
  },
337
  "SOFTWARE": {
338
- "Qwen3-Embedding-4B": {
339
  "accuracy": 0.599025,
340
  "model_family": "Qwen3-32B",
341
  "f1": 0.083575
342
  },
343
- "GRAPHRAG": {
344
  "accuracy": 0.348875,
345
  "model_family": "Qwen3-32B",
346
  "f1": 0.229825
347
  },
348
- "Hipporag2": {
349
  "accuracy": 0.5299,
350
  "model_family": "Qwen3-32B",
351
  "f1": 0.1279
352
  },
353
- "Memagent": {
354
  "accuracy": 0.53965,
355
  "model_family": "Qwen3-32B",
356
  "f1": 0.09085
357
  },
358
- "Mem1": {
359
  "accuracy": 0.18595,
360
  "model_family": "Qwen3-32B",
361
  "f1": 0.17527500000000001
362
  },
363
- "Amem": {
364
  "accuracy": 0.29615,
365
  "model_family": "Qwen3-32B",
366
  "f1": 0.20395
367
  },
368
- "Mem0": {
369
  "accuracy": 0.2366,
370
  "model_family": "Qwen3-32B",
371
  "f1": 0.176975
372
  },
373
- "Memorag": {
374
  "accuracy": 0.55005,
375
  "model_family": "Qwen3-32B",
376
  "f1": 0.10707499999999999
377
  },
378
- "Memgpt": {
379
  "accuracy": 0.599125,
380
  "model_family": "Qwen3-32B",
381
  "f1": 0.066575
382
  },
383
- "Mem-alpha": {
384
  "accuracy": 0.3476,
385
  "model_family": "Qwen3-32B",
386
  "f1": 0.12492500000000001
387
  },
388
- "Memorybank": {
389
  "accuracy": 0.5072,
390
  "model_family": "Qwen3-32B",
391
  "f1": 0.240875
392
  },
393
- "Simple mem": {
394
  "accuracy": 0.2431,
395
  "model_family": "Qwen3-32B",
396
  "f1": 0.2005
397
  },
398
- "Long context": {
399
  "accuracy": 0.4847,
400
  "model_family": "Qwen3-32B",
401
  "f1": 0.267725
 
1
  {
2
  "GAMING": {
3
+ "Qwen3-Embedding-4B (32B)": {
4
  "accuracy": 0.5157,
5
  "model_family": "Qwen3-32B",
6
  "f1": 0.2195
7
  },
8
+ "GRAPHRAG (32B)": {
9
  "accuracy": 0.5595249999999999,
10
  "model_family": "Qwen3-32B",
11
  "f1": 0.288175
12
  },
13
+ "Hipporag2 (32B)": {
14
  "accuracy": 0.60555,
15
  "model_family": "Qwen3-32B",
16
  "f1": 0.2273
17
  },
18
+ "Memagent (32B)": {
19
  "accuracy": 0.31775,
20
  "model_family": "Qwen3-32B",
21
  "f1": 0.22945
22
  },
23
+ "Mem1 (32B)": {
24
  "accuracy": 0.225875,
25
  "model_family": "Qwen3-32B",
26
  "f1": 0.18155
27
  },
28
+ "Amem (32B)": {
29
  "accuracy": 0.4247,
30
  "model_family": "Qwen3-32B",
31
  "f1": 0.343125
32
  },
33
+ "Mem0 (32B)": {
34
  "accuracy": 0.39085000000000003,
35
  "model_family": "Qwen3-32B",
36
  "f1": 0.346
37
  },
38
+ "Memorag (32B)": {
39
  "accuracy": 0.557625,
40
  "model_family": "Qwen3-32B",
41
  "f1": 0.257875
42
  },
43
+ "Memgpt (32B)": {
44
  "accuracy": 0.435425,
45
  "model_family": "Qwen3-32B",
46
  "f1": 0.318475
47
  },
48
+ "Mem-alpha (32B)": {
49
  "accuracy": 0.43895,
50
  "model_family": "Qwen3-32B",
51
  "f1": 0.319875
52
  },
53
+ "Memorybank (32B)": {
54
  "accuracy": 0.43885,
55
  "model_family": "Qwen3-32B",
56
  "f1": 0.325325
57
  },
58
+ "Simple mem (32B)": {
59
  "accuracy": 0.288775,
60
  "model_family": "Qwen3-32B",
61
  "f1": 0.163
62
  },
63
+ "Long context (32B)": {
64
  "accuracy": 0.5355,
65
  "model_family": "Qwen3-32B",
66
  "f1": 0.321775
67
  }
68
  },
69
  "EMBODIED_AI": {
70
+ "Qwen3-Embedding-4B (32B)": {
71
  "accuracy": 0.204325,
72
  "model_family": "Qwen3-32B",
73
  "f1": 0.1353
74
  },
75
+ "GRAPHRAG (32B)": {
76
  "accuracy": 0.1476,
77
  "model_family": "Qwen3-32B",
78
  "f1": 0.3799
79
  },
80
+ "Hipporag2 (32B)": {
81
  "accuracy": 0.17627500000000002,
82
  "model_family": "Qwen3-32B",
83
  "f1": 0.181875
84
  },
85
+ "Memagent (32B)": {
86
  "accuracy": 0.10617499999999999,
87
  "model_family": "Qwen3-32B",
88
  "f1": 0.144975
89
  },
90
+ "Mem1 (32B)": {
91
  "accuracy": 0.03355,
92
  "model_family": "Qwen3-32B",
93
  "f1": 0.22445
94
  },
95
+ "Amem (32B)": {
96
  "accuracy": 0.183975,
97
  "model_family": "Qwen3-32B",
98
  "f1": 0.3524
99
  },
100
+ "Mem0 (32B)": {
101
  "accuracy": 0.11109999999999999,
102
  "model_family": "Qwen3-32B",
103
  "f1": 0.27005
104
  },
105
+ "Memorag (32B)": {
106
  "accuracy": 0.085425,
107
  "model_family": "Qwen3-32B",
108
  "f1": 0.17677500000000002
109
  },
110
+ "Memgpt (32B)": {
111
  "accuracy": 0.1122,
112
  "model_family": "Qwen3-32B",
113
  "f1": 0.10405
114
  },
115
+ "Mem-alpha (32B)": {
116
  "accuracy": 0.15515,
117
  "model_family": "Qwen3-32B",
118
  "f1": 0.23735
119
  },
120
+ "Memorybank (32B)": {
121
  "accuracy": 0.16025,
122
  "model_family": "Qwen3-32B",
123
  "f1": 0.426475
124
  },
125
+ "Simple mem (32B)": {
126
  "accuracy": 0.045975,
127
  "model_family": "Qwen3-32B",
128
  "f1": 0.2284
129
  },
130
+ "Long context (32B)": {
131
  "accuracy": 0.48185,
132
  "model_family": "Qwen3-32B",
133
  "f1": 0.56
134
  }
135
  },
136
  "WEB": {
137
+ "Qwen3-Embedding-4B (32B)": {
138
  "accuracy": 0.2872,
139
  "model_family": "Qwen3-32B",
140
  "f1": 0.08535000000000001
141
  },
142
+ "GRAPHRAG (32B)": {
143
  "accuracy": 0.420675,
144
  "model_family": "Qwen3-32B",
145
  "f1": 0.268075
146
  },
147
+ "Hipporag2 (32B)": {
148
  "accuracy": 0.3761,
149
  "model_family": "Qwen3-32B",
150
  "f1": 0.120125
151
  },
152
+ "Memagent (32B)": {
153
  "accuracy": 0.263975,
154
  "model_family": "Qwen3-32B",
155
  "f1": 0.09065
156
  },
157
+ "Mem1 (32B)": {
158
  "accuracy": 0.131275,
159
  "model_family": "Qwen3-32B",
160
  "f1": 0.1518
161
  },
162
+ "Amem (32B)": {
163
  "accuracy": 0.391525,
164
  "model_family": "Qwen3-32B",
165
  "f1": 0.2294
166
  },
167
+ "Mem0 (32B)": {
168
  "accuracy": 0.2705,
169
  "model_family": "Qwen3-32B",
170
  "f1": 0.21675
171
  },
172
+ "Memorag (32B)": {
173
  "accuracy": 0.364975,
174
  "model_family": "Qwen3-32B",
175
  "f1": 0.108075
176
  },
177
+ "Memgpt (32B)": {
178
  "accuracy": 0.327975,
179
  "model_family": "Qwen3-32B",
180
  "f1": 0.07105
181
  },
182
+ "Mem-alpha (32B)": {
183
  "accuracy": 0.362925,
184
  "model_family": "Qwen3-32B",
185
  "f1": 0.15944999999999998
186
  },
187
+ "Memorybank (32B)": {
188
  "accuracy": 0.401775,
189
  "model_family": "Qwen3-32B",
190
  "f1": 0.23704999999999998
191
  },
192
+ "Simple mem (32B)": {
193
  "accuracy": 0.13974999999999999,
194
  "model_family": "Qwen3-32B",
195
  "f1": 0.1679
196
  },
197
+ "Long context (32B)": {
198
  "accuracy": 0.554275,
199
  "model_family": "Qwen3-32B",
200
  "f1": 0.348075
201
  }
202
  },
203
  "TEXT2SQL": {
204
+ "Qwen3-Embedding-4B (32B)": {
205
  "accuracy": 0.4164,
206
  "model_family": "Qwen3-32B",
207
  "f1": 0.249325
208
  },
209
+ "GRAPHRAG (32B)": {
210
  "accuracy": 0.21665,
211
  "model_family": "Qwen3-32B",
212
  "f1": 0.221675
213
  },
214
+ "Hipporag2 (32B)": {
215
  "accuracy": 0.46267499999999995,
216
  "model_family": "Qwen3-32B",
217
  "f1": 0.26935
218
  },
219
+ "Memagent (32B)": {
220
  "accuracy": 0.245375,
221
  "model_family": "Qwen3-32B",
222
  "f1": 0.245375
223
  },
224
+ "Mem1 (32B)": {
225
  "accuracy": 0.06465,
226
  "model_family": "Qwen3-32B",
227
  "f1": 0.19990000000000002
228
  },
229
+ "Amem (32B)": {
230
  "accuracy": 0.31405,
231
  "model_family": "Qwen3-32B",
232
  "f1": 0.289625
233
  },
234
+ "Mem0 (32B)": {
235
  "accuracy": 0.1192,
236
  "model_family": "Qwen3-32B",
237
  "f1": 0.2326
238
  },
239
+ "Memorag (32B)": {
240
  "accuracy": 0.619,
241
  "model_family": "Qwen3-32B",
242
  "f1": 0.296475
243
  },
244
+ "Memgpt (32B)": {
245
  "accuracy": 0.206875,
246
  "model_family": "Qwen3-32B",
247
  "f1": 0.178975
248
  },
249
+ "Mem-alpha (32B)": {
250
  "accuracy": 0.30065,
251
  "model_family": "Qwen3-32B",
252
  "f1": 0.26505
253
  },
254
+ "Memorybank (32B)": {
255
  "accuracy": 0.23855,
256
  "model_family": "Qwen3-32B",
257
  "f1": 0.28355
258
  },
259
+ "Simple mem (32B)": {
260
  "accuracy": 0.192575,
261
  "model_family": "Qwen3-32B",
262
  "f1": 0.157225
263
  },
264
+ "Long context (32B)": {
265
  "accuracy": 0.456075,
266
  "model_family": "Qwen3-32B",
267
  "f1": 0.295275
268
  }
269
  },
270
  "OPENWORLD_QA": {
271
+ "Qwen3-Embedding-4B (32B)": {
272
  "accuracy": 0.399125,
273
  "model_family": "Qwen3-32B",
274
  "f1": 0.0837
275
  },
276
+ "GRAPHRAG (32B)": {
277
  "accuracy": 0.31845,
278
  "model_family": "Qwen3-32B",
279
  "f1": 0.22635
280
  },
281
+ "Hipporag2 (32B)": {
282
  "accuracy": 0.45825,
283
  "model_family": "Qwen3-32B",
284
  "f1": 0.2362
285
  },
286
+ "Memagent (32B)": {
287
  "accuracy": 0.158225,
288
  "model_family": "Qwen3-32B",
289
  "f1": 0.0704
290
  },
291
+ "Mem1 (32B)": {
292
  "accuracy": 0.12065000000000001,
293
  "model_family": "Qwen3-32B",
294
  "f1": 0.15005
295
  },
296
+ "Amem (32B)": {
297
  "accuracy": 0.29359999999999997,
298
  "model_family": "Qwen3-32B",
299
  "f1": 0.2079
300
  },
301
+ "Mem0 (32B)": {
302
  "accuracy": 0.16197499999999998,
303
  "model_family": "Qwen3-32B",
304
  "f1": 0.1604
305
  },
306
+ "Memorag (32B)": {
307
  "accuracy": 0.411375,
308
  "model_family": "Qwen3-32B",
309
  "f1": 0.093675
310
  },
311
+ "Memgpt (32B)": {
312
  "accuracy": 0.3155,
313
  "model_family": "Qwen3-32B",
314
  "f1": 0.0595
315
  },
316
+ "Mem-alpha (32B)": {
317
  "accuracy": 0.2301,
318
  "model_family": "Qwen3-32B",
319
  "f1": 0.13345
320
  },
321
+ "Memorybank (32B)": {
322
  "accuracy": 0.3486,
323
  "model_family": "Qwen3-32B",
324
  "f1": 0.2519
325
  },
326
+ "Simple mem (32B)": {
327
  "accuracy": 0.12154999999999999,
328
  "model_family": "Qwen3-32B",
329
  "f1": 0.1312
330
  },
331
+ "Long context (32B)": {
332
  "accuracy": 0.49785,
333
  "model_family": "Qwen3-32B",
334
  "f1": 0.3349
335
  }
336
  },
337
  "SOFTWARE": {
338
+ "Qwen3-Embedding-4B (32B)": {
339
  "accuracy": 0.599025,
340
  "model_family": "Qwen3-32B",
341
  "f1": 0.083575
342
  },
343
+ "GRAPHRAG (32B)": {
344
  "accuracy": 0.348875,
345
  "model_family": "Qwen3-32B",
346
  "f1": 0.229825
347
  },
348
+ "Hipporag2 (32B)": {
349
  "accuracy": 0.5299,
350
  "model_family": "Qwen3-32B",
351
  "f1": 0.1279
352
  },
353
+ "Memagent (32B)": {
354
  "accuracy": 0.53965,
355
  "model_family": "Qwen3-32B",
356
  "f1": 0.09085
357
  },
358
+ "Mem1 (32B)": {
359
  "accuracy": 0.18595,
360
  "model_family": "Qwen3-32B",
361
  "f1": 0.17527500000000001
362
  },
363
+ "Amem (32B)": {
364
  "accuracy": 0.29615,
365
  "model_family": "Qwen3-32B",
366
  "f1": 0.20395
367
  },
368
+ "Mem0 (32B)": {
369
  "accuracy": 0.2366,
370
  "model_family": "Qwen3-32B",
371
  "f1": 0.176975
372
  },
373
+ "Memorag (32B)": {
374
  "accuracy": 0.55005,
375
  "model_family": "Qwen3-32B",
376
  "f1": 0.10707499999999999
377
  },
378
+ "Memgpt (32B)": {
379
  "accuracy": 0.599125,
380
  "model_family": "Qwen3-32B",
381
  "f1": 0.066575
382
  },
383
+ "Mem-alpha (32B)": {
384
  "accuracy": 0.3476,
385
  "model_family": "Qwen3-32B",
386
  "f1": 0.12492500000000001
387
  },
388
+ "Memorybank (32B)": {
389
  "accuracy": 0.5072,
390
  "model_family": "Qwen3-32B",
391
  "f1": 0.240875
392
  },
393
+ "Simple mem (32B)": {
394
  "accuracy": 0.2431,
395
  "model_family": "Qwen3-32B",
396
  "f1": 0.2005
397
  },
398
+ "Long context (32B)": {
399
  "accuracy": 0.4847,
400
  "model_family": "Qwen3-32B",
401
  "f1": 0.267725
scorer.py CHANGED
@@ -1,55 +1,31 @@
1
  """
2
  Scoring functions for AMA-Bench submissions.
3
 
4
- This module implements evaluation logic for multiple-choice questions,
5
- calculating accuracy by comparing uppercase letters in answers.
6
  """
7
 
8
  import re
9
  from typing import Union, List, Dict
10
 
11
 
12
- def extract_uppercase_letters(text: str) -> str:
13
- """
14
- Extract all uppercase letters from text.
15
-
16
- Used for multiple-choice answer comparison where answers are like
17
- "A", "B", "AB", "ACD", etc.
18
-
19
- Args:
20
- text: Input text containing answer choices
21
-
22
- Returns:
23
- String of uppercase letters only, sorted alphabetically
24
- """
25
- if not isinstance(text, str):
26
- text = str(text)
27
-
28
- # Extract all uppercase letters
29
- letters = [c for c in text if c.isupper() and c.isalpha()]
30
-
31
- # Sort and join to ensure consistent ordering
32
- return ''.join(sorted(set(letters)))
33
 
34
 
35
- def multiple_choice_accuracy(prediction: str, reference: str) -> float:
36
  """
37
- Calculate accuracy for multiple-choice answers.
38
-
39
- Compares uppercase letters extracted from both prediction and reference.
40
- Returns 1.0 if they match exactly, 0.0 otherwise.
41
 
42
  Args:
43
- prediction: Model's predicted answer
44
- reference: Ground truth reference answer
45
 
46
  Returns:
47
- 1.0 if exact match, 0.0 otherwise
48
  """
49
- pred_letters = extract_uppercase_letters(prediction)
50
- ref_letters = extract_uppercase_letters(reference)
51
-
52
- return 1.0 if pred_letters == ref_letters else 0.0
53
 
54
 
55
  def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
@@ -63,7 +39,7 @@ def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
63
  Dictionary with accuracy metric
64
  """
65
  if not scores:
66
- return {"accuracy": 0.0, "count": 0}
67
 
68
  import numpy as np
69
 
@@ -97,6 +73,10 @@ def score_submission(
97
  "Causal": "Causal Inference",
98
  "State": "State Updating",
99
  "Abstraction": "State Abstraction",
 
 
 
 
100
  }
101
 
102
  # Initialize scores by metric
@@ -128,8 +108,8 @@ def score_submission(
128
  reference = gt_info["answer"]
129
  qa_type = gt_info.get("type", "Recall")
130
 
131
- # Calculate accuracy
132
- score = multiple_choice_accuracy(answer, reference)
133
 
134
  # Map question type to metric category
135
  metric_category = "Recall" # default
@@ -150,6 +130,7 @@ def score_submission(
150
  "score": score,
151
  "reference_answer": reference,
152
  "metric_category": metric_category,
 
153
  })
154
 
155
  # Calculate metrics for each category
@@ -163,4 +144,4 @@ def score_submission(
163
  return {
164
  "scores": results,
165
  "scored_submissions": scored_submissions,
166
- }
 
1
  """
2
  Scoring functions for AMA-Bench submissions.
3
 
4
+ This module implements evaluation logic for string answers,
5
+ calculating accuracy by exact string match (case-insensitive).
6
  """
7
 
8
  import re
9
  from typing import Union, List, Dict
10
 
11
 
12
+ def normalize_answer(text: str) -> str:
13
+ """Normalize answer string for comparison (lowercase, strip whitespace)."""
14
+ return str(text).strip().lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
+ def string_exact_match(prediction: str, reference: str) -> float:
18
  """
19
+ Calculate accuracy for string answers using exact match.
 
 
 
20
 
21
  Args:
22
+ prediction: Model's predicted answer string
23
+ reference: Ground truth reference answer string
24
 
25
  Returns:
26
+ 1.0 if normalized strings match exactly, 0.0 otherwise
27
  """
28
+ return 1.0 if normalize_answer(prediction) == normalize_answer(reference) else 0.0
 
 
 
29
 
30
 
31
  def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
 
39
  Dictionary with accuracy metric
40
  """
41
  if not scores:
42
+ return {"accuracy": 0.0, "count": 0, "correct": 0}
43
 
44
  import numpy as np
45
 
 
73
  "Causal": "Causal Inference",
74
  "State": "State Updating",
75
  "Abstraction": "State Abstraction",
76
+ "A": "Recall",
77
+ "B": "Causal Inference",
78
+ "C": "State Updating",
79
+ "D": "State Abstraction",
80
  }
81
 
82
  # Initialize scores by metric
 
108
  reference = gt_info["answer"]
109
  qa_type = gt_info.get("type", "Recall")
110
 
111
+ # Calculate accuracy via exact string match
112
+ score = string_exact_match(answer, reference)
113
 
114
  # Map question type to metric category
115
  metric_category = "Recall" # default
 
130
  "score": score,
131
  "reference_answer": reference,
132
  "metric_category": metric_category,
133
+ "domain": gt_info.get("domain", "") if gt_info else "",
134
  })
135
 
136
  # Calculate metrics for each category
 
144
  return {
145
  "scores": results,
146
  "scored_submissions": scored_submissions,
147
+ }
utils.py CHANGED
@@ -107,6 +107,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
107
 
108
  for row in dataset:
109
  episode_id = row.get("episode_id", "")
 
110
  qa_pairs = row.get("qa_pairs", [])
111
 
112
  for qa in qa_pairs:
@@ -119,7 +120,8 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
119
  groundtruth[key] = {
120
  "answer": answer,
121
  "type": qa_type,
122
- "sub_type": qa.get("sub_type", "")
 
123
  }
124
 
125
  except Exception as hf_error:
@@ -128,7 +130,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
128
 
129
  # Fallback to local file
130
  import json
131
- local_path = "test/test.jsonl"
132
 
133
  try:
134
  with open(local_path, 'r', encoding='utf-8') as f:
@@ -139,6 +141,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
139
 
140
  data = json.loads(line)
141
  episode_id = data.get("episode_id", "")
 
142
  qa_pairs = data.get("qa_pairs", [])
143
 
144
  for qa in qa_pairs:
@@ -151,7 +154,8 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
151
  groundtruth[key] = {
152
  "answer": answer,
153
  "type": qa_type,
154
- "sub_type": qa.get("sub_type", "")
 
155
  }
156
 
157
  print(f"Loaded from local file: {local_path}")
@@ -221,4 +225,4 @@ def validate_submission_file(file_path: str) -> tuple:
221
  except FileNotFoundError:
222
  return False, "File not found.", []
223
  except Exception as e:
224
- return False, f"Error reading file: {str(e)}", []
 
107
 
108
  for row in dataset:
109
  episode_id = row.get("episode_id", "")
110
+ domain = row.get("domain", "")
111
  qa_pairs = row.get("qa_pairs", [])
112
 
113
  for qa in qa_pairs:
 
120
  groundtruth[key] = {
121
  "answer": answer,
122
  "type": qa_type,
123
+ "sub_type": qa.get("sub_type", ""),
124
+ "domain": domain,
125
  }
126
 
127
  except Exception as hf_error:
 
130
 
131
  # Fallback to local file
132
  import json
133
+ local_path = "test/open_end_qa_set.jsonl"
134
 
135
  try:
136
  with open(local_path, 'r', encoding='utf-8') as f:
 
141
 
142
  data = json.loads(line)
143
  episode_id = data.get("episode_id", "")
144
+ domain = data.get("domain", "")
145
  qa_pairs = data.get("qa_pairs", [])
146
 
147
  for qa in qa_pairs:
 
154
  groundtruth[key] = {
155
  "answer": answer,
156
  "type": qa_type,
157
+ "sub_type": qa.get("sub_type", ""),
158
+ "domain": domain,
159
  }
160
 
161
  print(f"Loaded from local file: {local_path}")
 
225
  except FileNotFoundError:
226
  return False, "File not found.", []
227
  except Exception as e:
228
+ return False, f"Error reading file: {str(e)}", []