Spaces:
Running
Running
add submit function
Browse files- .gitignore +3 -1
- app.py +144 -58
- data/agent_capability.json +17 -2
- data/model_capability.json +17 -2
- data/model_domain.json +78 -78
- scorer.py +20 -39
- utils.py +8 -4
.gitignore
CHANGED
|
@@ -1,2 +1,4 @@
|
|
| 1 |
__pycache__
|
| 2 |
-
*.DS_Store
|
|
|
|
|
|
|
|
|
| 1 |
__pycache__
|
| 2 |
+
*.DS_Store
|
| 3 |
+
submissions/
|
| 4 |
+
test/
|
app.py
CHANGED
|
@@ -17,7 +17,7 @@ except ImportError:
|
|
| 17 |
def format_log(msg): return f"✅ {msg}"
|
| 18 |
|
| 19 |
try:
|
| 20 |
-
from scorer import score_submission
|
| 21 |
except ImportError:
|
| 22 |
score_submission = None
|
| 23 |
extract_uppercase_letters = None
|
|
@@ -165,36 +165,28 @@ COLORS = [
|
|
| 165 |
# ---------------------------------------------------------------------------
|
| 166 |
|
| 167 |
def calculate_f1_score(predictions, references):
|
| 168 |
-
"""Calculate F1 score for
|
| 169 |
if not predictions or not references:
|
| 170 |
return 0.0
|
| 171 |
|
| 172 |
-
if extract_uppercase_letters is None:
|
| 173 |
-
# Fallback implementation
|
| 174 |
-
def extract_letters(text):
|
| 175 |
-
return ''.join(sorted(set(c for c in str(text) if c.isupper() and c.isalpha())))
|
| 176 |
-
extract_fn = extract_letters
|
| 177 |
-
else:
|
| 178 |
-
extract_fn = extract_uppercase_letters
|
| 179 |
-
|
| 180 |
total_precision = 0.0
|
| 181 |
total_recall = 0.0
|
| 182 |
count = 0
|
| 183 |
|
| 184 |
for pred, ref in zip(predictions, references):
|
| 185 |
-
|
| 186 |
-
|
| 187 |
|
| 188 |
-
if not
|
| 189 |
total_precision += 1.0
|
| 190 |
total_recall += 1.0
|
| 191 |
count += 1
|
| 192 |
-
elif not
|
| 193 |
count += 1
|
| 194 |
else:
|
| 195 |
-
intersection = len(
|
| 196 |
-
precision = intersection / len(
|
| 197 |
-
recall = intersection / len(
|
| 198 |
total_precision += precision
|
| 199 |
total_recall += recall
|
| 200 |
count += 1
|
|
@@ -208,54 +200,96 @@ def calculate_f1_score(predictions, references):
|
|
| 208 |
if avg_precision + avg_recall == 0:
|
| 209 |
return 0.0
|
| 210 |
|
| 211 |
-
|
| 212 |
-
return f1
|
| 213 |
|
| 214 |
|
| 215 |
def update_json_with_submission(model_name, scores_by_metric, scored_submissions, is_agent=False, model_family=""):
|
| 216 |
-
"""Update JSON files with new submission data."""
|
| 217 |
try:
|
| 218 |
if is_agent:
|
| 219 |
capability_file = "data/agent_capability.json"
|
| 220 |
-
domain_file
|
| 221 |
else:
|
| 222 |
capability_file = "data/model_capability.json"
|
| 223 |
-
domain_file
|
| 224 |
|
| 225 |
-
#
|
| 226 |
with open(capability_file, 'r', encoding='utf-8') as f:
|
| 227 |
capability_data = json.load(f)
|
| 228 |
|
| 229 |
-
# Update capability data
|
| 230 |
for capability in METRICS:
|
| 231 |
-
if capability in scores_by_metric
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
# Get submissions for this capability
|
| 235 |
-
capability_submissions = [
|
| 236 |
-
s for s in scored_submissions
|
| 237 |
-
if s.get('metric_category') == capability
|
| 238 |
-
]
|
| 239 |
-
|
| 240 |
-
# Calculate F1
|
| 241 |
-
if capability_submissions:
|
| 242 |
-
predictions = [s.get('answer', '') for s in capability_submissions]
|
| 243 |
-
references = [s.get('reference_answer', '') for s in capability_submissions]
|
| 244 |
-
f1 = calculate_f1_score(predictions, references)
|
| 245 |
-
else:
|
| 246 |
-
f1 = 0.0
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
-
# Save updated data
|
| 255 |
with open(capability_file, 'w', encoding='utf-8') as f:
|
| 256 |
json.dump(capability_data, f, indent=2, ensure_ascii=False)
|
| 257 |
-
|
| 258 |
print(f"✓ Updated {capability_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
return True
|
| 260 |
|
| 261 |
except Exception as e:
|
|
@@ -315,7 +349,7 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
|
|
| 315 |
print(f"✓ Overall accuracy: {average_accuracy:.4f}")
|
| 316 |
for metric_name, metric_data in scores_by_metric.items():
|
| 317 |
if metric_name != "Average":
|
| 318 |
-
print(f" {metric_name}: {metric_data['accuracy']:.4f} ({metric_data
|
| 319 |
|
| 320 |
# Save locally
|
| 321 |
submission_dir = f"submissions/{organisation}_{model}"
|
|
@@ -340,7 +374,7 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
|
|
| 340 |
metric_name: {
|
| 341 |
"accuracy": float(metric_data["accuracy"]),
|
| 342 |
"count": int(metric_data["count"]),
|
| 343 |
-
"correct": int(metric_data
|
| 344 |
}
|
| 345 |
for metric_name, metric_data in scores_by_metric.items()
|
| 346 |
}
|
|
@@ -361,24 +395,73 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
|
|
| 361 |
if update_success:
|
| 362 |
print("✓ Updated leaderboard JSON files")
|
| 363 |
# Reload data
|
| 364 |
-
global AGENT_CAPABILITY, AGENT_DOMAIN, MODEL_CAPABILITY, MODEL_DOMAIN
|
| 365 |
if is_agent:
|
| 366 |
AGENT_CAPABILITY = load_json_data("data/agent_capability.json")
|
| 367 |
AGENT_DOMAIN = load_json_data("data/agent_domain.json")
|
| 368 |
else:
|
| 369 |
MODEL_CAPABILITY = load_json_data("data/model_capability.json")
|
| 370 |
MODEL_DOMAIN = load_json_data("data/model_domain.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
# Format message
|
| 373 |
message = f"✅ **Submission successful!**\n\n"
|
| 374 |
message += f"**{'Agent' if is_agent else 'Model'}:** {model}\n"
|
| 375 |
message += f"**Organisation:** {organisation}\n"
|
| 376 |
message += f"**Overall Accuracy:** {average_accuracy:.4f}\n\n"
|
|
|
|
| 377 |
message += "**Scores by Capability:**\n"
|
| 378 |
for metric_name in METRICS:
|
| 379 |
if metric_name in scores_by_metric:
|
| 380 |
-
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
|
| 383 |
message += f"\n**Submission ID:** {timestamp}\n"
|
| 384 |
if update_success:
|
|
@@ -1014,16 +1097,19 @@ def build_app():
|
|
| 1014 |
gr.Markdown("""
|
| 1015 |
**Submission Format:**
|
| 1016 |
|
| 1017 |
-
Your JSONL file should contain one
|
| 1018 |
```json
|
| 1019 |
-
{"episode_id": "ep_001", "question": "What is X?", "answer": "
|
| 1020 |
-
{"episode_id": "
|
|
|
|
| 1021 |
```
|
| 1022 |
|
| 1023 |
**Required fields:**
|
| 1024 |
- `episode_id`: Episode identifier
|
| 1025 |
-
- `question`:
|
| 1026 |
-
- `answer`: Your model's
|
|
|
|
|
|
|
| 1027 |
""")
|
| 1028 |
|
| 1029 |
with gr.Row():
|
|
@@ -1101,4 +1187,4 @@ Results are reported as **Accuracy** and **F1 Score**:
|
|
| 1101 |
|
| 1102 |
if __name__ == "__main__":
|
| 1103 |
demo_app = build_app()
|
| 1104 |
-
demo_app.launch(debug=True, show_error=True)
|
|
|
|
| 17 |
def format_log(msg): return f"✅ {msg}"
|
| 18 |
|
| 19 |
try:
|
| 20 |
+
from scorer import score_submission
|
| 21 |
except ImportError:
|
| 22 |
score_submission = None
|
| 23 |
extract_uppercase_letters = None
|
|
|
|
| 165 |
# ---------------------------------------------------------------------------
|
| 166 |
|
| 167 |
def calculate_f1_score(predictions, references):
|
| 168 |
+
"""Calculate token-level F1 score for string answers."""
|
| 169 |
if not predictions or not references:
|
| 170 |
return 0.0
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
total_precision = 0.0
|
| 173 |
total_recall = 0.0
|
| 174 |
count = 0
|
| 175 |
|
| 176 |
for pred, ref in zip(predictions, references):
|
| 177 |
+
pred_tokens = set(str(pred).strip().lower().split())
|
| 178 |
+
ref_tokens = set(str(ref).strip().lower().split())
|
| 179 |
|
| 180 |
+
if not pred_tokens and not ref_tokens:
|
| 181 |
total_precision += 1.0
|
| 182 |
total_recall += 1.0
|
| 183 |
count += 1
|
| 184 |
+
elif not pred_tokens or not ref_tokens:
|
| 185 |
count += 1
|
| 186 |
else:
|
| 187 |
+
intersection = len(pred_tokens & ref_tokens)
|
| 188 |
+
precision = intersection / len(pred_tokens)
|
| 189 |
+
recall = intersection / len(ref_tokens)
|
| 190 |
total_precision += precision
|
| 191 |
total_recall += recall
|
| 192 |
count += 1
|
|
|
|
| 200 |
if avg_precision + avg_recall == 0:
|
| 201 |
return 0.0
|
| 202 |
|
| 203 |
+
return 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
|
|
|
|
| 204 |
|
| 205 |
|
| 206 |
def update_json_with_submission(model_name, scores_by_metric, scored_submissions, is_agent=False, model_family=""):
|
| 207 |
+
"""Update capability and domain JSON files with new submission data."""
|
| 208 |
try:
|
| 209 |
if is_agent:
|
| 210 |
capability_file = "data/agent_capability.json"
|
| 211 |
+
domain_file = "data/agent_domain.json"
|
| 212 |
else:
|
| 213 |
capability_file = "data/model_capability.json"
|
| 214 |
+
domain_file = "data/model_domain.json"
|
| 215 |
|
| 216 |
+
# ── 1. Update capability file ────────────────────────────────────
|
| 217 |
with open(capability_file, 'r', encoding='utf-8') as f:
|
| 218 |
capability_data = json.load(f)
|
| 219 |
|
|
|
|
| 220 |
for capability in METRICS:
|
| 221 |
+
if capability not in scores_by_metric or capability not in capability_data:
|
| 222 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
+
metric_data = scores_by_metric[capability]
|
| 225 |
+
|
| 226 |
+
# submissions belonging to this capability
|
| 227 |
+
cap_subs = [s for s in scored_submissions if s.get('metric_category') == capability]
|
| 228 |
+
|
| 229 |
+
# F1 calculated from this capability's predictions vs references
|
| 230 |
+
if cap_subs:
|
| 231 |
+
predictions = [s.get('answer', '') for s in cap_subs]
|
| 232 |
+
references = [s.get('reference_answer', '') for s in cap_subs]
|
| 233 |
+
f1 = calculate_f1_score(predictions, references)
|
| 234 |
+
else:
|
| 235 |
+
f1 = 0.0
|
| 236 |
+
|
| 237 |
+
capability_data[capability][model_name] = {
|
| 238 |
+
"accuracy": metric_data['accuracy'],
|
| 239 |
+
"model_family": model_family,
|
| 240 |
+
"f1": f1,
|
| 241 |
+
}
|
| 242 |
|
|
|
|
| 243 |
with open(capability_file, 'w', encoding='utf-8') as f:
|
| 244 |
json.dump(capability_data, f, indent=2, ensure_ascii=False)
|
|
|
|
| 245 |
print(f"✓ Updated {capability_file}")
|
| 246 |
+
|
| 247 |
+
# ── 2. Update domain file ────────────────────────────────────────
|
| 248 |
+
with open(domain_file, 'r', encoding='utf-8') as f:
|
| 249 |
+
domain_data = json.load(f)
|
| 250 |
+
|
| 251 |
+
# Group scored_submissions by domain
|
| 252 |
+
from collections import defaultdict
|
| 253 |
+
domain_groups = defaultdict(list)
|
| 254 |
+
for s in scored_submissions:
|
| 255 |
+
dom = s.get('domain', '').strip().upper()
|
| 256 |
+
if dom:
|
| 257 |
+
domain_groups[dom].append(s)
|
| 258 |
+
|
| 259 |
+
# Known domain keys in the JSON (may differ in capitalisation/alias)
|
| 260 |
+
DOMAIN_KEY_MAP = {
|
| 261 |
+
"GAMING": "GAMING",
|
| 262 |
+
"GAME": "GAMING",
|
| 263 |
+
"EMBODIED_AI": "EMBODIED_AI",
|
| 264 |
+
"WEB": "WEB",
|
| 265 |
+
"TEXT2SQL": "TEXT2SQL",
|
| 266 |
+
"OPENWORLD_QA": "OPENWORLD_QA",
|
| 267 |
+
"SOFTWARE": "SOFTWARE",
|
| 268 |
+
"SOFTWARE_ENGINEER": "SOFTWARE",
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
for raw_domain, subs in domain_groups.items():
|
| 272 |
+
json_domain = DOMAIN_KEY_MAP.get(raw_domain)
|
| 273 |
+
if json_domain is None or json_domain not in domain_data:
|
| 274 |
+
continue # unknown domain, skip
|
| 275 |
+
|
| 276 |
+
scores = [s.get('score', 0.0) for s in subs]
|
| 277 |
+
accuracy = float(np.mean(scores)) if scores else 0.0
|
| 278 |
+
|
| 279 |
+
predictions = [s.get('answer', '') for s in subs]
|
| 280 |
+
references = [s.get('reference_answer', '') for s in subs]
|
| 281 |
+
f1 = calculate_f1_score(predictions, references)
|
| 282 |
+
|
| 283 |
+
domain_data[json_domain][model_name] = {
|
| 284 |
+
"accuracy": accuracy,
|
| 285 |
+
"model_family": model_family,
|
| 286 |
+
"f1": f1,
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
with open(domain_file, 'w', encoding='utf-8') as f:
|
| 290 |
+
json.dump(domain_data, f, indent=2, ensure_ascii=False)
|
| 291 |
+
print(f"✓ Updated {domain_file}")
|
| 292 |
+
|
| 293 |
return True
|
| 294 |
|
| 295 |
except Exception as e:
|
|
|
|
| 349 |
print(f"✓ Overall accuracy: {average_accuracy:.4f}")
|
| 350 |
for metric_name, metric_data in scores_by_metric.items():
|
| 351 |
if metric_name != "Average":
|
| 352 |
+
print(f" {metric_name}: {metric_data['accuracy']:.4f} ({metric_data.get('correct', 0)}/{metric_data['count']})")
|
| 353 |
|
| 354 |
# Save locally
|
| 355 |
submission_dir = f"submissions/{organisation}_{model}"
|
|
|
|
| 374 |
metric_name: {
|
| 375 |
"accuracy": float(metric_data["accuracy"]),
|
| 376 |
"count": int(metric_data["count"]),
|
| 377 |
+
"correct": int(metric_data.get("correct", 0))
|
| 378 |
}
|
| 379 |
for metric_name, metric_data in scores_by_metric.items()
|
| 380 |
}
|
|
|
|
| 395 |
if update_success:
|
| 396 |
print("✓ Updated leaderboard JSON files")
|
| 397 |
# Reload data
|
| 398 |
+
global AGENT_CAPABILITY, AGENT_DOMAIN, MODEL_CAPABILITY, MODEL_DOMAIN, model_domain_filtered
|
| 399 |
if is_agent:
|
| 400 |
AGENT_CAPABILITY = load_json_data("data/agent_capability.json")
|
| 401 |
AGENT_DOMAIN = load_json_data("data/agent_domain.json")
|
| 402 |
else:
|
| 403 |
MODEL_CAPABILITY = load_json_data("data/model_capability.json")
|
| 404 |
MODEL_DOMAIN = load_json_data("data/model_domain.json")
|
| 405 |
+
# Recompute filtered model domain view
|
| 406 |
+
_model_items = set()
|
| 407 |
+
for _cap_data in MODEL_CAPABILITY.values():
|
| 408 |
+
_model_items.update(_cap_data.keys())
|
| 409 |
+
model_domain_filtered = filter_data_by_items(MODEL_DOMAIN, _model_items)
|
| 410 |
+
if not any(len(v) > 0 for v in model_domain_filtered.values()):
|
| 411 |
+
model_domain_filtered = {}
|
| 412 |
+
|
| 413 |
+
# Compute per-domain scores from scored_submissions
|
| 414 |
+
from collections import defaultdict
|
| 415 |
+
domain_groups = defaultdict(list)
|
| 416 |
+
for s in scored_submissions:
|
| 417 |
+
dom = s.get("domain", "").strip().upper()
|
| 418 |
+
if dom:
|
| 419 |
+
domain_groups[dom].append(s)
|
| 420 |
+
|
| 421 |
+
domain_scores = {}
|
| 422 |
+
for dom, subs in sorted(domain_groups.items()):
|
| 423 |
+
scores_list = [s.get("score", 0.0) for s in subs]
|
| 424 |
+
preds = [s.get("answer", "") for s in subs]
|
| 425 |
+
refs = [s.get("reference_answer", "") for s in subs]
|
| 426 |
+
domain_scores[dom] = {
|
| 427 |
+
"accuracy": float(np.mean(scores_list)) if scores_list else 0.0,
|
| 428 |
+
"f1": calculate_f1_score(preds, refs),
|
| 429 |
+
"correct": int(sum(scores_list)),
|
| 430 |
+
"count": len(scores_list),
|
| 431 |
+
}
|
| 432 |
|
| 433 |
# Format message
|
| 434 |
message = f"✅ **Submission successful!**\n\n"
|
| 435 |
message += f"**{'Agent' if is_agent else 'Model'}:** {model}\n"
|
| 436 |
message += f"**Organisation:** {organisation}\n"
|
| 437 |
message += f"**Overall Accuracy:** {average_accuracy:.4f}\n\n"
|
| 438 |
+
|
| 439 |
message += "**Scores by Capability:**\n"
|
| 440 |
for metric_name in METRICS:
|
| 441 |
if metric_name in scores_by_metric:
|
| 442 |
+
md = scores_by_metric[metric_name]
|
| 443 |
+
# compute F1 for this capability
|
| 444 |
+
cap_subs = [s for s in scored_submissions if s.get("metric_category") == metric_name]
|
| 445 |
+
if cap_subs:
|
| 446 |
+
preds = [s.get("answer", "") for s in cap_subs]
|
| 447 |
+
refs = [s.get("reference_answer", "") for s in cap_subs]
|
| 448 |
+
cap_f1 = calculate_f1_score(preds, refs)
|
| 449 |
+
else:
|
| 450 |
+
cap_f1 = 0.0
|
| 451 |
+
message += (
|
| 452 |
+
f"- **{metric_name}:** Accuracy {md['accuracy']:.4f}"
|
| 453 |
+
f" ({md.get('correct', 0)}/{md['count']})"
|
| 454 |
+
f", F1 {cap_f1:.4f}\n"
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
if domain_scores:
|
| 458 |
+
message += "\n**Scores by Domain:**\n"
|
| 459 |
+
for dom, ds in domain_scores.items():
|
| 460 |
+
message += (
|
| 461 |
+
f"- **{dom}:** Accuracy {ds['accuracy']:.4f}"
|
| 462 |
+
f" ({ds['correct']}/{ds['count']})"
|
| 463 |
+
f", F1 {ds['f1']:.4f}\n"
|
| 464 |
+
)
|
| 465 |
|
| 466 |
message += f"\n**Submission ID:** {timestamp}\n"
|
| 467 |
if update_success:
|
|
|
|
| 1097 |
gr.Markdown("""
|
| 1098 |
**Submission Format:**
|
| 1099 |
|
| 1100 |
+
Your JSONL file should contain one question-answer pair per line:
|
| 1101 |
```json
|
| 1102 |
+
{"episode_id": "ep_001", "question": "What is X?", "answer": "your answer"}
|
| 1103 |
+
{"episode_id": "ep_001", "question": "What happened next?", "answer": "another answer"}
|
| 1104 |
+
{"episode_id": "ep_002", "question": "What is the goal?", "answer": "yet another answer"}
|
| 1105 |
```
|
| 1106 |
|
| 1107 |
**Required fields:**
|
| 1108 |
- `episode_id`: Episode identifier
|
| 1109 |
+
- `question`: Question text (must match exactly the question in the dataset)
|
| 1110 |
+
- `answer`: Your model's predicted free-form string answer
|
| 1111 |
+
|
| 1112 |
+
Each `episode_id` + `question` pair must be unique. Answers are evaluated by case-insensitive exact string match.
|
| 1113 |
""")
|
| 1114 |
|
| 1115 |
with gr.Row():
|
|
|
|
| 1187 |
|
| 1188 |
if __name__ == "__main__":
|
| 1189 |
demo_app = build_app()
|
| 1190 |
+
demo_app.launch(debug=True, show_error=True)
|
data/agent_capability.json
CHANGED
|
@@ -66,7 +66,7 @@
|
|
| 66 |
"f1": 0.4152833333333333
|
| 67 |
}
|
| 68 |
},
|
| 69 |
-
"
|
| 70 |
"Qwen3-Embedding-4B": {
|
| 71 |
"accuracy": 0.48618333333333336,
|
| 72 |
"model_family": "Qwen3-32B",
|
|
@@ -131,6 +131,11 @@
|
|
| 131 |
"accuracy": 0.5399999999999999,
|
| 132 |
"model_family": "Qwen3-32B",
|
| 133 |
"f1": 0.34326666666666666
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
},
|
| 136 |
"State Updating": {
|
|
@@ -198,9 +203,14 @@
|
|
| 198 |
"accuracy": 0.48335,
|
| 199 |
"model_family": "Qwen3-32B",
|
| 200 |
"f1": 0.3447166666666666
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
}
|
| 202 |
},
|
| 203 |
-
"State
|
| 204 |
"Qwen3-Embedding-4B": {
|
| 205 |
"accuracy": 0.3022666666666667,
|
| 206 |
"model_family": "Qwen3-32B",
|
|
@@ -265,6 +275,11 @@
|
|
| 265 |
"accuracy": 0.37979999999999997,
|
| 266 |
"model_family": "Qwen3-32B",
|
| 267 |
"f1": 0.3152333333333333
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
}
|
| 269 |
}
|
| 270 |
}
|
|
|
|
| 66 |
"f1": 0.4152833333333333
|
| 67 |
}
|
| 68 |
},
|
| 69 |
+
"Causal Inference": {
|
| 70 |
"Qwen3-Embedding-4B": {
|
| 71 |
"accuracy": 0.48618333333333336,
|
| 72 |
"model_family": "Qwen3-32B",
|
|
|
|
| 131 |
"accuracy": 0.5399999999999999,
|
| 132 |
"model_family": "Qwen3-32B",
|
| 133 |
"f1": 0.34326666666666666
|
| 134 |
+
},
|
| 135 |
+
"1": {
|
| 136 |
+
"accuracy": 0.0,
|
| 137 |
+
"model_family": "1",
|
| 138 |
+
"f1": 0.0
|
| 139 |
}
|
| 140 |
},
|
| 141 |
"State Updating": {
|
|
|
|
| 203 |
"accuracy": 0.48335,
|
| 204 |
"model_family": "Qwen3-32B",
|
| 205 |
"f1": 0.3447166666666666
|
| 206 |
+
},
|
| 207 |
+
"1": {
|
| 208 |
+
"accuracy": 1.0,
|
| 209 |
+
"model_family": "1",
|
| 210 |
+
"f1": 1.0
|
| 211 |
}
|
| 212 |
},
|
| 213 |
+
"State Abstraction": {
|
| 214 |
"Qwen3-Embedding-4B": {
|
| 215 |
"accuracy": 0.3022666666666667,
|
| 216 |
"model_family": "Qwen3-32B",
|
|
|
|
| 275 |
"accuracy": 0.37979999999999997,
|
| 276 |
"model_family": "Qwen3-32B",
|
| 277 |
"f1": 0.3152333333333333
|
| 278 |
+
},
|
| 279 |
+
"1": {
|
| 280 |
+
"accuracy": 0.0,
|
| 281 |
+
"model_family": "1",
|
| 282 |
+
"f1": 0.0
|
| 283 |
}
|
| 284 |
}
|
| 285 |
}
|
data/model_capability.json
CHANGED
|
@@ -145,7 +145,7 @@
|
|
| 145 |
"f1": 0.3065
|
| 146 |
}
|
| 147 |
},
|
| 148 |
-
"
|
| 149 |
"Claude Haiku 3.5": {
|
| 150 |
"accuracy": 0.4799333333333333,
|
| 151 |
"f1": 0.29278333333333334
|
|
@@ -289,6 +289,11 @@
|
|
| 289 |
"AMA-agent (Ours) (8B)": {
|
| 290 |
"accuracy": 0.4806166666666667,
|
| 291 |
"f1": 0.23224999999999998
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
}
|
| 293 |
},
|
| 294 |
"State Updating": {
|
|
@@ -435,9 +440,14 @@
|
|
| 435 |
"AMA-agent (Ours) (8B)": {
|
| 436 |
"accuracy": 0.43645,
|
| 437 |
"f1": 0.21893333333333334
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
}
|
| 439 |
},
|
| 440 |
-
"State
|
| 441 |
"Claude Haiku 3.5": {
|
| 442 |
"accuracy": 0.32758333333333334,
|
| 443 |
"f1": 0.2684166666666667
|
|
@@ -581,6 +591,11 @@
|
|
| 581 |
"AMA-agent (Ours) (8B)": {
|
| 582 |
"accuracy": 0.37873333333333337,
|
| 583 |
"f1": 0.21493333333333334
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
}
|
| 585 |
}
|
| 586 |
}
|
|
|
|
| 145 |
"f1": 0.3065
|
| 146 |
}
|
| 147 |
},
|
| 148 |
+
"Causal Inference": {
|
| 149 |
"Claude Haiku 3.5": {
|
| 150 |
"accuracy": 0.4799333333333333,
|
| 151 |
"f1": 0.29278333333333334
|
|
|
|
| 289 |
"AMA-agent (Ours) (8B)": {
|
| 290 |
"accuracy": 0.4806166666666667,
|
| 291 |
"f1": 0.23224999999999998
|
| 292 |
+
},
|
| 293 |
+
"1": {
|
| 294 |
+
"accuracy": 0.0,
|
| 295 |
+
"model_family": "1",
|
| 296 |
+
"f1": 0.0
|
| 297 |
}
|
| 298 |
},
|
| 299 |
"State Updating": {
|
|
|
|
| 440 |
"AMA-agent (Ours) (8B)": {
|
| 441 |
"accuracy": 0.43645,
|
| 442 |
"f1": 0.21893333333333334
|
| 443 |
+
},
|
| 444 |
+
"1": {
|
| 445 |
+
"accuracy": 1.0,
|
| 446 |
+
"model_family": "1",
|
| 447 |
+
"f1": 1.0
|
| 448 |
}
|
| 449 |
},
|
| 450 |
+
"State Abstraction": {
|
| 451 |
"Claude Haiku 3.5": {
|
| 452 |
"accuracy": 0.32758333333333334,
|
| 453 |
"f1": 0.2684166666666667
|
|
|
|
| 591 |
"AMA-agent (Ours) (8B)": {
|
| 592 |
"accuracy": 0.37873333333333337,
|
| 593 |
"f1": 0.21493333333333334
|
| 594 |
+
},
|
| 595 |
+
"1": {
|
| 596 |
+
"accuracy": 0.0,
|
| 597 |
+
"model_family": "1",
|
| 598 |
+
"f1": 0.0
|
| 599 |
}
|
| 600 |
}
|
| 601 |
}
|
data/model_domain.json
CHANGED
|
@@ -1,401 +1,401 @@
|
|
| 1 |
{
|
| 2 |
"GAMING": {
|
| 3 |
-
"Qwen3-Embedding-4B": {
|
| 4 |
"accuracy": 0.5157,
|
| 5 |
"model_family": "Qwen3-32B",
|
| 6 |
"f1": 0.2195
|
| 7 |
},
|
| 8 |
-
"GRAPHRAG": {
|
| 9 |
"accuracy": 0.5595249999999999,
|
| 10 |
"model_family": "Qwen3-32B",
|
| 11 |
"f1": 0.288175
|
| 12 |
},
|
| 13 |
-
"Hipporag2": {
|
| 14 |
"accuracy": 0.60555,
|
| 15 |
"model_family": "Qwen3-32B",
|
| 16 |
"f1": 0.2273
|
| 17 |
},
|
| 18 |
-
"Memagent": {
|
| 19 |
"accuracy": 0.31775,
|
| 20 |
"model_family": "Qwen3-32B",
|
| 21 |
"f1": 0.22945
|
| 22 |
},
|
| 23 |
-
"Mem1": {
|
| 24 |
"accuracy": 0.225875,
|
| 25 |
"model_family": "Qwen3-32B",
|
| 26 |
"f1": 0.18155
|
| 27 |
},
|
| 28 |
-
"Amem": {
|
| 29 |
"accuracy": 0.4247,
|
| 30 |
"model_family": "Qwen3-32B",
|
| 31 |
"f1": 0.343125
|
| 32 |
},
|
| 33 |
-
"Mem0": {
|
| 34 |
"accuracy": 0.39085000000000003,
|
| 35 |
"model_family": "Qwen3-32B",
|
| 36 |
"f1": 0.346
|
| 37 |
},
|
| 38 |
-
"Memorag": {
|
| 39 |
"accuracy": 0.557625,
|
| 40 |
"model_family": "Qwen3-32B",
|
| 41 |
"f1": 0.257875
|
| 42 |
},
|
| 43 |
-
"Memgpt": {
|
| 44 |
"accuracy": 0.435425,
|
| 45 |
"model_family": "Qwen3-32B",
|
| 46 |
"f1": 0.318475
|
| 47 |
},
|
| 48 |
-
"Mem-alpha": {
|
| 49 |
"accuracy": 0.43895,
|
| 50 |
"model_family": "Qwen3-32B",
|
| 51 |
"f1": 0.319875
|
| 52 |
},
|
| 53 |
-
"Memorybank": {
|
| 54 |
"accuracy": 0.43885,
|
| 55 |
"model_family": "Qwen3-32B",
|
| 56 |
"f1": 0.325325
|
| 57 |
},
|
| 58 |
-
"Simple mem": {
|
| 59 |
"accuracy": 0.288775,
|
| 60 |
"model_family": "Qwen3-32B",
|
| 61 |
"f1": 0.163
|
| 62 |
},
|
| 63 |
-
"Long context": {
|
| 64 |
"accuracy": 0.5355,
|
| 65 |
"model_family": "Qwen3-32B",
|
| 66 |
"f1": 0.321775
|
| 67 |
}
|
| 68 |
},
|
| 69 |
"EMBODIED_AI": {
|
| 70 |
-
"Qwen3-Embedding-4B": {
|
| 71 |
"accuracy": 0.204325,
|
| 72 |
"model_family": "Qwen3-32B",
|
| 73 |
"f1": 0.1353
|
| 74 |
},
|
| 75 |
-
"GRAPHRAG": {
|
| 76 |
"accuracy": 0.1476,
|
| 77 |
"model_family": "Qwen3-32B",
|
| 78 |
"f1": 0.3799
|
| 79 |
},
|
| 80 |
-
"Hipporag2": {
|
| 81 |
"accuracy": 0.17627500000000002,
|
| 82 |
"model_family": "Qwen3-32B",
|
| 83 |
"f1": 0.181875
|
| 84 |
},
|
| 85 |
-
"Memagent": {
|
| 86 |
"accuracy": 0.10617499999999999,
|
| 87 |
"model_family": "Qwen3-32B",
|
| 88 |
"f1": 0.144975
|
| 89 |
},
|
| 90 |
-
"Mem1": {
|
| 91 |
"accuracy": 0.03355,
|
| 92 |
"model_family": "Qwen3-32B",
|
| 93 |
"f1": 0.22445
|
| 94 |
},
|
| 95 |
-
"Amem": {
|
| 96 |
"accuracy": 0.183975,
|
| 97 |
"model_family": "Qwen3-32B",
|
| 98 |
"f1": 0.3524
|
| 99 |
},
|
| 100 |
-
"Mem0": {
|
| 101 |
"accuracy": 0.11109999999999999,
|
| 102 |
"model_family": "Qwen3-32B",
|
| 103 |
"f1": 0.27005
|
| 104 |
},
|
| 105 |
-
"Memorag": {
|
| 106 |
"accuracy": 0.085425,
|
| 107 |
"model_family": "Qwen3-32B",
|
| 108 |
"f1": 0.17677500000000002
|
| 109 |
},
|
| 110 |
-
"Memgpt": {
|
| 111 |
"accuracy": 0.1122,
|
| 112 |
"model_family": "Qwen3-32B",
|
| 113 |
"f1": 0.10405
|
| 114 |
},
|
| 115 |
-
"Mem-alpha": {
|
| 116 |
"accuracy": 0.15515,
|
| 117 |
"model_family": "Qwen3-32B",
|
| 118 |
"f1": 0.23735
|
| 119 |
},
|
| 120 |
-
"Memorybank": {
|
| 121 |
"accuracy": 0.16025,
|
| 122 |
"model_family": "Qwen3-32B",
|
| 123 |
"f1": 0.426475
|
| 124 |
},
|
| 125 |
-
"Simple mem": {
|
| 126 |
"accuracy": 0.045975,
|
| 127 |
"model_family": "Qwen3-32B",
|
| 128 |
"f1": 0.2284
|
| 129 |
},
|
| 130 |
-
"Long context": {
|
| 131 |
"accuracy": 0.48185,
|
| 132 |
"model_family": "Qwen3-32B",
|
| 133 |
"f1": 0.56
|
| 134 |
}
|
| 135 |
},
|
| 136 |
"WEB": {
|
| 137 |
-
"Qwen3-Embedding-4B": {
|
| 138 |
"accuracy": 0.2872,
|
| 139 |
"model_family": "Qwen3-32B",
|
| 140 |
"f1": 0.08535000000000001
|
| 141 |
},
|
| 142 |
-
"GRAPHRAG": {
|
| 143 |
"accuracy": 0.420675,
|
| 144 |
"model_family": "Qwen3-32B",
|
| 145 |
"f1": 0.268075
|
| 146 |
},
|
| 147 |
-
"Hipporag2": {
|
| 148 |
"accuracy": 0.3761,
|
| 149 |
"model_family": "Qwen3-32B",
|
| 150 |
"f1": 0.120125
|
| 151 |
},
|
| 152 |
-
"Memagent": {
|
| 153 |
"accuracy": 0.263975,
|
| 154 |
"model_family": "Qwen3-32B",
|
| 155 |
"f1": 0.09065
|
| 156 |
},
|
| 157 |
-
"Mem1": {
|
| 158 |
"accuracy": 0.131275,
|
| 159 |
"model_family": "Qwen3-32B",
|
| 160 |
"f1": 0.1518
|
| 161 |
},
|
| 162 |
-
"Amem": {
|
| 163 |
"accuracy": 0.391525,
|
| 164 |
"model_family": "Qwen3-32B",
|
| 165 |
"f1": 0.2294
|
| 166 |
},
|
| 167 |
-
"Mem0": {
|
| 168 |
"accuracy": 0.2705,
|
| 169 |
"model_family": "Qwen3-32B",
|
| 170 |
"f1": 0.21675
|
| 171 |
},
|
| 172 |
-
"Memorag": {
|
| 173 |
"accuracy": 0.364975,
|
| 174 |
"model_family": "Qwen3-32B",
|
| 175 |
"f1": 0.108075
|
| 176 |
},
|
| 177 |
-
"Memgpt": {
|
| 178 |
"accuracy": 0.327975,
|
| 179 |
"model_family": "Qwen3-32B",
|
| 180 |
"f1": 0.07105
|
| 181 |
},
|
| 182 |
-
"Mem-alpha": {
|
| 183 |
"accuracy": 0.362925,
|
| 184 |
"model_family": "Qwen3-32B",
|
| 185 |
"f1": 0.15944999999999998
|
| 186 |
},
|
| 187 |
-
"Memorybank": {
|
| 188 |
"accuracy": 0.401775,
|
| 189 |
"model_family": "Qwen3-32B",
|
| 190 |
"f1": 0.23704999999999998
|
| 191 |
},
|
| 192 |
-
"Simple mem": {
|
| 193 |
"accuracy": 0.13974999999999999,
|
| 194 |
"model_family": "Qwen3-32B",
|
| 195 |
"f1": 0.1679
|
| 196 |
},
|
| 197 |
-
"Long context": {
|
| 198 |
"accuracy": 0.554275,
|
| 199 |
"model_family": "Qwen3-32B",
|
| 200 |
"f1": 0.348075
|
| 201 |
}
|
| 202 |
},
|
| 203 |
"TEXT2SQL": {
|
| 204 |
-
"Qwen3-Embedding-4B": {
|
| 205 |
"accuracy": 0.4164,
|
| 206 |
"model_family": "Qwen3-32B",
|
| 207 |
"f1": 0.249325
|
| 208 |
},
|
| 209 |
-
"GRAPHRAG": {
|
| 210 |
"accuracy": 0.21665,
|
| 211 |
"model_family": "Qwen3-32B",
|
| 212 |
"f1": 0.221675
|
| 213 |
},
|
| 214 |
-
"Hipporag2": {
|
| 215 |
"accuracy": 0.46267499999999995,
|
| 216 |
"model_family": "Qwen3-32B",
|
| 217 |
"f1": 0.26935
|
| 218 |
},
|
| 219 |
-
"Memagent": {
|
| 220 |
"accuracy": 0.245375,
|
| 221 |
"model_family": "Qwen3-32B",
|
| 222 |
"f1": 0.245375
|
| 223 |
},
|
| 224 |
-
"Mem1": {
|
| 225 |
"accuracy": 0.06465,
|
| 226 |
"model_family": "Qwen3-32B",
|
| 227 |
"f1": 0.19990000000000002
|
| 228 |
},
|
| 229 |
-
"Amem": {
|
| 230 |
"accuracy": 0.31405,
|
| 231 |
"model_family": "Qwen3-32B",
|
| 232 |
"f1": 0.289625
|
| 233 |
},
|
| 234 |
-
"Mem0": {
|
| 235 |
"accuracy": 0.1192,
|
| 236 |
"model_family": "Qwen3-32B",
|
| 237 |
"f1": 0.2326
|
| 238 |
},
|
| 239 |
-
"Memorag": {
|
| 240 |
"accuracy": 0.619,
|
| 241 |
"model_family": "Qwen3-32B",
|
| 242 |
"f1": 0.296475
|
| 243 |
},
|
| 244 |
-
"Memgpt": {
|
| 245 |
"accuracy": 0.206875,
|
| 246 |
"model_family": "Qwen3-32B",
|
| 247 |
"f1": 0.178975
|
| 248 |
},
|
| 249 |
-
"Mem-alpha": {
|
| 250 |
"accuracy": 0.30065,
|
| 251 |
"model_family": "Qwen3-32B",
|
| 252 |
"f1": 0.26505
|
| 253 |
},
|
| 254 |
-
"Memorybank": {
|
| 255 |
"accuracy": 0.23855,
|
| 256 |
"model_family": "Qwen3-32B",
|
| 257 |
"f1": 0.28355
|
| 258 |
},
|
| 259 |
-
"Simple mem": {
|
| 260 |
"accuracy": 0.192575,
|
| 261 |
"model_family": "Qwen3-32B",
|
| 262 |
"f1": 0.157225
|
| 263 |
},
|
| 264 |
-
"Long context": {
|
| 265 |
"accuracy": 0.456075,
|
| 266 |
"model_family": "Qwen3-32B",
|
| 267 |
"f1": 0.295275
|
| 268 |
}
|
| 269 |
},
|
| 270 |
"OPENWORLD_QA": {
|
| 271 |
-
"Qwen3-Embedding-4B": {
|
| 272 |
"accuracy": 0.399125,
|
| 273 |
"model_family": "Qwen3-32B",
|
| 274 |
"f1": 0.0837
|
| 275 |
},
|
| 276 |
-
"GRAPHRAG": {
|
| 277 |
"accuracy": 0.31845,
|
| 278 |
"model_family": "Qwen3-32B",
|
| 279 |
"f1": 0.22635
|
| 280 |
},
|
| 281 |
-
"Hipporag2": {
|
| 282 |
"accuracy": 0.45825,
|
| 283 |
"model_family": "Qwen3-32B",
|
| 284 |
"f1": 0.2362
|
| 285 |
},
|
| 286 |
-
"Memagent": {
|
| 287 |
"accuracy": 0.158225,
|
| 288 |
"model_family": "Qwen3-32B",
|
| 289 |
"f1": 0.0704
|
| 290 |
},
|
| 291 |
-
"Mem1": {
|
| 292 |
"accuracy": 0.12065000000000001,
|
| 293 |
"model_family": "Qwen3-32B",
|
| 294 |
"f1": 0.15005
|
| 295 |
},
|
| 296 |
-
"Amem": {
|
| 297 |
"accuracy": 0.29359999999999997,
|
| 298 |
"model_family": "Qwen3-32B",
|
| 299 |
"f1": 0.2079
|
| 300 |
},
|
| 301 |
-
"Mem0": {
|
| 302 |
"accuracy": 0.16197499999999998,
|
| 303 |
"model_family": "Qwen3-32B",
|
| 304 |
"f1": 0.1604
|
| 305 |
},
|
| 306 |
-
"Memorag": {
|
| 307 |
"accuracy": 0.411375,
|
| 308 |
"model_family": "Qwen3-32B",
|
| 309 |
"f1": 0.093675
|
| 310 |
},
|
| 311 |
-
"Memgpt": {
|
| 312 |
"accuracy": 0.3155,
|
| 313 |
"model_family": "Qwen3-32B",
|
| 314 |
"f1": 0.0595
|
| 315 |
},
|
| 316 |
-
"Mem-alpha": {
|
| 317 |
"accuracy": 0.2301,
|
| 318 |
"model_family": "Qwen3-32B",
|
| 319 |
"f1": 0.13345
|
| 320 |
},
|
| 321 |
-
"Memorybank": {
|
| 322 |
"accuracy": 0.3486,
|
| 323 |
"model_family": "Qwen3-32B",
|
| 324 |
"f1": 0.2519
|
| 325 |
},
|
| 326 |
-
"Simple mem": {
|
| 327 |
"accuracy": 0.12154999999999999,
|
| 328 |
"model_family": "Qwen3-32B",
|
| 329 |
"f1": 0.1312
|
| 330 |
},
|
| 331 |
-
"Long context": {
|
| 332 |
"accuracy": 0.49785,
|
| 333 |
"model_family": "Qwen3-32B",
|
| 334 |
"f1": 0.3349
|
| 335 |
}
|
| 336 |
},
|
| 337 |
"SOFTWARE": {
|
| 338 |
-
"Qwen3-Embedding-4B": {
|
| 339 |
"accuracy": 0.599025,
|
| 340 |
"model_family": "Qwen3-32B",
|
| 341 |
"f1": 0.083575
|
| 342 |
},
|
| 343 |
-
"GRAPHRAG": {
|
| 344 |
"accuracy": 0.348875,
|
| 345 |
"model_family": "Qwen3-32B",
|
| 346 |
"f1": 0.229825
|
| 347 |
},
|
| 348 |
-
"Hipporag2": {
|
| 349 |
"accuracy": 0.5299,
|
| 350 |
"model_family": "Qwen3-32B",
|
| 351 |
"f1": 0.1279
|
| 352 |
},
|
| 353 |
-
"Memagent": {
|
| 354 |
"accuracy": 0.53965,
|
| 355 |
"model_family": "Qwen3-32B",
|
| 356 |
"f1": 0.09085
|
| 357 |
},
|
| 358 |
-
"Mem1": {
|
| 359 |
"accuracy": 0.18595,
|
| 360 |
"model_family": "Qwen3-32B",
|
| 361 |
"f1": 0.17527500000000001
|
| 362 |
},
|
| 363 |
-
"Amem": {
|
| 364 |
"accuracy": 0.29615,
|
| 365 |
"model_family": "Qwen3-32B",
|
| 366 |
"f1": 0.20395
|
| 367 |
},
|
| 368 |
-
"Mem0": {
|
| 369 |
"accuracy": 0.2366,
|
| 370 |
"model_family": "Qwen3-32B",
|
| 371 |
"f1": 0.176975
|
| 372 |
},
|
| 373 |
-
"Memorag": {
|
| 374 |
"accuracy": 0.55005,
|
| 375 |
"model_family": "Qwen3-32B",
|
| 376 |
"f1": 0.10707499999999999
|
| 377 |
},
|
| 378 |
-
"Memgpt": {
|
| 379 |
"accuracy": 0.599125,
|
| 380 |
"model_family": "Qwen3-32B",
|
| 381 |
"f1": 0.066575
|
| 382 |
},
|
| 383 |
-
"Mem-alpha": {
|
| 384 |
"accuracy": 0.3476,
|
| 385 |
"model_family": "Qwen3-32B",
|
| 386 |
"f1": 0.12492500000000001
|
| 387 |
},
|
| 388 |
-
"Memorybank": {
|
| 389 |
"accuracy": 0.5072,
|
| 390 |
"model_family": "Qwen3-32B",
|
| 391 |
"f1": 0.240875
|
| 392 |
},
|
| 393 |
-
"Simple mem": {
|
| 394 |
"accuracy": 0.2431,
|
| 395 |
"model_family": "Qwen3-32B",
|
| 396 |
"f1": 0.2005
|
| 397 |
},
|
| 398 |
-
"Long context": {
|
| 399 |
"accuracy": 0.4847,
|
| 400 |
"model_family": "Qwen3-32B",
|
| 401 |
"f1": 0.267725
|
|
|
|
| 1 |
{
|
| 2 |
"GAMING": {
|
| 3 |
+
"Qwen3-Embedding-4B (32B)": {
|
| 4 |
"accuracy": 0.5157,
|
| 5 |
"model_family": "Qwen3-32B",
|
| 6 |
"f1": 0.2195
|
| 7 |
},
|
| 8 |
+
"GRAPHRAG (32B)": {
|
| 9 |
"accuracy": 0.5595249999999999,
|
| 10 |
"model_family": "Qwen3-32B",
|
| 11 |
"f1": 0.288175
|
| 12 |
},
|
| 13 |
+
"Hipporag2 (32B)": {
|
| 14 |
"accuracy": 0.60555,
|
| 15 |
"model_family": "Qwen3-32B",
|
| 16 |
"f1": 0.2273
|
| 17 |
},
|
| 18 |
+
"Memagent (32B)": {
|
| 19 |
"accuracy": 0.31775,
|
| 20 |
"model_family": "Qwen3-32B",
|
| 21 |
"f1": 0.22945
|
| 22 |
},
|
| 23 |
+
"Mem1 (32B)": {
|
| 24 |
"accuracy": 0.225875,
|
| 25 |
"model_family": "Qwen3-32B",
|
| 26 |
"f1": 0.18155
|
| 27 |
},
|
| 28 |
+
"Amem (32B)": {
|
| 29 |
"accuracy": 0.4247,
|
| 30 |
"model_family": "Qwen3-32B",
|
| 31 |
"f1": 0.343125
|
| 32 |
},
|
| 33 |
+
"Mem0 (32B)": {
|
| 34 |
"accuracy": 0.39085000000000003,
|
| 35 |
"model_family": "Qwen3-32B",
|
| 36 |
"f1": 0.346
|
| 37 |
},
|
| 38 |
+
"Memorag (32B)": {
|
| 39 |
"accuracy": 0.557625,
|
| 40 |
"model_family": "Qwen3-32B",
|
| 41 |
"f1": 0.257875
|
| 42 |
},
|
| 43 |
+
"Memgpt (32B)": {
|
| 44 |
"accuracy": 0.435425,
|
| 45 |
"model_family": "Qwen3-32B",
|
| 46 |
"f1": 0.318475
|
| 47 |
},
|
| 48 |
+
"Mem-alpha (32B)": {
|
| 49 |
"accuracy": 0.43895,
|
| 50 |
"model_family": "Qwen3-32B",
|
| 51 |
"f1": 0.319875
|
| 52 |
},
|
| 53 |
+
"Memorybank (32B)": {
|
| 54 |
"accuracy": 0.43885,
|
| 55 |
"model_family": "Qwen3-32B",
|
| 56 |
"f1": 0.325325
|
| 57 |
},
|
| 58 |
+
"Simple mem (32B)": {
|
| 59 |
"accuracy": 0.288775,
|
| 60 |
"model_family": "Qwen3-32B",
|
| 61 |
"f1": 0.163
|
| 62 |
},
|
| 63 |
+
"Long context (32B)": {
|
| 64 |
"accuracy": 0.5355,
|
| 65 |
"model_family": "Qwen3-32B",
|
| 66 |
"f1": 0.321775
|
| 67 |
}
|
| 68 |
},
|
| 69 |
"EMBODIED_AI": {
|
| 70 |
+
"Qwen3-Embedding-4B (32B)": {
|
| 71 |
"accuracy": 0.204325,
|
| 72 |
"model_family": "Qwen3-32B",
|
| 73 |
"f1": 0.1353
|
| 74 |
},
|
| 75 |
+
"GRAPHRAG (32B)": {
|
| 76 |
"accuracy": 0.1476,
|
| 77 |
"model_family": "Qwen3-32B",
|
| 78 |
"f1": 0.3799
|
| 79 |
},
|
| 80 |
+
"Hipporag2 (32B)": {
|
| 81 |
"accuracy": 0.17627500000000002,
|
| 82 |
"model_family": "Qwen3-32B",
|
| 83 |
"f1": 0.181875
|
| 84 |
},
|
| 85 |
+
"Memagent (32B)": {
|
| 86 |
"accuracy": 0.10617499999999999,
|
| 87 |
"model_family": "Qwen3-32B",
|
| 88 |
"f1": 0.144975
|
| 89 |
},
|
| 90 |
+
"Mem1 (32B)": {
|
| 91 |
"accuracy": 0.03355,
|
| 92 |
"model_family": "Qwen3-32B",
|
| 93 |
"f1": 0.22445
|
| 94 |
},
|
| 95 |
+
"Amem (32B)": {
|
| 96 |
"accuracy": 0.183975,
|
| 97 |
"model_family": "Qwen3-32B",
|
| 98 |
"f1": 0.3524
|
| 99 |
},
|
| 100 |
+
"Mem0 (32B)": {
|
| 101 |
"accuracy": 0.11109999999999999,
|
| 102 |
"model_family": "Qwen3-32B",
|
| 103 |
"f1": 0.27005
|
| 104 |
},
|
| 105 |
+
"Memorag (32B)": {
|
| 106 |
"accuracy": 0.085425,
|
| 107 |
"model_family": "Qwen3-32B",
|
| 108 |
"f1": 0.17677500000000002
|
| 109 |
},
|
| 110 |
+
"Memgpt (32B)": {
|
| 111 |
"accuracy": 0.1122,
|
| 112 |
"model_family": "Qwen3-32B",
|
| 113 |
"f1": 0.10405
|
| 114 |
},
|
| 115 |
+
"Mem-alpha (32B)": {
|
| 116 |
"accuracy": 0.15515,
|
| 117 |
"model_family": "Qwen3-32B",
|
| 118 |
"f1": 0.23735
|
| 119 |
},
|
| 120 |
+
"Memorybank (32B)": {
|
| 121 |
"accuracy": 0.16025,
|
| 122 |
"model_family": "Qwen3-32B",
|
| 123 |
"f1": 0.426475
|
| 124 |
},
|
| 125 |
+
"Simple mem (32B)": {
|
| 126 |
"accuracy": 0.045975,
|
| 127 |
"model_family": "Qwen3-32B",
|
| 128 |
"f1": 0.2284
|
| 129 |
},
|
| 130 |
+
"Long context (32B)": {
|
| 131 |
"accuracy": 0.48185,
|
| 132 |
"model_family": "Qwen3-32B",
|
| 133 |
"f1": 0.56
|
| 134 |
}
|
| 135 |
},
|
| 136 |
"WEB": {
|
| 137 |
+
"Qwen3-Embedding-4B (32B)": {
|
| 138 |
"accuracy": 0.2872,
|
| 139 |
"model_family": "Qwen3-32B",
|
| 140 |
"f1": 0.08535000000000001
|
| 141 |
},
|
| 142 |
+
"GRAPHRAG (32B)": {
|
| 143 |
"accuracy": 0.420675,
|
| 144 |
"model_family": "Qwen3-32B",
|
| 145 |
"f1": 0.268075
|
| 146 |
},
|
| 147 |
+
"Hipporag2 (32B)": {
|
| 148 |
"accuracy": 0.3761,
|
| 149 |
"model_family": "Qwen3-32B",
|
| 150 |
"f1": 0.120125
|
| 151 |
},
|
| 152 |
+
"Memagent (32B)": {
|
| 153 |
"accuracy": 0.263975,
|
| 154 |
"model_family": "Qwen3-32B",
|
| 155 |
"f1": 0.09065
|
| 156 |
},
|
| 157 |
+
"Mem1 (32B)": {
|
| 158 |
"accuracy": 0.131275,
|
| 159 |
"model_family": "Qwen3-32B",
|
| 160 |
"f1": 0.1518
|
| 161 |
},
|
| 162 |
+
"Amem (32B)": {
|
| 163 |
"accuracy": 0.391525,
|
| 164 |
"model_family": "Qwen3-32B",
|
| 165 |
"f1": 0.2294
|
| 166 |
},
|
| 167 |
+
"Mem0 (32B)": {
|
| 168 |
"accuracy": 0.2705,
|
| 169 |
"model_family": "Qwen3-32B",
|
| 170 |
"f1": 0.21675
|
| 171 |
},
|
| 172 |
+
"Memorag (32B)": {
|
| 173 |
"accuracy": 0.364975,
|
| 174 |
"model_family": "Qwen3-32B",
|
| 175 |
"f1": 0.108075
|
| 176 |
},
|
| 177 |
+
"Memgpt (32B)": {
|
| 178 |
"accuracy": 0.327975,
|
| 179 |
"model_family": "Qwen3-32B",
|
| 180 |
"f1": 0.07105
|
| 181 |
},
|
| 182 |
+
"Mem-alpha (32B)": {
|
| 183 |
"accuracy": 0.362925,
|
| 184 |
"model_family": "Qwen3-32B",
|
| 185 |
"f1": 0.15944999999999998
|
| 186 |
},
|
| 187 |
+
"Memorybank (32B)": {
|
| 188 |
"accuracy": 0.401775,
|
| 189 |
"model_family": "Qwen3-32B",
|
| 190 |
"f1": 0.23704999999999998
|
| 191 |
},
|
| 192 |
+
"Simple mem (32B)": {
|
| 193 |
"accuracy": 0.13974999999999999,
|
| 194 |
"model_family": "Qwen3-32B",
|
| 195 |
"f1": 0.1679
|
| 196 |
},
|
| 197 |
+
"Long context (32B)": {
|
| 198 |
"accuracy": 0.554275,
|
| 199 |
"model_family": "Qwen3-32B",
|
| 200 |
"f1": 0.348075
|
| 201 |
}
|
| 202 |
},
|
| 203 |
"TEXT2SQL": {
|
| 204 |
+
"Qwen3-Embedding-4B (32B)": {
|
| 205 |
"accuracy": 0.4164,
|
| 206 |
"model_family": "Qwen3-32B",
|
| 207 |
"f1": 0.249325
|
| 208 |
},
|
| 209 |
+
"GRAPHRAG (32B)": {
|
| 210 |
"accuracy": 0.21665,
|
| 211 |
"model_family": "Qwen3-32B",
|
| 212 |
"f1": 0.221675
|
| 213 |
},
|
| 214 |
+
"Hipporag2 (32B)": {
|
| 215 |
"accuracy": 0.46267499999999995,
|
| 216 |
"model_family": "Qwen3-32B",
|
| 217 |
"f1": 0.26935
|
| 218 |
},
|
| 219 |
+
"Memagent (32B)": {
|
| 220 |
"accuracy": 0.245375,
|
| 221 |
"model_family": "Qwen3-32B",
|
| 222 |
"f1": 0.245375
|
| 223 |
},
|
| 224 |
+
"Mem1 (32B)": {
|
| 225 |
"accuracy": 0.06465,
|
| 226 |
"model_family": "Qwen3-32B",
|
| 227 |
"f1": 0.19990000000000002
|
| 228 |
},
|
| 229 |
+
"Amem (32B)": {
|
| 230 |
"accuracy": 0.31405,
|
| 231 |
"model_family": "Qwen3-32B",
|
| 232 |
"f1": 0.289625
|
| 233 |
},
|
| 234 |
+
"Mem0 (32B)": {
|
| 235 |
"accuracy": 0.1192,
|
| 236 |
"model_family": "Qwen3-32B",
|
| 237 |
"f1": 0.2326
|
| 238 |
},
|
| 239 |
+
"Memorag (32B)": {
|
| 240 |
"accuracy": 0.619,
|
| 241 |
"model_family": "Qwen3-32B",
|
| 242 |
"f1": 0.296475
|
| 243 |
},
|
| 244 |
+
"Memgpt (32B)": {
|
| 245 |
"accuracy": 0.206875,
|
| 246 |
"model_family": "Qwen3-32B",
|
| 247 |
"f1": 0.178975
|
| 248 |
},
|
| 249 |
+
"Mem-alpha (32B)": {
|
| 250 |
"accuracy": 0.30065,
|
| 251 |
"model_family": "Qwen3-32B",
|
| 252 |
"f1": 0.26505
|
| 253 |
},
|
| 254 |
+
"Memorybank (32B)": {
|
| 255 |
"accuracy": 0.23855,
|
| 256 |
"model_family": "Qwen3-32B",
|
| 257 |
"f1": 0.28355
|
| 258 |
},
|
| 259 |
+
"Simple mem (32B)": {
|
| 260 |
"accuracy": 0.192575,
|
| 261 |
"model_family": "Qwen3-32B",
|
| 262 |
"f1": 0.157225
|
| 263 |
},
|
| 264 |
+
"Long context (32B)": {
|
| 265 |
"accuracy": 0.456075,
|
| 266 |
"model_family": "Qwen3-32B",
|
| 267 |
"f1": 0.295275
|
| 268 |
}
|
| 269 |
},
|
| 270 |
"OPENWORLD_QA": {
|
| 271 |
+
"Qwen3-Embedding-4B (32B)": {
|
| 272 |
"accuracy": 0.399125,
|
| 273 |
"model_family": "Qwen3-32B",
|
| 274 |
"f1": 0.0837
|
| 275 |
},
|
| 276 |
+
"GRAPHRAG (32B)": {
|
| 277 |
"accuracy": 0.31845,
|
| 278 |
"model_family": "Qwen3-32B",
|
| 279 |
"f1": 0.22635
|
| 280 |
},
|
| 281 |
+
"Hipporag2 (32B)": {
|
| 282 |
"accuracy": 0.45825,
|
| 283 |
"model_family": "Qwen3-32B",
|
| 284 |
"f1": 0.2362
|
| 285 |
},
|
| 286 |
+
"Memagent (32B)": {
|
| 287 |
"accuracy": 0.158225,
|
| 288 |
"model_family": "Qwen3-32B",
|
| 289 |
"f1": 0.0704
|
| 290 |
},
|
| 291 |
+
"Mem1 (32B)": {
|
| 292 |
"accuracy": 0.12065000000000001,
|
| 293 |
"model_family": "Qwen3-32B",
|
| 294 |
"f1": 0.15005
|
| 295 |
},
|
| 296 |
+
"Amem (32B)": {
|
| 297 |
"accuracy": 0.29359999999999997,
|
| 298 |
"model_family": "Qwen3-32B",
|
| 299 |
"f1": 0.2079
|
| 300 |
},
|
| 301 |
+
"Mem0 (32B)": {
|
| 302 |
"accuracy": 0.16197499999999998,
|
| 303 |
"model_family": "Qwen3-32B",
|
| 304 |
"f1": 0.1604
|
| 305 |
},
|
| 306 |
+
"Memorag (32B)": {
|
| 307 |
"accuracy": 0.411375,
|
| 308 |
"model_family": "Qwen3-32B",
|
| 309 |
"f1": 0.093675
|
| 310 |
},
|
| 311 |
+
"Memgpt (32B)": {
|
| 312 |
"accuracy": 0.3155,
|
| 313 |
"model_family": "Qwen3-32B",
|
| 314 |
"f1": 0.0595
|
| 315 |
},
|
| 316 |
+
"Mem-alpha (32B)": {
|
| 317 |
"accuracy": 0.2301,
|
| 318 |
"model_family": "Qwen3-32B",
|
| 319 |
"f1": 0.13345
|
| 320 |
},
|
| 321 |
+
"Memorybank (32B)": {
|
| 322 |
"accuracy": 0.3486,
|
| 323 |
"model_family": "Qwen3-32B",
|
| 324 |
"f1": 0.2519
|
| 325 |
},
|
| 326 |
+
"Simple mem (32B)": {
|
| 327 |
"accuracy": 0.12154999999999999,
|
| 328 |
"model_family": "Qwen3-32B",
|
| 329 |
"f1": 0.1312
|
| 330 |
},
|
| 331 |
+
"Long context (32B)": {
|
| 332 |
"accuracy": 0.49785,
|
| 333 |
"model_family": "Qwen3-32B",
|
| 334 |
"f1": 0.3349
|
| 335 |
}
|
| 336 |
},
|
| 337 |
"SOFTWARE": {
|
| 338 |
+
"Qwen3-Embedding-4B (32B)": {
|
| 339 |
"accuracy": 0.599025,
|
| 340 |
"model_family": "Qwen3-32B",
|
| 341 |
"f1": 0.083575
|
| 342 |
},
|
| 343 |
+
"GRAPHRAG (32B)": {
|
| 344 |
"accuracy": 0.348875,
|
| 345 |
"model_family": "Qwen3-32B",
|
| 346 |
"f1": 0.229825
|
| 347 |
},
|
| 348 |
+
"Hipporag2 (32B)": {
|
| 349 |
"accuracy": 0.5299,
|
| 350 |
"model_family": "Qwen3-32B",
|
| 351 |
"f1": 0.1279
|
| 352 |
},
|
| 353 |
+
"Memagent (32B)": {
|
| 354 |
"accuracy": 0.53965,
|
| 355 |
"model_family": "Qwen3-32B",
|
| 356 |
"f1": 0.09085
|
| 357 |
},
|
| 358 |
+
"Mem1 (32B)": {
|
| 359 |
"accuracy": 0.18595,
|
| 360 |
"model_family": "Qwen3-32B",
|
| 361 |
"f1": 0.17527500000000001
|
| 362 |
},
|
| 363 |
+
"Amem (32B)": {
|
| 364 |
"accuracy": 0.29615,
|
| 365 |
"model_family": "Qwen3-32B",
|
| 366 |
"f1": 0.20395
|
| 367 |
},
|
| 368 |
+
"Mem0 (32B)": {
|
| 369 |
"accuracy": 0.2366,
|
| 370 |
"model_family": "Qwen3-32B",
|
| 371 |
"f1": 0.176975
|
| 372 |
},
|
| 373 |
+
"Memorag (32B)": {
|
| 374 |
"accuracy": 0.55005,
|
| 375 |
"model_family": "Qwen3-32B",
|
| 376 |
"f1": 0.10707499999999999
|
| 377 |
},
|
| 378 |
+
"Memgpt (32B)": {
|
| 379 |
"accuracy": 0.599125,
|
| 380 |
"model_family": "Qwen3-32B",
|
| 381 |
"f1": 0.066575
|
| 382 |
},
|
| 383 |
+
"Mem-alpha (32B)": {
|
| 384 |
"accuracy": 0.3476,
|
| 385 |
"model_family": "Qwen3-32B",
|
| 386 |
"f1": 0.12492500000000001
|
| 387 |
},
|
| 388 |
+
"Memorybank (32B)": {
|
| 389 |
"accuracy": 0.5072,
|
| 390 |
"model_family": "Qwen3-32B",
|
| 391 |
"f1": 0.240875
|
| 392 |
},
|
| 393 |
+
"Simple mem (32B)": {
|
| 394 |
"accuracy": 0.2431,
|
| 395 |
"model_family": "Qwen3-32B",
|
| 396 |
"f1": 0.2005
|
| 397 |
},
|
| 398 |
+
"Long context (32B)": {
|
| 399 |
"accuracy": 0.4847,
|
| 400 |
"model_family": "Qwen3-32B",
|
| 401 |
"f1": 0.267725
|
scorer.py
CHANGED
|
@@ -1,55 +1,31 @@
|
|
| 1 |
"""
|
| 2 |
Scoring functions for AMA-Bench submissions.
|
| 3 |
|
| 4 |
-
This module implements evaluation logic for
|
| 5 |
-
calculating accuracy by
|
| 6 |
"""
|
| 7 |
|
| 8 |
import re
|
| 9 |
from typing import Union, List, Dict
|
| 10 |
|
| 11 |
|
| 12 |
-
def
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
Used for multiple-choice answer comparison where answers are like
|
| 17 |
-
"A", "B", "AB", "ACD", etc.
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
text: Input text containing answer choices
|
| 21 |
-
|
| 22 |
-
Returns:
|
| 23 |
-
String of uppercase letters only, sorted alphabetically
|
| 24 |
-
"""
|
| 25 |
-
if not isinstance(text, str):
|
| 26 |
-
text = str(text)
|
| 27 |
-
|
| 28 |
-
# Extract all uppercase letters
|
| 29 |
-
letters = [c for c in text if c.isupper() and c.isalpha()]
|
| 30 |
-
|
| 31 |
-
# Sort and join to ensure consistent ordering
|
| 32 |
-
return ''.join(sorted(set(letters)))
|
| 33 |
|
| 34 |
|
| 35 |
-
def
|
| 36 |
"""
|
| 37 |
-
Calculate accuracy for
|
| 38 |
-
|
| 39 |
-
Compares uppercase letters extracted from both prediction and reference.
|
| 40 |
-
Returns 1.0 if they match exactly, 0.0 otherwise.
|
| 41 |
|
| 42 |
Args:
|
| 43 |
-
prediction: Model's predicted answer
|
| 44 |
-
reference: Ground truth reference answer
|
| 45 |
|
| 46 |
Returns:
|
| 47 |
-
1.0 if
|
| 48 |
"""
|
| 49 |
-
|
| 50 |
-
ref_letters = extract_uppercase_letters(reference)
|
| 51 |
-
|
| 52 |
-
return 1.0 if pred_letters == ref_letters else 0.0
|
| 53 |
|
| 54 |
|
| 55 |
def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
|
|
@@ -63,7 +39,7 @@ def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
|
|
| 63 |
Dictionary with accuracy metric
|
| 64 |
"""
|
| 65 |
if not scores:
|
| 66 |
-
return {"accuracy": 0.0, "count": 0}
|
| 67 |
|
| 68 |
import numpy as np
|
| 69 |
|
|
@@ -97,6 +73,10 @@ def score_submission(
|
|
| 97 |
"Causal": "Causal Inference",
|
| 98 |
"State": "State Updating",
|
| 99 |
"Abstraction": "State Abstraction",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
# Initialize scores by metric
|
|
@@ -128,8 +108,8 @@ def score_submission(
|
|
| 128 |
reference = gt_info["answer"]
|
| 129 |
qa_type = gt_info.get("type", "Recall")
|
| 130 |
|
| 131 |
-
# Calculate accuracy
|
| 132 |
-
score =
|
| 133 |
|
| 134 |
# Map question type to metric category
|
| 135 |
metric_category = "Recall" # default
|
|
@@ -150,6 +130,7 @@ def score_submission(
|
|
| 150 |
"score": score,
|
| 151 |
"reference_answer": reference,
|
| 152 |
"metric_category": metric_category,
|
|
|
|
| 153 |
})
|
| 154 |
|
| 155 |
# Calculate metrics for each category
|
|
@@ -163,4 +144,4 @@ def score_submission(
|
|
| 163 |
return {
|
| 164 |
"scores": results,
|
| 165 |
"scored_submissions": scored_submissions,
|
| 166 |
-
}
|
|
|
|
| 1 |
"""
|
| 2 |
Scoring functions for AMA-Bench submissions.
|
| 3 |
|
| 4 |
+
This module implements evaluation logic for string answers,
|
| 5 |
+
calculating accuracy by exact string match (case-insensitive).
|
| 6 |
"""
|
| 7 |
|
| 8 |
import re
|
| 9 |
from typing import Union, List, Dict
|
| 10 |
|
| 11 |
|
| 12 |
+
def normalize_answer(text: str) -> str:
|
| 13 |
+
"""Normalize answer string for comparison (lowercase, strip whitespace)."""
|
| 14 |
+
return str(text).strip().lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
+
def string_exact_match(prediction: str, reference: str) -> float:
|
| 18 |
"""
|
| 19 |
+
Calculate accuracy for string answers using exact match.
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
Args:
|
| 22 |
+
prediction: Model's predicted answer string
|
| 23 |
+
reference: Ground truth reference answer string
|
| 24 |
|
| 25 |
Returns:
|
| 26 |
+
1.0 if normalized strings match exactly, 0.0 otherwise
|
| 27 |
"""
|
| 28 |
+
return 1.0 if normalize_answer(prediction) == normalize_answer(reference) else 0.0
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
|
|
|
|
| 39 |
Dictionary with accuracy metric
|
| 40 |
"""
|
| 41 |
if not scores:
|
| 42 |
+
return {"accuracy": 0.0, "count": 0, "correct": 0}
|
| 43 |
|
| 44 |
import numpy as np
|
| 45 |
|
|
|
|
| 73 |
"Causal": "Causal Inference",
|
| 74 |
"State": "State Updating",
|
| 75 |
"Abstraction": "State Abstraction",
|
| 76 |
+
"A": "Recall",
|
| 77 |
+
"B": "Causal Inference",
|
| 78 |
+
"C": "State Updating",
|
| 79 |
+
"D": "State Abstraction",
|
| 80 |
}
|
| 81 |
|
| 82 |
# Initialize scores by metric
|
|
|
|
| 108 |
reference = gt_info["answer"]
|
| 109 |
qa_type = gt_info.get("type", "Recall")
|
| 110 |
|
| 111 |
+
# Calculate accuracy via exact string match
|
| 112 |
+
score = string_exact_match(answer, reference)
|
| 113 |
|
| 114 |
# Map question type to metric category
|
| 115 |
metric_category = "Recall" # default
|
|
|
|
| 130 |
"score": score,
|
| 131 |
"reference_answer": reference,
|
| 132 |
"metric_category": metric_category,
|
| 133 |
+
"domain": gt_info.get("domain", "") if gt_info else "",
|
| 134 |
})
|
| 135 |
|
| 136 |
# Calculate metrics for each category
|
|
|
|
| 144 |
return {
|
| 145 |
"scores": results,
|
| 146 |
"scored_submissions": scored_submissions,
|
| 147 |
+
}
|
utils.py
CHANGED
|
@@ -107,6 +107,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
|
|
| 107 |
|
| 108 |
for row in dataset:
|
| 109 |
episode_id = row.get("episode_id", "")
|
|
|
|
| 110 |
qa_pairs = row.get("qa_pairs", [])
|
| 111 |
|
| 112 |
for qa in qa_pairs:
|
|
@@ -119,7 +120,8 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
|
|
| 119 |
groundtruth[key] = {
|
| 120 |
"answer": answer,
|
| 121 |
"type": qa_type,
|
| 122 |
-
"sub_type": qa.get("sub_type", "")
|
|
|
|
| 123 |
}
|
| 124 |
|
| 125 |
except Exception as hf_error:
|
|
@@ -128,7 +130,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
|
|
| 128 |
|
| 129 |
# Fallback to local file
|
| 130 |
import json
|
| 131 |
-
local_path = "test/
|
| 132 |
|
| 133 |
try:
|
| 134 |
with open(local_path, 'r', encoding='utf-8') as f:
|
|
@@ -139,6 +141,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
|
|
| 139 |
|
| 140 |
data = json.loads(line)
|
| 141 |
episode_id = data.get("episode_id", "")
|
|
|
|
| 142 |
qa_pairs = data.get("qa_pairs", [])
|
| 143 |
|
| 144 |
for qa in qa_pairs:
|
|
@@ -151,7 +154,8 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
|
|
| 151 |
groundtruth[key] = {
|
| 152 |
"answer": answer,
|
| 153 |
"type": qa_type,
|
| 154 |
-
"sub_type": qa.get("sub_type", "")
|
|
|
|
| 155 |
}
|
| 156 |
|
| 157 |
print(f"Loaded from local file: {local_path}")
|
|
@@ -221,4 +225,4 @@ def validate_submission_file(file_path: str) -> tuple:
|
|
| 221 |
except FileNotFoundError:
|
| 222 |
return False, "File not found.", []
|
| 223 |
except Exception as e:
|
| 224 |
-
return False, f"Error reading file: {str(e)}", []
|
|
|
|
| 107 |
|
| 108 |
for row in dataset:
|
| 109 |
episode_id = row.get("episode_id", "")
|
| 110 |
+
domain = row.get("domain", "")
|
| 111 |
qa_pairs = row.get("qa_pairs", [])
|
| 112 |
|
| 113 |
for qa in qa_pairs:
|
|
|
|
| 120 |
groundtruth[key] = {
|
| 121 |
"answer": answer,
|
| 122 |
"type": qa_type,
|
| 123 |
+
"sub_type": qa.get("sub_type", ""),
|
| 124 |
+
"domain": domain,
|
| 125 |
}
|
| 126 |
|
| 127 |
except Exception as hf_error:
|
|
|
|
| 130 |
|
| 131 |
# Fallback to local file
|
| 132 |
import json
|
| 133 |
+
local_path = "test/open_end_qa_set.jsonl"
|
| 134 |
|
| 135 |
try:
|
| 136 |
with open(local_path, 'r', encoding='utf-8') as f:
|
|
|
|
| 141 |
|
| 142 |
data = json.loads(line)
|
| 143 |
episode_id = data.get("episode_id", "")
|
| 144 |
+
domain = data.get("domain", "")
|
| 145 |
qa_pairs = data.get("qa_pairs", [])
|
| 146 |
|
| 147 |
for qa in qa_pairs:
|
|
|
|
| 154 |
groundtruth[key] = {
|
| 155 |
"answer": answer,
|
| 156 |
"type": qa_type,
|
| 157 |
+
"sub_type": qa.get("sub_type", ""),
|
| 158 |
+
"domain": domain,
|
| 159 |
}
|
| 160 |
|
| 161 |
print(f"Loaded from local file: {local_path}")
|
|
|
|
| 225 |
except FileNotFoundError:
|
| 226 |
return False, "File not found.", []
|
| 227 |
except Exception as e:
|
| 228 |
+
return False, f"Error reading file: {str(e)}", []
|