Spaces:

AMA-bench
/

AMA-bench-Leaderboard

Running

App Files Files Community

uuuhjb commited on Mar 4

Commit

bb0a764

1 Parent(s): d8b2e03

add submit function

Browse files

Files changed (7) hide show

.gitignore +3 -1
app.py +144 -58
data/agent_capability.json +17 -2
data/model_capability.json +17 -2
data/model_domain.json +78 -78
scorer.py +20 -39
utils.py +8 -4

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 __pycache__
-*.DS_Store

 __pycache__
+*.DS_Store
+submissions/
+test/

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ except ImportError:
     def format_log(msg): return f"✅ {msg}"
 try:
-    from scorer import score_submission, extract_uppercase_letters
 except ImportError:
     score_submission = None
     extract_uppercase_letters = None
@@ -165,36 +165,28 @@ COLORS = [
 # ---------------------------------------------------------------------------
 def calculate_f1_score(predictions, references):
-    """Calculate F1 score for multi-label classification."""
     if not predictions or not references:
         return 0.0
-    if extract_uppercase_letters is None:
-        # Fallback implementation
-        def extract_letters(text):
-            return ''.join(sorted(set(c for c in str(text) if c.isupper() and c.isalpha())))
-        extract_fn = extract_letters
-    else:
-        extract_fn = extract_uppercase_letters
     total_precision = 0.0
     total_recall = 0.0
     count = 0
     for pred, ref in zip(predictions, references):
-        pred_set = set(extract_fn(pred))
-        ref_set = set(extract_fn(ref))
-        if not pred_set and not ref_set:
             total_precision += 1.0
             total_recall += 1.0
             count += 1
-        elif not pred_set or not ref_set:
             count += 1
         else:
-            intersection = len(pred_set & ref_set)
-            precision = intersection / len(pred_set) if pred_set else 0
-            recall = intersection / len(ref_set) if ref_set else 0
             total_precision += precision
             total_recall += recall
             count += 1
@@ -208,54 +200,96 @@ def calculate_f1_score(predictions, references):
     if avg_precision + avg_recall == 0:
         return 0.0
-    f1 = 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
-    return f1
 def update_json_with_submission(model_name, scores_by_metric, scored_submissions, is_agent=False, model_family=""):
-    """Update JSON files with new submission data."""
     try:
         if is_agent:
             capability_file = "data/agent_capability.json"
-            domain_file = "data/agent_domain.json"
         else:
             capability_file = "data/model_capability.json"
-            domain_file = "data/model_domain.json"
-        # Load existing data
         with open(capability_file, 'r', encoding='utf-8') as f:
             capability_data = json.load(f)
-        # Update capability data
         for capability in METRICS:
-            if capability in scores_by_metric and capability in capability_data:
-                metric_data = scores_by_metric[capability]
-                # Get submissions for this capability
-                capability_submissions = [
-                    s for s in scored_submissions
-                    if s.get('metric_category') == capability
-                ]
-                # Calculate F1
-                if capability_submissions:
-                    predictions = [s.get('answer', '') for s in capability_submissions]
-                    references = [s.get('reference_answer', '') for s in capability_submissions]
-                    f1 = calculate_f1_score(predictions, references)
-                else:
-                    f1 = 0.0
-                capability_data[capability][model_name] = {
-                    "accuracy": metric_data['accuracy'],
-                    "model_family": model_family,
-                    "f1": f1
-                }
-        # Save updated data
         with open(capability_file, 'w', encoding='utf-8') as f:
             json.dump(capability_data, f, indent=2, ensure_ascii=False)
         print(f"✓ Updated {capability_file}")
         return True
     except Exception as e:
@@ -315,7 +349,7 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
         print(f"✓ Overall accuracy: {average_accuracy:.4f}")
         for metric_name, metric_data in scores_by_metric.items():
             if metric_name != "Average":
-                print(f"  {metric_name}: {metric_data['accuracy']:.4f} ({metric_data['correct']}/{metric_data['count']})")
         # Save locally
         submission_dir = f"submissions/{organisation}_{model}"
@@ -340,7 +374,7 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
                 metric_name: {
                     "accuracy": float(metric_data["accuracy"]),
                     "count": int(metric_data["count"]),
-                    "correct": int(metric_data["correct"])
                 }
                 for metric_name, metric_data in scores_by_metric.items()
             }
@@ -361,24 +395,73 @@ def add_new_submission(model, submission_type, url, file, organisation, mail, mo
         if update_success:
             print("✓ Updated leaderboard JSON files")
             # Reload data
-            global AGENT_CAPABILITY, AGENT_DOMAIN, MODEL_CAPABILITY, MODEL_DOMAIN
             if is_agent:
                 AGENT_CAPABILITY = load_json_data("data/agent_capability.json")
                 AGENT_DOMAIN = load_json_data("data/agent_domain.json")
             else:
                 MODEL_CAPABILITY = load_json_data("data/model_capability.json")
                 MODEL_DOMAIN = load_json_data("data/model_domain.json")
         # Format message
         message = f"✅ **Submission successful!**\n\n"
         message += f"**{'Agent' if is_agent else 'Model'}:** {model}\n"
         message += f"**Organisation:** {organisation}\n"
         message += f"**Overall Accuracy:** {average_accuracy:.4f}\n\n"
         message += "**Scores by Capability:**\n"
         for metric_name in METRICS:
             if metric_name in scores_by_metric:
-                metric_data = scores_by_metric[metric_name]
-                message += f"- **{metric_name}:** {metric_data['accuracy']:.4f} ({metric_data['correct']}/{metric_data['count']})\n"
         message += f"\n**Submission ID:** {timestamp}\n"
         if update_success:
@@ -1014,16 +1097,19 @@ def build_app():
                 gr.Markdown("""
                 **Submission Format:**
-                Your JSONL file should contain one prediction per line:
                 ```json
-                {"episode_id": "ep_001", "question": "What is X?", "answer": "A"}
-                {"episode_id": "ep_002", "question": "What is Y?", "answer": "BC"}
                 ```
                 **Required fields:**
                 - `episode_id`: Episode identifier
-                - `question`: The question text
-                - `answer`: Your model's answer (uppercase letters: A, B, AB, etc.)
                 """)
                 with gr.Row():
@@ -1101,4 +1187,4 @@ Results are reported as **Accuracy** and **F1 Score**:
 if __name__ == "__main__":
     demo_app = build_app()
-    demo_app.launch(debug=True, show_error=True)

     def format_log(msg): return f"✅ {msg}"
 try:
+    from scorer import score_submission
 except ImportError:
     score_submission = None
     extract_uppercase_letters = None
 # ---------------------------------------------------------------------------
 def calculate_f1_score(predictions, references):
+    """Calculate token-level F1 score for string answers."""
     if not predictions or not references:
         return 0.0
     total_precision = 0.0
     total_recall = 0.0
     count = 0
     for pred, ref in zip(predictions, references):
+        pred_tokens = set(str(pred).strip().lower().split())
+        ref_tokens = set(str(ref).strip().lower().split())
+        if not pred_tokens and not ref_tokens:
             total_precision += 1.0
             total_recall += 1.0
             count += 1
+        elif not pred_tokens or not ref_tokens:
             count += 1
         else:
+            intersection = len(pred_tokens & ref_tokens)
+            precision = intersection / len(pred_tokens)
+            recall = intersection / len(ref_tokens)
             total_precision += precision
             total_recall += recall
             count += 1
     if avg_precision + avg_recall == 0:
         return 0.0
+    return 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)
 def update_json_with_submission(model_name, scores_by_metric, scored_submissions, is_agent=False, model_family=""):
+    """Update capability and domain JSON files with new submission data."""
     try:
         if is_agent:
             capability_file = "data/agent_capability.json"
+            domain_file     = "data/agent_domain.json"
         else:
             capability_file = "data/model_capability.json"
+            domain_file     = "data/model_domain.json"
+        # ── 1. Update capability file ────────────────────────────────────
         with open(capability_file, 'r', encoding='utf-8') as f:
             capability_data = json.load(f)
         for capability in METRICS:
+            if capability not in scores_by_metric or capability not in capability_data:
+                continue
+            metric_data = scores_by_metric[capability]
+            # submissions belonging to this capability
+            cap_subs = [s for s in scored_submissions if s.get('metric_category') == capability]
+            # F1 calculated from this capability's predictions vs references
+            if cap_subs:
+                predictions = [s.get('answer', '') for s in cap_subs]
+                references  = [s.get('reference_answer', '') for s in cap_subs]
+                f1 = calculate_f1_score(predictions, references)
+            else:
+                f1 = 0.0
+            capability_data[capability][model_name] = {
+                "accuracy":     metric_data['accuracy'],
+                "model_family": model_family,
+                "f1":           f1,
+            }
         with open(capability_file, 'w', encoding='utf-8') as f:
             json.dump(capability_data, f, indent=2, ensure_ascii=False)
         print(f"✓ Updated {capability_file}")
+        # ── 2. Update domain file ────────────────────────────────────────
+        with open(domain_file, 'r', encoding='utf-8') as f:
+            domain_data = json.load(f)
+        # Group scored_submissions by domain
+        from collections import defaultdict
+        domain_groups = defaultdict(list)
+        for s in scored_submissions:
+            dom = s.get('domain', '').strip().upper()
+            if dom:
+                domain_groups[dom].append(s)
+        # Known domain keys in the JSON (may differ in capitalisation/alias)
+        DOMAIN_KEY_MAP = {
+            "GAMING":       "GAMING",
+            "GAME":         "GAMING",
+            "EMBODIED_AI":  "EMBODIED_AI",
+            "WEB":          "WEB",
+            "TEXT2SQL":     "TEXT2SQL",
+            "OPENWORLD_QA": "OPENWORLD_QA",
+            "SOFTWARE":     "SOFTWARE",
+            "SOFTWARE_ENGINEER": "SOFTWARE",
+        }
+        for raw_domain, subs in domain_groups.items():
+            json_domain = DOMAIN_KEY_MAP.get(raw_domain)
+            if json_domain is None or json_domain not in domain_data:
+                continue  # unknown domain, skip
+            scores = [s.get('score', 0.0) for s in subs]
+            accuracy = float(np.mean(scores)) if scores else 0.0
+            predictions = [s.get('answer', '') for s in subs]
+            references  = [s.get('reference_answer', '') for s in subs]
+            f1 = calculate_f1_score(predictions, references)
+            domain_data[json_domain][model_name] = {
+                "accuracy":     accuracy,
+                "model_family": model_family,
+                "f1":           f1,
+            }
+        with open(domain_file, 'w', encoding='utf-8') as f:
+            json.dump(domain_data, f, indent=2, ensure_ascii=False)
+        print(f"✓ Updated {domain_file}")
         return True
     except Exception as e:
         print(f"✓ Overall accuracy: {average_accuracy:.4f}")
         for metric_name, metric_data in scores_by_metric.items():
             if metric_name != "Average":
+                print(f"  {metric_name}: {metric_data['accuracy']:.4f} ({metric_data.get('correct', 0)}/{metric_data['count']})")
         # Save locally
         submission_dir = f"submissions/{organisation}_{model}"
                 metric_name: {
                     "accuracy": float(metric_data["accuracy"]),
                     "count": int(metric_data["count"]),
+                    "correct": int(metric_data.get("correct", 0))
                 }
                 for metric_name, metric_data in scores_by_metric.items()
             }
         if update_success:
             print("✓ Updated leaderboard JSON files")
             # Reload data
+            global AGENT_CAPABILITY, AGENT_DOMAIN, MODEL_CAPABILITY, MODEL_DOMAIN, model_domain_filtered
             if is_agent:
                 AGENT_CAPABILITY = load_json_data("data/agent_capability.json")
                 AGENT_DOMAIN = load_json_data("data/agent_domain.json")
             else:
                 MODEL_CAPABILITY = load_json_data("data/model_capability.json")
                 MODEL_DOMAIN = load_json_data("data/model_domain.json")
+                # Recompute filtered model domain view
+                _model_items = set()
+                for _cap_data in MODEL_CAPABILITY.values():
+                    _model_items.update(_cap_data.keys())
+                model_domain_filtered = filter_data_by_items(MODEL_DOMAIN, _model_items)
+                if not any(len(v) > 0 for v in model_domain_filtered.values()):
+                    model_domain_filtered = {}
+        # Compute per-domain scores from scored_submissions
+        from collections import defaultdict
+        domain_groups = defaultdict(list)
+        for s in scored_submissions:
+            dom = s.get("domain", "").strip().upper()
+            if dom:
+                domain_groups[dom].append(s)
+        domain_scores = {}
+        for dom, subs in sorted(domain_groups.items()):
+            scores_list  = [s.get("score", 0.0) for s in subs]
+            preds        = [s.get("answer", "")           for s in subs]
+            refs         = [s.get("reference_answer", "") for s in subs]
+            domain_scores[dom] = {
+                "accuracy": float(np.mean(scores_list)) if scores_list else 0.0,
+                "f1":       calculate_f1_score(preds, refs),
+                "correct":  int(sum(scores_list)),
+                "count":    len(scores_list),
+            }
         # Format message
         message = f"✅ **Submission successful!**\n\n"
         message += f"**{'Agent' if is_agent else 'Model'}:** {model}\n"
         message += f"**Organisation:** {organisation}\n"
         message += f"**Overall Accuracy:** {average_accuracy:.4f}\n\n"
         message += "**Scores by Capability:**\n"
         for metric_name in METRICS:
             if metric_name in scores_by_metric:
+                md = scores_by_metric[metric_name]
+                # compute F1 for this capability
+                cap_subs = [s for s in scored_submissions if s.get("metric_category") == metric_name]
+                if cap_subs:
+                    preds = [s.get("answer", "")           for s in cap_subs]
+                    refs  = [s.get("reference_answer", "") for s in cap_subs]
+                    cap_f1 = calculate_f1_score(preds, refs)
+                else:
+                    cap_f1 = 0.0
+                message += (
+                    f"- **{metric_name}:** Accuracy {md['accuracy']:.4f}"
+                    f" ({md.get('correct', 0)}/{md['count']})"
+                    f",  F1 {cap_f1:.4f}\n"
+                )
+        if domain_scores:
+            message += "\n**Scores by Domain:**\n"
+            for dom, ds in domain_scores.items():
+                message += (
+                    f"- **{dom}:** Accuracy {ds['accuracy']:.4f}"
+                    f" ({ds['correct']}/{ds['count']})"
+                    f",  F1 {ds['f1']:.4f}\n"
+                )
         message += f"\n**Submission ID:** {timestamp}\n"
         if update_success:
                 gr.Markdown("""
                 **Submission Format:**
+                Your JSONL file should contain one question-answer pair per line:
                 ```json
+                {"episode_id": "ep_001", "question": "What is X?", "answer": "your answer"}
+                {"episode_id": "ep_001", "question": "What happened next?", "answer": "another answer"}
+                {"episode_id": "ep_002", "question": "What is the goal?", "answer": "yet another answer"}
                 ```
                 **Required fields:**
                 - `episode_id`: Episode identifier
+                - `question`: Question text (must match exactly the question in the dataset)
+                - `answer`: Your model's predicted free-form string answer
+                Each `episode_id` + `question` pair must be unique. Answers are evaluated by case-insensitive exact string match.
                 """)
                 with gr.Row():
 if __name__ == "__main__":
     demo_app = build_app()
+    demo_app.launch(debug=True, show_error=True)

data/agent_capability.json CHANGED Viewed

@@ -66,7 +66,7 @@
       "f1": 0.4152833333333333
     }
   },
-  "Casual Inference": {
     "Qwen3-Embedding-4B": {
       "accuracy": 0.48618333333333336,
       "model_family": "Qwen3-32B",
@@ -131,6 +131,11 @@
       "accuracy": 0.5399999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.34326666666666666
     }
   },
   "State Updating": {
@@ -198,9 +203,14 @@
       "accuracy": 0.48335,
       "model_family": "Qwen3-32B",
       "f1": 0.3447166666666666
     }
   },
-  "State abstraction": {
     "Qwen3-Embedding-4B": {
       "accuracy": 0.3022666666666667,
       "model_family": "Qwen3-32B",
@@ -265,6 +275,11 @@
       "accuracy": 0.37979999999999997,
       "model_family": "Qwen3-32B",
       "f1": 0.3152333333333333
     }
   }
 }

       "f1": 0.4152833333333333
     }
   },
+  "Causal Inference": {
     "Qwen3-Embedding-4B": {
       "accuracy": 0.48618333333333336,
       "model_family": "Qwen3-32B",
       "accuracy": 0.5399999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.34326666666666666
+    },
+    "1": {
+      "accuracy": 0.0,
+      "model_family": "1",
+      "f1": 0.0
     }
   },
   "State Updating": {
       "accuracy": 0.48335,
       "model_family": "Qwen3-32B",
       "f1": 0.3447166666666666
+    },
+    "1": {
+      "accuracy": 1.0,
+      "model_family": "1",
+      "f1": 1.0
     }
   },
+  "State Abstraction": {
     "Qwen3-Embedding-4B": {
       "accuracy": 0.3022666666666667,
       "model_family": "Qwen3-32B",
       "accuracy": 0.37979999999999997,
       "model_family": "Qwen3-32B",
       "f1": 0.3152333333333333
+    },
+    "1": {
+      "accuracy": 0.0,
+      "model_family": "1",
+      "f1": 0.0
     }
   }
 }

data/model_capability.json CHANGED Viewed

@@ -145,7 +145,7 @@
       "f1": 0.3065
     }
   },
-  "Casual Inference": {
     "Claude Haiku 3.5": {
       "accuracy": 0.4799333333333333,
       "f1": 0.29278333333333334
@@ -289,6 +289,11 @@
     "AMA-agent (Ours) (8B)": {
       "accuracy": 0.4806166666666667,
       "f1": 0.23224999999999998
     }
   },
   "State Updating": {
@@ -435,9 +440,14 @@
     "AMA-agent (Ours) (8B)": {
       "accuracy": 0.43645,
       "f1": 0.21893333333333334
     }
   },
-  "State abstraction": {
     "Claude Haiku 3.5": {
       "accuracy": 0.32758333333333334,
       "f1": 0.2684166666666667
@@ -581,6 +591,11 @@
     "AMA-agent (Ours) (8B)": {
       "accuracy": 0.37873333333333337,
       "f1": 0.21493333333333334
     }
   }
 }

       "f1": 0.3065
     }
   },
+  "Causal Inference": {
     "Claude Haiku 3.5": {
       "accuracy": 0.4799333333333333,
       "f1": 0.29278333333333334
     "AMA-agent (Ours) (8B)": {
       "accuracy": 0.4806166666666667,
       "f1": 0.23224999999999998
+    },
+    "1": {
+      "accuracy": 0.0,
+      "model_family": "1",
+      "f1": 0.0
     }
   },
   "State Updating": {
     "AMA-agent (Ours) (8B)": {
       "accuracy": 0.43645,
       "f1": 0.21893333333333334
+    },
+    "1": {
+      "accuracy": 1.0,
+      "model_family": "1",
+      "f1": 1.0
     }
   },
+  "State Abstraction": {
     "Claude Haiku 3.5": {
       "accuracy": 0.32758333333333334,
       "f1": 0.2684166666666667
     "AMA-agent (Ours) (8B)": {
       "accuracy": 0.37873333333333337,
       "f1": 0.21493333333333334
+    },
+    "1": {
+      "accuracy": 0.0,
+      "model_family": "1",
+      "f1": 0.0
     }
   }
 }

data/model_domain.json CHANGED Viewed

@@ -1,401 +1,401 @@
 {
   "GAMING": {
-    "Qwen3-Embedding-4B": {
       "accuracy": 0.5157,
       "model_family": "Qwen3-32B",
       "f1": 0.2195
     },
-    "GRAPHRAG": {
       "accuracy": 0.5595249999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.288175
     },
-    "Hipporag2": {
       "accuracy": 0.60555,
       "model_family": "Qwen3-32B",
       "f1": 0.2273
     },
-    "Memagent": {
       "accuracy": 0.31775,
       "model_family": "Qwen3-32B",
       "f1": 0.22945
     },
-    "Mem1": {
       "accuracy": 0.225875,
       "model_family": "Qwen3-32B",
       "f1": 0.18155
     },
-    "Amem": {
       "accuracy": 0.4247,
       "model_family": "Qwen3-32B",
       "f1": 0.343125
     },
-    "Mem0": {
       "accuracy": 0.39085000000000003,
       "model_family": "Qwen3-32B",
       "f1": 0.346
     },
-    "Memorag": {
       "accuracy": 0.557625,
       "model_family": "Qwen3-32B",
       "f1": 0.257875
     },
-    "Memgpt": {
       "accuracy": 0.435425,
       "model_family": "Qwen3-32B",
       "f1": 0.318475
     },
-    "Mem-alpha": {
       "accuracy": 0.43895,
       "model_family": "Qwen3-32B",
       "f1": 0.319875
     },
-    "Memorybank": {
       "accuracy": 0.43885,
       "model_family": "Qwen3-32B",
       "f1": 0.325325
     },
-    "Simple mem": {
       "accuracy": 0.288775,
       "model_family": "Qwen3-32B",
       "f1": 0.163
     },
-    "Long context": {
       "accuracy": 0.5355,
       "model_family": "Qwen3-32B",
       "f1": 0.321775
     }
   },
   "EMBODIED_AI": {
-    "Qwen3-Embedding-4B": {
       "accuracy": 0.204325,
       "model_family": "Qwen3-32B",
       "f1": 0.1353
     },
-    "GRAPHRAG": {
       "accuracy": 0.1476,
       "model_family": "Qwen3-32B",
       "f1": 0.3799
     },
-    "Hipporag2": {
       "accuracy": 0.17627500000000002,
       "model_family": "Qwen3-32B",
       "f1": 0.181875
     },
-    "Memagent": {
       "accuracy": 0.10617499999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.144975
     },
-    "Mem1": {
       "accuracy": 0.03355,
       "model_family": "Qwen3-32B",
       "f1": 0.22445
     },
-    "Amem": {
       "accuracy": 0.183975,
       "model_family": "Qwen3-32B",
       "f1": 0.3524
     },
-    "Mem0": {
       "accuracy": 0.11109999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.27005
     },
-    "Memorag": {
       "accuracy": 0.085425,
       "model_family": "Qwen3-32B",
       "f1": 0.17677500000000002
     },
-    "Memgpt": {
       "accuracy": 0.1122,
       "model_family": "Qwen3-32B",
       "f1": 0.10405
     },
-    "Mem-alpha": {
       "accuracy": 0.15515,
       "model_family": "Qwen3-32B",
       "f1": 0.23735
     },
-    "Memorybank": {
       "accuracy": 0.16025,
       "model_family": "Qwen3-32B",
       "f1": 0.426475
     },
-    "Simple mem": {
       "accuracy": 0.045975,
       "model_family": "Qwen3-32B",
       "f1": 0.2284
     },
-    "Long context": {
       "accuracy": 0.48185,
       "model_family": "Qwen3-32B",
       "f1": 0.56
     }
   },
   "WEB": {
-    "Qwen3-Embedding-4B": {
       "accuracy": 0.2872,
       "model_family": "Qwen3-32B",
       "f1": 0.08535000000000001
     },
-    "GRAPHRAG": {
       "accuracy": 0.420675,
       "model_family": "Qwen3-32B",
       "f1": 0.268075
     },
-    "Hipporag2": {
       "accuracy": 0.3761,
       "model_family": "Qwen3-32B",
       "f1": 0.120125
     },
-    "Memagent": {
       "accuracy": 0.263975,
       "model_family": "Qwen3-32B",
       "f1": 0.09065
     },
-    "Mem1": {
       "accuracy": 0.131275,
       "model_family": "Qwen3-32B",
       "f1": 0.1518
     },
-    "Amem": {
       "accuracy": 0.391525,
       "model_family": "Qwen3-32B",
       "f1": 0.2294
     },
-    "Mem0": {
       "accuracy": 0.2705,
       "model_family": "Qwen3-32B",
       "f1": 0.21675
     },
-    "Memorag": {
       "accuracy": 0.364975,
       "model_family": "Qwen3-32B",
       "f1": 0.108075
     },
-    "Memgpt": {
       "accuracy": 0.327975,
       "model_family": "Qwen3-32B",
       "f1": 0.07105
     },
-    "Mem-alpha": {
       "accuracy": 0.362925,
       "model_family": "Qwen3-32B",
       "f1": 0.15944999999999998
     },
-    "Memorybank": {
       "accuracy": 0.401775,
       "model_family": "Qwen3-32B",
       "f1": 0.23704999999999998
     },
-    "Simple mem": {
       "accuracy": 0.13974999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.1679
     },
-    "Long context": {
       "accuracy": 0.554275,
       "model_family": "Qwen3-32B",
       "f1": 0.348075
     }
   },
   "TEXT2SQL": {
-    "Qwen3-Embedding-4B": {
       "accuracy": 0.4164,
       "model_family": "Qwen3-32B",
       "f1": 0.249325
     },
-    "GRAPHRAG": {
       "accuracy": 0.21665,
       "model_family": "Qwen3-32B",
       "f1": 0.221675
     },
-    "Hipporag2": {
       "accuracy": 0.46267499999999995,
       "model_family": "Qwen3-32B",
       "f1": 0.26935
     },
-    "Memagent": {
       "accuracy": 0.245375,
       "model_family": "Qwen3-32B",
       "f1": 0.245375
     },
-    "Mem1": {
       "accuracy": 0.06465,
       "model_family": "Qwen3-32B",
       "f1": 0.19990000000000002
     },
-    "Amem": {
       "accuracy": 0.31405,
       "model_family": "Qwen3-32B",
       "f1": 0.289625
     },
-    "Mem0": {
       "accuracy": 0.1192,
       "model_family": "Qwen3-32B",
       "f1": 0.2326
     },
-    "Memorag": {
       "accuracy": 0.619,
       "model_family": "Qwen3-32B",
       "f1": 0.296475
     },
-    "Memgpt": {
       "accuracy": 0.206875,
       "model_family": "Qwen3-32B",
       "f1": 0.178975
     },
-    "Mem-alpha": {
       "accuracy": 0.30065,
       "model_family": "Qwen3-32B",
       "f1": 0.26505
     },
-    "Memorybank": {
       "accuracy": 0.23855,
       "model_family": "Qwen3-32B",
       "f1": 0.28355
     },
-    "Simple mem": {
       "accuracy": 0.192575,
       "model_family": "Qwen3-32B",
       "f1": 0.157225
     },
-    "Long context": {
       "accuracy": 0.456075,
       "model_family": "Qwen3-32B",
       "f1": 0.295275
     }
   },
   "OPENWORLD_QA": {
-    "Qwen3-Embedding-4B": {
       "accuracy": 0.399125,
       "model_family": "Qwen3-32B",
       "f1": 0.0837
     },
-    "GRAPHRAG": {
       "accuracy": 0.31845,
       "model_family": "Qwen3-32B",
       "f1": 0.22635
     },
-    "Hipporag2": {
       "accuracy": 0.45825,
       "model_family": "Qwen3-32B",
       "f1": 0.2362
     },
-    "Memagent": {
       "accuracy": 0.158225,
       "model_family": "Qwen3-32B",
       "f1": 0.0704
     },
-    "Mem1": {
       "accuracy": 0.12065000000000001,
       "model_family": "Qwen3-32B",
       "f1": 0.15005
     },
-    "Amem": {
       "accuracy": 0.29359999999999997,
       "model_family": "Qwen3-32B",
       "f1": 0.2079
     },
-    "Mem0": {
       "accuracy": 0.16197499999999998,
       "model_family": "Qwen3-32B",
       "f1": 0.1604
     },
-    "Memorag": {
       "accuracy": 0.411375,
       "model_family": "Qwen3-32B",
       "f1": 0.093675
     },
-    "Memgpt": {
       "accuracy": 0.3155,
       "model_family": "Qwen3-32B",
       "f1": 0.0595
     },
-    "Mem-alpha": {
       "accuracy": 0.2301,
       "model_family": "Qwen3-32B",
       "f1": 0.13345
     },
-    "Memorybank": {
       "accuracy": 0.3486,
       "model_family": "Qwen3-32B",
       "f1": 0.2519
     },
-    "Simple mem": {
       "accuracy": 0.12154999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.1312
     },
-    "Long context": {
       "accuracy": 0.49785,
       "model_family": "Qwen3-32B",
       "f1": 0.3349
     }
   },
   "SOFTWARE": {
-    "Qwen3-Embedding-4B": {
       "accuracy": 0.599025,
       "model_family": "Qwen3-32B",
       "f1": 0.083575
     },
-    "GRAPHRAG": {
       "accuracy": 0.348875,
       "model_family": "Qwen3-32B",
       "f1": 0.229825
     },
-    "Hipporag2": {
       "accuracy": 0.5299,
       "model_family": "Qwen3-32B",
       "f1": 0.1279
     },
-    "Memagent": {
       "accuracy": 0.53965,
       "model_family": "Qwen3-32B",
       "f1": 0.09085
     },
-    "Mem1": {
       "accuracy": 0.18595,
       "model_family": "Qwen3-32B",
       "f1": 0.17527500000000001
     },
-    "Amem": {
       "accuracy": 0.29615,
       "model_family": "Qwen3-32B",
       "f1": 0.20395
     },
-    "Mem0": {
       "accuracy": 0.2366,
       "model_family": "Qwen3-32B",
       "f1": 0.176975
     },
-    "Memorag": {
       "accuracy": 0.55005,
       "model_family": "Qwen3-32B",
       "f1": 0.10707499999999999
     },
-    "Memgpt": {
       "accuracy": 0.599125,
       "model_family": "Qwen3-32B",
       "f1": 0.066575
     },
-    "Mem-alpha": {
       "accuracy": 0.3476,
       "model_family": "Qwen3-32B",
       "f1": 0.12492500000000001
     },
-    "Memorybank": {
       "accuracy": 0.5072,
       "model_family": "Qwen3-32B",
       "f1": 0.240875
     },
-    "Simple mem": {
       "accuracy": 0.2431,
       "model_family": "Qwen3-32B",
       "f1": 0.2005
     },
-    "Long context": {
       "accuracy": 0.4847,
       "model_family": "Qwen3-32B",
       "f1": 0.267725

 {
   "GAMING": {
+    "Qwen3-Embedding-4B (32B)": {
       "accuracy": 0.5157,
       "model_family": "Qwen3-32B",
       "f1": 0.2195
     },
+    "GRAPHRAG (32B)": {
       "accuracy": 0.5595249999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.288175
     },
+    "Hipporag2 (32B)": {
       "accuracy": 0.60555,
       "model_family": "Qwen3-32B",
       "f1": 0.2273
     },
+    "Memagent (32B)": {
       "accuracy": 0.31775,
       "model_family": "Qwen3-32B",
       "f1": 0.22945
     },
+    "Mem1 (32B)": {
       "accuracy": 0.225875,
       "model_family": "Qwen3-32B",
       "f1": 0.18155
     },
+    "Amem (32B)": {
       "accuracy": 0.4247,
       "model_family": "Qwen3-32B",
       "f1": 0.343125
     },
+    "Mem0 (32B)": {
       "accuracy": 0.39085000000000003,
       "model_family": "Qwen3-32B",
       "f1": 0.346
     },
+    "Memorag (32B)": {
       "accuracy": 0.557625,
       "model_family": "Qwen3-32B",
       "f1": 0.257875
     },
+    "Memgpt (32B)": {
       "accuracy": 0.435425,
       "model_family": "Qwen3-32B",
       "f1": 0.318475
     },
+    "Mem-alpha (32B)": {
       "accuracy": 0.43895,
       "model_family": "Qwen3-32B",
       "f1": 0.319875
     },
+    "Memorybank (32B)": {
       "accuracy": 0.43885,
       "model_family": "Qwen3-32B",
       "f1": 0.325325
     },
+    "Simple mem (32B)": {
       "accuracy": 0.288775,
       "model_family": "Qwen3-32B",
       "f1": 0.163
     },
+    "Long context (32B)": {
       "accuracy": 0.5355,
       "model_family": "Qwen3-32B",
       "f1": 0.321775
     }
   },
   "EMBODIED_AI": {
+    "Qwen3-Embedding-4B (32B)": {
       "accuracy": 0.204325,
       "model_family": "Qwen3-32B",
       "f1": 0.1353
     },
+    "GRAPHRAG (32B)": {
       "accuracy": 0.1476,
       "model_family": "Qwen3-32B",
       "f1": 0.3799
     },
+    "Hipporag2 (32B)": {
       "accuracy": 0.17627500000000002,
       "model_family": "Qwen3-32B",
       "f1": 0.181875
     },
+    "Memagent (32B)": {
       "accuracy": 0.10617499999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.144975
     },
+    "Mem1 (32B)": {
       "accuracy": 0.03355,
       "model_family": "Qwen3-32B",
       "f1": 0.22445
     },
+    "Amem (32B)": {
       "accuracy": 0.183975,
       "model_family": "Qwen3-32B",
       "f1": 0.3524
     },
+    "Mem0 (32B)": {
       "accuracy": 0.11109999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.27005
     },
+    "Memorag (32B)": {
       "accuracy": 0.085425,
       "model_family": "Qwen3-32B",
       "f1": 0.17677500000000002
     },
+    "Memgpt (32B)": {
       "accuracy": 0.1122,
       "model_family": "Qwen3-32B",
       "f1": 0.10405
     },
+    "Mem-alpha (32B)": {
       "accuracy": 0.15515,
       "model_family": "Qwen3-32B",
       "f1": 0.23735
     },
+    "Memorybank (32B)": {
       "accuracy": 0.16025,
       "model_family": "Qwen3-32B",
       "f1": 0.426475
     },
+    "Simple mem (32B)": {
       "accuracy": 0.045975,
       "model_family": "Qwen3-32B",
       "f1": 0.2284
     },
+    "Long context (32B)": {
       "accuracy": 0.48185,
       "model_family": "Qwen3-32B",
       "f1": 0.56
     }
   },
   "WEB": {
+    "Qwen3-Embedding-4B (32B)": {
       "accuracy": 0.2872,
       "model_family": "Qwen3-32B",
       "f1": 0.08535000000000001
     },
+    "GRAPHRAG (32B)": {
       "accuracy": 0.420675,
       "model_family": "Qwen3-32B",
       "f1": 0.268075
     },
+    "Hipporag2 (32B)": {
       "accuracy": 0.3761,
       "model_family": "Qwen3-32B",
       "f1": 0.120125
     },
+    "Memagent (32B)": {
       "accuracy": 0.263975,
       "model_family": "Qwen3-32B",
       "f1": 0.09065
     },
+    "Mem1 (32B)": {
       "accuracy": 0.131275,
       "model_family": "Qwen3-32B",
       "f1": 0.1518
     },
+    "Amem (32B)": {
       "accuracy": 0.391525,
       "model_family": "Qwen3-32B",
       "f1": 0.2294
     },
+    "Mem0 (32B)": {
       "accuracy": 0.2705,
       "model_family": "Qwen3-32B",
       "f1": 0.21675
     },
+    "Memorag (32B)": {
       "accuracy": 0.364975,
       "model_family": "Qwen3-32B",
       "f1": 0.108075
     },
+    "Memgpt (32B)": {
       "accuracy": 0.327975,
       "model_family": "Qwen3-32B",
       "f1": 0.07105
     },
+    "Mem-alpha (32B)": {
       "accuracy": 0.362925,
       "model_family": "Qwen3-32B",
       "f1": 0.15944999999999998
     },
+    "Memorybank (32B)": {
       "accuracy": 0.401775,
       "model_family": "Qwen3-32B",
       "f1": 0.23704999999999998
     },
+    "Simple mem (32B)": {
       "accuracy": 0.13974999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.1679
     },
+    "Long context (32B)": {
       "accuracy": 0.554275,
       "model_family": "Qwen3-32B",
       "f1": 0.348075
     }
   },
   "TEXT2SQL": {
+    "Qwen3-Embedding-4B (32B)": {
       "accuracy": 0.4164,
       "model_family": "Qwen3-32B",
       "f1": 0.249325
     },
+    "GRAPHRAG (32B)": {
       "accuracy": 0.21665,
       "model_family": "Qwen3-32B",
       "f1": 0.221675
     },
+    "Hipporag2 (32B)": {
       "accuracy": 0.46267499999999995,
       "model_family": "Qwen3-32B",
       "f1": 0.26935
     },
+    "Memagent (32B)": {
       "accuracy": 0.245375,
       "model_family": "Qwen3-32B",
       "f1": 0.245375
     },
+    "Mem1 (32B)": {
       "accuracy": 0.06465,
       "model_family": "Qwen3-32B",
       "f1": 0.19990000000000002
     },
+    "Amem (32B)": {
       "accuracy": 0.31405,
       "model_family": "Qwen3-32B",
       "f1": 0.289625
     },
+    "Mem0 (32B)": {
       "accuracy": 0.1192,
       "model_family": "Qwen3-32B",
       "f1": 0.2326
     },
+    "Memorag (32B)": {
       "accuracy": 0.619,
       "model_family": "Qwen3-32B",
       "f1": 0.296475
     },
+    "Memgpt (32B)": {
       "accuracy": 0.206875,
       "model_family": "Qwen3-32B",
       "f1": 0.178975
     },
+    "Mem-alpha (32B)": {
       "accuracy": 0.30065,
       "model_family": "Qwen3-32B",
       "f1": 0.26505
     },
+    "Memorybank (32B)": {
       "accuracy": 0.23855,
       "model_family": "Qwen3-32B",
       "f1": 0.28355
     },
+    "Simple mem (32B)": {
       "accuracy": 0.192575,
       "model_family": "Qwen3-32B",
       "f1": 0.157225
     },
+    "Long context (32B)": {
       "accuracy": 0.456075,
       "model_family": "Qwen3-32B",
       "f1": 0.295275
     }
   },
   "OPENWORLD_QA": {
+    "Qwen3-Embedding-4B (32B)": {
       "accuracy": 0.399125,
       "model_family": "Qwen3-32B",
       "f1": 0.0837
     },
+    "GRAPHRAG (32B)": {
       "accuracy": 0.31845,
       "model_family": "Qwen3-32B",
       "f1": 0.22635
     },
+    "Hipporag2 (32B)": {
       "accuracy": 0.45825,
       "model_family": "Qwen3-32B",
       "f1": 0.2362
     },
+    "Memagent (32B)": {
       "accuracy": 0.158225,
       "model_family": "Qwen3-32B",
       "f1": 0.0704
     },
+    "Mem1 (32B)": {
       "accuracy": 0.12065000000000001,
       "model_family": "Qwen3-32B",
       "f1": 0.15005
     },
+    "Amem (32B)": {
       "accuracy": 0.29359999999999997,
       "model_family": "Qwen3-32B",
       "f1": 0.2079
     },
+    "Mem0 (32B)": {
       "accuracy": 0.16197499999999998,
       "model_family": "Qwen3-32B",
       "f1": 0.1604
     },
+    "Memorag (32B)": {
       "accuracy": 0.411375,
       "model_family": "Qwen3-32B",
       "f1": 0.093675
     },
+    "Memgpt (32B)": {
       "accuracy": 0.3155,
       "model_family": "Qwen3-32B",
       "f1": 0.0595
     },
+    "Mem-alpha (32B)": {
       "accuracy": 0.2301,
       "model_family": "Qwen3-32B",
       "f1": 0.13345
     },
+    "Memorybank (32B)": {
       "accuracy": 0.3486,
       "model_family": "Qwen3-32B",
       "f1": 0.2519
     },
+    "Simple mem (32B)": {
       "accuracy": 0.12154999999999999,
       "model_family": "Qwen3-32B",
       "f1": 0.1312
     },
+    "Long context (32B)": {
       "accuracy": 0.49785,
       "model_family": "Qwen3-32B",
       "f1": 0.3349
     }
   },
   "SOFTWARE": {
+    "Qwen3-Embedding-4B (32B)": {
       "accuracy": 0.599025,
       "model_family": "Qwen3-32B",
       "f1": 0.083575
     },
+    "GRAPHRAG (32B)": {
       "accuracy": 0.348875,
       "model_family": "Qwen3-32B",
       "f1": 0.229825
     },
+    "Hipporag2 (32B)": {
       "accuracy": 0.5299,
       "model_family": "Qwen3-32B",
       "f1": 0.1279
     },
+    "Memagent (32B)": {
       "accuracy": 0.53965,
       "model_family": "Qwen3-32B",
       "f1": 0.09085
     },
+    "Mem1 (32B)": {
       "accuracy": 0.18595,
       "model_family": "Qwen3-32B",
       "f1": 0.17527500000000001
     },
+    "Amem (32B)": {
       "accuracy": 0.29615,
       "model_family": "Qwen3-32B",
       "f1": 0.20395
     },
+    "Mem0 (32B)": {
       "accuracy": 0.2366,
       "model_family": "Qwen3-32B",
       "f1": 0.176975
     },
+    "Memorag (32B)": {
       "accuracy": 0.55005,
       "model_family": "Qwen3-32B",
       "f1": 0.10707499999999999
     },
+    "Memgpt (32B)": {
       "accuracy": 0.599125,
       "model_family": "Qwen3-32B",
       "f1": 0.066575
     },
+    "Mem-alpha (32B)": {
       "accuracy": 0.3476,
       "model_family": "Qwen3-32B",
       "f1": 0.12492500000000001
     },
+    "Memorybank (32B)": {
       "accuracy": 0.5072,
       "model_family": "Qwen3-32B",
       "f1": 0.240875
     },
+    "Simple mem (32B)": {
       "accuracy": 0.2431,
       "model_family": "Qwen3-32B",
       "f1": 0.2005
     },
+    "Long context (32B)": {
       "accuracy": 0.4847,
       "model_family": "Qwen3-32B",
       "f1": 0.267725

scorer.py CHANGED Viewed

@@ -1,55 +1,31 @@
 """
 Scoring functions for AMA-Bench submissions.
-This module implements evaluation logic for multiple-choice questions,
-calculating accuracy by comparing uppercase letters in answers.
 """
 import re
 from typing import Union, List, Dict
-def extract_uppercase_letters(text: str) -> str:
-    """
-    Extract all uppercase letters from text.
-    Used for multiple-choice answer comparison where answers are like
-    "A", "B", "AB", "ACD", etc.
-    Args:
-        text: Input text containing answer choices
-    Returns:
-        String of uppercase letters only, sorted alphabetically
-    """
-    if not isinstance(text, str):
-        text = str(text)
-    # Extract all uppercase letters
-    letters = [c for c in text if c.isupper() and c.isalpha()]
-    # Sort and join to ensure consistent ordering
-    return ''.join(sorted(set(letters)))
-def multiple_choice_accuracy(prediction: str, reference: str) -> float:
     """
-    Calculate accuracy for multiple-choice answers.
-    Compares uppercase letters extracted from both prediction and reference.
-    Returns 1.0 if they match exactly, 0.0 otherwise.
     Args:
-        prediction: Model's predicted answer
-        reference: Ground truth reference answer
     Returns:
-        1.0 if exact match, 0.0 otherwise
     """
-    pred_letters = extract_uppercase_letters(prediction)
-    ref_letters = extract_uppercase_letters(reference)
-    return 1.0 if pred_letters == ref_letters else 0.0
 def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
@@ -63,7 +39,7 @@ def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
         Dictionary with accuracy metric
     """
     if not scores:
-        return {"accuracy": 0.0, "count": 0}
     import numpy as np
@@ -97,6 +73,10 @@ def score_submission(
             "Causal": "Causal Inference",
             "State": "State Updating",
             "Abstraction": "State Abstraction",
         }
     # Initialize scores by metric
@@ -128,8 +108,8 @@ def score_submission(
             reference = gt_info["answer"]
             qa_type = gt_info.get("type", "Recall")
-            # Calculate accuracy
-            score = multiple_choice_accuracy(answer, reference)
         # Map question type to metric category
         metric_category = "Recall"  # default
@@ -150,6 +130,7 @@ def score_submission(
             "score": score,
             "reference_answer": reference,
             "metric_category": metric_category,
         })
     # Calculate metrics for each category
@@ -163,4 +144,4 @@ def score_submission(
     return {
         "scores": results,
         "scored_submissions": scored_submissions,
-    }

 """
 Scoring functions for AMA-Bench submissions.
+This module implements evaluation logic for string answers,
+calculating accuracy by exact string match (case-insensitive).
 """
 import re
 from typing import Union, List, Dict
+def normalize_answer(text: str) -> str:
+    """Normalize answer string for comparison (lowercase, strip whitespace)."""
+    return str(text).strip().lower()
+def string_exact_match(prediction: str, reference: str) -> float:
     """
+    Calculate accuracy for string answers using exact match.
     Args:
+        prediction: Model's predicted answer string
+        reference: Ground truth reference answer string
     Returns:
+        1.0 if normalized strings match exactly, 0.0 otherwise
     """
+    return 1.0 if normalize_answer(prediction) == normalize_answer(reference) else 0.0
 def calculate_accuracy(scores: List[float]) -> Dict[str, float]:
         Dictionary with accuracy metric
     """
     if not scores:
+        return {"accuracy": 0.0, "count": 0, "correct": 0}
     import numpy as np
             "Causal": "Causal Inference",
             "State": "State Updating",
             "Abstraction": "State Abstraction",
+            "A": "Recall",
+            "B": "Causal Inference",
+            "C": "State Updating",
+            "D": "State Abstraction",
         }
     # Initialize scores by metric
             reference = gt_info["answer"]
             qa_type = gt_info.get("type", "Recall")
+            # Calculate accuracy via exact string match
+            score = string_exact_match(answer, reference)
         # Map question type to metric category
         metric_category = "Recall"  # default
             "score": score,
             "reference_answer": reference,
             "metric_category": metric_category,
+            "domain": gt_info.get("domain", "") if gt_info else "",
         })
     # Calculate metrics for each category
     return {
         "scores": results,
         "scored_submissions": scored_submissions,
+    }

utils.py CHANGED Viewed

@@ -107,6 +107,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
             for row in dataset:
                 episode_id = row.get("episode_id", "")
                 qa_pairs = row.get("qa_pairs", [])
                 for qa in qa_pairs:
@@ -119,7 +120,8 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
                     groundtruth[key] = {
                         "answer": answer,
                         "type": qa_type,
-                        "sub_type": qa.get("sub_type", "")
                     }
         except Exception as hf_error:
@@ -128,7 +130,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
             # Fallback to local file
             import json
-            local_path = "test/test.jsonl"
             try:
                 with open(local_path, 'r', encoding='utf-8') as f:
@@ -139,6 +141,7 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
                         data = json.loads(line)
                         episode_id = data.get("episode_id", "")
                         qa_pairs = data.get("qa_pairs", [])
                         for qa in qa_pairs:
@@ -151,7 +154,8 @@ def load_groundtruth(dataset_name: str, token: str = None) -> Dict[str, str]:
                             groundtruth[key] = {
                                 "answer": answer,
                                 "type": qa_type,
-                                "sub_type": qa.get("sub_type", "")
                             }
                 print(f"Loaded from local file: {local_path}")
@@ -221,4 +225,4 @@ def validate_submission_file(file_path: str) -> tuple:
     except FileNotFoundError:
         return False, "File not found.", []
     except Exception as e:
-        return False, f"Error reading file: {str(e)}", []

             for row in dataset:
                 episode_id = row.get("episode_id", "")
+                domain = row.get("domain", "")
                 qa_pairs = row.get("qa_pairs", [])
                 for qa in qa_pairs:
                     groundtruth[key] = {
                         "answer": answer,
                         "type": qa_type,
+                        "sub_type": qa.get("sub_type", ""),
+                        "domain": domain,
                     }
         except Exception as hf_error:
             # Fallback to local file
             import json
+            local_path = "test/open_end_qa_set.jsonl"
             try:
                 with open(local_path, 'r', encoding='utf-8') as f:
                         data = json.loads(line)
                         episode_id = data.get("episode_id", "")
+                        domain = data.get("domain", "")
                         qa_pairs = data.get("qa_pairs", [])
                         for qa in qa_pairs:
                             groundtruth[key] = {
                                 "answer": answer,
                                 "type": qa_type,
+                                "sub_type": qa.get("sub_type", ""),
+                                "domain": domain,
                             }
                 print(f"Loaded from local file: {local_path}")
     except FileNotFoundError:
         return False, "File not found.", []
     except Exception as e:
+        return False, f"Error reading file: {str(e)}", []