Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

KeenWoo commited on Sep 13

Commit

ee63446

verified ·

1 Parent(s): ae48925

Update evaluate.py

Files changed (1) hide show

evaluate.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import os
 import json
 import time
 import pandas as pd
 from typing import List, Dict, Any
@@ -101,33 +102,33 @@ def _parse_judge_json(raw_str: str) -> dict | None:
     except (json.JSONDecodeError, AttributeError):
         return None
-    # --- NEW: helpers for categorisation and error-class labelling ---
-    def _categorize_test(test_id: str) -> str:
-        tid = (test_id or "").lower()
-        if "synonym" in tid: return "synonym"
-        if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
-        if "omission" in tid: return "omission"
-        if "hallucination" in tid: return "hallucination"
-        if "time" in tid or "temporal" in tid: return "temporal"
-        if "context" in tid: return "context_disambig"
-        return "baseline"
-    def _classify_error(gt: str, gen: str) -> str:
-        import re
-        gt = (gt or "").strip().lower()
-        gen = (gen or "").strip().lower()
-        if not gen:
-            return "empty"
-        if not gt:
-            return "hallucination" if gen else "empty"
-        if gt in gen:
-            return "paraphrase"
-        gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
-        gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
-        overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
-        if overlap >= 0.3:
-            return "omission"
-        return "contradiction"
 def run_comprehensive_evaluation(

 import os
 import json
 import time
+import re  # <-- ADD THIS IMPORT
 import pandas as pd
 from typing import List, Dict, Any
     except (json.JSONDecodeError, AttributeError):
         return None
+# --- NEW: helpers for categorisation and error-class labelling ---
+def _categorize_test(test_id: str) -> str:
+    tid = (test_id or "").lower()
+    if "synonym" in tid: return "synonym"
+    if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
+    if "omission" in tid: return "omission"
+    if "hallucination" in tid: return "hallucination"
+    if "time" in tid or "temporal" in tid: return "temporal"
+    if "context" in tid: return "context_disambig"
+    return "baseline"
+def _classify_error(gt: str, gen: str) -> str:
+    import re
+    gt = (gt or "").strip().lower()
+    gen = (gen or "").strip().lower()
+    if not gen:
+        return "empty"
+    if not gt:
+        return "hallucination" if gen else "empty"
+    if gt in gen:
+        return "paraphrase"
+    gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
+    gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
+    overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
+    if overlap >= 0.3:
+        return "omission"
+    return "contradiction"
 def run_comprehensive_evaluation(