Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +28 -27
evaluate.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import time
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from typing import List, Dict, Any
|
| 8 |
|
|
@@ -101,33 +102,33 @@ def _parse_judge_json(raw_str: str) -> dict | None:
|
|
| 101 |
except (json.JSONDecodeError, AttributeError):
|
| 102 |
return None
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
|
| 132 |
|
| 133 |
def run_comprehensive_evaluation(
|
|
|
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import time
|
| 6 |
+
import re # <-- ADD THIS IMPORT
|
| 7 |
import pandas as pd
|
| 8 |
from typing import List, Dict, Any
|
| 9 |
|
|
|
|
| 102 |
except (json.JSONDecodeError, AttributeError):
|
| 103 |
return None
|
| 104 |
|
| 105 |
+
# --- NEW: helpers for categorisation and error-class labelling ---
|
| 106 |
+
def _categorize_test(test_id: str) -> str:
|
| 107 |
+
tid = (test_id or "").lower()
|
| 108 |
+
if "synonym" in tid: return "synonym"
|
| 109 |
+
if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
|
| 110 |
+
if "omission" in tid: return "omission"
|
| 111 |
+
if "hallucination" in tid: return "hallucination"
|
| 112 |
+
if "time" in tid or "temporal" in tid: return "temporal"
|
| 113 |
+
if "context" in tid: return "context_disambig"
|
| 114 |
+
return "baseline"
|
| 115 |
+
|
| 116 |
+
def _classify_error(gt: str, gen: str) -> str:
|
| 117 |
+
import re
|
| 118 |
+
gt = (gt or "").strip().lower()
|
| 119 |
+
gen = (gen or "").strip().lower()
|
| 120 |
+
if not gen:
|
| 121 |
+
return "empty"
|
| 122 |
+
if not gt:
|
| 123 |
+
return "hallucination" if gen else "empty"
|
| 124 |
+
if gt in gen:
|
| 125 |
+
return "paraphrase"
|
| 126 |
+
gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
|
| 127 |
+
gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
|
| 128 |
+
overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
|
| 129 |
+
if overlap >= 0.3:
|
| 130 |
+
return "omission"
|
| 131 |
+
return "contradiction"
|
| 132 |
|
| 133 |
|
| 134 |
def run_comprehensive_evaluation(
|