KeenWoo commited on
Commit
ee63446
·
verified ·
1 Parent(s): ae48925

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +28 -27
evaluate.py CHANGED
@@ -3,6 +3,7 @@
3
  import os
4
  import json
5
  import time
 
6
  import pandas as pd
7
  from typing import List, Dict, Any
8
 
@@ -101,33 +102,33 @@ def _parse_judge_json(raw_str: str) -> dict | None:
101
  except (json.JSONDecodeError, AttributeError):
102
  return None
103
 
104
- # --- NEW: helpers for categorisation and error-class labelling ---
105
- def _categorize_test(test_id: str) -> str:
106
- tid = (test_id or "").lower()
107
- if "synonym" in tid: return "synonym"
108
- if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
109
- if "omission" in tid: return "omission"
110
- if "hallucination" in tid: return "hallucination"
111
- if "time" in tid or "temporal" in tid: return "temporal"
112
- if "context" in tid: return "context_disambig"
113
- return "baseline"
114
-
115
- def _classify_error(gt: str, gen: str) -> str:
116
- import re
117
- gt = (gt or "").strip().lower()
118
- gen = (gen or "").strip().lower()
119
- if not gen:
120
- return "empty"
121
- if not gt:
122
- return "hallucination" if gen else "empty"
123
- if gt in gen:
124
- return "paraphrase"
125
- gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
126
- gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
127
- overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
128
- if overlap >= 0.3:
129
- return "omission"
130
- return "contradiction"
131
 
132
 
133
  def run_comprehensive_evaluation(
 
3
  import os
4
  import json
5
  import time
6
+ import re # <-- ADD THIS IMPORT
7
  import pandas as pd
8
  from typing import List, Dict, Any
9
 
 
102
  except (json.JSONDecodeError, AttributeError):
103
  return None
104
 
105
+ # --- NEW: helpers for categorisation and error-class labelling ---
106
+ def _categorize_test(test_id: str) -> str:
107
+ tid = (test_id or "").lower()
108
+ if "synonym" in tid: return "synonym"
109
+ if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
110
+ if "omission" in tid: return "omission"
111
+ if "hallucination" in tid: return "hallucination"
112
+ if "time" in tid or "temporal" in tid: return "temporal"
113
+ if "context" in tid: return "context_disambig"
114
+ return "baseline"
115
+
116
+ def _classify_error(gt: str, gen: str) -> str:
117
+ import re
118
+ gt = (gt or "").strip().lower()
119
+ gen = (gen or "").strip().lower()
120
+ if not gen:
121
+ return "empty"
122
+ if not gt:
123
+ return "hallucination" if gen else "empty"
124
+ if gt in gen:
125
+ return "paraphrase"
126
+ gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
127
+ gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
128
+ overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
129
+ if overlap >= 0.3:
130
+ return "omission"
131
+ return "contradiction"
132
 
133
 
134
  def run_comprehensive_evaluation(