KeenWoo commited on
Commit
63529fe
Β·
verified Β·
1 Parent(s): a7dc53b

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +96 -0
evaluate.py CHANGED
@@ -101,6 +101,35 @@ def _parse_judge_json(raw_str: str) -> dict | None:
101
  except (json.JSONDecodeError, AttributeError):
102
  return None
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  def run_comprehensive_evaluation(
105
  vs_general: FAISS,
106
  vs_personal: FAISS,
@@ -212,16 +241,23 @@ def run_comprehensive_evaluation(
212
 
213
  answer_correctness_score = None
214
  ground_truth_answer = ground_truth.get("ground_truth_answer")
 
 
215
  if ground_truth_answer and "ERROR" not in answer_text:
216
  try:
217
  judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
218
  raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
219
  correctness_data = _parse_judge_json(raw_correctness)
 
220
  if correctness_data and "correctness_score" in correctness_data:
221
  answer_correctness_score = float(correctness_data["correctness_score"])
 
222
  except Exception as e:
223
  print(f"ERROR during answer correctness judging: {e}")
224
 
 
 
 
225
  faithfulness = None
226
  source_docs = response.get("source_documents", [])
227
  if source_docs and "ERROR" not in answer_text:
@@ -241,6 +277,9 @@ def run_comprehensive_evaluation(
241
  sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
242
  results.append({
243
  "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
 
 
 
244
  "route_correct": "βœ…" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
245
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
246
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
@@ -262,6 +301,63 @@ def run_comprehensive_evaluation(
262
  df = df[[c for c in cols if c in df.columns]]
263
  df.to_csv(output_path, index=False, encoding="utf-8")
264
  print(f"Evaluation results saved to {output_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  pct = df["route_correct"].value_counts(normalize=True).get("βœ…", 0) * 100
267
  to_f = lambda s: pd.to_numeric(s, errors="coerce")
 
101
  except (json.JSONDecodeError, AttributeError):
102
  return None
103
 
104
+ # --- NEW: helpers for categorisation and error-class labelling ---
105
+ def _categorize_test(test_id: str) -> str:
106
+ tid = (test_id or "").lower()
107
+ if "synonym" in tid: return "synonym"
108
+ if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
109
+ if "omission" in tid: return "omission"
110
+ if "hallucination" in tid: return "hallucination"
111
+ if "time" in tid or "temporal" in tid: return "temporal"
112
+ if "context" in tid: return "context_disambig"
113
+ return "baseline"
114
+
115
+ def _classify_error(gt: str, gen: str) -> str:
116
+ import re
117
+ gt = (gt or "").strip().lower()
118
+ gen = (gen or "").strip().lower()
119
+ if not gen:
120
+ return "empty"
121
+ if not gt:
122
+ return "hallucination" if gen else "empty"
123
+ if gt in gen:
124
+ return "paraphrase"
125
+ gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
126
+ gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
127
+ overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
128
+ if overlap >= 0.3:
129
+ return "omission"
130
+ return "contradiction"
131
+
132
+
133
  def run_comprehensive_evaluation(
134
  vs_general: FAISS,
135
  vs_personal: FAISS,
 
241
 
242
  answer_correctness_score = None
243
  ground_truth_answer = ground_truth.get("ground_truth_answer")
244
+ error_class = None # initialise #NEW
245
+
246
  if ground_truth_answer and "ERROR" not in answer_text:
247
  try:
248
  judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
249
  raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
250
  correctness_data = _parse_judge_json(raw_correctness)
251
+
252
  if correctness_data and "correctness_score" in correctness_data:
253
  answer_correctness_score = float(correctness_data["correctness_score"])
254
+
255
  except Exception as e:
256
  print(f"ERROR during answer correctness judging: {e}")
257
 
258
+ # --- NEW: derive error class for diagnostics ---
259
+ error_class = _classify_error(ground_truth_answer, answer_text)
260
+
261
  faithfulness = None
262
  source_docs = response.get("source_documents", [])
263
  if source_docs and "ERROR" not in answer_text:
 
277
  sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
278
  results.append({
279
  "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
280
+ # NEW for debugging
281
+ "category": _categorize_test(test_id), "error_class": error_class,
282
+ # END
283
  "route_correct": "βœ…" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
284
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
285
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
 
301
  df = df[[c for c in cols if c in df.columns]]
302
  df.to_csv(output_path, index=False, encoding="utf-8")
303
  print(f"Evaluation results saved to {output_path}")
304
+
305
+
306
+ # --- NEW: write detailed results to a log file instead of CSV ---
307
+ log_path = Path(__file__).parent / "evaluation_log.txt"
308
+ with open(log_path, "a", encoding="utf-8") as logf:
309
+ logf.write("\n===== Detailed Evaluation Run =====\n")
310
+ logf.write(df.to_string(index=False))
311
+ logf.write("\n\n")
312
+
313
+ # --- NEW: per-category averages ---
314
+ try:
315
+ cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
316
+ print("\nπŸ“Š Correctness by Category:")
317
+ print(cat_means.to_string(index=False))
318
+ with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
319
+ logf.write("\nπŸ“Š Correctness by Category:\n")
320
+ logf.write(cat_means.to_string(index=False))
321
+ logf.write("\n")
322
+ except Exception as e:
323
+ print(f"WARNING: Could not compute category breakdown: {e}")
324
+
325
+ # --- NEW: confusion-style matrix ---
326
+ try:
327
+ confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
328
+ rownames=["Category"], colnames=["Error Class"], dropna=False)
329
+ print("\nπŸ“Š Error Class Distribution by Category:")
330
+ print(confusion.to_string())
331
+ with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
332
+ logf.write("\nπŸ“Š Error Class Distribution by Category:\n")
333
+ logf.write(confusion.to_string())
334
+ logf.write("\n")
335
+ except Exception as e:
336
+ print(f"WARNING: Could not build confusion matrix: {e}")
337
+
338
+
339
+ # NEW: save detailed results
340
+ df.to_csv("evaluation_results_detailed.csv", index=False, encoding="utf-8")
341
+
342
+ # NEW: per-category averages
343
+ try:
344
+ cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
345
+ print("\nπŸ“Š Correctness by Category:")
346
+ print(cat_means.to_string(index=False))
347
+ cat_means.to_csv("evaluation_correctness_by_category.csv", index=False)
348
+ except Exception as e:
349
+ print(f"WARNING: Could not compute category breakdown: {e}")
350
+
351
+ # NEW: confusion-style matrix
352
+ try:
353
+ confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
354
+ rownames=["Category"], colnames=["Error Class"], dropna=False)
355
+ print("\nπŸ“Š Error Class Distribution by Category:")
356
+ print(confusion.to_string())
357
+ confusion.to_csv("evaluation_confusion_matrix.csv")
358
+ except Exception as e:
359
+ print(f"WARNING: Could not build confusion matrix: {e}")
360
+
361
 
362
  pct = df["route_correct"].value_counts(normalize=True).get("βœ…", 0) * 100
363
  to_f = lambda s: pd.to_numeric(s, errors="coerce")