Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +96 -0
evaluate.py
CHANGED
|
@@ -101,6 +101,35 @@ def _parse_judge_json(raw_str: str) -> dict | None:
|
|
| 101 |
except (json.JSONDecodeError, AttributeError):
|
| 102 |
return None
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
def run_comprehensive_evaluation(
|
| 105 |
vs_general: FAISS,
|
| 106 |
vs_personal: FAISS,
|
|
@@ -212,16 +241,23 @@ def run_comprehensive_evaluation(
|
|
| 212 |
|
| 213 |
answer_correctness_score = None
|
| 214 |
ground_truth_answer = ground_truth.get("ground_truth_answer")
|
|
|
|
|
|
|
| 215 |
if ground_truth_answer and "ERROR" not in answer_text:
|
| 216 |
try:
|
| 217 |
judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
|
| 218 |
raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
|
| 219 |
correctness_data = _parse_judge_json(raw_correctness)
|
|
|
|
| 220 |
if correctness_data and "correctness_score" in correctness_data:
|
| 221 |
answer_correctness_score = float(correctness_data["correctness_score"])
|
|
|
|
| 222 |
except Exception as e:
|
| 223 |
print(f"ERROR during answer correctness judging: {e}")
|
| 224 |
|
|
|
|
|
|
|
|
|
|
| 225 |
faithfulness = None
|
| 226 |
source_docs = response.get("source_documents", [])
|
| 227 |
if source_docs and "ERROR" not in answer_text:
|
|
@@ -241,6 +277,9 @@ def run_comprehensive_evaluation(
|
|
| 241 |
sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
|
| 242 |
results.append({
|
| 243 |
"test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
|
|
|
|
|
|
|
|
|
|
| 244 |
"route_correct": "β
" if route_correct else "β", "expected_route": expected_route, "actual_route": actual_route,
|
| 245 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 246 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
|
@@ -262,6 +301,63 @@ def run_comprehensive_evaluation(
|
|
| 262 |
df = df[[c for c in cols if c in df.columns]]
|
| 263 |
df.to_csv(output_path, index=False, encoding="utf-8")
|
| 264 |
print(f"Evaluation results saved to {output_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
pct = df["route_correct"].value_counts(normalize=True).get("β
", 0) * 100
|
| 267 |
to_f = lambda s: pd.to_numeric(s, errors="coerce")
|
|
|
|
| 101 |
except (json.JSONDecodeError, AttributeError):
|
| 102 |
return None
|
| 103 |
|
| 104 |
+
# --- NEW: helpers for categorisation and error-class labelling ---
|
| 105 |
+
def _categorize_test(test_id: str) -> str:
|
| 106 |
+
tid = (test_id or "").lower()
|
| 107 |
+
if "synonym" in tid: return "synonym"
|
| 108 |
+
if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
|
| 109 |
+
if "omission" in tid: return "omission"
|
| 110 |
+
if "hallucination" in tid: return "hallucination"
|
| 111 |
+
if "time" in tid or "temporal" in tid: return "temporal"
|
| 112 |
+
if "context" in tid: return "context_disambig"
|
| 113 |
+
return "baseline"
|
| 114 |
+
|
| 115 |
+
def _classify_error(gt: str, gen: str) -> str:
|
| 116 |
+
import re
|
| 117 |
+
gt = (gt or "").strip().lower()
|
| 118 |
+
gen = (gen or "").strip().lower()
|
| 119 |
+
if not gen:
|
| 120 |
+
return "empty"
|
| 121 |
+
if not gt:
|
| 122 |
+
return "hallucination" if gen else "empty"
|
| 123 |
+
if gt in gen:
|
| 124 |
+
return "paraphrase"
|
| 125 |
+
gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
|
| 126 |
+
gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
|
| 127 |
+
overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
|
| 128 |
+
if overlap >= 0.3:
|
| 129 |
+
return "omission"
|
| 130 |
+
return "contradiction"
|
| 131 |
+
|
| 132 |
+
|
| 133 |
def run_comprehensive_evaluation(
|
| 134 |
vs_general: FAISS,
|
| 135 |
vs_personal: FAISS,
|
|
|
|
| 241 |
|
| 242 |
answer_correctness_score = None
|
| 243 |
ground_truth_answer = ground_truth.get("ground_truth_answer")
|
| 244 |
+
error_class = None # initialise #NEW
|
| 245 |
+
|
| 246 |
if ground_truth_answer and "ERROR" not in answer_text:
|
| 247 |
try:
|
| 248 |
judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
|
| 249 |
raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
|
| 250 |
correctness_data = _parse_judge_json(raw_correctness)
|
| 251 |
+
|
| 252 |
if correctness_data and "correctness_score" in correctness_data:
|
| 253 |
answer_correctness_score = float(correctness_data["correctness_score"])
|
| 254 |
+
|
| 255 |
except Exception as e:
|
| 256 |
print(f"ERROR during answer correctness judging: {e}")
|
| 257 |
|
| 258 |
+
# --- NEW: derive error class for diagnostics ---
|
| 259 |
+
error_class = _classify_error(ground_truth_answer, answer_text)
|
| 260 |
+
|
| 261 |
faithfulness = None
|
| 262 |
source_docs = response.get("source_documents", [])
|
| 263 |
if source_docs and "ERROR" not in answer_text:
|
|
|
|
| 277 |
sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
|
| 278 |
results.append({
|
| 279 |
"test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
|
| 280 |
+
# NEW for debugging
|
| 281 |
+
"category": _categorize_test(test_id), "error_class": error_class,
|
| 282 |
+
# END
|
| 283 |
"route_correct": "β
" if route_correct else "β", "expected_route": expected_route, "actual_route": actual_route,
|
| 284 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 285 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
|
|
|
| 301 |
df = df[[c for c in cols if c in df.columns]]
|
| 302 |
df.to_csv(output_path, index=False, encoding="utf-8")
|
| 303 |
print(f"Evaluation results saved to {output_path}")
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# --- NEW: write detailed results to a log file instead of CSV ---
|
| 307 |
+
log_path = Path(__file__).parent / "evaluation_log.txt"
|
| 308 |
+
with open(log_path, "a", encoding="utf-8") as logf:
|
| 309 |
+
logf.write("\n===== Detailed Evaluation Run =====\n")
|
| 310 |
+
logf.write(df.to_string(index=False))
|
| 311 |
+
logf.write("\n\n")
|
| 312 |
+
|
| 313 |
+
# --- NEW: per-category averages ---
|
| 314 |
+
try:
|
| 315 |
+
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
| 316 |
+
print("\nπ Correctness by Category:")
|
| 317 |
+
print(cat_means.to_string(index=False))
|
| 318 |
+
with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
|
| 319 |
+
logf.write("\nπ Correctness by Category:\n")
|
| 320 |
+
logf.write(cat_means.to_string(index=False))
|
| 321 |
+
logf.write("\n")
|
| 322 |
+
except Exception as e:
|
| 323 |
+
print(f"WARNING: Could not compute category breakdown: {e}")
|
| 324 |
+
|
| 325 |
+
# --- NEW: confusion-style matrix ---
|
| 326 |
+
try:
|
| 327 |
+
confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
|
| 328 |
+
rownames=["Category"], colnames=["Error Class"], dropna=False)
|
| 329 |
+
print("\nπ Error Class Distribution by Category:")
|
| 330 |
+
print(confusion.to_string())
|
| 331 |
+
with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
|
| 332 |
+
logf.write("\nπ Error Class Distribution by Category:\n")
|
| 333 |
+
logf.write(confusion.to_string())
|
| 334 |
+
logf.write("\n")
|
| 335 |
+
except Exception as e:
|
| 336 |
+
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
# NEW: save detailed results
|
| 340 |
+
df.to_csv("evaluation_results_detailed.csv", index=False, encoding="utf-8")
|
| 341 |
+
|
| 342 |
+
# NEW: per-category averages
|
| 343 |
+
try:
|
| 344 |
+
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
| 345 |
+
print("\nπ Correctness by Category:")
|
| 346 |
+
print(cat_means.to_string(index=False))
|
| 347 |
+
cat_means.to_csv("evaluation_correctness_by_category.csv", index=False)
|
| 348 |
+
except Exception as e:
|
| 349 |
+
print(f"WARNING: Could not compute category breakdown: {e}")
|
| 350 |
+
|
| 351 |
+
# NEW: confusion-style matrix
|
| 352 |
+
try:
|
| 353 |
+
confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
|
| 354 |
+
rownames=["Category"], colnames=["Error Class"], dropna=False)
|
| 355 |
+
print("\nπ Error Class Distribution by Category:")
|
| 356 |
+
print(confusion.to_string())
|
| 357 |
+
confusion.to_csv("evaluation_confusion_matrix.csv")
|
| 358 |
+
except Exception as e:
|
| 359 |
+
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 360 |
+
|
| 361 |
|
| 362 |
pct = df["route_correct"].value_counts(normalize=True).get("β
", 0) * 100
|
| 363 |
to_f = lambda s: pd.to_numeric(s, errors="coerce")
|