KeenWoo commited on
Commit
1d3f707
·
verified ·
1 Parent(s): ba67831

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +28 -8
evaluate.py CHANGED
@@ -284,10 +284,11 @@ def run_comprehensive_evaluation(
284
  elif not expected_sources_set and not actual_sources_set:
285
  context_precision, context_recall = 1.0, 1.0
286
 
287
- print("\n" + "-"*20 + " ANSWER & CORRECTNESS EVALUATION " + "-"*20)
288
- print(f" - Ground Truth Answer: {ground_truth_answer}")
289
- print(f" - Generated Answer: {answer_text}")
290
- print("-" * 59)
 
291
 
292
  answer_correctness_score = None
293
  if ground_truth_answer and "ERROR" not in answer_text:
@@ -304,6 +305,7 @@ def run_comprehensive_evaluation(
304
  print(f"ERROR during answer correctness judging: {e}")
305
 
306
  faithfulness = None
 
307
  source_docs = response.get("source_documents", [])
308
  if source_docs and "ERROR" not in answer_text:
309
  context_blob = "\n---\n".join([doc.page_content for doc in source_docs])
@@ -314,8 +316,13 @@ def run_comprehensive_evaluation(
314
  data = _parse_judge_json(raw)
315
  if data:
316
  denom = data.get("supported", 0) + data.get("contradicted", 0) + data.get("not_enough_info", 0)
317
- if denom > 0: faithfulness = round(data.get("supported", 0) / denom, 3)
318
- elif data.get("ignored", 0) > 0: faithfulness = 1.0
 
 
 
 
 
319
  except Exception as e:
320
  print(f"ERROR during faithfulness judging: {e}")
321
 
@@ -326,9 +333,10 @@ def run_comprehensive_evaluation(
326
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
327
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
328
  "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
329
- "latency_ms": latency_ms, "faithfulness": faithfulness,
330
  "context_precision": context_precision, "context_recall": context_recall,
331
  "answer_correctness": answer_correctness_score,
 
332
  "category": category,
333
  "error_class": error_class
334
  })
@@ -350,7 +358,15 @@ def run_comprehensive_evaluation(
350
  tf1_mean = to_f(df["topic_f1"]).mean() * 100
351
  cf1_mean = to_f(df["context_f1"]).mean() * 100
352
 
 
 
 
 
 
 
 
353
  # Add the NLU metrics to the summary f-string
 
354
  summary_text = f"""## Evaluation Summary
355
  - **Routing Accuracy**: {pct:.2f}%
356
  - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
@@ -359,7 +375,11 @@ def run_comprehensive_evaluation(
359
  - **Context F1 (avg)**: {cf1_mean:.2f}%
360
  - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
361
  - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
362
- - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
 
 
 
 
363
  # --- END OF MODIFICATION ---
364
 
365
 
 
284
  elif not expected_sources_set and not actual_sources_set:
285
  context_precision, context_recall = 1.0, 1.0
286
 
287
+ # TURN DEBUG on Answer Correctness
288
+ # print("\n" + "-"*20 + " ANSWER & CORRECTNESS EVALUATION " + "-"*20)
289
+ # print(f" - Ground Truth Answer: {ground_truth_answer}")
290
+ # print(f" - Generated Answer: {answer_text}")
291
+ # print("-" * 59)
292
 
293
  answer_correctness_score = None
294
  if ground_truth_answer and "ERROR" not in answer_text:
 
305
  print(f"ERROR during answer correctness judging: {e}")
306
 
307
  faithfulness = None
308
+ hallucination_rate = None
309
  source_docs = response.get("source_documents", [])
310
  if source_docs and "ERROR" not in answer_text:
311
  context_blob = "\n---\n".join([doc.page_content for doc in source_docs])
 
316
  data = _parse_judge_json(raw)
317
  if data:
318
  denom = data.get("supported", 0) + data.get("contradicted", 0) + data.get("not_enough_info", 0)
319
+ if denom > 0:
320
+ faithfulness = round(data.get("supported", 0) / denom, 3)
321
+ hallucination_rate = 1.0 - faithfulness
322
+ elif data.get("ignored", 0) > 0:
323
+ faithfulness = 1.0
324
+ hallucination_rate = 0.0
325
+
326
  except Exception as e:
327
  print(f"ERROR during faithfulness judging: {e}")
328
 
 
333
  "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
334
  "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
335
  "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
336
+ "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
337
  "context_precision": context_precision, "context_recall": context_recall,
338
  "answer_correctness": answer_correctness_score,
339
+ "latency_ms": latency_ms,
340
  "category": category,
341
  "error_class": error_class
342
  })
 
358
  tf1_mean = to_f(df["topic_f1"]).mean() * 100
359
  cf1_mean = to_f(df["context_f1"]).mean() * 100
360
 
361
+ # Calculate the mean for Faithfulness
362
+ faith_mean = to_f(df["faithfulness"]).mean() * 100
363
+ # --- CHANGE 6: Calculate the mean for the new metric ---
364
+ halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
365
+
366
+ rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
367
+
368
  # Add the NLU metrics to the summary f-string
369
+ # Choose to use Hallucination - **RAG: Faithfulness**: {faith_mean:.1f}%
370
  summary_text = f"""## Evaluation Summary
371
  - **Routing Accuracy**: {pct:.2f}%
372
  - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
 
375
  - **Context F1 (avg)**: {cf1_mean:.2f}%
376
  - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
377
  - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
378
+ - **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
379
+ - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
380
+ - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
381
+ - **RAG: Avg Latency (ms)**: {to_f(df["latency_ms"]).mean():.1f}
382
+ """
383
  # --- END OF MODIFICATION ---
384
 
385