Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +28 -8
evaluate.py
CHANGED
|
@@ -284,10 +284,11 @@ def run_comprehensive_evaluation(
|
|
| 284 |
elif not expected_sources_set and not actual_sources_set:
|
| 285 |
context_precision, context_recall = 1.0, 1.0
|
| 286 |
|
| 287 |
-
|
| 288 |
-
print(
|
| 289 |
-
print(f" -
|
| 290 |
-
print("-
|
|
|
|
| 291 |
|
| 292 |
answer_correctness_score = None
|
| 293 |
if ground_truth_answer and "ERROR" not in answer_text:
|
|
@@ -304,6 +305,7 @@ def run_comprehensive_evaluation(
|
|
| 304 |
print(f"ERROR during answer correctness judging: {e}")
|
| 305 |
|
| 306 |
faithfulness = None
|
|
|
|
| 307 |
source_docs = response.get("source_documents", [])
|
| 308 |
if source_docs and "ERROR" not in answer_text:
|
| 309 |
context_blob = "\n---\n".join([doc.page_content for doc in source_docs])
|
|
@@ -314,8 +316,13 @@ def run_comprehensive_evaluation(
|
|
| 314 |
data = _parse_judge_json(raw)
|
| 315 |
if data:
|
| 316 |
denom = data.get("supported", 0) + data.get("contradicted", 0) + data.get("not_enough_info", 0)
|
| 317 |
-
if denom > 0:
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
except Exception as e:
|
| 320 |
print(f"ERROR during faithfulness judging: {e}")
|
| 321 |
|
|
@@ -326,9 +333,10 @@ def run_comprehensive_evaluation(
|
|
| 326 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 327 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 328 |
"generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
|
| 329 |
-
"
|
| 330 |
"context_precision": context_precision, "context_recall": context_recall,
|
| 331 |
"answer_correctness": answer_correctness_score,
|
|
|
|
| 332 |
"category": category,
|
| 333 |
"error_class": error_class
|
| 334 |
})
|
|
@@ -350,7 +358,15 @@ def run_comprehensive_evaluation(
|
|
| 350 |
tf1_mean = to_f(df["topic_f1"]).mean() * 100
|
| 351 |
cf1_mean = to_f(df["context_f1"]).mean() * 100
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
# Add the NLU metrics to the summary f-string
|
|
|
|
| 354 |
summary_text = f"""## Evaluation Summary
|
| 355 |
- **Routing Accuracy**: {pct:.2f}%
|
| 356 |
- **Behaviour F1 (avg)**: {bf1_mean:.2f}%
|
|
@@ -359,7 +375,11 @@ def run_comprehensive_evaluation(
|
|
| 359 |
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 360 |
- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
|
| 361 |
- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
|
| 362 |
-
- **RAG
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
# --- END OF MODIFICATION ---
|
| 364 |
|
| 365 |
|
|
|
|
| 284 |
elif not expected_sources_set and not actual_sources_set:
|
| 285 |
context_precision, context_recall = 1.0, 1.0
|
| 286 |
|
| 287 |
+
# TURN DEBUG on Answer Correctness
|
| 288 |
+
# print("\n" + "-"*20 + " ANSWER & CORRECTNESS EVALUATION " + "-"*20)
|
| 289 |
+
# print(f" - Ground Truth Answer: {ground_truth_answer}")
|
| 290 |
+
# print(f" - Generated Answer: {answer_text}")
|
| 291 |
+
# print("-" * 59)
|
| 292 |
|
| 293 |
answer_correctness_score = None
|
| 294 |
if ground_truth_answer and "ERROR" not in answer_text:
|
|
|
|
| 305 |
print(f"ERROR during answer correctness judging: {e}")
|
| 306 |
|
| 307 |
faithfulness = None
|
| 308 |
+
hallucination_rate = None
|
| 309 |
source_docs = response.get("source_documents", [])
|
| 310 |
if source_docs and "ERROR" not in answer_text:
|
| 311 |
context_blob = "\n---\n".join([doc.page_content for doc in source_docs])
|
|
|
|
| 316 |
data = _parse_judge_json(raw)
|
| 317 |
if data:
|
| 318 |
denom = data.get("supported", 0) + data.get("contradicted", 0) + data.get("not_enough_info", 0)
|
| 319 |
+
if denom > 0:
|
| 320 |
+
faithfulness = round(data.get("supported", 0) / denom, 3)
|
| 321 |
+
hallucination_rate = 1.0 - faithfulness
|
| 322 |
+
elif data.get("ignored", 0) > 0:
|
| 323 |
+
faithfulness = 1.0
|
| 324 |
+
hallucination_rate = 0.0
|
| 325 |
+
|
| 326 |
except Exception as e:
|
| 327 |
print(f"ERROR during faithfulness judging: {e}")
|
| 328 |
|
|
|
|
| 333 |
"behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
|
| 334 |
"topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
|
| 335 |
"generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
|
| 336 |
+
"faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
|
| 337 |
"context_precision": context_precision, "context_recall": context_recall,
|
| 338 |
"answer_correctness": answer_correctness_score,
|
| 339 |
+
"latency_ms": latency_ms,
|
| 340 |
"category": category,
|
| 341 |
"error_class": error_class
|
| 342 |
})
|
|
|
|
| 358 |
tf1_mean = to_f(df["topic_f1"]).mean() * 100
|
| 359 |
cf1_mean = to_f(df["context_f1"]).mean() * 100
|
| 360 |
|
| 361 |
+
# Calculate the mean for Faithfulness
|
| 362 |
+
faith_mean = to_f(df["faithfulness"]).mean() * 100
|
| 363 |
+
# --- CHANGE 6: Calculate the mean for the new metric ---
|
| 364 |
+
halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
|
| 365 |
+
|
| 366 |
+
rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
|
| 367 |
+
|
| 368 |
# Add the NLU metrics to the summary f-string
|
| 369 |
+
# Choose to use Hallucination - **RAG: Faithfulness**: {faith_mean:.1f}%
|
| 370 |
summary_text = f"""## Evaluation Summary
|
| 371 |
- **Routing Accuracy**: {pct:.2f}%
|
| 372 |
- **Behaviour F1 (avg)**: {bf1_mean:.2f}%
|
|
|
|
| 375 |
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 376 |
- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
|
| 377 |
- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
|
| 378 |
+
- **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
|
| 379 |
+
- **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
|
| 380 |
+
- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
|
| 381 |
+
- **RAG: Avg Latency (ms)**: {to_f(df["latency_ms"]).mean():.1f}
|
| 382 |
+
"""
|
| 383 |
# --- END OF MODIFICATION ---
|
| 384 |
|
| 385 |
|