Spaces:
Sleeping
Sleeping
Update evaluate.py
Browse files- evaluate.py +70 -34
evaluate.py
CHANGED
|
@@ -530,6 +530,11 @@ def run_comprehensive_evaluation(
|
|
| 530 |
"recall_at_5": recall_at_5, # <-- ADD THIS LINE
|
| 531 |
"latency_ms": latency_ms
|
| 532 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
df = pd.DataFrame(results)
|
| 535 |
summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
|
|
@@ -557,55 +562,86 @@ def run_comprehensive_evaluation(
|
|
| 557 |
tf1_mean = to_f(df["topic_f1"]).mean() * 100
|
| 558 |
cf1_mean = to_f(df["context_f1"]).mean() * 100
|
| 559 |
|
| 560 |
-
# Calculate the mean for Faithfulness
|
| 561 |
-
faith_mean = to_f(df["faithfulness"]).mean() * 100
|
| 562 |
-
# --- CHANGE 6: Calculate the mean for the new metric ---
|
| 563 |
-
halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
|
| 564 |
|
| 565 |
-
rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
|
| 566 |
|
| 567 |
-
#
|
| 568 |
-
|
| 569 |
-
|
|
|
|
|
|
|
| 570 |
- **Routing Accuracy**: {pct:.2f}%
|
| 571 |
- **Behaviour F1 (avg)**: {bf1_mean:.2f}%
|
| 572 |
- **Emotion F1 (avg)**: {ef1_mean:.2f}%
|
| 573 |
- **Topic F1 (avg)**: {tf1_mean:.2f}%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 575 |
-
- **RAG: Context Precision**: {
|
| 576 |
-
- **RAG: Context Recall**: {
|
| 577 |
-
- **RAG: Recall@5**: {
|
| 578 |
-
- **RAG
|
| 579 |
- **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
|
| 580 |
-
- **RAG: Answer Correctness (LLM-judge)**: {
|
| 581 |
-
- **RAG: Avg Latency (ms)**: {
|
| 582 |
"""
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
print(summary_text)
|
| 585 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 587 |
table_rows = df_display.values.tolist()
|
| 588 |
headers = df_display.columns.tolist()
|
| 589 |
|
| 590 |
-
# --- NEW: per-category averages ---
|
| 591 |
-
try:
|
| 592 |
-
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
| 593 |
-
print("\nπ Correctness by Category:")
|
| 594 |
-
print(cat_means.to_string(index=False))
|
| 595 |
-
except Exception as e:
|
| 596 |
-
print(f"WARNING: Could not compute category breakdown: {e}")
|
| 597 |
-
|
| 598 |
-
# --- NEW: confusion-style matrix ---
|
| 599 |
-
try:
|
| 600 |
-
confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
|
| 601 |
-
rownames=["Category"], colnames=["Error Class"], dropna=False)
|
| 602 |
-
print("\nπ Error Class Distribution by Category:")
|
| 603 |
-
print(confusion.to_string())
|
| 604 |
-
except Exception as e:
|
| 605 |
-
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 606 |
-
# END
|
| 607 |
-
|
| 608 |
else:
|
|
|
|
| 609 |
summary_text = "No valid test fixtures found to evaluate."
|
| 610 |
table_rows, headers = [], []
|
| 611 |
|
|
|
|
| 530 |
"recall_at_5": recall_at_5, # <-- ADD THIS LINE
|
| 531 |
"latency_ms": latency_ms
|
| 532 |
})
|
| 533 |
+
|
| 534 |
+
# ####################################################################
|
| 535 |
+
# THIS IS YOUR ORIGINAL RESULTS PRINTOUT SECTION, NOW MODIFIED.
|
| 536 |
+
# IT IS OUTSIDE THE LOOP AND WILL ALWAYS RUN.
|
| 537 |
+
# ####################################################################
|
| 538 |
|
| 539 |
df = pd.DataFrame(results)
|
| 540 |
summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
|
|
|
|
| 562 |
tf1_mean = to_f(df["topic_f1"]).mean() * 100
|
| 563 |
cf1_mean = to_f(df["context_f1"]).mean() * 100
|
| 564 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
|
|
|
| 566 |
|
| 567 |
+
# Choose to use Hallucination instead of - **RAG: Faithfulness**: {faith_mean:.1f}%
|
| 568 |
+
|
| 569 |
+
--- START: CORRECTED SUMMARY LOGIC ---
|
| 570 |
+
# 1. Start building the summary_text string with the common parts
|
| 571 |
+
summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})
|
| 572 |
- **Routing Accuracy**: {pct:.2f}%
|
| 573 |
- **Behaviour F1 (avg)**: {bf1_mean:.2f}%
|
| 574 |
- **Emotion F1 (avg)**: {ef1_mean:.2f}%
|
| 575 |
- **Topic F1 (avg)**: {tf1_mean:.2f}%
|
| 576 |
+
"""
|
| 577 |
+
# END of summary_text
|
| 578 |
+
|
| 579 |
+
# 2. Conditionally append the RAG-specific part to the same string
|
| 580 |
+
if not NLU_ONLY_TEST:
|
| 581 |
+
# Calculate RAG-specific metrics from the DataFrame first
|
| 582 |
+
context_precision_mean = to_f(df["context_precision"]).mean()
|
| 583 |
+
context_recall_mean = to_f(df["context_recall"]).mean()
|
| 584 |
+
|
| 585 |
+
# Calculate F1 score safely, handling potential division by zero
|
| 586 |
+
if (context_precision_mean + context_recall_mean) > 0:
|
| 587 |
+
cf1_mean = (2 * context_precision_mean * context_recall_mean) / (context_precision_mean + context_recall_mean) * 100
|
| 588 |
+
else:
|
| 589 |
+
cf1_mean = 0.0
|
| 590 |
+
|
| 591 |
+
rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
|
| 592 |
+
# Calculate the mean for Faithfulness
|
| 593 |
+
faith_mean = to_f(df["faithfulness"]).mean() * 100
|
| 594 |
+
# halluc_mean = (1 - to_f(df["faithfulness_score"])).mean() * 100
|
| 595 |
+
halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
|
| 596 |
+
answer_correctness_mean = to_f(df["answer_correctness"]).mean() * 100
|
| 597 |
+
latency_mean = to_f(df["latency_ms"]).mean()
|
| 598 |
+
recall_at_5_mean = to_f(df["recall_at_5"]).mean() * 100
|
| 599 |
+
|
| 600 |
+
rag_summary = f"""
|
| 601 |
- **Context F1 (avg)**: {cf1_mean:.2f}%
|
| 602 |
+
- **RAG: Context Precision**: {context_precision_mean * 100:.1f}%
|
| 603 |
+
- **RAG: Context Recall**: {context_recall_mean * 100:.1f}%
|
| 604 |
+
- **RAG: Recall@5**: {recall_at_5_mean:.1f}%
|
| 605 |
+
- **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
|
| 606 |
- **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
|
| 607 |
+
- **RAG: Answer Correctness (LLM-judge)**: {answer_correctness_mean:.1f}%
|
| 608 |
+
- **RAG: Avg Latency (ms)**: {latency_mean:.1f}
|
| 609 |
"""
|
| 610 |
+
# END rag_summary
|
| 611 |
+
|
| 612 |
+
# Append the RAG summary to the main summary_text string
|
| 613 |
+
summary_text += rag_summary
|
| 614 |
+
# END RAG component if not NLU_ONLY_TEST:
|
| 615 |
+
|
| 616 |
+
# 3. Print the final summary text to the console
|
| 617 |
print(summary_text)
|
| 618 |
+
|
| 619 |
+
# --- START: CORRECTED CONDITIONAL PRINTOUTS ---
|
| 620 |
+
# 4. Only print these detailed breakdowns if in Full RAG mode
|
| 621 |
+
if not NLU_ONLY_TEST:
|
| 622 |
+
try:
|
| 623 |
+
cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
|
| 624 |
+
print("\nπ Correctness by Category:")
|
| 625 |
+
print(cat_means.to_string(index=False))
|
| 626 |
+
except Exception as e:
|
| 627 |
+
print(f"WARNING: Could not compute category breakdown: {e}")
|
| 628 |
+
|
| 629 |
+
try:
|
| 630 |
+
confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
|
| 631 |
+
rownames=["Category"], colnames=["Error Class"], dropna=False)
|
| 632 |
+
print("\nπ Error Class Distribution by Category:")
|
| 633 |
+
print(confusion.to_string())
|
| 634 |
+
except Exception as e:
|
| 635 |
+
print(f"WARNING: Could not build confusion matrix: {e}")
|
| 636 |
+
# --- END: CORRECTED CONDITIONAL PRINTOUTS ---
|
| 637 |
+
|
| 638 |
+
# 5. Prepare the other return values as usual
|
| 639 |
df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
|
| 640 |
table_rows = df_display.values.tolist()
|
| 641 |
headers = df_display.columns.tolist()
|
| 642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
else:
|
| 644 |
+
# Fallback return
|
| 645 |
summary_text = "No valid test fixtures found to evaluate."
|
| 646 |
table_rows, headers = [], []
|
| 647 |
|