KeenWoo commited on
Commit
8cc1d0f
Β·
verified Β·
1 Parent(s): 10823ad

Update evaluate.py

Browse files
Files changed (1) hide show
  1. evaluate.py +70 -34
evaluate.py CHANGED
@@ -530,6 +530,11 @@ def run_comprehensive_evaluation(
530
  "recall_at_5": recall_at_5, # <-- ADD THIS LINE
531
  "latency_ms": latency_ms
532
  })
 
 
 
 
 
533
 
534
  df = pd.DataFrame(results)
535
  summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
@@ -557,55 +562,86 @@ def run_comprehensive_evaluation(
557
  tf1_mean = to_f(df["topic_f1"]).mean() * 100
558
  cf1_mean = to_f(df["context_f1"]).mean() * 100
559
 
560
- # Calculate the mean for Faithfulness
561
- faith_mean = to_f(df["faithfulness"]).mean() * 100
562
- # --- CHANGE 6: Calculate the mean for the new metric ---
563
- halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
564
 
565
- rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
566
 
567
- # Add the NLU metrics to the summary f-string
568
- # Choose to use Hallucination - **RAG: Faithfulness**: {faith_mean:.1f}%
569
- summary_text = f"""## Evaluation Summary
 
 
570
  - **Routing Accuracy**: {pct:.2f}%
571
  - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
572
  - **Emotion F1 (avg)**: {ef1_mean:.2f}%
573
  - **Topic F1 (avg)**: {tf1_mean:.2f}%
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
  - **Context F1 (avg)**: {cf1_mean:.2f}%
575
- - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
576
- - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
577
- - **RAG: Recall@5**: {(to_f(df["recall_at_5"]).mean() * 100):.1f}%
578
- - **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
579
  - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
580
- - **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
581
- - **RAG: Avg Latency (ms)**: {to_f(df["latency_ms"]).mean():.1f}
582
  """
583
- # --- END OF MODIFICATION ---
 
 
 
 
 
 
584
  print(summary_text)
585
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
587
  table_rows = df_display.values.tolist()
588
  headers = df_display.columns.tolist()
589
 
590
- # --- NEW: per-category averages ---
591
- try:
592
- cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
593
- print("\nπŸ“Š Correctness by Category:")
594
- print(cat_means.to_string(index=False))
595
- except Exception as e:
596
- print(f"WARNING: Could not compute category breakdown: {e}")
597
-
598
- # --- NEW: confusion-style matrix ---
599
- try:
600
- confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
601
- rownames=["Category"], colnames=["Error Class"], dropna=False)
602
- print("\nπŸ“Š Error Class Distribution by Category:")
603
- print(confusion.to_string())
604
- except Exception as e:
605
- print(f"WARNING: Could not build confusion matrix: {e}")
606
- # END
607
-
608
  else:
 
609
  summary_text = "No valid test fixtures found to evaluate."
610
  table_rows, headers = [], []
611
 
 
530
  "recall_at_5": recall_at_5, # <-- ADD THIS LINE
531
  "latency_ms": latency_ms
532
  })
533
+
534
+ # ####################################################################
535
+ # THIS IS YOUR ORIGINAL RESULTS PRINTOUT SECTION, NOW MODIFIED.
536
+ # IT IS OUTSIDE THE LOOP AND WILL ALWAYS RUN.
537
+ # ####################################################################
538
 
539
  df = pd.DataFrame(results)
540
  summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
 
562
  tf1_mean = to_f(df["topic_f1"]).mean() * 100
563
  cf1_mean = to_f(df["context_f1"]).mean() * 100
564
 
 
 
 
 
565
 
 
566
 
567
+ # Choose to use Hallucination instead of - **RAG: Faithfulness**: {faith_mean:.1f}%
568
+
569
+ --- START: CORRECTED SUMMARY LOGIC ---
570
+ # 1. Start building the summary_text string with the common parts
571
+ summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})
572
  - **Routing Accuracy**: {pct:.2f}%
573
  - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
574
  - **Emotion F1 (avg)**: {ef1_mean:.2f}%
575
  - **Topic F1 (avg)**: {tf1_mean:.2f}%
576
+ """
577
+ # END of summary_text
578
+
579
+ # 2. Conditionally append the RAG-specific part to the same string
580
+ if not NLU_ONLY_TEST:
581
+ # Calculate RAG-specific metrics from the DataFrame first
582
+ context_precision_mean = to_f(df["context_precision"]).mean()
583
+ context_recall_mean = to_f(df["context_recall"]).mean()
584
+
585
+ # Calculate F1 score safely, handling potential division by zero
586
+ if (context_precision_mean + context_recall_mean) > 0:
587
+ cf1_mean = (2 * context_precision_mean * context_recall_mean) / (context_precision_mean + context_recall_mean) * 100
588
+ else:
589
+ cf1_mean = 0.0
590
+
591
+ rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
592
+ # Calculate the mean for Faithfulness
593
+ faith_mean = to_f(df["faithfulness"]).mean() * 100
594
+ # halluc_mean = (1 - to_f(df["faithfulness_score"])).mean() * 100
595
+ halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
596
+ answer_correctness_mean = to_f(df["answer_correctness"]).mean() * 100
597
+ latency_mean = to_f(df["latency_ms"]).mean()
598
+ recall_at_5_mean = to_f(df["recall_at_5"]).mean() * 100
599
+
600
+ rag_summary = f"""
601
  - **Context F1 (avg)**: {cf1_mean:.2f}%
602
+ - **RAG: Context Precision**: {context_precision_mean * 100:.1f}%
603
+ - **RAG: Context Recall**: {context_recall_mean * 100:.1f}%
604
+ - **RAG: Recall@5**: {recall_at_5_mean:.1f}%
605
+ - **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
606
  - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
607
+ - **RAG: Answer Correctness (LLM-judge)**: {answer_correctness_mean:.1f}%
608
+ - **RAG: Avg Latency (ms)**: {latency_mean:.1f}
609
  """
610
+ # END rag_summary
611
+
612
+ # Append the RAG summary to the main summary_text string
613
+ summary_text += rag_summary
614
+ # END RAG component if not NLU_ONLY_TEST:
615
+
616
+ # 3. Print the final summary text to the console
617
  print(summary_text)
618
+
619
+ # --- START: CORRECTED CONDITIONAL PRINTOUTS ---
620
+ # 4. Only print these detailed breakdowns if in Full RAG mode
621
+ if not NLU_ONLY_TEST:
622
+ try:
623
+ cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
624
+ print("\nπŸ“Š Correctness by Category:")
625
+ print(cat_means.to_string(index=False))
626
+ except Exception as e:
627
+ print(f"WARNING: Could not compute category breakdown: {e}")
628
+
629
+ try:
630
+ confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
631
+ rownames=["Category"], colnames=["Error Class"], dropna=False)
632
+ print("\nπŸ“Š Error Class Distribution by Category:")
633
+ print(confusion.to_string())
634
+ except Exception as e:
635
+ print(f"WARNING: Could not build confusion matrix: {e}")
636
+ # --- END: CORRECTED CONDITIONAL PRINTOUTS ---
637
+
638
+ # 5. Prepare the other return values as usual
639
  df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
640
  table_rows = df_display.values.tolist()
641
  headers = df_display.columns.tolist()
642
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  else:
644
+ # Fallback return
645
  summary_text = "No valid test fixtures found to evaluate."
646
  table_rows, headers = [], []
647