Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

App Files Files Community

KeenWoo commited on Sep 19

Commit

8cc1d0f

verified ·

1 Parent(s): 10823ad

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +70 -34

evaluate.py CHANGED Viewed

@@ -530,6 +530,11 @@ def run_comprehensive_evaluation(
             "recall_at_5": recall_at_5,  # <-- ADD THIS LINE
             "latency_ms": latency_ms
         })
     df = pd.DataFrame(results)
     summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
@@ -557,55 +562,86 @@ def run_comprehensive_evaluation(
         tf1_mean = to_f(df["topic_f1"]).mean() * 100
         cf1_mean = to_f(df["context_f1"]).mean() * 100
-        # Calculate the mean for Faithfulness
-        faith_mean = to_f(df["faithfulness"]).mean() * 100
-        # --- CHANGE 6: Calculate the mean for the new metric ---
-        halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
-        rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
-        # Add the NLU metrics to the summary f-string
-        # Choose to use Hallucination - **RAG: Faithfulness**: {faith_mean:.1f}%
-        summary_text = f"""## Evaluation Summary
 - **Routing Accuracy**: {pct:.2f}%
 - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
 - **Emotion F1 (avg)**: {ef1_mean:.2f}%
 - **Topic F1 (avg)**: {tf1_mean:.2f}%
 - **Context F1 (avg)**: {cf1_mean:.2f}%
-- **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
-- **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
-- **RAG: Recall@5**: {(to_f(df["recall_at_5"]).mean() * 100):.1f}%
-- **RAG  Answers w/ Sources**: {rag_with_sources_pct:.1f}%
 - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
-- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
-- **RAG: Avg Latency (ms)**: {to_f(df["latency_ms"]).mean():.1f}
 """
-        # --- END OF MODIFICATION ---
         print(summary_text)
         df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
-        # --- NEW: per-category averages ---
-        try:
-            cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
-            print("\n📊 Correctness by Category:")
-            print(cat_means.to_string(index=False))
-        except Exception as e:
-            print(f"WARNING: Could not compute category breakdown: {e}")
-        # --- NEW: confusion-style matrix ---
-        try:
-            confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
-                                    rownames=["Category"], colnames=["Error Class"], dropna=False)
-            print("\n📊 Error Class Distribution by Category:")
-            print(confusion.to_string())
-        except Exception as e:
-            print(f"WARNING: Could not build confusion matrix: {e}")
-        # END
     else:
         summary_text = "No valid test fixtures found to evaluate."
         table_rows, headers = [], []

             "recall_at_5": recall_at_5,  # <-- ADD THIS LINE
             "latency_ms": latency_ms
         })
+    # ####################################################################
+    # THIS IS YOUR ORIGINAL RESULTS PRINTOUT SECTION, NOW MODIFIED.
+    # IT IS OUTSIDE THE LOOP AND WILL ALWAYS RUN.
+    # ####################################################################
     df = pd.DataFrame(results)
     summary_text, table_rows, headers = "No valid test fixtures found to evaluate.", [], []
         tf1_mean = to_f(df["topic_f1"]).mean() * 100
         cf1_mean = to_f(df["context_f1"]).mean() * 100
+        # Choose to use Hallucination instead of - **RAG: Faithfulness**: {faith_mean:.1f}%
+         --- START: CORRECTED SUMMARY LOGIC ---
+        # 1. Start building the summary_text string with the common parts
+        summary_text = f"""## Evaluation Summary (Mode: {'NLU-Only' if NLU_ONLY_TEST else 'Full RAG'})
 - **Routing Accuracy**: {pct:.2f}%
 - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
 - **Emotion F1 (avg)**: {ef1_mean:.2f}%
 - **Topic F1 (avg)**: {tf1_mean:.2f}%
+"""
+# END of summary_text
+        # 2. Conditionally append the RAG-specific part to the same string
+        if not NLU_ONLY_TEST:
+            # Calculate RAG-specific metrics from the DataFrame first
+            context_precision_mean = to_f(df["context_precision"]).mean()
+            context_recall_mean = to_f(df["context_recall"]).mean()
+            # Calculate F1 score safely, handling potential division by zero
+            if (context_precision_mean + context_recall_mean) > 0:
+                cf1_mean = (2 * context_precision_mean * context_recall_mean) / (context_precision_mean + context_recall_mean) * 100
+            else:
+                cf1_mean = 0.0
+            rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
+            # Calculate the mean for Faithfulness
+            faith_mean = to_f(df["faithfulness"]).mean() * 100
+            # halluc_mean = (1 - to_f(df["faithfulness_score"])).mean() * 100
+            halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
+            answer_correctness_mean = to_f(df["answer_correctness"]).mean() * 100
+            latency_mean = to_f(df["latency_ms"]).mean()
+            recall_at_5_mean = to_f(df["recall_at_5"]).mean() * 100
+            rag_summary = f"""
 - **Context F1 (avg)**: {cf1_mean:.2f}%
+- **RAG: Context Precision**: {context_precision_mean * 100:.1f}%
+- **RAG: Context Recall**: {context_recall_mean * 100:.1f}%
+- **RAG: Recall@5**: {recall_at_5_mean:.1f}%
+- **RAG Answers w/ Sources**: {rag_with_sources_pct:.1f}%
 - **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
+- **RAG: Answer Correctness (LLM-judge)**: {answer_correctness_mean:.1f}%
+- **RAG: Avg Latency (ms)**: {latency_mean:.1f}
 """
+# END rag_summary
+            # Append the RAG summary to the main summary_text string
+            summary_text += rag_summary
+        # END RAG component if not NLU_ONLY_TEST:
+        # 3. Print the final summary text to the console
         print(summary_text)
+        # --- START: CORRECTED CONDITIONAL PRINTOUTS ---
+        # 4. Only print these detailed breakdowns if in Full RAG mode
+        if not NLU_ONLY_TEST:
+            try:
+                cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
+                print("\n📊 Correctness by Category:")
+                print(cat_means.to_string(index=False))
+            except Exception as e:
+                print(f"WARNING: Could not compute category breakdown: {e}")
+            try:
+                confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
+                                        rownames=["Category"], colnames=["Error Class"], dropna=False)
+                print("\n📊 Error Class Distribution by Category:")
+                print(confusion.to_string())
+            except Exception as e:
+                print(f"WARNING: Could not build confusion matrix: {e}")
+        # --- END: CORRECTED CONDITIONAL PRINTOUTS ---
+        # 5. Prepare the other return values as usual
         df_display = df.rename(columns={"context_precision": "Ctx. Precision", "context_recall": "Ctx. Recall"})
         table_rows = df_display.values.tolist()
         headers = df_display.columns.tolist()
     else:
+        # Fallback return
         summary_text = "No valid test fixtures found to evaluate."
         table_rows, headers = [], []