Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

App Files Files Community

KeenWoo commited on Sep 14

Commit

1d3f707

verified ·

1 Parent(s): ba67831

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +28 -8

evaluate.py CHANGED Viewed

@@ -284,10 +284,11 @@ def run_comprehensive_evaluation(
         elif not expected_sources_set and not actual_sources_set:
             context_precision, context_recall = 1.0, 1.0
-        print("\n" + "-"*20 + " ANSWER & CORRECTNESS EVALUATION " + "-"*20)
-        print(f"  - Ground Truth Answer: {ground_truth_answer}")
-        print(f"  - Generated Answer:    {answer_text}")
-        print("-" * 59)
         answer_correctness_score = None
         if ground_truth_answer and "ERROR" not in answer_text:
@@ -304,6 +305,7 @@ def run_comprehensive_evaluation(
                 print(f"ERROR during answer correctness judging: {e}")
         faithfulness = None
         source_docs = response.get("source_documents", [])
         if source_docs and "ERROR" not in answer_text:
             context_blob = "\n---\n".join([doc.page_content for doc in source_docs])
@@ -314,8 +316,13 @@ def run_comprehensive_evaluation(
                     data = _parse_judge_json(raw)
                     if data:
                         denom = data.get("supported", 0) + data.get("contradicted", 0) + data.get("not_enough_info", 0)
-                        if denom > 0: faithfulness = round(data.get("supported", 0) / denom, 3)
-                        elif data.get("ignored", 0) > 0: faithfulness = 1.0
             except Exception as e:
                 print(f"ERROR during faithfulness judging: {e}")
@@ -326,9 +333,10 @@ def run_comprehensive_evaluation(
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
-            "latency_ms": latency_ms, "faithfulness": faithfulness,
             "context_precision": context_precision, "context_recall": context_recall,
             "answer_correctness": answer_correctness_score,
             "category": category,
             "error_class": error_class
         })
@@ -350,7 +358,15 @@ def run_comprehensive_evaluation(
         tf1_mean = to_f(df["topic_f1"]).mean() * 100
         cf1_mean = to_f(df["context_f1"]).mean() * 100
         # Add the NLU metrics to the summary f-string
         summary_text = f"""## Evaluation Summary
 - **Routing Accuracy**: {pct:.2f}%
 - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
@@ -359,7 +375,11 @@ def run_comprehensive_evaluation(
 - **Context F1 (avg)**: {cf1_mean:.2f}%
 - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
 - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
-- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%"""
         # --- END OF MODIFICATION ---

         elif not expected_sources_set and not actual_sources_set:
             context_precision, context_recall = 1.0, 1.0
+        # TURN DEBUG on Answer Correctness
+        # print("\n" + "-"*20 + " ANSWER & CORRECTNESS EVALUATION " + "-"*20)
+        # print(f"  - Ground Truth Answer: {ground_truth_answer}")
+        # print(f"  - Generated Answer:    {answer_text}")
+        # print("-" * 59)
         answer_correctness_score = None
         if ground_truth_answer and "ERROR" not in answer_text:
                 print(f"ERROR during answer correctness judging: {e}")
         faithfulness = None
+        hallucination_rate = None
         source_docs = response.get("source_documents", [])
         if source_docs and "ERROR" not in answer_text:
             context_blob = "\n---\n".join([doc.page_content for doc in source_docs])
                     data = _parse_judge_json(raw)
                     if data:
                         denom = data.get("supported", 0) + data.get("contradicted", 0) + data.get("not_enough_info", 0)
+                        if denom > 0:
+                            faithfulness = round(data.get("supported", 0) / denom, 3)
+                            hallucination_rate = 1.0 - faithfulness
+                        elif data.get("ignored", 0) > 0:
+                            faithfulness = 1.0
+                            hallucination_rate = 0.0
             except Exception as e:
                 print(f"ERROR during faithfulness judging: {e}")
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
             "generated_answer": answer_text, "sources": sources_pretty, "source_count": len(actual_sources_set),
+            "faithfulness": faithfulness, "hallucination_rate": hallucination_rate,
             "context_precision": context_precision, "context_recall": context_recall,
             "answer_correctness": answer_correctness_score,
+            "latency_ms": latency_ms,
             "category": category,
             "error_class": error_class
         })
         tf1_mean = to_f(df["topic_f1"]).mean() * 100
         cf1_mean = to_f(df["context_f1"]).mean() * 100
+        # Calculate the mean for Faithfulness
+        faith_mean = to_f(df["faithfulness"]).mean() * 100
+        # --- CHANGE 6: Calculate the mean for the new metric ---
+        halluc_mean = to_f(df["hallucination_rate"]).mean() * 100
+        rag_with_sources_pct = (df["source_count"] > 0).mean() * 100 if "source_count" in df else 0
         # Add the NLU metrics to the summary f-string
+        # Choose to use Hallucination - **RAG: Faithfulness**: {faith_mean:.1f}%
         summary_text = f"""## Evaluation Summary
 - **Routing Accuracy**: {pct:.2f}%
 - **Behaviour F1 (avg)**: {bf1_mean:.2f}%
 - **Context F1 (avg)**: {cf1_mean:.2f}%
 - **RAG: Context Precision**: {(to_f(df["context_precision"]).mean() * 100):.1f}%
 - **RAG: Context Recall**: {(to_f(df["context_recall"]).mean() * 100):.1f}%
+- **RAG  Answers w/ Sources**: {rag_with_sources_pct:.1f}%
+- **RAG: Hallucination Rate**: {halluc_mean:.1f}% (Lower is better)
+- **RAG: Answer Correctness (LLM-judge)**: {(to_f(df["answer_correctness"]).mean() * 100):.1f}%
+- **RAG: Avg Latency (ms)**: {to_f(df["latency_ms"]).mean():.1f}
+"""
         # --- END OF MODIFICATION ---