Spaces:

KeenWoo
/

AD_Multimodal_Chatbot

Sleeping

App Files Files Community

KeenWoo commited on Sep 13

Commit

63529fe

verified ·

1 Parent(s): a7dc53b

Update evaluate.py

Browse files

Files changed (1) hide show

evaluate.py +96 -0

evaluate.py CHANGED Viewed

@@ -101,6 +101,35 @@ def _parse_judge_json(raw_str: str) -> dict | None:
     except (json.JSONDecodeError, AttributeError):
         return None
 def run_comprehensive_evaluation(
     vs_general: FAISS,
     vs_personal: FAISS,
@@ -212,16 +241,23 @@ def run_comprehensive_evaluation(
         answer_correctness_score = None
         ground_truth_answer = ground_truth.get("ground_truth_answer")
         if ground_truth_answer and "ERROR" not in answer_text:
             try:
                 judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
                 raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
                 correctness_data = _parse_judge_json(raw_correctness)
                 if correctness_data and "correctness_score" in correctness_data:
                     answer_correctness_score = float(correctness_data["correctness_score"])
             except Exception as e:
                 print(f"ERROR during answer correctness judging: {e}")
         faithfulness = None
         source_docs = response.get("source_documents", [])
         if source_docs and "ERROR" not in answer_text:
@@ -241,6 +277,9 @@ def run_comprehensive_evaluation(
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
             "route_correct": "✅" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
@@ -262,6 +301,63 @@ def run_comprehensive_evaluation(
         df = df[[c for c in cols if c in df.columns]]
         df.to_csv(output_path, index=False, encoding="utf-8")
         print(f"Evaluation results saved to {output_path}")
         pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
         to_f = lambda s: pd.to_numeric(s, errors="coerce")

     except (json.JSONDecodeError, AttributeError):
         return None
+    # --- NEW: helpers for categorisation and error-class labelling ---
+    def _categorize_test(test_id: str) -> str:
+        tid = (test_id or "").lower()
+        if "synonym" in tid: return "synonym"
+        if "multi_fact" in tid or "multi-hop" in tid or "multihop" in tid: return "multi_fact"
+        if "omission" in tid: return "omission"
+        if "hallucination" in tid: return "hallucination"
+        if "time" in tid or "temporal" in tid: return "temporal"
+        if "context" in tid: return "context_disambig"
+        return "baseline"
+    def _classify_error(gt: str, gen: str) -> str:
+        import re
+        gt = (gt or "").strip().lower()
+        gen = (gen or "").strip().lower()
+        if not gen:
+            return "empty"
+        if not gt:
+            return "hallucination" if gen else "empty"
+        if gt in gen:
+            return "paraphrase"
+        gt_tokens = set([t for t in re.split(r'\W+', gt) if t])
+        gen_tokens = set([t for t in re.split(r'\W+', gen) if t])
+        overlap = len(gt_tokens & gen_tokens) / max(1, len(gt_tokens))
+        if overlap >= 0.3:
+            return "omission"
+        return "contradiction"
 def run_comprehensive_evaluation(
     vs_general: FAISS,
     vs_personal: FAISS,
         answer_correctness_score = None
         ground_truth_answer = ground_truth.get("ground_truth_answer")
+        error_class = None  # initialise  #NEW
         if ground_truth_answer and "ERROR" not in answer_text:
             try:
                 judge_msg = ANSWER_CORRECTNESS_JUDGE_PROMPT.format(ground_truth_answer=ground_truth_answer, generated_answer=answer_text)
                 raw_correctness = call_llm([{"role": "user", "content": judge_msg}], temperature=0.0)
                 correctness_data = _parse_judge_json(raw_correctness)
                 if correctness_data and "correctness_score" in correctness_data:
                     answer_correctness_score = float(correctness_data["correctness_score"])
             except Exception as e:
                 print(f"ERROR during answer correctness judging: {e}")
+            # --- NEW: derive error class for diagnostics ---
+            error_class = _classify_error(ground_truth_answer, answer_text)
         faithfulness = None
         source_docs = response.get("source_documents", [])
         if source_docs and "ERROR" not in answer_text:
         sources_pretty = ", ".join(sorted(s)) if (s:=actual_sources_set) else ""
         results.append({
             "test_id": fx.get("test_id", "N/A"), "title": fx.get("title", "N/A"),
+            # NEW for debugging
+            "category": _categorize_test(test_id), "error_class": error_class,
+            # END
             "route_correct": "✅" if route_correct else "❌", "expected_route": expected_route, "actual_route": actual_route,
             "behavior_f1": f"{behavior_metrics['f1_score']:.2f}", "emotion_f1": f"{emotion_metrics['f1_score']:.2f}",
             "topic_f1": f"{topic_metrics['f1_score']:.2f}", "context_f1": f"{context_metrics['f1_score']:.2f}",
         df = df[[c for c in cols if c in df.columns]]
         df.to_csv(output_path, index=False, encoding="utf-8")
         print(f"Evaluation results saved to {output_path}")
+        # --- NEW: write detailed results to a log file instead of CSV ---
+        log_path = Path(__file__).parent / "evaluation_log.txt"
+        with open(log_path, "a", encoding="utf-8") as logf:
+            logf.write("\n===== Detailed Evaluation Run =====\n")
+            logf.write(df.to_string(index=False))
+            logf.write("\n\n")
+        # --- NEW: per-category averages ---
+        try:
+            cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
+            print("\n📊 Correctness by Category:")
+            print(cat_means.to_string(index=False))
+            with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
+                logf.write("\n📊 Correctness by Category:\n")
+                logf.write(cat_means.to_string(index=False))
+                logf.write("\n")
+        except Exception as e:
+            print(f"WARNING: Could not compute category breakdown: {e}")
+        # --- NEW: confusion-style matrix ---
+        try:
+            confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
+                                    rownames=["Category"], colnames=["Error Class"], dropna=False)
+            print("\n📊 Error Class Distribution by Category:")
+            print(confusion.to_string())
+            with open("evaluation_log.txt", "a", encoding="utf-8") as logf:
+                logf.write("\n📊 Error Class Distribution by Category:\n")
+                logf.write(confusion.to_string())
+                logf.write("\n")
+        except Exception as e:
+            print(f"WARNING: Could not build confusion matrix: {e}")
+        # NEW: save detailed results
+        df.to_csv("evaluation_results_detailed.csv", index=False, encoding="utf-8")
+        # NEW: per-category averages
+        try:
+            cat_means = df.groupby("category")["answer_correctness"].mean().reset_index()
+            print("\n📊 Correctness by Category:")
+            print(cat_means.to_string(index=False))
+            cat_means.to_csv("evaluation_correctness_by_category.csv", index=False)
+        except Exception as e:
+            print(f"WARNING: Could not compute category breakdown: {e}")
+        # NEW: confusion-style matrix
+        try:
+            confusion = pd.crosstab(df.get("category", []), df.get("error_class", []),
+                                    rownames=["Category"], colnames=["Error Class"], dropna=False)
+            print("\n📊 Error Class Distribution by Category:")
+            print(confusion.to_string())
+            confusion.to_csv("evaluation_confusion_matrix.csv")
+        except Exception as e:
+            print(f"WARNING: Could not build confusion matrix: {e}")
         pct = df["route_correct"].value_counts(normalize=True).get("✅", 0) * 100
         to_f = lambda s: pd.to_numeric(s, errors="coerce")