Spaces:

Inframat-x
/

ML-Chatbot

Sleeping

Inframat-x commited on Nov 3, 2025

Commit

4a81d08

verified ·

1 Parent(s): 5310e9a

Create rag_eval_metrics.py

Files changed (1) hide show

rag_eval_metrics.py ADDED Viewed

+# rag_eval_metrics.py
+import json
+import pandas as pd
+from pathlib import Path
+from sklearn.metrics import recall_score
+LOGS_PATH = Path("rag_artifacts/logs.jsonl")
+GOLD_PATH = Path("gold.csv")
+def load_logs():
+    data = []
+    if not LOGS_PATH.exists():
+        print("❌ logs.jsonl not found in rag_artifacts/")
+        return pd.DataFrame()
+    with open(LOGS_PATH, "r", encoding="utf-8") as f:
+        for line in f:
+            data.append(json.loads(line))
+    return pd.DataFrame(data)
+def load_gold():
+    if not GOLD_PATH.exists():
+        print("❌ gold.csv not found in repo")
+        return pd.DataFrame()
+    return pd.read_csv(GOLD_PATH)
+def evaluate():
+    logs = load_logs()
+    gold = load_gold()
+    if logs.empty or gold.empty:
+        print("⚠️ Missing logs or gold — evaluation aborted.")
+        return
+    df = gold.merge(logs, on="question", how="left")
+    df['is_null'] = df['type'].eq("null").astype(int)
+    df['pred_null'] = df['answer'].fillna("").str.contains(
+        "not found|insufficient evidence|No indexed PDFs",
+        case=False
+    ).astype(int)
+    null_recall = recall_score(df['is_null'], df['pred_null'])
+    print("=== ✅ Evaluation Results ===")
+    print(f"Questions Evaluated: {len(df)}")
+    print(f"Null-Question Recall (hallucination control): {null_recall:.2f}")
+    print(f"Missing answers in logs: {(df['answer'].isna()).sum()}")
+if __name__ == "__main__":
+    evaluate()