""" ============================================================= BAL Chatbot — Step 3: Retrieval Quality Evaluation Usage: python scripts/03_eval_retrieval.py ============================================================= This script evaluates the RAG system's retrieval quality: - Runs a set of pre-defined test questions - Evaluates retrieved chunks for each question - Writes a report to logs/eval_report.txt ============================================================= """ import json import time from pathlib import Path import numpy as np import faiss from sentence_transformers import SentenceTransformer # ── Test Questions ───────────────────────────────────────────────────────────── # Each question includes expected keywords for evaluation TEST_QUESTIONS = [ { "question": "BAL'ın kuruluş tarihi nedir?", "expected_keywords": ["1953", "Ege Koleji", "Giraud"], }, { "question": "Almanca bölümünün LGS taban puanı kaçtır?", "expected_keywords": ["484", "Almanca", "taban"], }, { "question": "Ayran Günü nedir?", "expected_keywords": ["Mayıs", "şenlik", "geleneksel", "müzik"], }, { "question": "BALEV bursuna nasıl başvurabilirim?", "expected_keywords": ["balev.org.tr", "burs", "başvuru"], }, { "question": "Okula metro ile nasıl gidebilirim?", "expected_keywords": ["Bornova Metro", "otobüs", "267", "268"], }, { "question": "Ultimate Frizbi takımı var mı?", "expected_keywords": ["Ultimate Frizbi", "tek lise", "BALspor"], }, { "question": "Hazırlık sınıfında ne öğretilir?", "expected_keywords": ["yabancı dil", "yoğunlaştırılmış", "hazırlık"], }, { "question": "DSD diploması nedir?", "expected_keywords": ["Deutsches Sprachdiplom", "Almanca", "diploma"], }, { "question": "Okulun vizyon cümlesi nedir?", "expected_keywords": ["Geleceğin Aydınlık Sesi", "vizyon"], }, { "question": "Pansiyon ücreti ne kadar?", "expected_keywords": ["pansiyon", "güncel veri"], }, ] def load_artifacts(index_path: str, chunks_path: str, model_name: str): """Loads the FAISS index, chunk metadata, and embedding model.""" index = faiss.read_index(index_path) with open(chunks_path, "r", encoding="utf-8") as f: chunks = json.load(f) model = SentenceTransformer(model_name) return index, chunks, model def retrieve(query: str, index, chunks, model, top_k: int = 5): """Retrieves the top-k most relevant chunks for a query.""" query_text = f"query: {query}" embedding = model.encode([query_text], normalize_embeddings=True, convert_to_numpy=True).astype("float32") scores, indices = index.search(embedding, top_k) results = [] for score, idx in zip(scores[0], indices[0]): if idx == -1: continue chunk = chunks[idx].copy() chunk["score"] = float(score) results.append(chunk) return results def evaluate_retrieval(results, expected_keywords: list) -> dict: """Checks whether expected keywords are found in the retrieved chunks.""" combined_text = " ".join(r.get("text", "").lower() for r in results) found = [kw for kw in expected_keywords if kw.lower() in combined_text] recall = len(found) / len(expected_keywords) if expected_keywords else 1.0 return { "found_keywords": found, "missing_keywords": [kw for kw in expected_keywords if kw not in found], "recall": recall, "top_score": results[0]["score"] if results else 0.0, "avg_score": sum(r["score"] for r in results) / len(results) if results else 0.0, } def run_evaluation(): """Runs the full retrieval evaluation pipeline.""" # Load artifacts print("Loading artifacts...") try: index, chunks, model = load_artifacts( "data/bal_faiss.index", "data/bal_chunks.json", "intfloat/multilingual-e5-small", ) except FileNotFoundError: print("❌ Vector database not found. Run 01_build_vectorstore.py first.") return print(f" ✓ {index.ntotal} chunks loaded\n") report_lines = [ "BAL Chatbot — Retrieval Quality Report", "=" * 60, f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}", f"Total test questions: {len(TEST_QUESTIONS)}", "", ] recalls = [] top_scores = [] for i, test in enumerate(TEST_QUESTIONS, 1): question = test["question"] expected = test["expected_keywords"] results = retrieve(question, index, chunks, model, top_k=5) eval_result = evaluate_retrieval(results, expected) recalls.append(eval_result["recall"]) top_scores.append(eval_result["top_score"]) # Terminal output status = "✅" if eval_result["recall"] >= 0.7 else "⚠️" if eval_result["recall"] >= 0.4 else "❌" print(f"{status} [{i:02d}] {question}") print(f" Recall: {eval_result['recall']:.0%} | Top Score: {eval_result['top_score']:.3f}") if eval_result["missing_keywords"]: print(f" Missing: {eval_result['missing_keywords']}") print() # Report content report_lines += [ f"── Question {i}: {question}", f" Recall : {eval_result['recall']:.0%}", f" Top Score : {eval_result['top_score']:.3f}", f" Avg Score : {eval_result['avg_score']:.3f}", f" Found kw : {eval_result['found_keywords']}", f" Missing kw : {eval_result['missing_keywords']}", "", " Chunk titles:", ] for r in results[:3]: report_lines.append( f" [{r['score']:.3f}] {r.get('breadcrumb', '')} — {r.get('text', '')[:80]}..." ) report_lines.append("") # Summary avg_recall = sum(recalls) / len(recalls) avg_top = sum(top_scores) / len(top_scores) summary = [ "=" * 60, "SUMMARY", f" Average Recall : {avg_recall:.0%}", f" Average Top Score: {avg_top:.3f}", f" Successful (≥70%): {sum(1 for r in recalls if r >= 0.7)}/{len(recalls)}", ] print("\n".join(summary)) report_lines += [""] + summary # Save Path("logs").mkdir(exist_ok=True) report_path = "logs/eval_report.txt" with open(report_path, "w", encoding="utf-8") as f: f.write("\n".join(report_lines)) print(f"\n📄 Report saved: {report_path}") if __name__ == "__main__": run_evaluation()