Spaces:

brk9999
/

bal-chatbot

Running

App Files Files Community

bal-chatbot / scripts /03_eval_retrieval.py

brk9999

Upload folder using huggingface_hub

bd323cc verified 6 days ago

Raw

History Blame Contribute Delete

6.83 kB

	"""
	=============================================================
	BAL Chatbot — Step 3: Retrieval Quality Evaluation
	Usage: python scripts/03_eval_retrieval.py
	=============================================================
	This script evaluates the RAG system's retrieval quality:
	- Runs a set of pre-defined test questions
	- Evaluates retrieved chunks for each question
	- Writes a report to logs/eval_report.txt
	=============================================================
	"""

	import json
	import time
	from pathlib import Path

	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer

	# ── Test Questions ─────────────────────────────────────────────────────────────
	# Each question includes expected keywords for evaluation
	TEST_QUESTIONS = [
	{
	"question": "BAL'ın kuruluş tarihi nedir?",
	"expected_keywords": ["1953", "Ege Koleji", "Giraud"],
	},
	{
	"question": "Almanca bölümünün LGS taban puanı kaçtır?",
	"expected_keywords": ["484", "Almanca", "taban"],
	},
	{
	"question": "Ayran Günü nedir?",
	"expected_keywords": ["Mayıs", "şenlik", "geleneksel", "müzik"],
	},
	{
	"question": "BALEV bursuna nasıl başvurabilirim?",
	"expected_keywords": ["balev.org.tr", "burs", "başvuru"],
	},
	{
	"question": "Okula metro ile nasıl gidebilirim?",
	"expected_keywords": ["Bornova Metro", "otobüs", "267", "268"],
	},
	{
	"question": "Ultimate Frizbi takımı var mı?",
	"expected_keywords": ["Ultimate Frizbi", "tek lise", "BALspor"],
	},
	{
	"question": "Hazırlık sınıfında ne öğretilir?",
	"expected_keywords": ["yabancı dil", "yoğunlaştırılmış", "hazırlık"],
	},
	{
	"question": "DSD diploması nedir?",
	"expected_keywords": ["Deutsches Sprachdiplom", "Almanca", "diploma"],
	},
	{
	"question": "Okulun vizyon cümlesi nedir?",
	"expected_keywords": ["Geleceğin Aydınlık Sesi", "vizyon"],
	},
	{
	"question": "Pansiyon ücreti ne kadar?",
	"expected_keywords": ["pansiyon", "güncel veri"],
	},
	]


	def load_artifacts(index_path: str, chunks_path: str, model_name: str):
	"""Loads the FAISS index, chunk metadata, and embedding model."""
	index = faiss.read_index(index_path)
	with open(chunks_path, "r", encoding="utf-8") as f:
	chunks = json.load(f)
	model = SentenceTransformer(model_name)
	return index, chunks, model


	def retrieve(query: str, index, chunks, model, top_k: int = 5):
	"""Retrieves the top-k most relevant chunks for a query."""
	query_text = f"query: {query}"
	embedding = model.encode([query_text], normalize_embeddings=True, convert_to_numpy=True).astype("float32")
	scores, indices = index.search(embedding, top_k)
	results = []
	for score, idx in zip(scores[0], indices[0]):
	if idx == -1:
	continue
	chunk = chunks[idx].copy()
	chunk["score"] = float(score)
	results.append(chunk)
	return results


	def evaluate_retrieval(results, expected_keywords: list) -> dict:
	"""Checks whether expected keywords are found in the retrieved chunks."""
	combined_text = " ".join(r.get("text", "").lower() for r in results)
	found = [kw for kw in expected_keywords if kw.lower() in combined_text]
	recall = len(found) / len(expected_keywords) if expected_keywords else 1.0
	return {
	"found_keywords": found,
	"missing_keywords": [kw for kw in expected_keywords if kw not in found],
	"recall": recall,
	"top_score": results[0]["score"] if results else 0.0,
	"avg_score": sum(r["score"] for r in results) / len(results) if results else 0.0,
	}


	def run_evaluation():
	"""Runs the full retrieval evaluation pipeline."""
	# Load artifacts
	print("Loading artifacts...")
	try:
	index, chunks, model = load_artifacts(
	"data/bal_faiss.index",
	"data/bal_chunks.json",
	"intfloat/multilingual-e5-small",
	)
	except FileNotFoundError:
	print("❌ Vector database not found. Run 01_build_vectorstore.py first.")
	return

	print(f" ✓ {index.ntotal} chunks loaded\n")

	report_lines = [
	"BAL Chatbot — Retrieval Quality Report",
	"=" * 60,
	f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}",
	f"Total test questions: {len(TEST_QUESTIONS)}",
	"",
	]

	recalls = []
	top_scores = []

	for i, test in enumerate(TEST_QUESTIONS, 1):
	question = test["question"]
	expected = test["expected_keywords"]

	results = retrieve(question, index, chunks, model, top_k=5)
	eval_result = evaluate_retrieval(results, expected)

	recalls.append(eval_result["recall"])
	top_scores.append(eval_result["top_score"])

	# Terminal output
	status = "✅" if eval_result["recall"] >= 0.7 else "⚠️" if eval_result["recall"] >= 0.4 else "❌"
	print(f"{status} [{i:02d}] {question}")
	print(f" Recall: {eval_result['recall']:.0%} \| Top Score: {eval_result['top_score']:.3f}")
	if eval_result["missing_keywords"]:
	print(f" Missing: {eval_result['missing_keywords']}")
	print()

	# Report content
	report_lines += [
	f"── Question {i}: {question}",
	f" Recall : {eval_result['recall']:.0%}",
	f" Top Score : {eval_result['top_score']:.3f}",
	f" Avg Score : {eval_result['avg_score']:.3f}",
	f" Found kw : {eval_result['found_keywords']}",
	f" Missing kw : {eval_result['missing_keywords']}",
	"",
	" Chunk titles:",
	]
	for r in results[:3]:
	report_lines.append(
	f" [{r['score']:.3f}] {r.get('breadcrumb', '')} — {r.get('text', '')[:80]}..."
	)
	report_lines.append("")

	# Summary
	avg_recall = sum(recalls) / len(recalls)
	avg_top = sum(top_scores) / len(top_scores)

	summary = [
	"=" * 60,
	"SUMMARY",
	f" Average Recall : {avg_recall:.0%}",
	f" Average Top Score: {avg_top:.3f}",
	f" Successful (≥70%): {sum(1 for r in recalls if r >= 0.7)}/{len(recalls)}",
	]

	print("\n".join(summary))
	report_lines += [""] + summary

	# Save
	Path("logs").mkdir(exist_ok=True)
	report_path = "logs/eval_report.txt"
	with open(report_path, "w", encoding="utf-8") as f:
	f.write("\n".join(report_lines))
	print(f"\n📄 Report saved: {report_path}")


	if __name__ == "__main__":
	run_evaluation()