rag-chatbot / scripts /evaluate.py
Mobiworks's picture
Initial commit: RAG chatbot
d456104
Raw
History Blame Contribute Delete
2.7 kB
"""
evaluate.py
-----------
Basic evaluation script for the RAG pipeline.
Metrics
-------
* Retrieval accuracy β€” whether the correct source appears in the top-K results
* Answer non-emptiness β€” whether the LLM produced a non-trivial response
* Latency β€” end-to-end response time per query
Usage
-----
python scripts/evaluate.py
"""
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.chatbot import Chatbot
from app.config import VECTOR_DB_PATH
from components.embedder import HuggingFaceEmbedder
from components.vector_store import VectorStore
# ── Sample eval set (query, expected_keyword_in_answer) ──────────────────────
# Edit these to match the documents you have ingested.
EVAL_SET = [
("What is RAG?", "retrieval"),
("How does embedding work?", "vector"),
("What is a vector store?", "similarity"),
("Explain chunking", "overlap"),
("What models are used in this stack?","mistral"),
]
def main() -> None:
print("\n" + "=" * 60)
print(" RAG Chatbot β€” Evaluation")
print("=" * 60 + "\n")
embedder = HuggingFaceEmbedder()
store = VectorStore(embedder=embedder, index_path=VECTOR_DB_PATH)
if not store.load():
print("❌ No vector index found. Run scripts/ingest.py first.")
sys.exit(1)
chatbot = Chatbot(vector_store=store)
passed = 0
results = []
for i, (query, keyword) in enumerate(EVAL_SET, start=1):
t0 = time.time()
response = chatbot.chat(query)
latency = time.time() - t0
hit = keyword.lower() in response.answer.lower()
non_empty = len(response.answer.strip()) > 20
ok = hit and non_empty
passed += int(ok)
results.append({
"q": query,
"keyword": keyword,
"hit": hit,
"non_empty": non_empty,
"latency": latency,
"ok": ok,
})
status = "βœ…" if ok else "❌"
print(f" [{i}] {status} '{query}'")
print(f" keyword={keyword!r} found={hit} "
f"non_empty={non_empty} latency={latency:.2f}s")
print(f" Sources: {response.sources}")
print()
total = len(EVAL_SET)
pct = 100 * passed / total
avg_lat = sum(r["latency"] for r in results) / total
print("=" * 60)
print(f" Score : {passed}/{total} ({pct:.0f}%)")
print(f" Avg latency: {avg_lat:.2f}s")
print("=" * 60 + "\n")
if __name__ == "__main__":
main()