Spaces:

Mobiworks
/

rag-chatbot

Running

File size: 2,698 Bytes

d456104

"""
evaluate.py
-----------
Basic evaluation script for the RAG pipeline.

Metrics
-------
* Retrieval accuracy  — whether the correct source appears in the top-K results
* Answer non-emptiness — whether the LLM produced a non-trivial response
* Latency             — end-to-end response time per query

Usage
-----
    python scripts/evaluate.py
"""

import sys
import time
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from app.chatbot import Chatbot
from app.config import VECTOR_DB_PATH
from components.embedder import HuggingFaceEmbedder
from components.vector_store import VectorStore

# ── Sample eval set (query, expected_keyword_in_answer) ──────────────────────
# Edit these to match the documents you have ingested.
EVAL_SET = [
    ("What is RAG?",                       "retrieval"),
    ("How does embedding work?",           "vector"),
    ("What is a vector store?",            "similarity"),
    ("Explain chunking",                   "overlap"),
    ("What models are used in this stack?","mistral"),
]


def main() -> None:
    print("\n" + "=" * 60)
    print("  RAG Chatbot — Evaluation")
    print("=" * 60 + "\n")

    embedder = HuggingFaceEmbedder()
    store    = VectorStore(embedder=embedder, index_path=VECTOR_DB_PATH)

    if not store.load():
        print("❌  No vector index found. Run scripts/ingest.py first.")
        sys.exit(1)

    chatbot = Chatbot(vector_store=store)

    passed = 0
    results = []

    for i, (query, keyword) in enumerate(EVAL_SET, start=1):
        t0       = time.time()
        response = chatbot.chat(query)
        latency  = time.time() - t0

        hit = keyword.lower() in response.answer.lower()
        non_empty = len(response.answer.strip()) > 20

        ok = hit and non_empty
        passed += int(ok)

        results.append({
            "q":        query,
            "keyword":  keyword,
            "hit":      hit,
            "non_empty": non_empty,
            "latency":  latency,
            "ok":       ok,
        })

        status = "✅" if ok else "❌"
        print(f"  [{i}] {status}  '{query}'")
        print(f"       keyword={keyword!r} found={hit}  "
              f"non_empty={non_empty}  latency={latency:.2f}s")
        print(f"       Sources: {response.sources}")
        print()

    total  = len(EVAL_SET)
    pct    = 100 * passed / total
    avg_lat = sum(r["latency"] for r in results) / total

    print("=" * 60)
    print(f"  Score     : {passed}/{total}  ({pct:.0f}%)")
    print(f"  Avg latency: {avg_lat:.2f}s")
    print("=" * 60 + "\n")


if __name__ == "__main__":
    main()