huzaifa-dangote's picture
Upload folder using huggingface_hub
0876544 verified
import sys
import math
from pydantic import BaseModel, Field
from dotenv import load_dotenv
from implementation.answer import answer_question, fetch_context
from evaluation.test import TestQuestion, load_tests
load_dotenv(override=True)
MODEL = "gpt-4.1-mini"
db_name = "vector_db"
class RetrievalEval(BaseModel):
"""Evaluation metrics for retrieval performance."""
mrr: float = Field(description="Mean Reciprocal Rank - average across all keywords")
ndcg: float = Field(description="Normalized Discounted Cumulative Gain (binary relevance)")
keywords_found: int = Field(description="Number of keywords found in top-k results")
total_keywords: int = Field(description="Total number of keywords to find")
keyword_coverage: float = Field(description="Percentage of keywords found")
def calculate_mrr(keyword: str, retrieved_docs: list) -> float:
"""Calculate Mean Reciprocal Rank for a keyword."""
for rank, doc in enumerate(retrieved_docs, start=1):
if keyword.lower() in doc.page_content.lower():
return 1.0 / rank
return 0.0
def calculate_dcg(relevances: list[int], k: int) -> float:
"""Calculate Discounted Cumulative Gain for a list of relevance scores."""
dcg = 0.0
for i in range(min(k, len(relevances))):
dcg += relevances[i] / math.log2(i + 2)
return dcg
def calculate_ndcg(keyword: str, retrieved_docs: list, k: int = 10) -> float:
"""Calculate nDCG for a single keyword (binary relevance, case-insensitive)."""
keyword_lower = keyword.lower()
# Binary relevance: 1 if keyword found, 0 otherwise
relevances = [
1 if keyword_lower in doc.page_content.lower() else 0 for doc in retrieved_docs[:k]
]
# DCG
dcg = calculate_dcg(relevances, k)
# Ideal DCG (best case: keyword in first position)
ideal_relevances = sorted(relevances, reverse=True)
idcg = calculate_dcg(ideal_relevances, k)
return dcg / idcg if idcg > 0 else 0.0
def evaluate_retrieval(test: TestQuestion, k: int = 10) -> RetrievalEval:
"""
Evaluate retrieval performance for a test question.
Args:
test: TestQuestion object containing question and keywords
k: Number of top documents to retrieve (default 10)
Returns:
RetrievalEval object with MRR, nDCG, and keyword coverage metrics
"""
# Retrieve documents using shared answer module
retrieved_docs = fetch_context(test.question)
# Calculate MRR (average across all keywords)
mrr_scores = [calculate_mrr(keyword, retrieved_docs) for keyword in test.keywords]
avg_mrr = sum(mrr_scores) / len(mrr_scores) if mrr_scores else 0.0
# Calculate nDCG (average across all keywords)
ndcg_scores = [calculate_ndcg(keyword, retrieved_docs, k) for keyword in test.keywords]
avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0
# Calculate keyword coverage
keywords_found = sum(1 for score in mrr_scores if score > 0)
total_keywords = len(test.keywords)
keyword_coverage = (keywords_found / total_keywords * 100) if total_keywords > 0 else 0.0
return RetrievalEval(
mrr=avg_mrr,
ndcg=avg_ndcg,
keywords_found=keywords_found,
total_keywords=total_keywords,
keyword_coverage=keyword_coverage,
)
def evaluate_all_retrieval():
"""Evaluate retrieval performance for all test questions."""
tests = load_tests()
total_tests = len(tests)
for index, test in enumerate(tests, start=1):
result = evaluate_retrieval(test)
progress = index / total_tests
yield test, result, progress