| import sys |
| import math |
| from pydantic import BaseModel, Field |
| from dotenv import load_dotenv |
| from implementation.answer import answer_question, fetch_context |
| from evaluation.test import TestQuestion, load_tests |
|
|
| load_dotenv(override=True) |
|
|
| MODEL = "gpt-4.1-mini" |
| db_name = "vector_db" |
|
|
| class RetrievalEval(BaseModel): |
| """Evaluation metrics for retrieval performance.""" |
|
|
| mrr: float = Field(description="Mean Reciprocal Rank - average across all keywords") |
| ndcg: float = Field(description="Normalized Discounted Cumulative Gain (binary relevance)") |
| keywords_found: int = Field(description="Number of keywords found in top-k results") |
| total_keywords: int = Field(description="Total number of keywords to find") |
| keyword_coverage: float = Field(description="Percentage of keywords found") |
|
|
| def calculate_mrr(keyword: str, retrieved_docs: list) -> float: |
| """Calculate Mean Reciprocal Rank for a keyword.""" |
| for rank, doc in enumerate(retrieved_docs, start=1): |
| if keyword.lower() in doc.page_content.lower(): |
| return 1.0 / rank |
| return 0.0 |
|
|
| def calculate_dcg(relevances: list[int], k: int) -> float: |
| """Calculate Discounted Cumulative Gain for a list of relevance scores.""" |
| dcg = 0.0 |
| for i in range(min(k, len(relevances))): |
| dcg += relevances[i] / math.log2(i + 2) |
| return dcg |
|
|
| def calculate_ndcg(keyword: str, retrieved_docs: list, k: int = 10) -> float: |
| """Calculate nDCG for a single keyword (binary relevance, case-insensitive).""" |
| keyword_lower = keyword.lower() |
|
|
| |
| relevances = [ |
| 1 if keyword_lower in doc.page_content.lower() else 0 for doc in retrieved_docs[:k] |
| ] |
|
|
| |
| dcg = calculate_dcg(relevances, k) |
|
|
| |
| ideal_relevances = sorted(relevances, reverse=True) |
| idcg = calculate_dcg(ideal_relevances, k) |
|
|
| return dcg / idcg if idcg > 0 else 0.0 |
|
|
| def evaluate_retrieval(test: TestQuestion, k: int = 10) -> RetrievalEval: |
| """ |
| Evaluate retrieval performance for a test question. |
| |
| Args: |
| test: TestQuestion object containing question and keywords |
| k: Number of top documents to retrieve (default 10) |
| |
| Returns: |
| RetrievalEval object with MRR, nDCG, and keyword coverage metrics |
| """ |
| |
| retrieved_docs = fetch_context(test.question) |
|
|
| |
| mrr_scores = [calculate_mrr(keyword, retrieved_docs) for keyword in test.keywords] |
| avg_mrr = sum(mrr_scores) / len(mrr_scores) if mrr_scores else 0.0 |
|
|
| |
| ndcg_scores = [calculate_ndcg(keyword, retrieved_docs, k) for keyword in test.keywords] |
| avg_ndcg = sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0 |
|
|
| |
| keywords_found = sum(1 for score in mrr_scores if score > 0) |
| total_keywords = len(test.keywords) |
| keyword_coverage = (keywords_found / total_keywords * 100) if total_keywords > 0 else 0.0 |
|
|
| return RetrievalEval( |
| mrr=avg_mrr, |
| ndcg=avg_ndcg, |
| keywords_found=keywords_found, |
| total_keywords=total_keywords, |
| keyword_coverage=keyword_coverage, |
| ) |
|
|
| def evaluate_all_retrieval(): |
| """Evaluate retrieval performance for all test questions.""" |
| tests = load_tests() |
| total_tests = len(tests) |
| for index, test in enumerate(tests, start=1): |
| result = evaluate_retrieval(test) |
| progress = index / total_tests |
| yield test, result, progress |