File size: 4,370 Bytes
f9e2c6d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | """
metrics.py β Retrieval quality metrics.
Implements:
- Recall@K : fraction of relevant docs retrieved in top-K
- MRR : Mean Reciprocal Rank of the first relevant doc
- nDCG : Normalized Discounted Cumulative Gain
Relevance is determined by keyword matching between the query and chunk content.
This is a proxy measure used when ground-truth labels are unavailable.
"""
import math
import re
import logging
from typing import List
from langchain_core.documents import Document
from pydantic import BaseModel, Field
logger = logging.getLogger(__name__)
# βββ Pydantic output model ββββββββββββββββββββββββββββββββββββββββββββββββββββ
class RetrievalMetrics(BaseModel):
"""Structured container for retrieval evaluation scores."""
recall_at_k: float = Field(..., ge=0.0, le=1.0, description="Recall@K")
mrr: float = Field(..., ge=0.0, le=1.0, description="Mean Reciprocal Rank")
ndcg: float = Field(..., ge=0.0, le=1.0, description="nDCG@K")
top_k: int = Field(..., description="K used for evaluation")
num_relevant: int = Field(..., description="Number of docs judged relevant")
# βββ Relevance oracle βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _extract_keywords(text: str) -> set:
"""Extract lowercase alphabetic tokens (length β₯ 3) from text."""
return set(re.findall(r"\b[a-zA-Z]{3,}\b", text.lower()))
def _is_relevant(query: str, doc: Document, threshold: int = 2) -> bool:
"""
Determine if a document is relevant to the query via keyword overlap.
Args:
query: User question.
doc: Retrieved document.
threshold: Minimum number of shared keywords to count as relevant.
Returns:
True if overlap β₯ threshold.
"""
q_keywords = _extract_keywords(query)
d_keywords = _extract_keywords(doc.page_content)
overlap = len(q_keywords & d_keywords)
return overlap >= threshold
# βββ Metric functions βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _compute_relevance_flags(query: str, docs: List[Document]) -> List[int]:
"""Return binary relevance list (1 = relevant, 0 = not)."""
return [1 if _is_relevant(query, doc) else 0 for doc in docs]
def recall_at_k(relevance: List[int]) -> float:
"""
Recall@K: fraction of retrieved docs that are relevant.
Since we have no total relevant pool, we treat the number of
relevant items in the retrieved set as the denominator baseline.
"""
num_relevant = sum(relevance)
if num_relevant == 0:
return 0.0
return num_relevant / len(relevance)
def mean_reciprocal_rank(relevance: List[int]) -> float:
"""
MRR: 1/rank of the first relevant document.
Returns 0.0 if no relevant document is found.
"""
for rank, rel in enumerate(relevance, 1):
if rel == 1:
return 1.0 / rank
return 0.0
def ndcg_at_k(relevance: List[int]) -> float:
"""
nDCG@K using binary relevance.
Args:
relevance: Binary relevance list ordered by retrieval rank.
Returns:
nDCG score in [0, 1].
"""
def dcg(rels: List[int]) -> float:
return sum(r / math.log2(i + 2) for i, r in enumerate(rels))
actual_dcg = dcg(relevance)
ideal_dcg = dcg(sorted(relevance, reverse=True))
if ideal_dcg == 0:
return 0.0
return actual_dcg / ideal_dcg
def compute_retrieval_metrics(query: str, docs: List[Document]) -> RetrievalMetrics:
"""
Compute all retrieval metrics for a queryβresult pair.
Args:
query: User's natural language question.
docs: Retrieved documents in retrieval rank order.
Returns:
RetrievalMetrics Pydantic model.
"""
relevance = _compute_relevance_flags(query, docs)
return RetrievalMetrics(
recall_at_k=round(recall_at_k(relevance), 4),
mrr=round(mean_reciprocal_rank(relevance), 4),
ndcg=round(ndcg_at_k(relevance), 4),
top_k=len(docs),
num_relevant=sum(relevance),
) |