ymlin105's picture
feat: integrate A/B testing framework and enhance RAG diversity in recommendation system
b4bfa19
#!/usr/bin/env python3
"""
Evaluate RAG retrieval on a Golden Test Set.
Quantitative metrics: Accuracy@K, Recall@K, MRR@K, NDCG@K.
Use human-annotated Query-Book pairs for data-driven evaluation.
Usage:
python scripts/model/evaluate_rag.py
python scripts/model/evaluate_rag.py --golden data/rag_golden.csv --top_k 10
Golden set format (CSV): query, isbn, relevance
- query: user search string
- isbn: expected relevant book (1=relevant)
- Multiple rows per query = multiple relevant books
"""
import math
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
import pandas as pd
import logging
from collections import defaultdict
from src.core.recommendation_orchestrator import RecommendationOrchestrator
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def _dcg_at_k(relevances: list[float], k: int) -> float:
"""DCG@K: sum(rel_i / log2(rank_i + 1)). relevances[i] = relevance at rank i+1."""
return sum(rel / math.log2(i + 2) for i, rel in enumerate(relevances[:k]))
def _ndcg_at_k(relevances: list[float], k: int, n_relevant: int) -> float:
"""NDCG@K. relevances: binary (0/1) per rank. IDCG = ideal when n_relevant items at top."""
dcg = _dcg_at_k(relevances, k)
n_at_top = min(n_relevant, k)
idcg = sum(1.0 / math.log2(i + 2) for i in range(n_at_top))
return dcg / idcg if idcg > 0 else 0.0
def load_golden(path: Path) -> dict[str, set[str]]:
"""Load golden set: {query -> set of relevant isbns}."""
df = pd.read_csv(path, comment="#")
df = df[df.get("relevance", 1) == 1] # Only relevant pairs
golden = defaultdict(set)
for _, row in df.iterrows():
q = str(row["query"]).strip()
isbn = str(row["isbn"]).strip().replace(".0", "")
if q and isbn:
golden[q].add(isbn)
return dict(golden)
def evaluate_rag(
golden_path: Path | str = "data/rag_golden.csv",
top_k: int = 10,
use_title_match: bool = True,
) -> dict:
"""
Run RAG retrieval on golden set and compute metrics.
Returns: dict with accuracy_at_k, recall_at_k, mrr_at_k, ndcg_at_k, n_queries
"""
golden_path = Path(golden_path)
if not golden_path.exists():
# Fallback to example
alt = Path("data/rag_golden.example.csv")
if alt.exists():
logger.warning("Golden set not found at %s, using %s", golden_path, alt)
golden_path = alt
else:
raise FileNotFoundError(
f"Golden set not found. Create {golden_path} with columns: query,isbn,relevance. "
"See data/rag_golden.example.csv for format."
)
golden = load_golden(golden_path)
if not golden:
raise ValueError("Golden set is empty")
logger.info("Evaluating RAG on %d queries from %s", len(golden), golden_path)
recommender = RecommendationOrchestrator()
isbn_to_title = {}
if use_title_match:
try:
bp = Path("data/books_processed.csv")
if not bp.exists():
bp = Path(__file__).resolve().parent.parent.parent / "data" / "books_processed.csv"
books = pd.read_csv(bp, usecols=["isbn13", "title"])
books["isbn13"] = books["isbn13"].astype(str).str.replace(r"\.0$", "", regex=True)
isbn_to_title = books.set_index("isbn13")["title"].to_dict()
except Exception as e:
logger.warning("Could not load title map: %s", e)
use_title_match = False
hits_acc = 0
recall_sum = 0.0
mrr_sum = 0.0
ndcg_sum = 0.0
for query, relevant_isbns in golden.items():
try:
recs = recommender.get_recommendations_sync(query, category="All")
rec_isbns = [r.get("isbn") or r.get("isbn13") for r in recs if r]
rec_isbns = [str(x).replace(".0", "") for x in rec_isbns if pd.notna(x)]
rec_top = rec_isbns[:top_k]
# Match: exact or title
def _match(target: str, cand_list: list) -> int:
for i, c in enumerate(cand_list):
if str(c).strip() == str(target).strip():
return i
if use_title_match:
t_title = isbn_to_title.get(str(target), "").lower().strip()
c_title = isbn_to_title.get(str(c), "").lower().strip()
if t_title and c_title and t_title == c_title:
return i
return -1
# Accuracy@K: at least one relevant in top-K
found_any = False
first_rank = top_k + 1
count_in_top = 0
for rel in relevant_isbns:
rk = _match(rel, rec_top)
if rk >= 0:
found_any = True
count_in_top += 1
first_rank = min(first_rank, rk + 1)
if found_any:
hits_acc += 1
recall_sum += count_in_top / len(relevant_isbns) if relevant_isbns else 0
if first_rank <= top_k:
mrr_sum += 1.0 / first_rank
# NDCG@K: build relevance vector per rank
relevances = []
for c in rec_top:
matched = False
for rel in relevant_isbns:
if str(c).strip() == str(rel).strip():
matched = True
break
if use_title_match:
t_title = isbn_to_title.get(str(rel), "").lower().strip()
c_title = isbn_to_title.get(str(c), "").lower().strip()
if t_title and c_title and t_title == c_title:
matched = True
break
relevances.append(1.0 if matched else 0.0)
ndcg_sum += _ndcg_at_k(relevances, top_k, len(relevant_isbns))
except Exception as e:
logger.warning("Query %r failed: %s", query[:50], e)
n = len(golden)
return {
"accuracy_at_k": hits_acc / n,
"recall_at_k": recall_sum / n,
"mrr_at_k": mrr_sum / n,
"ndcg_at_k": ndcg_sum / n,
"n_queries": n,
"top_k": top_k,
}
def main():
import argparse
parser = argparse.ArgumentParser(description="Evaluate RAG on Golden Test Set")
parser.add_argument("--golden", default="data/rag_golden.csv", help="Path to golden CSV")
parser.add_argument("--top_k", type=int, default=10)
parser.add_argument("--no-title-match", action="store_true", help="Disable relaxed title matching")
args = parser.parse_args()
m = evaluate_rag(
golden_path=args.golden,
top_k=args.top_k,
use_title_match=not args.no_title_match,
)
print("\n" + "=" * 50)
print(" RAG Golden Test Set Evaluation")
print("=" * 50)
print(f" Queries: {m['n_queries']}")
print(f" Top-K: {m['top_k']}")
print(f" Accuracy@{m['top_k']}: {m['accuracy_at_k']:.4f} (any relevant in top-K)")
print(f" Recall@{m['top_k']}: {m['recall_at_k']:.4f} (fraction of relevant in top-K)")
print(f" MRR@{m['top_k']}: {m['mrr_at_k']:.4f} (mean reciprocal rank)")
print(f" NDCG@{m['top_k']}: {m['ndcg_at_k']:.4f} (normalized discounted cumulative gain)")
print("=" * 50)
if __name__ == "__main__":
main()