import time from vec import search from rerank import rerank_candidates from kg import compute_path_proximity, compute_degree_norm from utils import compute_freshness # A toy dataset: {question: [expected substrings]} EVAL_SET = [ {"q": "Who founded OpenAI?", "answers": ["Elon Musk", "Sam Altman"]}, {"q": "What did OpenAI release?", "answers": ["GPT-4o", "Whisper", "SORA"]}, {"q": "What did Google acquire?", "answers": ["YouTube", "Instagram"]}, ] def evaluate(): baseline_hits, hybrid_hits = [], [] baseline_ndcg, hybrid_ndcg = [], [] citation_correctness = [] latencies = [] for item in EVAL_SET: q, golds = item["q"], item["answers"] # Baseline (cosine only) start = time.time() baseline = search(q, top_k=10) latencies.append(time.time() - start) # Did we hit a gold in top-10? hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in baseline) baseline_hits.append(1 if hit else 0) # nDCG@10 scores = [] for rank, c in enumerate(baseline, 1): rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0 if rel: scores.append(1 / (rank)) baseline_ndcg.append(sum(scores)) # Hybrid (cosine + path + freshness + degree) for c in baseline: c["path_proximity"] = compute_path_proximity(q, c["chunk"]) c["degree_norm"] = compute_degree_norm(c["chunk"]) c["freshness_decay"] = compute_freshness(c.get("timestamp")) reranked, _ = rerank_candidates(baseline) hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in reranked[:10]) hybrid_hits.append(1 if hit else 0) scores = [] for rank, c in enumerate(reranked, 1): rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0 if rel: scores.append(1 / (rank)) hybrid_ndcg.append(sum(scores)) # TODO -> Citation correctness citation_correctness.append(1) return { "baseline": { "hit@10": sum(baseline_hits)/len(baseline_hits), "nDCG@10": sum(baseline_ndcg)/len(baseline_ndcg), }, "hybrid": { "hit@10": sum(hybrid_hits)/len(hybrid_hits), "nDCG@10": sum(hybrid_ndcg)/len(hybrid_ndcg), }, "citation_correctness": sum(citation_correctness)/len(citation_correctness), "avg_latency_sec": sum(latencies)/len(latencies), }