Spaces:
Runtime error
Runtime error
| import time | |
| from vec import search | |
| from rerank import rerank_candidates | |
| from kg import compute_path_proximity, compute_degree_norm | |
| from utils import compute_freshness | |
| # A toy dataset: {question: [expected substrings]} | |
| EVAL_SET = [ | |
| {"q": "Who founded OpenAI?", "answers": ["Elon Musk", "Sam Altman"]}, | |
| {"q": "What did OpenAI release?", "answers": ["GPT-4o", "Whisper", "SORA"]}, | |
| {"q": "What did Google acquire?", "answers": ["YouTube", "Instagram"]}, | |
| ] | |
| def evaluate(): | |
| baseline_hits, hybrid_hits = [], [] | |
| baseline_ndcg, hybrid_ndcg = [], [] | |
| citation_correctness = [] | |
| latencies = [] | |
| for item in EVAL_SET: | |
| q, golds = item["q"], item["answers"] | |
| # Baseline (cosine only) | |
| start = time.time() | |
| baseline = search(q, top_k=10) | |
| latencies.append(time.time() - start) | |
| # Did we hit a gold in top-10? | |
| hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in baseline) | |
| baseline_hits.append(1 if hit else 0) | |
| # nDCG@10 | |
| scores = [] | |
| for rank, c in enumerate(baseline, 1): | |
| rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0 | |
| if rel: | |
| scores.append(1 / (rank)) | |
| baseline_ndcg.append(sum(scores)) | |
| # Hybrid (cosine + path + freshness + degree) | |
| for c in baseline: | |
| c["path_proximity"] = compute_path_proximity(q, c["chunk"]) | |
| c["degree_norm"] = compute_degree_norm(c["chunk"]) | |
| c["freshness_decay"] = compute_freshness(c.get("timestamp")) | |
| reranked, _ = rerank_candidates(baseline) | |
| hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in reranked[:10]) | |
| hybrid_hits.append(1 if hit else 0) | |
| scores = [] | |
| for rank, c in enumerate(reranked, 1): | |
| rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0 | |
| if rel: | |
| scores.append(1 / (rank)) | |
| hybrid_ndcg.append(sum(scores)) | |
| # TODO -> Citation correctness | |
| citation_correctness.append(1) | |
| return { | |
| "baseline": { | |
| "hit@10": sum(baseline_hits)/len(baseline_hits), | |
| "nDCG@10": sum(baseline_ndcg)/len(baseline_ndcg), | |
| }, | |
| "hybrid": { | |
| "hit@10": sum(hybrid_hits)/len(hybrid_hits), | |
| "nDCG@10": sum(hybrid_ndcg)/len(hybrid_ndcg), | |
| }, | |
| "citation_correctness": sum(citation_correctness)/len(citation_correctness), | |
| "avg_latency_sec": sum(latencies)/len(latencies), | |
| } | |