GraphRAG-Live / eval.py
aayush226's picture
Upload 10 files
4d9fcca verified
import time
from vec import search
from rerank import rerank_candidates
from kg import compute_path_proximity, compute_degree_norm
from utils import compute_freshness
# A toy dataset: {question: [expected substrings]}
EVAL_SET = [
{"q": "Who founded OpenAI?", "answers": ["Elon Musk", "Sam Altman"]},
{"q": "What did OpenAI release?", "answers": ["GPT-4o", "Whisper", "SORA"]},
{"q": "What did Google acquire?", "answers": ["YouTube", "Instagram"]},
]
def evaluate():
baseline_hits, hybrid_hits = [], []
baseline_ndcg, hybrid_ndcg = [], []
citation_correctness = []
latencies = []
for item in EVAL_SET:
q, golds = item["q"], item["answers"]
# Baseline (cosine only)
start = time.time()
baseline = search(q, top_k=10)
latencies.append(time.time() - start)
# Did we hit a gold in top-10?
hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in baseline)
baseline_hits.append(1 if hit else 0)
# nDCG@10
scores = []
for rank, c in enumerate(baseline, 1):
rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0
if rel:
scores.append(1 / (rank))
baseline_ndcg.append(sum(scores))
# Hybrid (cosine + path + freshness + degree)
for c in baseline:
c["path_proximity"] = compute_path_proximity(q, c["chunk"])
c["degree_norm"] = compute_degree_norm(c["chunk"])
c["freshness_decay"] = compute_freshness(c.get("timestamp"))
reranked, _ = rerank_candidates(baseline)
hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in reranked[:10])
hybrid_hits.append(1 if hit else 0)
scores = []
for rank, c in enumerate(reranked, 1):
rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0
if rel:
scores.append(1 / (rank))
hybrid_ndcg.append(sum(scores))
# TODO -> Citation correctness
citation_correctness.append(1)
return {
"baseline": {
"hit@10": sum(baseline_hits)/len(baseline_hits),
"nDCG@10": sum(baseline_ndcg)/len(baseline_ndcg),
},
"hybrid": {
"hit@10": sum(hybrid_hits)/len(hybrid_hits),
"nDCG@10": sum(hybrid_ndcg)/len(hybrid_ndcg),
},
"citation_correctness": sum(citation_correctness)/len(citation_correctness),
"avg_latency_sec": sum(latencies)/len(latencies),
}