File size: 2,624 Bytes
4d9fcca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import time
from vec import search
from rerank import rerank_candidates
from kg import compute_path_proximity, compute_degree_norm
from utils import compute_freshness

# A toy dataset: {question: [expected substrings]}
EVAL_SET = [
    {"q": "Who founded OpenAI?", "answers": ["Elon Musk", "Sam Altman"]},
    {"q": "What did OpenAI release?", "answers": ["GPT-4o", "Whisper", "SORA"]},
    {"q": "What did Google acquire?", "answers": ["YouTube", "Instagram"]},
]

def evaluate():
    baseline_hits, hybrid_hits = [], []
    baseline_ndcg, hybrid_ndcg = [], []
    citation_correctness = []
    latencies = []

    for item in EVAL_SET:
        q, golds = item["q"], item["answers"]

        # Baseline (cosine only)
        start = time.time()
        baseline = search(q, top_k=10)
        latencies.append(time.time() - start)

        # Did we hit a gold in top-10?
        hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in baseline)
        baseline_hits.append(1 if hit else 0)

        # nDCG@10
        scores = []
        for rank, c in enumerate(baseline, 1):
            rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0
            if rel:
                scores.append(1 / (rank))
        baseline_ndcg.append(sum(scores))

        # Hybrid (cosine + path + freshness + degree)
        for c in baseline:
            c["path_proximity"] = compute_path_proximity(q, c["chunk"])
            c["degree_norm"] = compute_degree_norm(c["chunk"])
            c["freshness_decay"] = compute_freshness(c.get("timestamp"))

        reranked, _ = rerank_candidates(baseline)
        hit = any(any(g.lower() in c["chunk"].lower() for g in golds) for c in reranked[:10])
        hybrid_hits.append(1 if hit else 0)

        scores = []
        for rank, c in enumerate(reranked, 1):
            rel = 1 if any(g.lower() in c["chunk"].lower() for g in golds) else 0
            if rel:
                scores.append(1 / (rank))
        hybrid_ndcg.append(sum(scores))

        # TODO -> Citation correctness
        citation_correctness.append(1)

    return {
        "baseline": {
            "hit@10": sum(baseline_hits)/len(baseline_hits),
            "nDCG@10": sum(baseline_ndcg)/len(baseline_ndcg),
        },
        "hybrid": {
            "hit@10": sum(hybrid_hits)/len(hybrid_hits),
            "nDCG@10": sum(hybrid_ndcg)/len(hybrid_ndcg),
        },
        "citation_correctness": sum(citation_correctness)/len(citation_correctness),
        "avg_latency_sec": sum(latencies)/len(latencies),
    }