File size: 3,119 Bytes
8e72e1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""
Evaluation harness for the codebase agent.

For each test question we run the real pipeline (hybrid search -> rerank ->
grounded answer) and measure:

  - file accuracy      : did a chunk from the expected file reach the top-N?
  - function accuracy  : did the expected function/class reach the top-N?
                         (only scored on questions that name an expected function)
  - citation accuracy  : did the generated answer actually cite the expected file?
  - latency            : per-question wall time

This is the honest, code-appropriate version of StudyMate's eval. NLI-style
faithfulness is a poor fit for code, so citation accuracy is the faithfulness
proxy here (does the answer ground itself in the right file?). LLM-as-judge
grading is the natural upgrade (see README roadmap).
"""
import time


class CodeEvaluator:

    def __init__(self, embedder, hybrid, reranker, answerer, retrieve_k=10, top_n=5):
        self.embedder = embedder
        self.hybrid = hybrid
        self.reranker = reranker
        self.answerer = answerer
        self.retrieve_k = retrieve_k
        self.top_n = top_n

    def evaluate(self, testset):
        rows = []
        for item in testset:
            q = item["question"]
            exp_file = (item.get("expected_file") or "").lower()
            exp_name = (item.get("expected_name") or "").lower()

            t0 = time.time()
            query_emb = self.embedder.create_embeddings([q])[0]
            results = self.reranker.rerank(q, self.hybrid.search(q, query_emb, k=self.retrieve_k))
            top = results[:self.top_n]
            answer = self.answerer.answer(q, top)
            elapsed = round(time.time() - t0, 2)

            files = [r["document"]["file"].lower() for r in top]
            names = [r["document"]["name"].lower() for r in top]

            file_hit = bool(exp_file) and any(exp_file in f for f in files)
            name_hit = (any(exp_name in n for n in names)) if exp_name else None
            citation_ok = bool(exp_file) and exp_file in answer["answer"].lower()

            rows.append({
                "question": q,
                "expected_file": item.get("expected_file", ""),
                "expected_name": item.get("expected_name", ""),
                "file_hit": file_hit,
                "name_hit": name_hit,
                "citation_ok": citation_ok,
                "seconds": elapsed,
                "top_files": files,
            })

        n = len(rows)
        named = [r for r in rows if r["name_hit"] is not None]
        latencies = sorted(r["seconds"] for r in rows)
        summary = {
            "questions": n,
            "file_accuracy": round(100 * sum(r["file_hit"] for r in rows) / n, 1) if n else 0,
            "function_accuracy": (round(100 * sum(r["name_hit"] for r in named) / len(named), 1)
                                  if named else None),
            "citation_accuracy": round(100 * sum(r["citation_ok"] for r in rows) / n, 1) if n else 0,
            "median_latency": latencies[len(latencies) // 2] if latencies else 0,
        }
        return rows, summary