Spaces:
Running
Running
File size: 3,119 Bytes
8e72e1f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | """
Evaluation harness for the codebase agent.
For each test question we run the real pipeline (hybrid search -> rerank ->
grounded answer) and measure:
- file accuracy : did a chunk from the expected file reach the top-N?
- function accuracy : did the expected function/class reach the top-N?
(only scored on questions that name an expected function)
- citation accuracy : did the generated answer actually cite the expected file?
- latency : per-question wall time
This is the honest, code-appropriate version of StudyMate's eval. NLI-style
faithfulness is a poor fit for code, so citation accuracy is the faithfulness
proxy here (does the answer ground itself in the right file?). LLM-as-judge
grading is the natural upgrade (see README roadmap).
"""
import time
class CodeEvaluator:
def __init__(self, embedder, hybrid, reranker, answerer, retrieve_k=10, top_n=5):
self.embedder = embedder
self.hybrid = hybrid
self.reranker = reranker
self.answerer = answerer
self.retrieve_k = retrieve_k
self.top_n = top_n
def evaluate(self, testset):
rows = []
for item in testset:
q = item["question"]
exp_file = (item.get("expected_file") or "").lower()
exp_name = (item.get("expected_name") or "").lower()
t0 = time.time()
query_emb = self.embedder.create_embeddings([q])[0]
results = self.reranker.rerank(q, self.hybrid.search(q, query_emb, k=self.retrieve_k))
top = results[:self.top_n]
answer = self.answerer.answer(q, top)
elapsed = round(time.time() - t0, 2)
files = [r["document"]["file"].lower() for r in top]
names = [r["document"]["name"].lower() for r in top]
file_hit = bool(exp_file) and any(exp_file in f for f in files)
name_hit = (any(exp_name in n for n in names)) if exp_name else None
citation_ok = bool(exp_file) and exp_file in answer["answer"].lower()
rows.append({
"question": q,
"expected_file": item.get("expected_file", ""),
"expected_name": item.get("expected_name", ""),
"file_hit": file_hit,
"name_hit": name_hit,
"citation_ok": citation_ok,
"seconds": elapsed,
"top_files": files,
})
n = len(rows)
named = [r for r in rows if r["name_hit"] is not None]
latencies = sorted(r["seconds"] for r in rows)
summary = {
"questions": n,
"file_accuracy": round(100 * sum(r["file_hit"] for r in rows) / n, 1) if n else 0,
"function_accuracy": (round(100 * sum(r["name_hit"] for r in named) / len(named), 1)
if named else None),
"citation_accuracy": round(100 * sum(r["citation_ok"] for r in rows) / n, 1) if n else 0,
"median_latency": latencies[len(latencies) // 2] if latencies else 0,
}
return rows, summary |