codebase-agent / src /evaluation /evaluator.py
AishaSurve's picture
Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval
8e72e1f
Raw
History Blame Contribute Delete
3.12 kB
"""
Evaluation harness for the codebase agent.
For each test question we run the real pipeline (hybrid search -> rerank ->
grounded answer) and measure:
- file accuracy : did a chunk from the expected file reach the top-N?
- function accuracy : did the expected function/class reach the top-N?
(only scored on questions that name an expected function)
- citation accuracy : did the generated answer actually cite the expected file?
- latency : per-question wall time
This is the honest, code-appropriate version of StudyMate's eval. NLI-style
faithfulness is a poor fit for code, so citation accuracy is the faithfulness
proxy here (does the answer ground itself in the right file?). LLM-as-judge
grading is the natural upgrade (see README roadmap).
"""
import time
class CodeEvaluator:
def __init__(self, embedder, hybrid, reranker, answerer, retrieve_k=10, top_n=5):
self.embedder = embedder
self.hybrid = hybrid
self.reranker = reranker
self.answerer = answerer
self.retrieve_k = retrieve_k
self.top_n = top_n
def evaluate(self, testset):
rows = []
for item in testset:
q = item["question"]
exp_file = (item.get("expected_file") or "").lower()
exp_name = (item.get("expected_name") or "").lower()
t0 = time.time()
query_emb = self.embedder.create_embeddings([q])[0]
results = self.reranker.rerank(q, self.hybrid.search(q, query_emb, k=self.retrieve_k))
top = results[:self.top_n]
answer = self.answerer.answer(q, top)
elapsed = round(time.time() - t0, 2)
files = [r["document"]["file"].lower() for r in top]
names = [r["document"]["name"].lower() for r in top]
file_hit = bool(exp_file) and any(exp_file in f for f in files)
name_hit = (any(exp_name in n for n in names)) if exp_name else None
citation_ok = bool(exp_file) and exp_file in answer["answer"].lower()
rows.append({
"question": q,
"expected_file": item.get("expected_file", ""),
"expected_name": item.get("expected_name", ""),
"file_hit": file_hit,
"name_hit": name_hit,
"citation_ok": citation_ok,
"seconds": elapsed,
"top_files": files,
})
n = len(rows)
named = [r for r in rows if r["name_hit"] is not None]
latencies = sorted(r["seconds"] for r in rows)
summary = {
"questions": n,
"file_accuracy": round(100 * sum(r["file_hit"] for r in rows) / n, 1) if n else 0,
"function_accuracy": (round(100 * sum(r["name_hit"] for r in named) / len(named), 1)
if named else None),
"citation_accuracy": round(100 * sum(r["citation_ok"] for r in rows) / n, 1) if n else 0,
"median_latency": latencies[len(latencies) // 2] if latencies else 0,
}
return rows, summary