Spaces:

AishaSurve
/

codebase-agent

Running

App Files Files Community

codebase-agent / src /evaluation /evaluator.py

AishaSurve

Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval

8e72e1f 3 days ago

Raw

History Blame Contribute Delete

3.12 kB

	"""
	Evaluation harness for the codebase agent.

	For each test question we run the real pipeline (hybrid search -> rerank ->
	grounded answer) and measure:

	- file accuracy : did a chunk from the expected file reach the top-N?
	- function accuracy : did the expected function/class reach the top-N?
	(only scored on questions that name an expected function)
	- citation accuracy : did the generated answer actually cite the expected file?
	- latency : per-question wall time

	This is the honest, code-appropriate version of StudyMate's eval. NLI-style
	faithfulness is a poor fit for code, so citation accuracy is the faithfulness
	proxy here (does the answer ground itself in the right file?). LLM-as-judge
	grading is the natural upgrade (see README roadmap).
	"""
	import time


	class CodeEvaluator:

	def __init__(self, embedder, hybrid, reranker, answerer, retrieve_k=10, top_n=5):
	self.embedder = embedder
	self.hybrid = hybrid
	self.reranker = reranker
	self.answerer = answerer
	self.retrieve_k = retrieve_k
	self.top_n = top_n

	def evaluate(self, testset):
	rows = []
	for item in testset:
	q = item["question"]
	exp_file = (item.get("expected_file") or "").lower()
	exp_name = (item.get("expected_name") or "").lower()

	t0 = time.time()
	query_emb = self.embedder.create_embeddings([q])[0]
	results = self.reranker.rerank(q, self.hybrid.search(q, query_emb, k=self.retrieve_k))
	top = results[:self.top_n]
	answer = self.answerer.answer(q, top)
	elapsed = round(time.time() - t0, 2)

	files = [r["document"]["file"].lower() for r in top]
	names = [r["document"]["name"].lower() for r in top]

	file_hit = bool(exp_file) and any(exp_file in f for f in files)
	name_hit = (any(exp_name in n for n in names)) if exp_name else None
	citation_ok = bool(exp_file) and exp_file in answer["answer"].lower()

	rows.append({
	"question": q,
	"expected_file": item.get("expected_file", ""),
	"expected_name": item.get("expected_name", ""),
	"file_hit": file_hit,
	"name_hit": name_hit,
	"citation_ok": citation_ok,
	"seconds": elapsed,
	"top_files": files,
	})

	n = len(rows)
	named = [r for r in rows if r["name_hit"] is not None]
	latencies = sorted(r["seconds"] for r in rows)
	summary = {
	"questions": n,
	"file_accuracy": round(100 * sum(r["file_hit"] for r in rows) / n, 1) if n else 0,
	"function_accuracy": (round(100 * sum(r["name_hit"] for r in named) / len(named), 1)
	if named else None),
	"citation_accuracy": round(100 * sum(r["citation_ok"] for r in rows) / n, 1) if n else 0,
	"median_latency": latencies[len(latencies) // 2] if latencies else 0,
	}
	return rows, summary