""" Evaluation harness for the codebase agent. For each test question we run the real pipeline (hybrid search -> rerank -> grounded answer) and measure: - file accuracy : did a chunk from the expected file reach the top-N? - function accuracy : did the expected function/class reach the top-N? (only scored on questions that name an expected function) - citation accuracy : did the generated answer actually cite the expected file? - latency : per-question wall time This is the honest, code-appropriate version of StudyMate's eval. NLI-style faithfulness is a poor fit for code, so citation accuracy is the faithfulness proxy here (does the answer ground itself in the right file?). LLM-as-judge grading is the natural upgrade (see README roadmap). """ import time class CodeEvaluator: def __init__(self, embedder, hybrid, reranker, answerer, retrieve_k=10, top_n=5): self.embedder = embedder self.hybrid = hybrid self.reranker = reranker self.answerer = answerer self.retrieve_k = retrieve_k self.top_n = top_n def evaluate(self, testset): rows = [] for item in testset: q = item["question"] exp_file = (item.get("expected_file") or "").lower() exp_name = (item.get("expected_name") or "").lower() t0 = time.time() query_emb = self.embedder.create_embeddings([q])[0] results = self.reranker.rerank(q, self.hybrid.search(q, query_emb, k=self.retrieve_k)) top = results[:self.top_n] answer = self.answerer.answer(q, top) elapsed = round(time.time() - t0, 2) files = [r["document"]["file"].lower() for r in top] names = [r["document"]["name"].lower() for r in top] file_hit = bool(exp_file) and any(exp_file in f for f in files) name_hit = (any(exp_name in n for n in names)) if exp_name else None citation_ok = bool(exp_file) and exp_file in answer["answer"].lower() rows.append({ "question": q, "expected_file": item.get("expected_file", ""), "expected_name": item.get("expected_name", ""), "file_hit": file_hit, "name_hit": name_hit, "citation_ok": citation_ok, "seconds": elapsed, "top_files": files, }) n = len(rows) named = [r for r in rows if r["name_hit"] is not None] latencies = sorted(r["seconds"] for r in rows) summary = { "questions": n, "file_accuracy": round(100 * sum(r["file_hit"] for r in rows) / n, 1) if n else 0, "function_accuracy": (round(100 * sum(r["name_hit"] for r in named) / len(named), 1) if named else None), "citation_accuracy": round(100 * sum(r["citation_ok"] for r in rows) / n, 1) if n else 0, "median_latency": latencies[len(latencies) // 2] if latencies else 0, } return rows, summary