Spaces:
Running
Running
| """ | |
| Evaluation harness for the codebase agent. | |
| For each test question we run the real pipeline (hybrid search -> rerank -> | |
| grounded answer) and measure: | |
| - file accuracy : did a chunk from the expected file reach the top-N? | |
| - function accuracy : did the expected function/class reach the top-N? | |
| (only scored on questions that name an expected function) | |
| - citation accuracy : did the generated answer actually cite the expected file? | |
| - latency : per-question wall time | |
| This is the honest, code-appropriate version of StudyMate's eval. NLI-style | |
| faithfulness is a poor fit for code, so citation accuracy is the faithfulness | |
| proxy here (does the answer ground itself in the right file?). LLM-as-judge | |
| grading is the natural upgrade (see README roadmap). | |
| """ | |
| import time | |
| class CodeEvaluator: | |
| def __init__(self, embedder, hybrid, reranker, answerer, retrieve_k=10, top_n=5): | |
| self.embedder = embedder | |
| self.hybrid = hybrid | |
| self.reranker = reranker | |
| self.answerer = answerer | |
| self.retrieve_k = retrieve_k | |
| self.top_n = top_n | |
| def evaluate(self, testset): | |
| rows = [] | |
| for item in testset: | |
| q = item["question"] | |
| exp_file = (item.get("expected_file") or "").lower() | |
| exp_name = (item.get("expected_name") or "").lower() | |
| t0 = time.time() | |
| query_emb = self.embedder.create_embeddings([q])[0] | |
| results = self.reranker.rerank(q, self.hybrid.search(q, query_emb, k=self.retrieve_k)) | |
| top = results[:self.top_n] | |
| answer = self.answerer.answer(q, top) | |
| elapsed = round(time.time() - t0, 2) | |
| files = [r["document"]["file"].lower() for r in top] | |
| names = [r["document"]["name"].lower() for r in top] | |
| file_hit = bool(exp_file) and any(exp_file in f for f in files) | |
| name_hit = (any(exp_name in n for n in names)) if exp_name else None | |
| citation_ok = bool(exp_file) and exp_file in answer["answer"].lower() | |
| rows.append({ | |
| "question": q, | |
| "expected_file": item.get("expected_file", ""), | |
| "expected_name": item.get("expected_name", ""), | |
| "file_hit": file_hit, | |
| "name_hit": name_hit, | |
| "citation_ok": citation_ok, | |
| "seconds": elapsed, | |
| "top_files": files, | |
| }) | |
| n = len(rows) | |
| named = [r for r in rows if r["name_hit"] is not None] | |
| latencies = sorted(r["seconds"] for r in rows) | |
| summary = { | |
| "questions": n, | |
| "file_accuracy": round(100 * sum(r["file_hit"] for r in rows) / n, 1) if n else 0, | |
| "function_accuracy": (round(100 * sum(r["name_hit"] for r in named) / len(named), 1) | |
| if named else None), | |
| "citation_accuracy": round(100 * sum(r["citation_ok"] for r in rows) / n, 1) if n else 0, | |
| "median_latency": latencies[len(latencies) // 2] if latencies else 0, | |
| } | |
| return rows, summary |