""" Run the codebase-agent evaluation over a repo + test set. Usage (from project root, venv active, OPENAI_API_KEY set): python evaluate.py --repo path/to/repo_dir --testset data/eval/testset.json Generation is deterministic (temperature=0 in answerer.py) so results are reproducible. Override retrieval depth with --retrieve / --top to compare configs. """ import argparse import json import os from src.ingestion.scanner import scan_python_files from src.ingestion.chunker import chunk_repo from src.rag.embedder import Embedder from src.rag.vector_store import VectorStore from src.rag.bm25_search import BM25Retriever from src.rag.hybrid_search import HybridRetriever from src.rag.reranker import Reranker from src.rag.answerer import Answerer from src.evaluation.evaluator import CodeEvaluator def build_pipeline(repo_dir, retrieve_k, top_n): chunks = chunk_repo(scan_python_files(repo_dir)) embedder = Embedder() embeddings = embedder.create_embeddings([c["chunk_text"] for c in chunks]) vs = VectorStore() vs.build(embeddings, chunks) hybrid = HybridRetriever(vs, BM25Retriever(chunks)) evaluator = CodeEvaluator(embedder, hybrid, Reranker(), Answerer(), retrieve_k, top_n) return evaluator, len(chunks) def main(): ap = argparse.ArgumentParser() ap.add_argument("--repo", required=True, help="path to a Python repo directory") ap.add_argument("--testset", required=True, help="JSON list of test questions") ap.add_argument("--retrieve", type=int, default=10) ap.add_argument("--top", type=int, default=5) ap.add_argument("--out", default="data/eval/results.json") args = ap.parse_args() with open(args.testset, encoding="utf-8") as f: testset = json.load(f) print(f"Building index for {args.repo} ...") evaluator, n_chunks = build_pipeline(args.repo, args.retrieve, args.top) print(f"Indexed {n_chunks} chunks. Running {len(testset)} questions " f"(retrieve={args.retrieve}, top={args.top})...\n") rows, summary = evaluator.evaluate(testset) for r in rows: fh = "Y" if r["file_hit"] else "." nh = ("Y" if r["name_hit"] else ".") if r["name_hit"] is not None else "-" ch = "Y" if r["citation_ok"] else "." print(f"[file {fh}][fn {nh}][cite {ch}] {r['seconds']:5.2f}s | {r['question'][:55]}") print("\n" + "=" * 50) print("SUMMARY") print("=" * 50) print(f"Questions ............ {summary['questions']}") print(f"File accuracy ........ {summary['file_accuracy']}%") if summary["function_accuracy"] is not None: print(f"Function accuracy .... {summary['function_accuracy']}%") print(f"Citation accuracy .... {summary['citation_accuracy']}%") print(f"Median latency ....... {summary['median_latency']}s") os.makedirs(os.path.dirname(args.out), exist_ok=True) with open(args.out, "w", encoding="utf-8") as f: json.dump({"settings": vars(args), "summary": summary, "rows": rows}, f, indent=2, ensure_ascii=False) print(f"\nFull results saved to {args.out}") if __name__ == "__main__": main()