Spaces:
Running
Running
| """ | |
| Run the codebase-agent evaluation over a repo + test set. | |
| Usage (from project root, venv active, OPENAI_API_KEY set): | |
| python evaluate.py --repo path/to/repo_dir --testset data/eval/testset.json | |
| Generation is deterministic (temperature=0 in answerer.py) so results are | |
| reproducible. Override retrieval depth with --retrieve / --top to compare configs. | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| from src.ingestion.scanner import scan_python_files | |
| from src.ingestion.chunker import chunk_repo | |
| from src.rag.embedder import Embedder | |
| from src.rag.vector_store import VectorStore | |
| from src.rag.bm25_search import BM25Retriever | |
| from src.rag.hybrid_search import HybridRetriever | |
| from src.rag.reranker import Reranker | |
| from src.rag.answerer import Answerer | |
| from src.evaluation.evaluator import CodeEvaluator | |
| def build_pipeline(repo_dir, retrieve_k, top_n): | |
| chunks = chunk_repo(scan_python_files(repo_dir)) | |
| embedder = Embedder() | |
| embeddings = embedder.create_embeddings([c["chunk_text"] for c in chunks]) | |
| vs = VectorStore() | |
| vs.build(embeddings, chunks) | |
| hybrid = HybridRetriever(vs, BM25Retriever(chunks)) | |
| evaluator = CodeEvaluator(embedder, hybrid, Reranker(), Answerer(), retrieve_k, top_n) | |
| return evaluator, len(chunks) | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--repo", required=True, help="path to a Python repo directory") | |
| ap.add_argument("--testset", required=True, help="JSON list of test questions") | |
| ap.add_argument("--retrieve", type=int, default=10) | |
| ap.add_argument("--top", type=int, default=5) | |
| ap.add_argument("--out", default="data/eval/results.json") | |
| args = ap.parse_args() | |
| with open(args.testset, encoding="utf-8") as f: | |
| testset = json.load(f) | |
| print(f"Building index for {args.repo} ...") | |
| evaluator, n_chunks = build_pipeline(args.repo, args.retrieve, args.top) | |
| print(f"Indexed {n_chunks} chunks. Running {len(testset)} questions " | |
| f"(retrieve={args.retrieve}, top={args.top})...\n") | |
| rows, summary = evaluator.evaluate(testset) | |
| for r in rows: | |
| fh = "Y" if r["file_hit"] else "." | |
| nh = ("Y" if r["name_hit"] else ".") if r["name_hit"] is not None else "-" | |
| ch = "Y" if r["citation_ok"] else "." | |
| print(f"[file {fh}][fn {nh}][cite {ch}] {r['seconds']:5.2f}s | {r['question'][:55]}") | |
| print("\n" + "=" * 50) | |
| print("SUMMARY") | |
| print("=" * 50) | |
| print(f"Questions ............ {summary['questions']}") | |
| print(f"File accuracy ........ {summary['file_accuracy']}%") | |
| if summary["function_accuracy"] is not None: | |
| print(f"Function accuracy .... {summary['function_accuracy']}%") | |
| print(f"Citation accuracy .... {summary['citation_accuracy']}%") | |
| print(f"Median latency ....... {summary['median_latency']}s") | |
| os.makedirs(os.path.dirname(args.out), exist_ok=True) | |
| with open(args.out, "w", encoding="utf-8") as f: | |
| json.dump({"settings": vars(args), "summary": summary, "rows": rows}, | |
| f, indent=2, ensure_ascii=False) | |
| print(f"\nFull results saved to {args.out}") | |
| if __name__ == "__main__": | |
| main() |