codebase-agent / evaluate.py
AishaSurve's picture
Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval
8e72e1f
Raw
History Blame Contribute Delete
3.13 kB
"""
Run the codebase-agent evaluation over a repo + test set.
Usage (from project root, venv active, OPENAI_API_KEY set):
python evaluate.py --repo path/to/repo_dir --testset data/eval/testset.json
Generation is deterministic (temperature=0 in answerer.py) so results are
reproducible. Override retrieval depth with --retrieve / --top to compare configs.
"""
import argparse
import json
import os
from src.ingestion.scanner import scan_python_files
from src.ingestion.chunker import chunk_repo
from src.rag.embedder import Embedder
from src.rag.vector_store import VectorStore
from src.rag.bm25_search import BM25Retriever
from src.rag.hybrid_search import HybridRetriever
from src.rag.reranker import Reranker
from src.rag.answerer import Answerer
from src.evaluation.evaluator import CodeEvaluator
def build_pipeline(repo_dir, retrieve_k, top_n):
chunks = chunk_repo(scan_python_files(repo_dir))
embedder = Embedder()
embeddings = embedder.create_embeddings([c["chunk_text"] for c in chunks])
vs = VectorStore()
vs.build(embeddings, chunks)
hybrid = HybridRetriever(vs, BM25Retriever(chunks))
evaluator = CodeEvaluator(embedder, hybrid, Reranker(), Answerer(), retrieve_k, top_n)
return evaluator, len(chunks)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--repo", required=True, help="path to a Python repo directory")
ap.add_argument("--testset", required=True, help="JSON list of test questions")
ap.add_argument("--retrieve", type=int, default=10)
ap.add_argument("--top", type=int, default=5)
ap.add_argument("--out", default="data/eval/results.json")
args = ap.parse_args()
with open(args.testset, encoding="utf-8") as f:
testset = json.load(f)
print(f"Building index for {args.repo} ...")
evaluator, n_chunks = build_pipeline(args.repo, args.retrieve, args.top)
print(f"Indexed {n_chunks} chunks. Running {len(testset)} questions "
f"(retrieve={args.retrieve}, top={args.top})...\n")
rows, summary = evaluator.evaluate(testset)
for r in rows:
fh = "Y" if r["file_hit"] else "."
nh = ("Y" if r["name_hit"] else ".") if r["name_hit"] is not None else "-"
ch = "Y" if r["citation_ok"] else "."
print(f"[file {fh}][fn {nh}][cite {ch}] {r['seconds']:5.2f}s | {r['question'][:55]}")
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Questions ............ {summary['questions']}")
print(f"File accuracy ........ {summary['file_accuracy']}%")
if summary["function_accuracy"] is not None:
print(f"Function accuracy .... {summary['function_accuracy']}%")
print(f"Citation accuracy .... {summary['citation_accuracy']}%")
print(f"Median latency ....... {summary['median_latency']}s")
os.makedirs(os.path.dirname(args.out), exist_ok=True)
with open(args.out, "w", encoding="utf-8") as f:
json.dump({"settings": vars(args), "summary": summary, "rows": rows},
f, indent=2, ensure_ascii=False)
print(f"\nFull results saved to {args.out}")
if __name__ == "__main__":
main()