Spaces:

AishaSurve
/

codebase-agent

Running

App Files Files Community

codebase-agent / evaluate.py

AishaSurve

Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval

8e72e1f 3 days ago

Raw

History Blame Contribute Delete

3.13 kB

	"""
	Run the codebase-agent evaluation over a repo + test set.

	Usage (from project root, venv active, OPENAI_API_KEY set):
	python evaluate.py --repo path/to/repo_dir --testset data/eval/testset.json

	Generation is deterministic (temperature=0 in answerer.py) so results are
	reproducible. Override retrieval depth with --retrieve / --top to compare configs.
	"""
	import argparse
	import json
	import os

	from src.ingestion.scanner import scan_python_files
	from src.ingestion.chunker import chunk_repo
	from src.rag.embedder import Embedder
	from src.rag.vector_store import VectorStore
	from src.rag.bm25_search import BM25Retriever
	from src.rag.hybrid_search import HybridRetriever
	from src.rag.reranker import Reranker
	from src.rag.answerer import Answerer
	from src.evaluation.evaluator import CodeEvaluator


	def build_pipeline(repo_dir, retrieve_k, top_n):
	chunks = chunk_repo(scan_python_files(repo_dir))
	embedder = Embedder()
	embeddings = embedder.create_embeddings([c["chunk_text"] for c in chunks])
	vs = VectorStore()
	vs.build(embeddings, chunks)
	hybrid = HybridRetriever(vs, BM25Retriever(chunks))
	evaluator = CodeEvaluator(embedder, hybrid, Reranker(), Answerer(), retrieve_k, top_n)
	return evaluator, len(chunks)


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--repo", required=True, help="path to a Python repo directory")
	ap.add_argument("--testset", required=True, help="JSON list of test questions")
	ap.add_argument("--retrieve", type=int, default=10)
	ap.add_argument("--top", type=int, default=5)
	ap.add_argument("--out", default="data/eval/results.json")
	args = ap.parse_args()

	with open(args.testset, encoding="utf-8") as f:
	testset = json.load(f)

	print(f"Building index for {args.repo} ...")
	evaluator, n_chunks = build_pipeline(args.repo, args.retrieve, args.top)
	print(f"Indexed {n_chunks} chunks. Running {len(testset)} questions "
	f"(retrieve={args.retrieve}, top={args.top})...\n")

	rows, summary = evaluator.evaluate(testset)

	for r in rows:
	fh = "Y" if r["file_hit"] else "."
	nh = ("Y" if r["name_hit"] else ".") if r["name_hit"] is not None else "-"
	ch = "Y" if r["citation_ok"] else "."
	print(f"[file {fh}][fn {nh}][cite {ch}] {r['seconds']:5.2f}s \| {r['question'][:55]}")

	print("\n" + "=" * 50)
	print("SUMMARY")
	print("=" * 50)
	print(f"Questions ............ {summary['questions']}")
	print(f"File accuracy ........ {summary['file_accuracy']}%")
	if summary["function_accuracy"] is not None:
	print(f"Function accuracy .... {summary['function_accuracy']}%")
	print(f"Citation accuracy .... {summary['citation_accuracy']}%")
	print(f"Median latency ....... {summary['median_latency']}s")

	os.makedirs(os.path.dirname(args.out), exist_ok=True)
	with open(args.out, "w", encoding="utf-8") as f:
	json.dump({"settings": vars(args), "summary": summary, "rows": rows},
	f, indent=2, ensure_ascii=False)
	print(f"\nFull results saved to {args.out}")


	if __name__ == "__main__":
	main()