Spaces:

AishaSurve
/

codebase-agent

Running

App Files Files Community

codebase-agent / app.py

AishaSurve

Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval

8e72e1f 3 days ago

Raw

History Blame Contribute Delete

4.3 kB

	import os
	import hashlib
	import streamlit as st

	st.set_page_config(page_title="Codebase Intelligence Agent", page_icon="🧭", layout="wide")

	# Bridge HF/Streamlit secrets to env (safe if neither exists).
	try:
	if not os.getenv("OPENAI_API_KEY") and "OPENAI_API_KEY" in st.secrets:
	os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
	except Exception:
	pass

	st.title("🧭 Codebase Intelligence Agent")
	st.write("Upload a Python repository (ZIP) and ask questions — answers come with exact file/line citations.")


	# ---- models: loaded once per session ----
	@st.cache_resource(show_spinner="Loading models (first run only)...")
	def load_models():
	from src.rag.embedder import Embedder
	from src.rag.reranker import Reranker
	from src.rag.answerer import Answerer
	return {"embedder": Embedder(), "reranker": Reranker(), "answerer": Answerer()}


	# ---- ingest + chunk, cached per uploaded file ----
	@st.cache_data(show_spinner="Scanning and parsing repository...")
	def ingest(file_bytes, file_name):
	import tempfile
	from src.ingestion.scanner import scan_repo
	from src.ingestion.chunker import chunk_repo

	tmp_zip = os.path.join(tempfile.mkdtemp(), file_name)
	with open(tmp_zip, "wb") as f:
	f.write(file_bytes)
	files, _ = scan_repo(tmp_zip)
	chunks = chunk_repo(files)
	return chunks, len(files)


	# ---- build index, cached per file (underscore args skip hashing) ----
	@st.cache_resource(show_spinner="Building search index...")
	def build_index(file_hash, _chunks, _embeddings):
	from src.rag.vector_store import VectorStore
	from src.rag.bm25_search import BM25Retriever
	from src.rag.hybrid_search import HybridRetriever
	vs = VectorStore()
	vs.build(_embeddings, _chunks)
	return HybridRetriever(vs, BM25Retriever(_chunks))


	@st.cache_data(show_spinner="Embedding code...")
	def embed(file_hash, _texts, _embedder):
	return _embedder.create_embeddings(_texts)


	uploaded = st.file_uploader("Upload repository ZIP", type=["zip"])

	if uploaded:
	file_bytes = uploaded.getvalue()
	file_hash = hashlib.md5(file_bytes).hexdigest()

	models = load_models()
	chunks, n_files = ingest(file_bytes, uploaded.name)
	texts = [c["chunk_text"] for c in chunks]
	embeddings = embed(file_hash, texts, models["embedder"])
	hybrid = build_index(file_hash, chunks, embeddings)

	st.success(f"Indexed {len(chunks)} definitions from {n_files} files. Ask away.")

	ask_tab, test_tab = st.tabs(["💬 Ask the codebase", "🧪 Generate tests"])

	with ask_tab:
	query = st.text_input("Ask about the codebase", placeholder="e.g. where are JWT tokens created?")
	if query:
	with st.spinner("Searching and answering..."):
	results = hybrid.search(query, models["embedder"].create_embeddings([query])[0], k=10)
	results = models["reranker"].rerank(query, results)
	result = models["answerer"].answer(query, results[:5])

	st.subheader("Answer")
	st.write(result["answer"])

	st.subheader("Sources")
	for s in result["sources"]:
	label = f"📄 {s['file']}:{s['start_line']}-{s['end_line']} · {s['type']} {s['name']}"
	with st.expander(label):
	st.code(s["code"], language="python")

	with test_tab:
	st.write("Enter a function or class name and the agent will read its real source and write pytest tests.")
	target = st.text_input("Function / class name", placeholder="e.g. create_access_token")
	if target and st.button("Generate tests"):
	from src.agent.tools import CodeTools
	from src.agent.workflow import TestAgent
	with st.spinner("Agent reading the code and writing tests..."):
	tools = CodeTools(chunks, models["embedder"], hybrid, models["reranker"])
	tests = TestAgent(tools).generate_tests(target)
	st.code(tests.replace("```python", "").replace("```", "").strip(), language="python")

	with st.expander("Repository overview"):
	by_type = {}
	for c in chunks:
	by_type[c["type"]] = by_type.get(c["type"], 0) + 1
	st.write({"files": n_files, "definitions": len(chunks), **by_type})