codebase-agent / app.py
AishaSurve's picture
Codebase Intelligence Agent: code-aware RAG + test-gen agent + eval
8e72e1f
Raw
History Blame Contribute Delete
4.3 kB
import os
import hashlib
import streamlit as st
st.set_page_config(page_title="Codebase Intelligence Agent", page_icon="🧭", layout="wide")
# Bridge HF/Streamlit secrets to env (safe if neither exists).
try:
if not os.getenv("OPENAI_API_KEY") and "OPENAI_API_KEY" in st.secrets:
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
except Exception:
pass
st.title("🧭 Codebase Intelligence Agent")
st.write("Upload a Python repository (ZIP) and ask questions — answers come with exact file/line citations.")
# ---- models: loaded once per session ----
@st.cache_resource(show_spinner="Loading models (first run only)...")
def load_models():
from src.rag.embedder import Embedder
from src.rag.reranker import Reranker
from src.rag.answerer import Answerer
return {"embedder": Embedder(), "reranker": Reranker(), "answerer": Answerer()}
# ---- ingest + chunk, cached per uploaded file ----
@st.cache_data(show_spinner="Scanning and parsing repository...")
def ingest(file_bytes, file_name):
import tempfile
from src.ingestion.scanner import scan_repo
from src.ingestion.chunker import chunk_repo
tmp_zip = os.path.join(tempfile.mkdtemp(), file_name)
with open(tmp_zip, "wb") as f:
f.write(file_bytes)
files, _ = scan_repo(tmp_zip)
chunks = chunk_repo(files)
return chunks, len(files)
# ---- build index, cached per file (underscore args skip hashing) ----
@st.cache_resource(show_spinner="Building search index...")
def build_index(file_hash, _chunks, _embeddings):
from src.rag.vector_store import VectorStore
from src.rag.bm25_search import BM25Retriever
from src.rag.hybrid_search import HybridRetriever
vs = VectorStore()
vs.build(_embeddings, _chunks)
return HybridRetriever(vs, BM25Retriever(_chunks))
@st.cache_data(show_spinner="Embedding code...")
def embed(file_hash, _texts, _embedder):
return _embedder.create_embeddings(_texts)
uploaded = st.file_uploader("Upload repository ZIP", type=["zip"])
if uploaded:
file_bytes = uploaded.getvalue()
file_hash = hashlib.md5(file_bytes).hexdigest()
models = load_models()
chunks, n_files = ingest(file_bytes, uploaded.name)
texts = [c["chunk_text"] for c in chunks]
embeddings = embed(file_hash, texts, models["embedder"])
hybrid = build_index(file_hash, chunks, embeddings)
st.success(f"Indexed {len(chunks)} definitions from {n_files} files. Ask away.")
ask_tab, test_tab = st.tabs(["💬 Ask the codebase", "🧪 Generate tests"])
with ask_tab:
query = st.text_input("Ask about the codebase", placeholder="e.g. where are JWT tokens created?")
if query:
with st.spinner("Searching and answering..."):
results = hybrid.search(query, models["embedder"].create_embeddings([query])[0], k=10)
results = models["reranker"].rerank(query, results)
result = models["answerer"].answer(query, results[:5])
st.subheader("Answer")
st.write(result["answer"])
st.subheader("Sources")
for s in result["sources"]:
label = f"📄 {s['file']}:{s['start_line']}-{s['end_line']} · {s['type']} {s['name']}"
with st.expander(label):
st.code(s["code"], language="python")
with test_tab:
st.write("Enter a function or class name and the agent will read its real source and write pytest tests.")
target = st.text_input("Function / class name", placeholder="e.g. create_access_token")
if target and st.button("Generate tests"):
from src.agent.tools import CodeTools
from src.agent.workflow import TestAgent
with st.spinner("Agent reading the code and writing tests..."):
tools = CodeTools(chunks, models["embedder"], hybrid, models["reranker"])
tests = TestAgent(tools).generate_tests(target)
st.code(tests.replace("```python", "").replace("```", "").strip(), language="python")
with st.expander("Repository overview"):
by_type = {}
for c in chunks:
by_type[c["type"]] = by_type.get(c["type"], 0) + 1
st.write({"files": n_files, "definitions": len(chunks), **by_type})