Asish Karthikeya Gogineni
Refactor: Code Structure Update & UI Redesign
a3bdcf1
"""
Index endpoint - Index a codebase from various sources
"""
import os
import shutil
from fastapi import APIRouter, HTTPException, BackgroundTasks
from api.schemas import IndexRequest, IndexResponse
router = APIRouter()
@router.post("/index", response_model=IndexResponse)
async def index_codebase(request: IndexRequest):
"""
Index a codebase from GitHub URL, local path, or ZIP file.
Args:
request: IndexRequest with source and settings
Returns:
IndexResponse with indexing status and statistics
"""
from api.state import app_state
try:
# Import required modules
from code_chatbot.ingestion.universal_ingestor import process_source
from code_chatbot.analysis.ast_analysis import ASTGraphBuilder
from code_chatbot.ingestion.indexer import Indexer
from code_chatbot.retrieval.graph_rag import GraphEnhancedRetriever
from code_chatbot.retrieval.rag import ChatEngine
from code_chatbot.ingestion.chunker import StructuralChunker
from langchain_community.vectorstores import Chroma, FAISS
from langchain_community.vectorstores.utils import filter_complex_metadata
# Prepare extraction directory
extract_to = os.path.join("data", "extracted")
if os.path.exists(extract_to):
shutil.rmtree(extract_to)
# Stage 1: Extract & Ingest
documents, local_path = process_source(request.source, extract_to)
if not documents:
raise HTTPException(
status_code=400,
detail="No documents found in the source"
)
# Stage 2: AST Analysis
ast_builder = ASTGraphBuilder()
for doc in documents:
ast_builder.add_file(doc.metadata['file_path'], doc.page_content)
os.makedirs(local_path, exist_ok=True)
graph_path = os.path.join(local_path, "ast_graph.graphml")
ast_builder.save_graph(graph_path)
graph_nodes = ast_builder.graph.number_of_nodes()
# Stage 3: Chunking
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key and request.provider.value == "gemini":
raise HTTPException(
status_code=400,
detail="GOOGLE_API_KEY not set in environment"
)
indexer = Indexer(
provider=request.provider.value,
api_key=api_key
)
indexer.clear_collection(collection_name="codebase")
chunker = StructuralChunker()
all_chunks = []
for doc in documents:
file_chunks = chunker.chunk(doc.page_content, doc.metadata["file_path"])
all_chunks.extend(file_chunks)
# Clean metadata
for doc in all_chunks:
doc.metadata = {k: v for k, v in doc.metadata.items() if v is not None}
all_chunks = filter_complex_metadata(all_chunks)
# Stage 4: Index into vector store
vector_db_type = request.vector_db.value
if vector_db_type == "faiss":
vectordb = FAISS.from_documents(all_chunks, indexer.embedding_function)
vectordb.save_local(folder_path=indexer.persist_directory, index_name="codebase")
elif vector_db_type == "qdrant":
from langchain_qdrant import QdrantVectorStore
url = os.getenv("QDRANT_URL")
api_key_qdrant = os.getenv("QDRANT_API_KEY")
vectordb = QdrantVectorStore.from_documents(
documents=all_chunks,
embedding=indexer.embedding_function,
url=url,
api_key=api_key_qdrant,
collection_name="codebase",
prefer_grpc=True
)
else: # Chroma
vectordb = Chroma(
persist_directory=indexer.persist_directory,
embedding_function=indexer.embedding_function,
collection_name="codebase"
)
vectordb.add_documents(documents=all_chunks)
# Stage 5: Initialize Chat Engine
base_retriever = indexer.get_retriever(vector_db_type=vector_db_type)
graph_retriever = GraphEnhancedRetriever(
base_retriever=base_retriever,
repo_dir=local_path
)
repo_files = list(set([doc.metadata['file_path'] for doc in documents]))
chat_engine = ChatEngine(
retriever=graph_retriever,
provider=request.provider.value,
model_name="gemini-2.5-flash" if request.provider.value == "gemini" else "llama-3.3-70b-versatile",
api_key=api_key,
repo_files=repo_files,
repo_name=os.path.basename(request.source),
use_agent=True,
repo_dir=local_path
)
# Update app state
app_state.chat_engine = chat_engine
app_state.provider = request.provider.value
app_state.vector_db = vector_db_type
app_state.documents_count = len(all_chunks)
return IndexResponse(
status="success",
message=f"Successfully indexed {len(documents)} files",
files_indexed=len(documents),
chunks_created=len(all_chunks),
graph_nodes=graph_nodes
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"Indexing failed: {str(e)}"
)