File size: 5,586 Bytes
5b89d45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3bdcf1
 
 
 
 
 
5b89d45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Index endpoint - Index a codebase from various sources
"""
import os
import shutil
from fastapi import APIRouter, HTTPException, BackgroundTasks
from api.schemas import IndexRequest, IndexResponse

router = APIRouter()


@router.post("/index", response_model=IndexResponse)
async def index_codebase(request: IndexRequest):
    """
    Index a codebase from GitHub URL, local path, or ZIP file.
    
    Args:
        request: IndexRequest with source and settings
        
    Returns:
        IndexResponse with indexing status and statistics
    """
    from api.state import app_state
    
    try:
        # Import required modules
        from code_chatbot.ingestion.universal_ingestor import process_source
        from code_chatbot.analysis.ast_analysis import ASTGraphBuilder
        from code_chatbot.ingestion.indexer import Indexer
        from code_chatbot.retrieval.graph_rag import GraphEnhancedRetriever
        from code_chatbot.retrieval.rag import ChatEngine
        from code_chatbot.ingestion.chunker import StructuralChunker
        from langchain_community.vectorstores import Chroma, FAISS
        from langchain_community.vectorstores.utils import filter_complex_metadata
        
        # Prepare extraction directory
        extract_to = os.path.join("data", "extracted")
        if os.path.exists(extract_to):
            shutil.rmtree(extract_to)
        
        # Stage 1: Extract & Ingest
        documents, local_path = process_source(request.source, extract_to)
        
        if not documents:
            raise HTTPException(
                status_code=400,
                detail="No documents found in the source"
            )
        
        # Stage 2: AST Analysis
        ast_builder = ASTGraphBuilder()
        for doc in documents:
            ast_builder.add_file(doc.metadata['file_path'], doc.page_content)
        
        os.makedirs(local_path, exist_ok=True)
        graph_path = os.path.join(local_path, "ast_graph.graphml")
        ast_builder.save_graph(graph_path)
        graph_nodes = ast_builder.graph.number_of_nodes()
        
        # Stage 3: Chunking
        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key and request.provider.value == "gemini":
            raise HTTPException(
                status_code=400,
                detail="GOOGLE_API_KEY not set in environment"
            )
        
        indexer = Indexer(
            provider=request.provider.value,
            api_key=api_key
        )
        indexer.clear_collection(collection_name="codebase")
        
        chunker = StructuralChunker()
        all_chunks = []
        for doc in documents:
            file_chunks = chunker.chunk(doc.page_content, doc.metadata["file_path"])
            all_chunks.extend(file_chunks)
        
        # Clean metadata
        for doc in all_chunks:
            doc.metadata = {k: v for k, v in doc.metadata.items() if v is not None}
        all_chunks = filter_complex_metadata(all_chunks)
        
        # Stage 4: Index into vector store
        vector_db_type = request.vector_db.value
        
        if vector_db_type == "faiss":
            vectordb = FAISS.from_documents(all_chunks, indexer.embedding_function)
            vectordb.save_local(folder_path=indexer.persist_directory, index_name="codebase")
        elif vector_db_type == "qdrant":
            from langchain_qdrant import QdrantVectorStore
            url = os.getenv("QDRANT_URL")
            api_key_qdrant = os.getenv("QDRANT_API_KEY")
            vectordb = QdrantVectorStore.from_documents(
                documents=all_chunks,
                embedding=indexer.embedding_function,
                url=url,
                api_key=api_key_qdrant,
                collection_name="codebase",
                prefer_grpc=True
            )
        else:  # Chroma
            vectordb = Chroma(
                persist_directory=indexer.persist_directory,
                embedding_function=indexer.embedding_function,
                collection_name="codebase"
            )
            vectordb.add_documents(documents=all_chunks)
        
        # Stage 5: Initialize Chat Engine
        base_retriever = indexer.get_retriever(vector_db_type=vector_db_type)
        graph_retriever = GraphEnhancedRetriever(
            base_retriever=base_retriever,
            repo_dir=local_path
        )
        
        repo_files = list(set([doc.metadata['file_path'] for doc in documents]))
        
        chat_engine = ChatEngine(
            retriever=graph_retriever,
            provider=request.provider.value,
            model_name="gemini-2.5-flash" if request.provider.value == "gemini" else "llama-3.3-70b-versatile",
            api_key=api_key,
            repo_files=repo_files,
            repo_name=os.path.basename(request.source),
            use_agent=True,
            repo_dir=local_path
        )
        
        # Update app state
        app_state.chat_engine = chat_engine
        app_state.provider = request.provider.value
        app_state.vector_db = vector_db_type
        app_state.documents_count = len(all_chunks)
        
        return IndexResponse(
            status="success",
            message=f"Successfully indexed {len(documents)} files",
            files_indexed=len(documents),
            chunks_created=len(all_chunks),
            graph_nodes=graph_nodes
        )
        
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Indexing failed: {str(e)}"
        )