Spaces:

Krishp1
/

codebase-explainer

Running

App Files Files Community

Your Name commited on May 27

Commit

c1a2087

1 Parent(s): c080da9

Added main.py : core RAG pipeline for codebase explainer

Browse files

Files changed (1) hide show

main.py +211 -0

main.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import shutil
+import time
+import git
+from dotenv import load_dotenv
+from langchain_groq import ChatGroq
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.chat_message_histories import ChatMessageHistory
+load_dotenv()
+# ── Models ────────────────────────────────────────────────
+llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0, max_tokens=500)
+embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+print("Models ready!")
+# ── Core functions ────────────────────────────────────────
+def clone_repo(github_url):
+    """Clone a GitHub repo to local folder"""
+    repo_name  = github_url.rstrip("/").split("/")[-1]
+    clone_path = f"cloned_repos/{repo_name}"
+    if os.path.exists(clone_path):
+        shutil.rmtree(clone_path)
+    os.makedirs("cloned_repos", exist_ok=True)
+    print(f"Cloning {repo_name}...")
+    git.Repo.clone_from(github_url, clone_path)
+    print(f"Done! Saved to: {clone_path}")
+    return clone_path, repo_name
+def load_code_files(repo_path):
+    """Load all code files from the cloned repo"""
+    extensions = ["py", "js", "ts", "md", "txt", "json", "css", "html"]
+    all_docs   = []
+    for ext in extensions:
+        try:
+            loader = DirectoryLoader(
+                repo_path,
+                glob=f"**/*.{ext}",
+                loader_cls=TextLoader,
+                loader_kwargs={"encoding": "utf-8"},
+                silent_errors=True
+            )
+            docs = loader.load()
+            for doc in docs:
+                doc.metadata["file_name"] = os.path.basename(
+                    doc.metadata.get("source", "unknown")
+                )
+                doc.metadata["file_type"] = ext
+            all_docs.extend(docs)
+            print(f"Loaded {len(docs)} .{ext} files")
+        except Exception as e:
+            print(f"Skipped .{ext}: {e}")
+            continue
+    print(f"\nTotal files loaded: {len(all_docs)}")
+    return all_docs
+def split_code(all_docs):
+    """Split documents into chunks using language-aware splitters"""
+    EXTENSION_TO_LANGUAGE = {
+        "py":   Language.PYTHON,
+        "js":   Language.JS,
+        "ts":   Language.TS,
+        "jsx":  Language.JS,
+        "tsx":  Language.TS,
+        "java": Language.JAVA,
+        "cpp":  Language.CPP,
+        "c":    Language.CPP,
+        "go":   Language.GO,
+        "rb":   Language.RUBY,
+        "rs":   Language.RUST,
+        "md":   Language.MARKDOWN,
+    }
+    all_chunks = []
+    for doc in all_docs:
+        ext      = doc.metadata.get("file_type", "").lower()
+        language = EXTENSION_TO_LANGUAGE.get(ext)
+        if language:
+            splitter = RecursiveCharacterTextSplitter.from_language(
+                language=language,
+                chunk_size=2000,
+                chunk_overlap=300
+            )
+        else:
+            splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1500,
+                chunk_overlap=200
+            )
+        all_chunks.extend(splitter.split_documents([doc]))
+    print(f"Original files : {len(all_docs)}")
+    print(f"After splitting: {len(all_chunks)} chunks")
+    return all_chunks
+def store_in_chromadb(chunks):
+    """Store code chunks in ChromaDB (in-memory)"""
+    print("Storing chunks in ChromaDB...")
+    time.sleep(1)  # ensure any previous instance is released
+    vectorstore = Chroma.from_documents(
+        documents=chunks,
+        embedding=embeddings
+    )
+    print(f"Stored {len(chunks)} chunks ✅")
+    return vectorstore
+def ask_question(question, vectorstore, history):
+    """Ask any question about the codebase"""
+    start_search = time.time()
+    # Step 1: Retrieve relevant chunks
+    retriever = vectorstore.as_retriever(
+        search_type="mmr",
+        search_kwargs={"k": 8, "fetch_k": 20, "lambda_mult": 0.7}
+    )
+    docs = retriever.invoke(question)
+    search_latency_ms = (time.time() - start_search) * 1000
+    print(f"🔍 Vector DB Query Latency: {search_latency_ms:.2f} ms")
+    # Step 2: Format context with file names
+    context = "\n\n".join([
+        f"# File: {d.metadata['file_name']}\n{d.page_content}"
+        for d in docs
+    ])
+    # Step 3: Build prompt
+    prompt = ChatPromptTemplate.from_messages([
+        ("system",
+         "You are an expert code analyst for a GitHub repository.\n"
+         "Answer questions using the retrieved code chunks below.\n\n"
+         "Rules:\n"
+         "- Always name the exact file where you found the answer\n"
+         "- Prioritize source code files (.py, .js, .ts) over documentation (README, conf.py, setup.py)\n"
+         "- If implementation is spread across files, piece it together\n"
+         "- If you see a method name or partial logic, explain what it does\n"
+         "- NEVER say 'not in codebase' if you found related code or methods\n"
+         "- Give specific details: method names, parameters, logic flow\n"
+         "- If truly nothing relevant exists, say what you DID find instead\n\n"
+         "Code context:\n{context}"),
+        MessagesPlaceholder(variable_name="history"),
+        ("human", "{question}")
+    ])
+    # Step 4: Run chain
+    parser   = StrOutputParser()
+    chain    = prompt | llm | parser
+    start_llm = time.time()
+    response = chain.invoke({
+        "context" : context,
+        "history" : history.messages,
+        "question": question
+    })
+    print(f"🤖 LLM Generation Time: {time.time() - start_llm:.2f} seconds")
+    # Step 5: Save to memory
+    history.add_user_message(question)
+    history.add_ai_message(response)
+    return response
+def build_codebase_explainer(github_url):
+    """Complete pipeline in one function"""
+    print(f"Building explainer for: {github_url}\n")
+    start_ingestion = time.time()
+    clone_path, repo_name = clone_repo(github_url)
+    all_docs              = load_code_files(clone_path)
+    chunks                = split_code(all_docs)
+    vectorstore           = store_in_chromadb(chunks)
+    history               = ChatMessageHistory()
+    elapsed_ingestion = time.time() - start_ingestion
+    print("\n" + "═" * 50)
+    print(f"✅ Ready! Indexed {len(all_docs)} files, {len(chunks)} chunks")
+    print(f"⏱  Total Ingestion Time: {elapsed_ingestion:.2f} seconds")
+    print(f"Repo: {repo_name}")
+    print("═" * 50 + "\n")
+    return vectorstore, history, repo_name
+# ── Run ───────────────────────────────────────────────────
+if __name__ == "__main__":
+    vectorstore, history, repo_name = build_codebase_explainer(
+        "https://github.com/psf/requests"
+    )
+    questions = [
+        "What does this project do?",
+        "What are the core source code files and what does each do?",
+        "What language is it written in?",
+        "How do I install this?",
+        "Are there any tests?",
+    ]
+    print(f"REPO: {repo_name}\n")
+    for i, q in enumerate(questions):
+        start    = time.time()
+        response = ask_question(q, vectorstore, history)
+        elapsed  = time.time() - start
+        print(f"Q{i+1}: {q}")
+        print(f"A  : {response}")
+        print(f"⏱  : {elapsed:.2f}s\n")