Your Name commited on
Commit
c1a2087
Β·
1 Parent(s): c080da9

Added main.py : core RAG pipeline for codebase explainer

Browse files
Files changed (1) hide show
  1. main.py +211 -0
main.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import time
4
+ import git
5
+ from dotenv import load_dotenv
6
+ from langchain_groq import ChatGroq
7
+ from langchain_huggingface import HuggingFaceEmbeddings
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
11
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
12
+ from langchain_core.output_parsers import StrOutputParser
13
+ from langchain_community.chat_message_histories import ChatMessageHistory
14
+
15
+ load_dotenv()
16
+
17
+ # ── Models ────────────────────────────────────────────────
18
+ llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0, max_tokens=500)
19
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
20
+ print("Models ready!")
21
+
22
+ # ── Core functions ────────────────────────────────────────
23
+ def clone_repo(github_url):
24
+ """Clone a GitHub repo to local folder"""
25
+ repo_name = github_url.rstrip("/").split("/")[-1]
26
+ clone_path = f"cloned_repos/{repo_name}"
27
+ if os.path.exists(clone_path):
28
+ shutil.rmtree(clone_path)
29
+ os.makedirs("cloned_repos", exist_ok=True)
30
+ print(f"Cloning {repo_name}...")
31
+ git.Repo.clone_from(github_url, clone_path)
32
+ print(f"Done! Saved to: {clone_path}")
33
+ return clone_path, repo_name
34
+
35
+
36
+ def load_code_files(repo_path):
37
+ """Load all code files from the cloned repo"""
38
+ extensions = ["py", "js", "ts", "md", "txt", "json", "css", "html"]
39
+ all_docs = []
40
+ for ext in extensions:
41
+ try:
42
+ loader = DirectoryLoader(
43
+ repo_path,
44
+ glob=f"**/*.{ext}",
45
+ loader_cls=TextLoader,
46
+ loader_kwargs={"encoding": "utf-8"},
47
+ silent_errors=True
48
+ )
49
+ docs = loader.load()
50
+ for doc in docs:
51
+ doc.metadata["file_name"] = os.path.basename(
52
+ doc.metadata.get("source", "unknown")
53
+ )
54
+ doc.metadata["file_type"] = ext
55
+ all_docs.extend(docs)
56
+ print(f"Loaded {len(docs)} .{ext} files")
57
+ except Exception as e:
58
+ print(f"Skipped .{ext}: {e}")
59
+ continue
60
+ print(f"\nTotal files loaded: {len(all_docs)}")
61
+ return all_docs
62
+
63
+
64
+ def split_code(all_docs):
65
+ """Split documents into chunks using language-aware splitters"""
66
+ EXTENSION_TO_LANGUAGE = {
67
+ "py": Language.PYTHON,
68
+ "js": Language.JS,
69
+ "ts": Language.TS,
70
+ "jsx": Language.JS,
71
+ "tsx": Language.TS,
72
+ "java": Language.JAVA,
73
+ "cpp": Language.CPP,
74
+ "c": Language.CPP,
75
+ "go": Language.GO,
76
+ "rb": Language.RUBY,
77
+ "rs": Language.RUST,
78
+ "md": Language.MARKDOWN,
79
+ }
80
+
81
+ all_chunks = []
82
+ for doc in all_docs:
83
+ ext = doc.metadata.get("file_type", "").lower()
84
+ language = EXTENSION_TO_LANGUAGE.get(ext)
85
+ if language:
86
+ splitter = RecursiveCharacterTextSplitter.from_language(
87
+ language=language,
88
+ chunk_size=2000,
89
+ chunk_overlap=300
90
+ )
91
+ else:
92
+ splitter = RecursiveCharacterTextSplitter(
93
+ chunk_size=1500,
94
+ chunk_overlap=200
95
+ )
96
+ all_chunks.extend(splitter.split_documents([doc]))
97
+
98
+ print(f"Original files : {len(all_docs)}")
99
+ print(f"After splitting: {len(all_chunks)} chunks")
100
+ return all_chunks
101
+
102
+
103
+ def store_in_chromadb(chunks):
104
+ """Store code chunks in ChromaDB (in-memory)"""
105
+ print("Storing chunks in ChromaDB...")
106
+ time.sleep(1) # ensure any previous instance is released
107
+ vectorstore = Chroma.from_documents(
108
+ documents=chunks,
109
+ embedding=embeddings
110
+ )
111
+ print(f"Stored {len(chunks)} chunks βœ…")
112
+ return vectorstore
113
+
114
+
115
+ def ask_question(question, vectorstore, history):
116
+ """Ask any question about the codebase"""
117
+ start_search = time.time()
118
+ # Step 1: Retrieve relevant chunks
119
+ retriever = vectorstore.as_retriever(
120
+ search_type="mmr",
121
+ search_kwargs={"k": 8, "fetch_k": 20, "lambda_mult": 0.7}
122
+ )
123
+ docs = retriever.invoke(question)
124
+ search_latency_ms = (time.time() - start_search) * 1000
125
+ print(f"πŸ” Vector DB Query Latency: {search_latency_ms:.2f} ms")
126
+
127
+ # Step 2: Format context with file names
128
+ context = "\n\n".join([
129
+ f"# File: {d.metadata['file_name']}\n{d.page_content}"
130
+ for d in docs
131
+ ])
132
+
133
+ # Step 3: Build prompt
134
+ prompt = ChatPromptTemplate.from_messages([
135
+ ("system",
136
+ "You are an expert code analyst for a GitHub repository.\n"
137
+ "Answer questions using the retrieved code chunks below.\n\n"
138
+ "Rules:\n"
139
+ "- Always name the exact file where you found the answer\n"
140
+ "- Prioritize source code files (.py, .js, .ts) over documentation (README, conf.py, setup.py)\n"
141
+ "- If implementation is spread across files, piece it together\n"
142
+ "- If you see a method name or partial logic, explain what it does\n"
143
+ "- NEVER say 'not in codebase' if you found related code or methods\n"
144
+ "- Give specific details: method names, parameters, logic flow\n"
145
+ "- If truly nothing relevant exists, say what you DID find instead\n\n"
146
+ "Code context:\n{context}"),
147
+ MessagesPlaceholder(variable_name="history"),
148
+ ("human", "{question}")
149
+ ])
150
+
151
+ # Step 4: Run chain
152
+ parser = StrOutputParser()
153
+ chain = prompt | llm | parser
154
+ start_llm = time.time()
155
+ response = chain.invoke({
156
+ "context" : context,
157
+ "history" : history.messages,
158
+ "question": question
159
+ })
160
+ print(f"πŸ€– LLM Generation Time: {time.time() - start_llm:.2f} seconds")
161
+
162
+ # Step 5: Save to memory
163
+ history.add_user_message(question)
164
+ history.add_ai_message(response)
165
+
166
+ return response
167
+
168
+
169
+ def build_codebase_explainer(github_url):
170
+ """Complete pipeline in one function"""
171
+ print(f"Building explainer for: {github_url}\n")
172
+ start_ingestion = time.time()
173
+ clone_path, repo_name = clone_repo(github_url)
174
+ all_docs = load_code_files(clone_path)
175
+ chunks = split_code(all_docs)
176
+ vectorstore = store_in_chromadb(chunks)
177
+ history = ChatMessageHistory()
178
+ elapsed_ingestion = time.time() - start_ingestion
179
+
180
+ print("\n" + "═" * 50)
181
+ print(f"βœ… Ready! Indexed {len(all_docs)} files, {len(chunks)} chunks")
182
+ print(f"⏱ Total Ingestion Time: {elapsed_ingestion:.2f} seconds")
183
+ print(f"Repo: {repo_name}")
184
+ print("═" * 50 + "\n")
185
+
186
+ return vectorstore, history, repo_name
187
+
188
+
189
+ # ── Run ───────────────────────────────────────────────────
190
+ if __name__ == "__main__":
191
+ vectorstore, history, repo_name = build_codebase_explainer(
192
+ "https://github.com/psf/requests"
193
+ )
194
+
195
+ questions = [
196
+ "What does this project do?",
197
+ "What are the core source code files and what does each do?",
198
+ "What language is it written in?",
199
+ "How do I install this?",
200
+ "Are there any tests?",
201
+ ]
202
+
203
+ print(f"REPO: {repo_name}\n")
204
+
205
+ for i, q in enumerate(questions):
206
+ start = time.time()
207
+ response = ask_question(q, vectorstore, history)
208
+ elapsed = time.time() - start
209
+ print(f"Q{i+1}: {q}")
210
+ print(f"A : {response}")
211
+ print(f"⏱ : {elapsed:.2f}s\n")