Your Name commited on
Commit
a3f4dff
Β·
0 Parent(s):

Initial commit: AI Codebase Explainer

Browse files
Files changed (3) hide show
  1. .gitignore +7 -0
  2. app.py +356 -0
  3. requirements.txt +13 -0
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ cloned_repos/
3
+ codebase_db/
4
+ __pycache__/
5
+ *.pyc
6
+ .DS_Store
7
+ *.ipynb
app.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import time
4
+ from langchain_google_genai import ChatGoogleGenerativeAI
5
+ import git
6
+ import streamlit as st
7
+ from dotenv import load_dotenv
8
+ from langchain_groq import ChatGroq
9
+ from langchain_huggingface import HuggingFaceEmbeddings
10
+ from langchain_community.vectorstores import Chroma
11
+ from langchain_community.document_loaders import DirectoryLoader, TextLoader
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
14
+ from langchain_core.output_parsers import StrOutputParser
15
+ from langchain_community.chat_message_histories import ChatMessageHistory
16
+
17
+ load_dotenv()
18
+
19
+ # ── Page config ───────────────────────────────────────────
20
+ st.set_page_config(
21
+ page_title="AI Codebase Explainer",
22
+ page_icon="πŸ”",
23
+ layout="wide"
24
+ )
25
+
26
+ # ── Initialize session state ──────────────────────────────
27
+ if "vectorstore" not in st.session_state:
28
+ st.session_state.vectorstore = None
29
+ st.session_state.history = ChatMessageHistory()
30
+ st.session_state.messages = []
31
+ st.session_state.repo_name = ""
32
+ st.session_state.indexed = False
33
+ st.session_state.stats = {}
34
+
35
+ # ── Load models ───────────────────────────────────────────
36
+ @st.cache_resource
37
+ @st.cache_resource
38
+ def load_models():
39
+ # Try Groq first β€” fastest
40
+ try:
41
+ from langchain_groq import ChatGroq
42
+ llm = ChatGroq(
43
+ model="llama-3.1-8b-instant",
44
+ temperature=0,
45
+ max_tokens=500
46
+ )
47
+ # Test if it works
48
+ llm.invoke("hi")
49
+ print("Using Groq")
50
+ except Exception:
51
+ # Fallback to Gemini
52
+ from langchain_google_genai import ChatGoogleGenerativeAI
53
+ llm = ChatGoogleGenerativeAI(
54
+ model="gemini-2.0-flash",
55
+ temperature=0,
56
+ max_output_tokens=500
57
+ )
58
+ print("Using Gemini fallback")
59
+
60
+ embeddings = HuggingFaceEmbeddings(
61
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
62
+ )
63
+ return llm, embeddings
64
+
65
+ llm, embeddings = load_models()
66
+ parser = StrOutputParser()
67
+
68
+ # ── Core functions ────────────────────────────────────────
69
+ def clone_repo(github_url):
70
+ repo_name = github_url.rstrip("/").split("/")[-1]
71
+ clone_path = f"cloned_repos/{repo_name}"
72
+ if os.path.exists(clone_path):
73
+ shutil.rmtree(clone_path)
74
+ os.makedirs("cloned_repos", exist_ok=True)
75
+ git.Repo.clone_from(github_url, clone_path)
76
+ return clone_path, repo_name
77
+
78
+ def load_code_files(repo_path):
79
+ extensions = ["py", "js", "ts", "md", "txt", "json", "css", "html"]
80
+ all_docs = []
81
+ for ext in extensions:
82
+ try:
83
+ loader = DirectoryLoader(
84
+ repo_path,
85
+ glob=f"**/*.{ext}",
86
+ loader_cls=TextLoader,
87
+ loader_kwargs={"encoding": "utf-8"},
88
+ silent_errors=True
89
+ )
90
+ docs = loader.load()
91
+ for doc in docs:
92
+ doc.metadata["file_name"] = os.path.basename(
93
+ doc.metadata.get("source", "unknown")
94
+ )
95
+ doc.metadata["file_type"] = ext
96
+ all_docs.extend(docs)
97
+ except Exception:
98
+ continue
99
+ return all_docs
100
+
101
+ def split_and_index(all_docs):
102
+ from langchain_text_splitters import Language
103
+
104
+ EXTENSION_TO_LANGUAGE = {
105
+ "py": Language.PYTHON,
106
+ "js": Language.JS,
107
+ "ts": Language.TS,
108
+ "jsx": Language.JS,
109
+ "tsx": Language.TS,
110
+ "java": Language.JAVA,
111
+ "cpp": Language.CPP,
112
+ "c": Language.CPP,
113
+ "go": Language.GO,
114
+ "rb": Language.RUBY,
115
+ "rs": Language.RUST,
116
+ "md": Language.MARKDOWN,
117
+ }
118
+
119
+ all_chunks = []
120
+ for doc in all_docs:
121
+ ext = doc.metadata.get("file_type", "").lower()
122
+ language = EXTENSION_TO_LANGUAGE.get(ext)
123
+ if language:
124
+ splitter = RecursiveCharacterTextSplitter.from_language(
125
+ language=language,
126
+ chunk_size=2000,
127
+ chunk_overlap=300
128
+ )
129
+ else:
130
+ splitter = RecursiveCharacterTextSplitter(
131
+ chunk_size=1500,
132
+ chunk_overlap=200
133
+ )
134
+ all_chunks.extend(splitter.split_documents([doc]))
135
+
136
+ vectorstore = Chroma.from_documents(
137
+ documents=all_chunks,
138
+ embedding=embeddings
139
+ )
140
+ return vectorstore, len(all_docs), len(all_chunks)
141
+
142
+
143
+ def ask_question(question, vectorstore, history):
144
+ retriever = vectorstore.as_retriever(
145
+ search_type="mmr",
146
+ search_kwargs={"k": 8, "fetch_k": 20, "lambda_mult": 0.7}
147
+ )
148
+ docs = retriever.invoke(question)
149
+ context = "\n\n".join([
150
+ f"# File: {d.metadata['file_name']}\n{d.page_content}"
151
+ for d in docs
152
+ ])
153
+ prompt = ChatPromptTemplate.from_messages([
154
+ ("system",
155
+ "You are an expert code analyst for a GitHub repository.\n"
156
+ "Answer questions using the retrieved code chunks below.\n\n"
157
+ "Rules:\n"
158
+ "- Always name the exact file where you found the answer\n"
159
+ "- Prioritize source code files (.py, .js, .ts) over documentation (README, conf.py, setup.py)\n"
160
+ "- If implementation is spread across files, piece it together\n"
161
+ "- If you see a method name or partial logic, explain what it does\n"
162
+ "- NEVER say 'not in codebase' if you found related code or methods\n"
163
+ "- Give specific details: method names, parameters, logic flow\n"
164
+ "- If truly nothing relevant exists, say what you DID find instead\n\n"
165
+ "Code context:\n{context}"),
166
+ MessagesPlaceholder(variable_name="history"),
167
+ ("human", "{question}")
168
+ ])
169
+ chain = prompt | llm | parser
170
+
171
+ for attempt in range(3):
172
+ try:
173
+ time.sleep(0.5)
174
+ response = chain.invoke({
175
+ "context": context,
176
+ "history": history.messages,
177
+ "question": question
178
+ })
179
+ history.add_user_message(question)
180
+ history.add_ai_message(response)
181
+ return response
182
+ except Exception as e:
183
+ err = str(e).lower()
184
+ if "429" in err or "rate limit" in err:
185
+ if attempt < 2:
186
+ time.sleep(10 * (attempt + 1))
187
+ continue
188
+ return "⚠️ Rate limit hit. Resets midnight UTC."
189
+ elif "401" in err or "invalid api key" in err:
190
+ return "⚠️ Invalid API key. Update GROQ_API_KEY in .env"
191
+ elif "timeout" in err or "connection" in err:
192
+ if attempt < 2:
193
+ time.sleep(5)
194
+ continue
195
+ return "⚠️ Connection timed out. Try again."
196
+ else:
197
+ return f"⚠️ Error: {str(e)}"
198
+
199
+ # ── UI ────────────────────────────────────────────────────
200
+ st.title("πŸ” AI Codebase Explainer")
201
+ st.markdown(
202
+ "Paste any **public GitHub repo URL** β€” "
203
+ "ask questions about the code in plain English"
204
+ )
205
+ st.divider()
206
+
207
+ # ── Sidebar ───────────────────────────────────────────────
208
+ with st.sidebar:
209
+ st.header("πŸ“¦ Load Repository")
210
+
211
+ github_url = st.text_input(
212
+ "GitHub Repository URL",
213
+ placeholder="https://github.com/username/repo"
214
+ )
215
+
216
+ st.markdown("**Try these:**")
217
+ if st.button("πŸ“ Spoon-Knife (small)", use_container_width=True):
218
+ st.session_state["prefill_url"] = "https://github.com/octocat/Spoon-Knife"
219
+ st.rerun()
220
+
221
+ if st.button("πŸ“ Flask (medium)", use_container_width=True):
222
+ st.session_state["prefill_url"] = "https://github.com/pallets/flask"
223
+ st.rerun()
224
+
225
+ if "prefill_url" in st.session_state:
226
+ github_url = st.session_state.pop("prefill_url")
227
+
228
+ if github_url:
229
+ if st.button(
230
+ "πŸš€ Load & Index",
231
+ use_container_width=True,
232
+ type="primary"
233
+ ):
234
+ try:
235
+ # Reset
236
+ st.session_state.messages = []
237
+ st.session_state.history = ChatMessageHistory()
238
+ st.session_state.indexed = False
239
+
240
+ with st.spinner("Cloning repository..."):
241
+ clone_path, repo_name = clone_repo(github_url)
242
+
243
+ with st.spinner("Loading and indexing files..."):
244
+ all_docs = load_code_files(clone_path)
245
+ if not all_docs:
246
+ st.error("No readable files found!")
247
+ st.stop()
248
+ vectorstore, n_files, n_chunks = split_and_index(all_docs)
249
+
250
+ st.session_state.vectorstore = vectorstore
251
+ st.session_state.repo_name = repo_name
252
+ st.session_state.indexed = True
253
+ st.session_state.stats = {
254
+ "files" : n_files,
255
+ "chunks": n_chunks
256
+ }
257
+ st.success(f"βœ… Ready!")
258
+
259
+ except Exception as e:
260
+ st.error(f"Error: {str(e)}")
261
+
262
+ if st.session_state.indexed:
263
+ st.divider()
264
+ st.metric("Files", st.session_state.stats["files"])
265
+ st.metric("Chunks", st.session_state.stats["chunks"])
266
+ st.markdown(f"**Repo:** {st.session_state.repo_name}")
267
+
268
+ if st.button("πŸ”„ New Repo", use_container_width=True):
269
+ st.session_state.vectorstore = None
270
+ st.session_state.indexed = False
271
+ st.session_state.messages = []
272
+ st.session_state.history = ChatMessageHistory()
273
+ st.rerun()
274
+
275
+ # ── Main area ─────────────────────────────────────────────
276
+ if not st.session_state.indexed:
277
+ col1, col2, col3 = st.columns(3)
278
+ with col1:
279
+ st.info("**Step 1**\nPaste GitHub URL")
280
+ with col2:
281
+ st.info("**Step 2**\nClick Load & Index")
282
+ with col3:
283
+ st.info("**Step 3**\nAsk questions")
284
+
285
+ st.divider()
286
+ st.markdown("### Example questions")
287
+ examples = [
288
+ "What does this project do?",
289
+ "What are the main files?",
290
+ "How does authentication work?",
291
+ "Where is the database code?",
292
+ "How do I add a new feature?",
293
+ "What dependencies does it use?",
294
+ ]
295
+ col1, col2 = st.columns(2)
296
+ for i, q in enumerate(examples):
297
+ with col1 if i % 2 == 0 else col2:
298
+ st.markdown(f"πŸ’¬ *{q}*")
299
+
300
+ else:
301
+ st.subheader(f"πŸ’¬ Ask about `{st.session_state.repo_name}`")
302
+
303
+ # Quick question buttons
304
+ st.markdown("**Quick questions:**")
305
+ quick = [
306
+ "What does this project do?",
307
+ "What are the main files?",
308
+ "What dependencies does it use?",
309
+ "How is the code structured?",
310
+ ]
311
+ cols = st.columns(4)
312
+ for i, q in enumerate(quick):
313
+ with cols[i]:
314
+ if st.button(q, use_container_width=True, key=f"quick{i}"):
315
+ st.session_state.messages.append({
316
+ "role": "user", "content": q
317
+ })
318
+ with st.spinner("Reading code..."):
319
+ response = ask_question(
320
+ q,
321
+ st.session_state.vectorstore,
322
+ st.session_state.history
323
+ )
324
+ st.session_state.messages.append({
325
+ "role": "assistant", "content": response
326
+ })
327
+ st.rerun()
328
+
329
+ st.divider()
330
+
331
+ # Chat history
332
+ for msg in st.session_state.messages:
333
+ with st.chat_message(msg["role"]):
334
+ st.markdown(msg["content"])
335
+
336
+ # Chat input
337
+ if question := st.chat_input("Ask anything about the code..."):
338
+ st.session_state.messages.append({
339
+ "role": "user", "content": question
340
+ })
341
+ with st.chat_message("user"):
342
+ st.markdown(question)
343
+
344
+ with st.chat_message("assistant"):
345
+ with st.spinner("Reading code..."):
346
+ response = ask_question(
347
+ question,
348
+ st.session_state.vectorstore,
349
+ st.session_state.history
350
+ )
351
+ st.markdown(response)
352
+
353
+ st.session_state.messages.append({
354
+ "role": "assistant", "content": response
355
+ })
356
+ st.rerun()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ langchain-groq
3
+ langchain-google-genai
4
+ langchain-huggingface
5
+ langchain-community
6
+ langchain-core
7
+ langchain-text-splitters
8
+ chromadb
9
+ sentence-transformers
10
+ streamlit
11
+ python-dotenv
12
+ gitpython
13
+ google-generativeai