themehmi commited on
Commit
0e82851
Β·
verified Β·
1 Parent(s): 67baef8

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +138 -57
app.py CHANGED
@@ -4,8 +4,23 @@ import os
4
  import shutil
5
  import subprocess
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
7
- from langchain_community.document_loaders import DirectoryLoader
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_core.runnables import RunnablePassthrough
@@ -42,23 +57,65 @@ def setup_vector_db():
42
  if not os.path.exists('./repo'):
43
  os.makedirs('./repo')
44
 
45
- loader = DirectoryLoader('./repo', glob="**/*.py", show_progress=True)
46
- docs = loader.load()
 
47
 
48
- if not docs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  return None, 0
50
 
51
- python_splitter = RecursiveCharacterTextSplitter.from_language(
52
- language=Language.PYTHON,
53
- chunk_size=500,
54
- chunk_overlap=50
55
- )
56
- texts = python_splitter.split_documents(docs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
59
- db = FAISS.from_documents(texts, embeddings)
60
 
61
- return db, len(docs)
62
 
63
  # 3. GLOBAL INITIALIZATION
64
  print("Initializing models...")
@@ -102,106 +159,130 @@ qa_chain = build_qa_chain(vector_db)
102
  # 4. INGESTION FUNCTIONS
103
  def clone_and_index(repo_url):
104
  global vector_db, file_count, qa_chain
 
 
 
105
  if os.path.exists('./repo'):
106
  shutil.rmtree('./repo')
107
 
108
  try:
109
- subprocess.run(["git", "clone", repo_url, "./repo"], check=True)
 
 
110
  except Exception as e:
111
- return f"**Repo Status:** Failed to clone repo: {str(e)} ❌"
112
 
113
  vector_db, file_count = setup_vector_db()
114
  qa_chain = build_qa_chain(vector_db)
115
 
116
  if vector_db:
117
- return f"**Repo Status:** {file_count} files indexed from `{repo_url}` βœ…"
118
  else:
119
- return f"**Repo Status:** No Python files found in `{repo_url}` ❌"
120
 
121
  def upload_and_index(files):
122
  global vector_db, file_count, qa_chain
 
 
 
123
  if os.path.exists('./repo'):
124
  shutil.rmtree('./repo')
125
  os.makedirs('./repo', exist_ok=True)
126
 
127
- if not files:
128
- return "**Repo Status:** No files uploaded ❌"
129
-
130
  for file in files:
131
- dest_path = os.path.join('./repo', os.path.basename(file.name))
132
- shutil.copy(file.name, dest_path)
 
 
133
 
134
  vector_db, file_count = setup_vector_db()
135
  qa_chain = build_qa_chain(vector_db)
136
 
137
  if vector_db:
138
- return f"**Repo Status:** {file_count} files indexed from local upload βœ…"
139
  else:
140
- return "**Repo Status:** No Python files found in local upload ❌"
141
 
142
  # 5. CHAT LOGIC
143
  def respond(message, chat_history):
 
 
 
144
  if not vector_db:
145
- bot_message = "πŸ‘‹ Welcome! Please provide a repo link or upload Python files to start chatting."
146
  chat_history.append((message, bot_message))
147
  return "", chat_history
148
 
149
- # Fetch response from RAG
150
- response = qa_chain.invoke(message)
151
- answer = response["answer"]
152
- sources = response["context"]
153
-
154
- final_answer = answer
155
-
156
- if sources:
157
- final_answer += "\n\n<details><summary>πŸ” View Source Code Referenced</summary>\n\n"
158
- for idx, doc in enumerate(sources):
159
- source_file = doc.metadata.get("source", "Unknown File")
160
- final_answer += f"**Snippet {idx + 1}** from `{source_file}`:\n"
161
- final_answer += f"```python\n{doc.page_content}\n```\n\n"
162
- final_answer += "</details>"
163
 
164
- chat_history.append((message, final_answer))
 
 
 
 
 
 
 
 
 
 
 
 
165
  return "", chat_history
166
 
167
  # 6. GRADIO UI
168
  custom_css = """
169
- .status-box { padding: 10px; border-radius: 8px; background-color: #f0f0f0; margin-bottom: 10px; }
170
- .dark .status-box { background-color: #1e293b; color: #cbd5e1; }
 
 
171
  """
172
 
173
  def get_initial_repo_status():
174
  if vector_db:
175
- return f"**Repo Status:** {file_count} files indexed βœ…"
176
- return "**Repo Status:** Empty ❌\n\nProvide a repo link or upload files to begin analyzing."
177
 
178
- with gr.Blocks(title="Codebase Assistant", css=custom_css) as demo:
179
  with gr.Row():
180
  with gr.Column(scale=1):
181
- gr.Markdown("# DevAssist AI\nYour personal Qwen-powered codebase expert.")
182
  gr.Markdown("---")
183
 
184
  with gr.Column(elem_classes=["status-box"]):
185
- gr.Markdown("### System Status")
186
  gr.Markdown(f"**Hardware:** {device_status}")
187
  repo_status = gr.Markdown(get_initial_repo_status())
188
 
189
- gr.Markdown("### Add Codebase")
190
- with gr.Tab("GitHub Repo"):
191
- repo_url = gr.Textbox(placeholder="https://github.com/user/repo", show_label=False)
192
- clone_btn = gr.Button("Clone & Index")
193
- with gr.Tab("Local Upload"):
194
- local_files = gr.File(file_count="multiple", label="Upload Local Files", file_types=[".py"])
195
- upload_btn = gr.Button("Upload & Index")
 
 
 
 
 
196
 
197
  clone_btn.click(fn=clone_and_index, inputs=[repo_url], outputs=[repo_status])
198
  upload_btn.click(fn=upload_and_index, inputs=[local_files], outputs=[repo_status])
199
 
200
  with gr.Column(scale=3):
201
- gr.Markdown("### πŸ’» Chat with your Codebase\nAsk architecture questions, find bugs, or request code explanations.")
202
- chatbot = gr.Chatbot(height=500, show_label=False)
203
- msg = gr.Textbox(placeholder="E.g., What does the main function do?", show_label=False)
204
- clear = gr.Button("Clear Chat")
 
 
205
 
206
  msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
207
  clear.click(lambda: None, None, chatbot, queue=False)
 
4
  import shutil
5
  import subprocess
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
8
+ from langchain_core.documents import Document
9
+
10
+ EXTENSION_TO_LANGUAGE = {
11
+ '.py': Language.PYTHON,
12
+ '.js': Language.JS,
13
+ '.ts': Language.JS,
14
+ '.java': Language.JAVA,
15
+ '.cpp': Language.CPP,
16
+ '.c': Language.CPP,
17
+ '.h': Language.CPP,
18
+ '.go': Language.GO,
19
+ '.rs': Language.RUST,
20
+ '.rb': Language.RUBY,
21
+ '.html': Language.HTML,
22
+ '.md': Language.MARKDOWN,
23
+ }
24
  from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
25
  from langchain_community.vectorstores import FAISS
26
  from langchain_core.runnables import RunnablePassthrough
 
57
  if not os.path.exists('./repo'):
58
  os.makedirs('./repo')
59
 
60
+ docs_by_language = {}
61
+ generic_docs = []
62
+ file_count = 0
63
 
64
+ for root, _, files in os.walk('./repo'):
65
+ if '.git' in root:
66
+ continue
67
+ for file in files:
68
+ file_path = os.path.join(root, file)
69
+ ext = os.path.splitext(file)[1].lower()
70
+ try:
71
+ with open(file_path, 'r', encoding='utf-8') as f:
72
+ content = f.read()
73
+ doc = Document(page_content=content, metadata={"source": file_path})
74
+ file_count += 1
75
+
76
+ lang = EXTENSION_TO_LANGUAGE.get(ext)
77
+ if lang:
78
+ if lang not in docs_by_language:
79
+ docs_by_language[lang] = []
80
+ docs_by_language[lang].append(doc)
81
+ else:
82
+ generic_docs.append(doc)
83
+ except UnicodeDecodeError:
84
+ pass # Skip binary files
85
+
86
+ if file_count == 0:
87
  return None, 0
88
 
89
+ all_splits = []
90
+
91
+ # Split documents by specific language rules
92
+ for lang, docs in docs_by_language.items():
93
+ try:
94
+ splitter = RecursiveCharacterTextSplitter.from_language(
95
+ language=lang,
96
+ chunk_size=500,
97
+ chunk_overlap=50
98
+ )
99
+ all_splits.extend(splitter.split_documents(docs))
100
+ except Exception:
101
+ # Fallback if language is not supported by installed langchain version
102
+ generic_docs.extend(docs)
103
+
104
+ # Split generic documents
105
+ if generic_docs:
106
+ generic_splitter = RecursiveCharacterTextSplitter(
107
+ chunk_size=500,
108
+ chunk_overlap=50
109
+ )
110
+ all_splits.extend(generic_splitter.split_documents(generic_docs))
111
+
112
+ if not all_splits:
113
+ return None, 0
114
 
115
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
116
+ db = FAISS.from_documents(all_splits, embeddings)
117
 
118
+ return db, file_count
119
 
120
  # 3. GLOBAL INITIALIZATION
121
  print("Initializing models...")
 
159
  # 4. INGESTION FUNCTIONS
160
  def clone_and_index(repo_url):
161
  global vector_db, file_count, qa_chain
162
+ if not repo_url or not repo_url.strip():
163
+ return "⚠️ Please enter a valid GitHub URL."
164
+
165
  if os.path.exists('./repo'):
166
  shutil.rmtree('./repo')
167
 
168
  try:
169
+ subprocess.run(["git", "clone", repo_url.strip(), "./repo"], check=True, capture_output=True, text=True)
170
+ except subprocess.CalledProcessError as e:
171
+ return f"❌ Failed to clone repo. Error: {e.stderr}"
172
  except Exception as e:
173
+ return f"❌ Failed to clone repo: {str(e)}"
174
 
175
  vector_db, file_count = setup_vector_db()
176
  qa_chain = build_qa_chain(vector_db)
177
 
178
  if vector_db:
179
+ return f"βœ… Success! {file_count} files indexed from `{repo_url}`"
180
  else:
181
+ return f"⚠️ Warning: No valid text files found in `{repo_url}`"
182
 
183
  def upload_and_index(files):
184
  global vector_db, file_count, qa_chain
185
+ if not files:
186
+ return "⚠️ No files were uploaded."
187
+
188
  if os.path.exists('./repo'):
189
  shutil.rmtree('./repo')
190
  os.makedirs('./repo', exist_ok=True)
191
 
 
 
 
192
  for file in files:
193
+ # Handle both Gradio 3 (filepath string) and Gradio 4 (File object)
194
+ file_path = getattr(file, "name", str(file))
195
+ dest_path = os.path.join('./repo', os.path.basename(file_path))
196
+ shutil.copy(file_path, dest_path)
197
 
198
  vector_db, file_count = setup_vector_db()
199
  qa_chain = build_qa_chain(vector_db)
200
 
201
  if vector_db:
202
+ return f"βœ… Success! {file_count} files indexed from local upload"
203
  else:
204
+ return "⚠️ Warning: No valid text files found in the uploaded files"
205
 
206
  # 5. CHAT LOGIC
207
  def respond(message, chat_history):
208
+ if not message.strip():
209
+ return "", chat_history
210
+
211
  if not vector_db:
212
+ bot_message = "πŸ‘‹ Welcome! Please provide a repo link or upload your code files using the panel on the left to start chatting."
213
  chat_history.append((message, bot_message))
214
  return "", chat_history
215
 
216
+ try:
217
+ # Fetch response from RAG
218
+ response = qa_chain.invoke(message)
219
+ answer = response["answer"]
220
+ sources = response["context"]
221
+
222
+ final_answer = answer
 
 
 
 
 
 
 
223
 
224
+ if sources:
225
+ final_answer += "\n\n<details><summary>πŸ” View Source Code Referenced</summary>\n\n"
226
+ for idx, doc in enumerate(sources):
227
+ source_file = doc.metadata.get("source", "Unknown File")
228
+ final_answer += f"**Snippet {idx + 1}** from `{source_file}`:\n"
229
+ final_answer += f"```python\n{doc.page_content}\n```\n\n"
230
+ final_answer += "</details>"
231
+
232
+ chat_history.append((message, final_answer))
233
+ except Exception as e:
234
+ bot_message = f"❌ An error occurred during processing: {str(e)}"
235
+ chat_history.append((message, bot_message))
236
+
237
  return "", chat_history
238
 
239
  # 6. GRADIO UI
240
  custom_css = """
241
+ .status-box { padding: 15px; border-radius: 8px; background-color: #f0f0f0; margin-bottom: 20px; border-left: 4px solid #007bff;}
242
+ .dark .status-box { background-color: #1e293b; color: #cbd5e1; border-left: 4px solid #3b82f6;}
243
+ .instructions { font-size: 0.95em; color: #555; }
244
+ .dark .instructions { color: #aaa; }
245
  """
246
 
247
  def get_initial_repo_status():
248
  if vector_db:
249
+ return f"βœ… **Ready!** {file_count} files indexed and loaded."
250
+ return "❌ **Empty Database.** Provide a codebase below to begin."
251
 
252
+ with gr.Blocks(title="Codebase Assistant", css=custom_css, theme=gr.themes.Soft()) as demo:
253
  with gr.Row():
254
  with gr.Column(scale=1):
255
+ gr.Markdown("# πŸ¦– RepoRaptor\n**Your personal AI codebase expert.**")
256
  gr.Markdown("---")
257
 
258
  with gr.Column(elem_classes=["status-box"]):
259
+ gr.Markdown("### πŸ“Š System Status")
260
  gr.Markdown(f"**Hardware:** {device_status}")
261
  repo_status = gr.Markdown(get_initial_repo_status())
262
 
263
+ gr.Markdown("### πŸ“‚ Ingest Codebase")
264
+ gr.Markdown("Choose a method to load your codebase into the Vector Database.", elem_classes=["instructions"])
265
+
266
+ with gr.Tabs():
267
+ with gr.Tab("GitHub Repo"):
268
+ gr.Markdown("Clone a public repository directly:")
269
+ repo_url = gr.Textbox(placeholder="https://github.com/user/repo", show_label=False)
270
+ clone_btn = gr.Button("⬇️ Clone & Index", variant="primary")
271
+ with gr.Tab("Local Upload"):
272
+ gr.Markdown("Upload local codebase files:")
273
+ local_files = gr.File(file_count="multiple", label="Upload Files")
274
+ upload_btn = gr.Button("πŸ“€ Upload & Index", variant="primary")
275
 
276
  clone_btn.click(fn=clone_and_index, inputs=[repo_url], outputs=[repo_status])
277
  upload_btn.click(fn=upload_and_index, inputs=[local_files], outputs=[repo_status])
278
 
279
  with gr.Column(scale=3):
280
+ gr.Markdown("### πŸ’» Chat Interface\nAsk architecture questions, find bugs, or request code explanations. I will **only** answer questions related to code.")
281
+ chatbot = gr.Chatbot(height=600, show_label=False, bubble_full_width=False)
282
+
283
+ with gr.Row():
284
+ msg = gr.Textbox(placeholder="E.g., What does the main function do? (Press Enter to send)", show_label=False, scale=4)
285
+ clear = gr.Button("πŸ—‘οΈ Clear Chat", scale=1)
286
 
287
  msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
288
  clear.click(lambda: None, None, chatbot, queue=False)