ahmadsanafarooq commited on
Commit
6cc092f
·
verified ·
1 Parent(s): 04a2087

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +445 -0
app.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import Chroma
5
+ from langchain.chains import RetrievalQA
6
+ from langchain_groq import ChatGroq
7
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader
8
+ from langchain.schema import Document
9
+ from pathlib import Path
10
+ from typing import List
11
+ import logging
12
+ import numpy as np
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ import pickle
16
+ from dotenv import load_dotenv
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class SimpleEmbeddings:
23
+ """Simple TF-IDF based embeddings as fallback"""
24
+
25
+ def __init__(self):
26
+ self.vectorizer = TfidfVectorizer(max_features=384, stop_words='english')
27
+ self.fitted = False
28
+
29
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
30
+ """Embed a list of documents"""
31
+ if not self.fitted:
32
+ self.vectorizer.fit(texts)
33
+ self.fitted = True
34
+
35
+ embeddings = self.vectorizer.transform(texts)
36
+ return embeddings.toarray().tolist()
37
+
38
+ def embed_query(self, text: str) -> List[float]:
39
+ """Embed a single query"""
40
+ if not self.fitted:
41
+ # If not fitted, return zero vector
42
+ return [0.0] * 384
43
+
44
+ embedding = self.vectorizer.transform([text])
45
+ return embedding.toarray()[0].tolist()
46
+
47
+ class RAGAssistant:
48
+ def __init__(self, groq_api_key: str):
49
+ """Initialize the RAG Assistant with Groq API key"""
50
+ self.groq_api_key = groq_api_key
51
+
52
+ # Initialize embeddings with fallback
53
+ self.embeddings = self._init_embeddings()
54
+
55
+ self.text_splitter = RecursiveCharacterTextSplitter(
56
+ chunk_size=1000,
57
+ chunk_overlap=200,
58
+ length_function=len
59
+ )
60
+
61
+ # Initialize separate vector stores for each assistant
62
+ self.learning_vectorstore = None
63
+ self.code_vectorstore = None
64
+
65
+ # Initialize LLM
66
+ self.llm = ChatGroq(
67
+ groq_api_key=groq_api_key,
68
+ model_name="llama3-70b-8192",
69
+ temperature=0.1
70
+ )
71
+
72
+ # Create persistent directories
73
+ self.learning_persist_dir = "./chroma_learning_db"
74
+ self.code_persist_dir = "./chroma_code_db"
75
+
76
+ # Initialize vector stores
77
+ self._init_vector_stores()
78
+
79
+ def _init_embeddings(self):
80
+ """Initialize embeddings with multiple fallback options"""
81
+ # Try HuggingFace first
82
+ try:
83
+ from langchain_huggingface import HuggingFaceEmbeddings
84
+ print("Trying HuggingFace embeddings...")
85
+
86
+ # Try different model names
87
+ models_to_try = [
88
+ "all-MiniLM-L6-v2",
89
+ "paraphrase-MiniLM-L3-v2",
90
+ "all-mpnet-base-v2"
91
+ ]
92
+
93
+ for model_name in models_to_try:
94
+ try:
95
+ embeddings = HuggingFaceEmbeddings(
96
+ model_name=model_name,
97
+ model_kwargs={'device': 'cpu'},
98
+ encode_kwargs={'normalize_embeddings': False}
99
+ )
100
+ print(f"Successfully loaded HuggingFace model: {model_name}")
101
+ return embeddings
102
+ except Exception as e:
103
+ print(f"Failed to load {model_name}: {e}")
104
+ continue
105
+
106
+ except ImportError:
107
+ print("HuggingFace embeddings not available")
108
+
109
+ # Fallback to simple TF-IDF embeddings
110
+ print("Using TF-IDF embeddings as fallback...")
111
+ return SimpleEmbeddings()
112
+
113
+ def _init_vector_stores(self):
114
+ """Initialize ChromaDB vector stores"""
115
+ try:
116
+ # Learning Tutor vector store
117
+ self.learning_vectorstore = Chroma(
118
+ persist_directory=self.learning_persist_dir,
119
+ embedding_function=self.embeddings,
120
+ collection_name="learning_materials"
121
+ )
122
+
123
+ # Code Documentation vector store
124
+ self.code_vectorstore = Chroma(
125
+ persist_directory=self.code_persist_dir,
126
+ embedding_function=self.embeddings,
127
+ collection_name="code_documentation"
128
+ )
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error initializing vector stores: {str(e)}")
132
+ raise
133
+
134
+ def load_documents(self, files: List[str], assistant_type: str) -> str:
135
+ """Load documents into the appropriate vector store"""
136
+ try:
137
+ documents = []
138
+
139
+ for file_path in files:
140
+ try:
141
+ if file_path.endswith('.pdf'):
142
+ loader = PyPDFLoader(file_path)
143
+ else:
144
+ loader = TextLoader(file_path, encoding='utf-8')
145
+
146
+ docs = loader.load()
147
+ documents.extend(docs)
148
+ except Exception as e:
149
+ print(f"Error loading {file_path}: {e}")
150
+ continue
151
+
152
+ if not documents:
153
+ return "No documents could be loaded. Please check your files."
154
+
155
+ # Split documents into chunks
156
+ chunks = self.text_splitter.split_documents(documents)
157
+
158
+ # Add metadata to distinguish document types
159
+ for chunk in chunks:
160
+ chunk.metadata['assistant_type'] = assistant_type
161
+
162
+ # Add to appropriate vector store
163
+ if assistant_type == "learning":
164
+ self.learning_vectorstore.add_documents(chunks)
165
+ self.learning_vectorstore.persist()
166
+ elif assistant_type == "code":
167
+ self.code_vectorstore.add_documents(chunks)
168
+ self.code_vectorstore.persist()
169
+
170
+ return f"Successfully loaded {len(chunks)} chunks from {len(documents)} documents into {assistant_type} assistant."
171
+
172
+ except Exception as e:
173
+ logger.error(f"Error loading documents: {str(e)}")
174
+ return f"Error loading documents: {str(e)}"
175
+
176
+ def get_learning_tutor_response(self, question: str) -> str:
177
+ """Get response from Learning Tutor"""
178
+ try:
179
+ if not self.learning_vectorstore:
180
+ return "Please upload some learning materials first."
181
+
182
+ # Create retrieval QA chain
183
+ qa_chain = RetrievalQA.from_chain_type(
184
+ llm=self.llm,
185
+ chain_type="stuff",
186
+ retriever=self.learning_vectorstore.as_retriever(
187
+ search_kwargs={"k": 3}
188
+ ),
189
+ return_source_documents=True
190
+ )
191
+
192
+ # Custom prompt for learning tutor
193
+ learning_prompt = f"""
194
+ You are an AI learning assistant that helps students understand academic concepts.
195
+ Based on the provided course materials, answer the student's question clearly and educationally.
196
+
197
+ Guidelines:
198
+ - Provide clear, educational explanations
199
+ - Use examples when helpful
200
+ - Reference specific sources when possible
201
+ - Adapt to the student's level of understanding
202
+ - Offer additional practice questions or related concepts when relevant
203
+ - Maintain an encouraging, supportive tone
204
+
205
+ Student's question: {question}
206
+
207
+ Please provide a helpful, educational response:
208
+ """
209
+
210
+ result = qa_chain({"query": learning_prompt})
211
+
212
+ # Format response with sources
213
+ response = result['result']
214
+
215
+ if result.get('source_documents'):
216
+ response += "\n\n**Sources:**\n"
217
+ for i, doc in enumerate(result['source_documents'][:3]):
218
+ source = doc.metadata.get('source', 'Unknown')
219
+ response += f"- {Path(source).name}\n"
220
+
221
+ return response
222
+
223
+ except Exception as e:
224
+ logger.error(f"Error in learning tutor: {str(e)}")
225
+ return f"Error generating response: {str(e)}"
226
+
227
+ def get_code_helper_response(self, question: str) -> str:
228
+ """Get response from Code Documentation Helper"""
229
+ try:
230
+ if not self.code_vectorstore:
231
+ return "Please upload some code documentation first."
232
+
233
+ # Create retrieval QA chain
234
+ qa_chain = RetrievalQA.from_chain_type(
235
+ llm=self.llm,
236
+ chain_type="stuff",
237
+ retriever=self.code_vectorstore.as_retriever(
238
+ search_kwargs={"k": 3}
239
+ ),
240
+ return_source_documents=True
241
+ )
242
+
243
+ # Custom prompt for code helper
244
+ code_prompt = f"""
245
+ You are a technical assistant that helps developers understand codebases and APIs.
246
+ Based on the provided documentation and code examples, answer the developer's question.
247
+
248
+ Guidelines:
249
+ - Provide practical, actionable guidance
250
+ - Include relevant code snippets with explanations
251
+ - Reference specific documentation sections when possible
252
+ - Highlight important considerations (security, performance, errors)
253
+ - Suggest related APIs or patterns that might be useful
254
+ - Use clear, technical language appropriate for developers
255
+
256
+ Developer's question: {question}
257
+
258
+ Please provide a helpful technical response:
259
+ """
260
+
261
+ result = qa_chain({"query": code_prompt})
262
+
263
+ # Format response with sources
264
+ response = result['result']
265
+
266
+ if result.get('source_documents'):
267
+ response += "\n\n**Documentation Sources:**\n"
268
+ for i, doc in enumerate(result['source_documents'][:3]):
269
+ source = doc.metadata.get('source', 'Unknown')
270
+ response += f"- {Path(source).name}\n"
271
+
272
+ return response
273
+
274
+ except Exception as e:
275
+ logger.error(f"Error in code helper: {str(e)}")
276
+ return f"Error generating response: {str(e)}"
277
+
278
+ def create_gradio_interface(assistant: RAGAssistant):
279
+ """Create Gradio interface for the RAG Assistant"""
280
+
281
+ def upload_learning_files(files):
282
+ if not files:
283
+ return "No files uploaded."
284
+
285
+ file_paths = [f.name for f in files]
286
+ return assistant.load_documents(file_paths, "learning")
287
+
288
+ def upload_code_files(files):
289
+ if not files:
290
+ return "No files uploaded."
291
+
292
+ file_paths = [f.name for f in files]
293
+ return assistant.load_documents(file_paths, "code")
294
+
295
+ def learning_chat(message, history):
296
+ if not message.strip():
297
+ return history, ""
298
+
299
+ response = assistant.get_learning_tutor_response(message)
300
+ history.append((message, response))
301
+ return history, ""
302
+
303
+ def code_chat(message, history):
304
+ if not message.strip():
305
+ return history, ""
306
+
307
+ response = assistant.get_code_helper_response(message)
308
+ history.append((message, response))
309
+ return history, ""
310
+
311
+ # Create Gradio interface
312
+ with gr.Blocks(title="RAG-Based Learning & Code Assistant", theme=gr.themes.Soft()) as demo:
313
+ gr.Markdown("# 🎓 RAG-Based Learning & Code Assistant")
314
+ gr.Markdown("Upload your documents and ask questions to get intelligent responses!")
315
+
316
+ with gr.Tabs():
317
+ # Learning Tutor Tab
318
+ with gr.TabItem("📚 Learning Tutor"):
319
+ gr.Markdown("### Personalized Learning Assistant")
320
+ gr.Markdown("Upload textbooks, lecture notes, and study materials to get personalized learning assistance.")
321
+
322
+ with gr.Row():
323
+ with gr.Column(scale=1):
324
+ learning_files = gr.File(
325
+ label="Upload Learning Materials (PDF, TXT)",
326
+ file_count="multiple",
327
+ file_types=[".pdf", ".txt", ".md"]
328
+ )
329
+ learning_upload_btn = gr.Button("Upload Materials", variant="primary")
330
+ learning_status = gr.Textbox(label="Upload Status", interactive=False)
331
+
332
+ with gr.Column(scale=2):
333
+ learning_chatbot = gr.Chatbot(
334
+ label="Learning Tutor Chat",
335
+ height=400
336
+ )
337
+ learning_input = gr.Textbox(
338
+ label="Ask a question about your course materials",
339
+ placeholder="e.g., Can you explain the concept of machine learning?"
340
+ )
341
+ learning_submit = gr.Button("Ask Question", variant="primary")
342
+
343
+ learning_upload_btn.click(
344
+ upload_learning_files,
345
+ inputs=[learning_files],
346
+ outputs=[learning_status]
347
+ )
348
+
349
+ learning_submit.click(
350
+ learning_chat,
351
+ inputs=[learning_input, learning_chatbot],
352
+ outputs=[learning_chatbot, learning_input]
353
+ )
354
+
355
+ learning_input.submit(
356
+ learning_chat,
357
+ inputs=[learning_input, learning_chatbot],
358
+ outputs=[learning_chatbot, learning_input]
359
+ )
360
+
361
+ # Code Documentation Helper Tab
362
+ with gr.TabItem("💻 Code Documentation Helper"):
363
+ gr.Markdown("### Developer Documentation Assistant")
364
+ gr.Markdown("Upload API documentation, code examples, and technical guides to get development assistance.")
365
+
366
+ with gr.Row():
367
+ with gr.Column(scale=1):
368
+ code_files = gr.File(
369
+ label="Upload Code Documentation (PDF, TXT, MD)",
370
+ file_count="multiple",
371
+ file_types=[".pdf", ".txt", ".md", ".py", ".js", ".json"]
372
+ )
373
+ code_upload_btn = gr.Button("Upload Documentation", variant="primary")
374
+ code_status = gr.Textbox(label="Upload Status", interactive=False)
375
+
376
+ with gr.Column(scale=2):
377
+ code_chatbot = gr.Chatbot(
378
+ label="Code Helper Chat",
379
+ height=400
380
+ )
381
+ code_input = gr.Textbox(
382
+ label="Ask about APIs, code examples, or troubleshooting",
383
+ placeholder="e.g., How do I implement authentication in this API?"
384
+ )
385
+ code_submit = gr.Button("Ask Question", variant="primary")
386
+
387
+ code_upload_btn.click(
388
+ upload_code_files,
389
+ inputs=[code_files],
390
+ outputs=[code_status]
391
+ )
392
+
393
+ code_submit.click(
394
+ code_chat,
395
+ inputs=[code_input, code_chatbot],
396
+ outputs=[code_chatbot, code_input]
397
+ )
398
+
399
+ code_input.submit(
400
+ code_chat,
401
+ inputs=[code_input, code_chatbot],
402
+ outputs=[code_chatbot, code_input]
403
+ )
404
+
405
+ # Footer
406
+ gr.Markdown("---")
407
+ gr.Markdown("*Powered by LangChain, ChromaDB, and Groq API*")
408
+
409
+ return demo
410
+
411
+ def main():
412
+ """Main function to run the application"""
413
+ # Get Groq API key from environment variable
414
+ load_dotenv()
415
+ groq_api_key = os.getenv("GROQ_API_KEY")
416
+
417
+ if not groq_api_key:
418
+ print("Please set your GROQ_API_KEY environment variable")
419
+ print("You can get a free API key from: https://console.groq.com/")
420
+ return
421
+
422
+ try:
423
+ # Initialize RAG Assistant
424
+ print("Initializing RAG Assistant...")
425
+ assistant = RAGAssistant(groq_api_key)
426
+
427
+ # Create and launch Gradio interface
428
+ demo = create_gradio_interface(assistant)
429
+
430
+ print("Starting RAG-Based Learning & Code Assistant...")
431
+ print("Access the application at: http://localhost:7860")
432
+
433
+ demo.launch(
434
+ server_name="0.0.0.0",
435
+ server_port=7860,
436
+ share=False,
437
+ debug=True
438
+ )
439
+
440
+ except Exception as e:
441
+ logger.error(f"Error starting application: {str(e)}")
442
+ print(f"Error: {str(e)}")
443
+
444
+ if __name__ == "__main__":
445
+ main()