BetaGen commited on
Commit
1e3c704
Β·
verified Β·
1 Parent(s): b5757f5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +413 -0
app.py ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import PyPDF2
4
+ import docx
5
+ import pandas as pd
6
+ from typing import List, Dict, Any
7
+ import numpy as np
8
+ from sentence_transformers import SentenceTransformer
9
+ import faiss
10
+ import re
11
+ from groq import Groq
12
+ import json
13
+ import tempfile
14
+ import io
15
+
16
+ class RAGApplication:
17
+ def __init__(self):
18
+ """Initialize the RAG application with necessary components"""
19
+ # Initialize Groq client
20
+ self.groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
21
+
22
+ # Initialize embedding model (using a lightweight, free model)
23
+ self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
24
+
25
+ # Initialize FAISS index
26
+ self.dimension = 384 # Dimension of all-MiniLM-L6-v2 embeddings
27
+ self.index = faiss.IndexFlatIP(self.dimension) # Inner product for cosine similarity
28
+
29
+ # Storage for chunks and metadata
30
+ self.chunks = []
31
+ self.chunk_metadata = []
32
+ self.is_indexed = False
33
+
34
+ def extract_text_from_file(self, file_path: str, file_type: str) -> str:
35
+ """Extract text from different file types"""
36
+ text = ""
37
+
38
+ try:
39
+ if file_type == "pdf":
40
+ with open(file_path, 'rb') as file:
41
+ pdf_reader = PyPDF2.PdfReader(file)
42
+ for page in pdf_reader.pages:
43
+ text += page.extract_text() + "\n"
44
+
45
+ elif file_type == "docx":
46
+ doc = docx.Document(file_path)
47
+ for paragraph in doc.paragraphs:
48
+ text += paragraph.text + "\n"
49
+
50
+ elif file_type == "txt":
51
+ with open(file_path, 'r', encoding='utf-8') as file:
52
+ text = file.read()
53
+
54
+ elif file_type in ["csv", "xlsx"]:
55
+ if file_type == "csv":
56
+ df = pd.read_csv(file_path)
57
+ else:
58
+ df = pd.read_excel(file_path)
59
+
60
+ # Convert DataFrame to text representation
61
+ text = df.to_string(index=False)
62
+
63
+ except Exception as e:
64
+ return f"Error reading file: {str(e)}"
65
+
66
+ return text
67
+
68
+ def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
69
+ """Split text into overlapping chunks"""
70
+ if not text.strip():
71
+ return []
72
+
73
+ # Clean the text
74
+ text = re.sub(r'\s+', ' ', text.strip())
75
+
76
+ # Split by sentences first to maintain context
77
+ sentences = re.split(r'[.!?]+', text)
78
+
79
+ chunks = []
80
+ current_chunk = ""
81
+
82
+ for sentence in sentences:
83
+ sentence = sentence.strip()
84
+ if not sentence:
85
+ continue
86
+
87
+ # If adding this sentence would exceed chunk_size, save current chunk
88
+ if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
89
+ chunks.append(current_chunk.strip())
90
+
91
+ # Start new chunk with overlap
92
+ words = current_chunk.split()
93
+ overlap_text = ' '.join(words[-overlap:]) if len(words) > overlap else current_chunk
94
+ current_chunk = overlap_text + " " + sentence
95
+ else:
96
+ current_chunk += " " + sentence if current_chunk else sentence
97
+
98
+ # Add the last chunk
99
+ if current_chunk.strip():
100
+ chunks.append(current_chunk.strip())
101
+
102
+ return chunks
103
+
104
+ def create_embeddings(self, chunks: List[str]) -> np.ndarray:
105
+ """Create embeddings for text chunks"""
106
+ if not chunks:
107
+ return np.array([])
108
+
109
+ embeddings = self.embedding_model.encode(chunks, convert_to_tensor=False)
110
+ return embeddings
111
+
112
+ def build_index(self, files) -> str:
113
+ """Process uploaded files and build the search index"""
114
+ if not files:
115
+ return "❌ No files uploaded. Please upload at least one file."
116
+
117
+ try:
118
+ # Reset previous data
119
+ self.chunks = []
120
+ self.chunk_metadata = []
121
+ self.index = faiss.IndexFlatIP(self.dimension)
122
+
123
+ all_chunks = []
124
+ processing_status = []
125
+
126
+ for file in files:
127
+ file_name = file.name
128
+ file_extension = file_name.split('.')[-1].lower()
129
+
130
+ # Extract text from file
131
+ text = self.extract_text_from_file(file.name, file_extension)
132
+
133
+ if text.startswith("Error"):
134
+ processing_status.append(f"❌ {file_name}: {text}")
135
+ continue
136
+
137
+ # Create chunks
138
+ file_chunks = self.chunk_text(text)
139
+
140
+ if not file_chunks:
141
+ processing_status.append(f"❌ {file_name}: No text could be extracted")
142
+ continue
143
+
144
+ # Add metadata for each chunk
145
+ for i, chunk in enumerate(file_chunks):
146
+ self.chunk_metadata.append({
147
+ 'file_name': file_name,
148
+ 'chunk_id': i,
149
+ 'chunk_text': chunk
150
+ })
151
+ all_chunks.append(chunk)
152
+
153
+ processing_status.append(f"βœ… {file_name}: {len(file_chunks)} chunks created")
154
+
155
+ if not all_chunks:
156
+ return "❌ No valid text chunks were created from the uploaded files."
157
+
158
+ # Create embeddings
159
+ embeddings = self.create_embeddings(all_chunks)
160
+
161
+ # Normalize embeddings for cosine similarity
162
+ faiss.normalize_L2(embeddings)
163
+
164
+ # Add to FAISS index
165
+ self.index.add(embeddings)
166
+ self.chunks = all_chunks
167
+ self.is_indexed = True
168
+
169
+ status_report = "\n".join(processing_status)
170
+ summary = f"\n\nπŸ“Š **Summary:**\n- Total chunks created: {len(all_chunks)}\n- Index built successfully!\n- Ready to answer questions!"
171
+
172
+ return f"**File Processing Results:**\n\n{status_report}{summary}"
173
+
174
+ except Exception as e:
175
+ return f"❌ Error during indexing: {str(e)}"
176
+
177
+ def search_similar_chunks(self, query: str, top_k: int = 5) -> List[Dict]:
178
+ """Search for similar chunks using vector similarity"""
179
+ if not self.is_indexed:
180
+ return []
181
+
182
+ try:
183
+ # Create query embedding
184
+ query_embedding = self.embedding_model.encode([query])
185
+ faiss.normalize_L2(query_embedding)
186
+
187
+ # Search in FAISS index
188
+ scores, indices = self.index.search(query_embedding, top_k)
189
+
190
+ results = []
191
+ for score, idx in zip(scores[0], indices[0]):
192
+ if idx < len(self.chunk_metadata):
193
+ results.append({
194
+ 'chunk': self.chunks[idx],
195
+ 'metadata': self.chunk_metadata[idx],
196
+ 'similarity_score': float(score)
197
+ })
198
+
199
+ return results
200
+
201
+ except Exception as e:
202
+ print(f"Search error: {e}")
203
+ return []
204
+
205
+ def generate_response(self, query: str, context_chunks: List[str]) -> str:
206
+ """Generate response using Groq API with context"""
207
+ try:
208
+ # Prepare context
209
+ context = "\n\n".join([f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(context_chunks)])
210
+
211
+ # Create prompt
212
+ prompt = f"""Based on the following context information, please answer the user's question. If the answer cannot be found in the context, please say so clearly.
213
+
214
+ Context Information:
215
+ {context}
216
+
217
+ Question: {query}
218
+
219
+ Please provide a comprehensive and accurate answer based on the context provided above."""
220
+
221
+ # Call Groq API
222
+ chat_completion = self.groq_client.chat.completions.create(
223
+ messages=[
224
+ {
225
+ "role": "system",
226
+ "content": "You are a helpful assistant that answers questions based on provided context. Always cite which part of the context supports your answer."
227
+ },
228
+ {
229
+ "role": "user",
230
+ "content": prompt,
231
+ }
232
+ ],
233
+ model="llama-3.3-70b-versatile",
234
+ temperature=0.3,
235
+ max_tokens=1000
236
+ )
237
+
238
+ return chat_completion.choices[0].message.content
239
+
240
+ except Exception as e:
241
+ return f"Error generating response: {str(e)}"
242
+
243
+ def query_documents(self, query: str, top_k: int = 5) -> tuple:
244
+ """Main function to query the documents"""
245
+ if not query.strip():
246
+ return "Please enter a question.", ""
247
+
248
+ if not self.is_indexed:
249
+ return "Please upload and index some documents first.", ""
250
+
251
+ # Search for relevant chunks
252
+ similar_chunks = self.search_similar_chunks(query, top_k)
253
+
254
+ if not similar_chunks:
255
+ return "No relevant information found in the documents.", ""
256
+
257
+ # Extract chunks and generate response
258
+ context_chunks = [chunk_data['chunk'] for chunk_data in similar_chunks]
259
+ response = self.generate_response(query, context_chunks)
260
+
261
+ # Create source information
262
+ sources = "\n\nπŸ“š **Sources:**\n"
263
+ for i, chunk_data in enumerate(similar_chunks):
264
+ file_name = chunk_data['metadata']['file_name']
265
+ similarity = chunk_data['similarity_score']
266
+ sources += f"- **Source {i+1}:** {file_name} (Similarity: {similarity:.3f})\n"
267
+
268
+ return response, sources
269
+
270
+ # Initialize the RAG application
271
+ rag_app = RAGApplication()
272
+
273
+ # Custom CSS for attractive interface
274
+ custom_css = """
275
+ .gradio-container {
276
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
277
+ }
278
+
279
+ .main-header {
280
+ text-align: center;
281
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
282
+ color: white;
283
+ padding: 2rem;
284
+ border-radius: 10px;
285
+ margin-bottom: 2rem;
286
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
287
+ }
288
+
289
+ .upload-area {
290
+ border: 2px dashed #667eea;
291
+ border-radius: 10px;
292
+ padding: 2rem;
293
+ text-align: center;
294
+ background: #f8f9ff;
295
+ }
296
+
297
+ .chat-container {
298
+ background: #ffffff;
299
+ border-radius: 10px;
300
+ padding: 1rem;
301
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
302
+ }
303
+
304
+ #component-0 {
305
+ border-radius: 15px;
306
+ }
307
+ """
308
+
309
+ # Create Gradio interface
310
+ def create_interface():
311
+ with gr.Blocks(css=custom_css, title="πŸ€– RAG Document Assistant") as interface:
312
+
313
+ # Header
314
+ gr.HTML("""
315
+ <div class="main-header">
316
+ <h1>πŸ€– RAG Document Assistant</h1>
317
+ <p>Upload your documents and ask questions - powered by AI!</p>
318
+ </div>
319
+ """)
320
+
321
+ with gr.Row():
322
+ with gr.Column(scale=1):
323
+ gr.HTML("<h3>πŸ“ Document Upload</h3>")
324
+
325
+ file_upload = gr.File(
326
+ label="Upload Documents",
327
+ file_types=[".pdf", ".docx", ".txt", ".csv", ".xlsx"],
328
+ file_count="multiple",
329
+ height=200
330
+ )
331
+
332
+ upload_btn = gr.Button(
333
+ "πŸš€ Process Documents",
334
+ variant="primary",
335
+ size="lg"
336
+ )
337
+
338
+ upload_status = gr.Textbox(
339
+ label="Processing Status",
340
+ lines=8,
341
+ interactive=False,
342
+ placeholder="Upload documents and click 'Process Documents' to begin..."
343
+ )
344
+
345
+ with gr.Column(scale=2):
346
+ gr.HTML("<h3>πŸ’¬ Ask Questions</h3>")
347
+
348
+ with gr.Row():
349
+ query_input = gr.Textbox(
350
+ label="Your Question",
351
+ placeholder="Ask anything about your uploaded documents...",
352
+ lines=2,
353
+ scale=4
354
+ )
355
+ ask_btn = gr.Button("Ask", variant="primary", scale=1)
356
+
357
+ response_output = gr.Textbox(
358
+ label="AI Response",
359
+ lines=10,
360
+ interactive=False,
361
+ placeholder="AI responses will appear here..."
362
+ )
363
+
364
+ sources_output = gr.Textbox(
365
+ label="Sources",
366
+ lines=5,
367
+ interactive=False,
368
+ placeholder="Source information will appear here..."
369
+ )
370
+
371
+ # Example questions
372
+ gr.HTML("""
373
+ <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 8px;">
374
+ <h4>πŸ’‘ Example Questions:</h4>
375
+ <ul>
376
+ <li>"What are the main topics discussed in the document?"</li>
377
+ <li>"Can you summarize the key findings?"</li>
378
+ <li>"What recommendations are provided?"</li>
379
+ <li>"Tell me about [specific topic] mentioned in the documents"</li>
380
+ </ul>
381
+ </div>
382
+ """)
383
+
384
+ # Event handlers
385
+ upload_btn.click(
386
+ fn=rag_app.build_index,
387
+ inputs=[file_upload],
388
+ outputs=[upload_status]
389
+ )
390
+
391
+ ask_btn.click(
392
+ fn=rag_app.query_documents,
393
+ inputs=[query_input],
394
+ outputs=[response_output, sources_output]
395
+ )
396
+
397
+ # Allow Enter key to submit question
398
+ query_input.submit(
399
+ fn=rag_app.query_documents,
400
+ inputs=[query_input],
401
+ outputs=[response_output, sources_output]
402
+ )
403
+
404
+ return interface
405
+
406
+ # Launch the application
407
+ if __name__ == "__main__":
408
+ interface = create_interface()
409
+ interface.launch(
410
+ share=True,
411
+ server_name="0.0.0.0",
412
+ server_port=7860
413
+ )