File size: 14,446 Bytes
1e3c704
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
import os
import gradio as gr
import PyPDF2
import docx
import pandas as pd
from typing import List, Dict, Any
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import re
from groq import Groq
import json
import tempfile
import io

class RAGApplication:
    def __init__(self):
        """Initialize the RAG application with necessary components"""
        # Initialize Groq client
        self.groq_client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
        
        # Initialize embedding model (using a lightweight, free model)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        
        # Initialize FAISS index
        self.dimension = 384  # Dimension of all-MiniLM-L6-v2 embeddings
        self.index = faiss.IndexFlatIP(self.dimension)  # Inner product for cosine similarity
        
        # Storage for chunks and metadata
        self.chunks = []
        self.chunk_metadata = []
        self.is_indexed = False
    
    def extract_text_from_file(self, file_path: str, file_type: str) -> str:
        """Extract text from different file types"""
        text = ""
        
        try:
            if file_type == "pdf":
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page in pdf_reader.pages:
                        text += page.extract_text() + "\n"
            
            elif file_type == "docx":
                doc = docx.Document(file_path)
                for paragraph in doc.paragraphs:
                    text += paragraph.text + "\n"
            
            elif file_type == "txt":
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
            
            elif file_type in ["csv", "xlsx"]:
                if file_type == "csv":
                    df = pd.read_csv(file_path)
                else:
                    df = pd.read_excel(file_path)
                
                # Convert DataFrame to text representation
                text = df.to_string(index=False)
        
        except Exception as e:
            return f"Error reading file: {str(e)}"
        
        return text
    
    def chunk_text(self, text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
        """Split text into overlapping chunks"""
        if not text.strip():
            return []
        
        # Clean the text
        text = re.sub(r'\s+', ' ', text.strip())
        
        # Split by sentences first to maintain context
        sentences = re.split(r'[.!?]+', text)
        
        chunks = []
        current_chunk = ""
        
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            # If adding this sentence would exceed chunk_size, save current chunk
            if len(current_chunk) + len(sentence) > chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                
                # Start new chunk with overlap
                words = current_chunk.split()
                overlap_text = ' '.join(words[-overlap:]) if len(words) > overlap else current_chunk
                current_chunk = overlap_text + " " + sentence
            else:
                current_chunk += " " + sentence if current_chunk else sentence
        
        # Add the last chunk
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        return chunks
    
    def create_embeddings(self, chunks: List[str]) -> np.ndarray:
        """Create embeddings for text chunks"""
        if not chunks:
            return np.array([])
        
        embeddings = self.embedding_model.encode(chunks, convert_to_tensor=False)
        return embeddings
    
    def build_index(self, files) -> str:
        """Process uploaded files and build the search index"""
        if not files:
            return "❌ No files uploaded. Please upload at least one file."
        
        try:
            # Reset previous data
            self.chunks = []
            self.chunk_metadata = []
            self.index = faiss.IndexFlatIP(self.dimension)
            
            all_chunks = []
            processing_status = []
            
            for file in files:
                file_name = file.name
                file_extension = file_name.split('.')[-1].lower()
                
                # Extract text from file
                text = self.extract_text_from_file(file.name, file_extension)
                
                if text.startswith("Error"):
                    processing_status.append(f"❌ {file_name}: {text}")
                    continue
                
                # Create chunks
                file_chunks = self.chunk_text(text)
                
                if not file_chunks:
                    processing_status.append(f"❌ {file_name}: No text could be extracted")
                    continue
                
                # Add metadata for each chunk
                for i, chunk in enumerate(file_chunks):
                    self.chunk_metadata.append({
                        'file_name': file_name,
                        'chunk_id': i,
                        'chunk_text': chunk
                    })
                    all_chunks.append(chunk)
                
                processing_status.append(f"βœ… {file_name}: {len(file_chunks)} chunks created")
            
            if not all_chunks:
                return "❌ No valid text chunks were created from the uploaded files."
            
            # Create embeddings
            embeddings = self.create_embeddings(all_chunks)
            
            # Normalize embeddings for cosine similarity
            faiss.normalize_L2(embeddings)
            
            # Add to FAISS index
            self.index.add(embeddings)
            self.chunks = all_chunks
            self.is_indexed = True
            
            status_report = "\n".join(processing_status)
            summary = f"\n\nπŸ“Š **Summary:**\n- Total chunks created: {len(all_chunks)}\n- Index built successfully!\n- Ready to answer questions!"
            
            return f"**File Processing Results:**\n\n{status_report}{summary}"
        
        except Exception as e:
            return f"❌ Error during indexing: {str(e)}"
    
    def search_similar_chunks(self, query: str, top_k: int = 5) -> List[Dict]:
        """Search for similar chunks using vector similarity"""
        if not self.is_indexed:
            return []
        
        try:
            # Create query embedding
            query_embedding = self.embedding_model.encode([query])
            faiss.normalize_L2(query_embedding)
            
            # Search in FAISS index
            scores, indices = self.index.search(query_embedding, top_k)
            
            results = []
            for score, idx in zip(scores[0], indices[0]):
                if idx < len(self.chunk_metadata):
                    results.append({
                        'chunk': self.chunks[idx],
                        'metadata': self.chunk_metadata[idx],
                        'similarity_score': float(score)
                    })
            
            return results
        
        except Exception as e:
            print(f"Search error: {e}")
            return []
    
    def generate_response(self, query: str, context_chunks: List[str]) -> str:
        """Generate response using Groq API with context"""
        try:
            # Prepare context
            context = "\n\n".join([f"Context {i+1}:\n{chunk}" for i, chunk in enumerate(context_chunks)])
            
            # Create prompt
            prompt = f"""Based on the following context information, please answer the user's question. If the answer cannot be found in the context, please say so clearly.

Context Information:
{context}

Question: {query}

Please provide a comprehensive and accurate answer based on the context provided above."""

            # Call Groq API
            chat_completion = self.groq_client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant that answers questions based on provided context. Always cite which part of the context supports your answer."
                    },
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                model="llama-3.3-70b-versatile",
                temperature=0.3,
                max_tokens=1000
            )
            
            return chat_completion.choices[0].message.content
        
        except Exception as e:
            return f"Error generating response: {str(e)}"
    
    def query_documents(self, query: str, top_k: int = 5) -> tuple:
        """Main function to query the documents"""
        if not query.strip():
            return "Please enter a question.", ""
        
        if not self.is_indexed:
            return "Please upload and index some documents first.", ""
        
        # Search for relevant chunks
        similar_chunks = self.search_similar_chunks(query, top_k)
        
        if not similar_chunks:
            return "No relevant information found in the documents.", ""
        
        # Extract chunks and generate response
        context_chunks = [chunk_data['chunk'] for chunk_data in similar_chunks]
        response = self.generate_response(query, context_chunks)
        
        # Create source information
        sources = "\n\nπŸ“š **Sources:**\n"
        for i, chunk_data in enumerate(similar_chunks):
            file_name = chunk_data['metadata']['file_name']
            similarity = chunk_data['similarity_score']
            sources += f"- **Source {i+1}:** {file_name} (Similarity: {similarity:.3f})\n"
        
        return response, sources

# Initialize the RAG application
rag_app = RAGApplication()

# Custom CSS for attractive interface
custom_css = """
.gradio-container {
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

.main-header {
    text-align: center;
    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 2rem;
    border-radius: 10px;
    margin-bottom: 2rem;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}

.upload-area {
    border: 2px dashed #667eea;
    border-radius: 10px;
    padding: 2rem;
    text-align: center;
    background: #f8f9ff;
}

.chat-container {
    background: #ffffff;
    border-radius: 10px;
    padding: 1rem;
    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}

#component-0 {
    border-radius: 15px;
}
"""

# Create Gradio interface
def create_interface():
    with gr.Blocks(css=custom_css, title="πŸ€– RAG Document Assistant") as interface:
        
        # Header
        gr.HTML("""
        <div class="main-header">
            <h1>πŸ€– RAG Document Assistant</h1>
            <p>Upload your documents and ask questions - powered by AI!</p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=1):
                gr.HTML("<h3>πŸ“ Document Upload</h3>")
                
                file_upload = gr.File(
                    label="Upload Documents",
                    file_types=[".pdf", ".docx", ".txt", ".csv", ".xlsx"],
                    file_count="multiple",
                    height=200
                )
                
                upload_btn = gr.Button(
                    "πŸš€ Process Documents", 
                    variant="primary",
                    size="lg"
                )
                
                upload_status = gr.Textbox(
                    label="Processing Status",
                    lines=8,
                    interactive=False,
                    placeholder="Upload documents and click 'Process Documents' to begin..."
                )
            
            with gr.Column(scale=2):
                gr.HTML("<h3>πŸ’¬ Ask Questions</h3>")
                
                with gr.Row():
                    query_input = gr.Textbox(
                        label="Your Question",
                        placeholder="Ask anything about your uploaded documents...",
                        lines=2,
                        scale=4
                    )
                    ask_btn = gr.Button("Ask", variant="primary", scale=1)
                
                response_output = gr.Textbox(
                    label="AI Response",
                    lines=10,
                    interactive=False,
                    placeholder="AI responses will appear here..."
                )
                
                sources_output = gr.Textbox(
                    label="Sources",
                    lines=5,
                    interactive=False,
                    placeholder="Source information will appear here..."
                )
        
        # Example questions
        gr.HTML("""
        <div style="margin-top: 2rem; padding: 1rem; background: #f0f0f0; border-radius: 8px;">
            <h4>πŸ’‘ Example Questions:</h4>
            <ul>
                <li>"What are the main topics discussed in the document?"</li>
                <li>"Can you summarize the key findings?"</li>
                <li>"What recommendations are provided?"</li>
                <li>"Tell me about [specific topic] mentioned in the documents"</li>
            </ul>
        </div>
        """)
        
        # Event handlers
        upload_btn.click(
            fn=rag_app.build_index,
            inputs=[file_upload],
            outputs=[upload_status]
        )
        
        ask_btn.click(
            fn=rag_app.query_documents,
            inputs=[query_input],
            outputs=[response_output, sources_output]
        )
        
        # Allow Enter key to submit question
        query_input.submit(
            fn=rag_app.query_documents,
            inputs=[query_input],
            outputs=[response_output, sources_output]
        )
    
    return interface

# Launch the application
if __name__ == "__main__":
    interface = create_interface()
    interface.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860
    )