File size: 9,856 Bytes
e9eb5ef
 
 
 
 
 
 
 
 
 
 
dfc8ae4
c4f41fc
714f7b6
 
dfc8ae4
c4f41fc
dfc8ae4
1a0b21d
714f7b6
 
e9eb5ef
dfc8ae4
714f7b6
dfc8ae4
 
e9eb5ef
dfc8ae4
e9eb5ef
 
 
 
 
 
 
 
 
 
1a0b21d
 
 
 
 
 
 
e9eb5ef
 
 
bc268dd
1a0b21d
 
 
 
 
e9eb5ef
 
1a0b21d
dfc8ae4
e9eb5ef
 
dfc8ae4
e9eb5ef
1a0b21d
 
 
 
 
 
 
dfc8ae4
 
e9eb5ef
1a0b21d
dfc8ae4
e9eb5ef
 
dfc8ae4
e9eb5ef
 
 
 
dfc8ae4
e9eb5ef
 
bc268dd
e9eb5ef
 
 
 
 
 
 
 
 
 
dfc8ae4
e9eb5ef
 
 
1a0b21d
e9eb5ef
1a0b21d
dfc8ae4
e9eb5ef
1a0b21d
 
 
 
dfc8ae4
 
1a0b21d
 
 
 
 
 
 
 
 
 
 
dfc8ae4
1a0b21d
dfc8ae4
e9eb5ef
1a0b21d
dfc8ae4
e9eb5ef
1a0b21d
 
dfc8ae4
1a0b21d
dfc8ae4
1a0b21d
 
 
 
 
 
 
 
 
 
 
 
dfc8ae4
e9eb5ef
 
dfc8ae4
e9eb5ef
 
 
1a0b21d
 
 
 
 
 
 
 
 
 
 
 
e9eb5ef
84d905b
9c493f4
78497d1
 
bc268dd
714f7b6
78497d1
 
 
c4f41fc
dfc8ae4
 
 
 
 
84d905b
714f7b6
e9eb5ef
84d905b
e9eb5ef
1a0b21d
dfc8ae4
bc268dd
1a0b21d
 
84d905b
1a0b21d
bc268dd
1a0b21d
 
 
e9eb5ef
1a0b21d
dfc8ae4
bc268dd
dfc8ae4
bc268dd
e34ca61
9c493f4
bc268dd
9c493f4
 
 
1a0b21d
dfc8ae4
e34ca61
 
dfc8ae4
e34ca61
1a0b21d
dfc8ae4
 
e34ca61
bc268dd
e9eb5ef
 
 
 
dfc8ae4
e9eb5ef
bc268dd
1a0b21d
dfc8ae4
e9eb5ef
 
 
 
dfc8ae4
e9eb5ef
 
bc268dd
dfc8ae4
e9eb5ef
 
84d905b
e9eb5ef
 
dfc8ae4
84d905b
e9eb5ef
dfc8ae4
 
e9eb5ef
 
 
1a0b21d
dfc8ae4
1a0b21d
dfc8ae4
 
1a0b21d
e9eb5ef
9c493f4
bc268dd
 
 
 
dfc8ae4
bc268dd
dfc8ae4
bc268dd
 
 
 
 
 
 
 
 
 
dfc8ae4
dfcf54f
dfc8ae4
 
 
bc268dd
 
ca5f03e
84d905b
ca5f03e
84d905b
bc268dd
dfc8ae4
 
bc268dd
dfc8ae4
 
bc268dd
 
 
dfc8ae4
 
 
 
bc268dd
e9eb5ef
 
bc268dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import gradio as gr
import os
from groq import Groq
import PyPDF2
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
from datetime import datetime
import docx

# Initialize Groq client
client = None
try:
    api_key = os.environ.get("GROQ_API_KEY")
    if api_key:
        import httpx
        client = Groq(api_key=api_key, http_client=httpx.Client())
        print("Groq client initialized successfully")
except Exception as e:
    print(f"Error initializing Groq client: {e}")

# Initialize sentence transformer model
print("Loading sentence transformer model...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully!")

# Global storage
document_store = {
    'chunks': [],
    'embeddings': [],
    'metadata': [],
    'conversation_history': []
}

def extract_text_from_pdf(pdf_file):
    """Extract text from PDF file"""
    try:
        if isinstance(pdf_file, str):
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            filename = os.path.basename(pdf_file)
        else:
            pdf_reader = PyPDF2.PdfReader(pdf_file.name)
            filename = os.path.basename(pdf_file.name)
        
        text_data = []
        for page_num, page in enumerate(pdf_reader.pages):
            text = page.extract_text()
            if text and text.strip():
                text_data.append({
                    'text': text,
                    'page': page_num + 1,
                    'filename': filename
                })
        return text_data
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return []

def extract_text_from_docx(docx_file):
    """Extract text from DOCX file"""
    try:
        if isinstance(docx_file, str):
            doc = docx.Document(docx_file)
            filename = os.path.basename(docx_file)
        else:
            doc = docx.Document(docx_file.name)
            filename = os.path.basename(docx_file.name)
        
        text = '\n'.join([p.text for p in doc.paragraphs if p.text.strip()])
        return [{'text': text, 'page': 1, 'filename': filename}]
    except Exception as e:
        print(f"Error reading DOCX: {e}")
        return []

def chunk_text(text_data, chunk_size=500, overlap=50):
    """Split text into chunks"""
    chunks = []
    metadata = []
    
    for data in text_data:
        words = data['text'].split()
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            if len(chunk.strip()) > 50:
                chunks.append(chunk)
                metadata.append({
                    'page': data['page'],
                    'filename': data['filename'],
                    'chunk_id': len(chunks)
                })
    
    return chunks, metadata

def process_files(files):
    """Process uploaded files"""
    global document_store
    
    if not files:
        return "[ERROR] Please upload at least one file."
    
    try:
        document_store = {'chunks': [], 'embeddings': [], 'metadata': [], 'conversation_history': []}
        
        all_text_data = []
        file_summaries = []
        
        for file in files:
            file_path = file.name if hasattr(file, 'name') else file
            file_ext = os.path.splitext(file_path)[1].lower()
            
            print(f"Processing file: {file_path}")
            
            if file_ext == '.pdf':
                text_data = extract_text_from_pdf(file)
            elif file_ext == '.docx':
                text_data = extract_text_from_docx(file)
            else:
                continue
            
            all_text_data.extend(text_data)
            total_chars = sum(len(d['text']) for d in text_data)
            filename = os.path.basename(file_path)
            file_summaries.append(f"- **{filename}**: {len(text_data)} pages, {total_chars} characters")
        
        if not all_text_data:
            return "[ERROR] No valid text extracted."
        
        chunks, metadata = chunk_text(all_text_data)
        if not chunks:
            return "[ERROR] No text chunks created."
        
        embeddings = embedder.encode(chunks, show_progress_bar=False)
        
        document_store['chunks'] = chunks
        document_store['embeddings'] = embeddings
        document_store['metadata'] = metadata
        
        summary = f"**Successfully Processed {len(files)} file(s)**\n\n"
        summary += "\n".join(file_summaries)
        summary += f"\n\n**Created {len(chunks)} text chunks for retrieval.**"
        
        return summary
    except Exception as e:
        print(f"Error processing files: {e}")
        return f"[ERROR] {str(e)}"

def retrieve_relevant_chunks(query, top_k=3):
    """Retrieve relevant chunks"""
    if not document_store['chunks']:
        return [], []
    
    try:
        query_embedding = embedder.encode([query], show_progress_bar=False)
        similarities = cosine_similarity(query_embedding, document_store['embeddings'])[0]
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        relevant_chunks = [document_store['chunks'][i] for i in top_indices]
        relevant_metadata = [document_store['metadata'][i] for i in top_indices]
        
        return relevant_chunks, relevant_metadata
    except Exception as e:
        print(f"Error retrieving chunks: {e}")
        return [], []

def chat(message, history):
    """Chat function - returns response string for ChatInterface"""
    global client
    
    # Reinitialize client if needed
    if client is None:
        try:
            api_key = os.environ.get("GROQ_API_KEY")
            if api_key:
                import httpx
                client = Groq(api_key=api_key, http_client=httpx.Client())
        except:
            pass
    
    if client is None:
        return "[ERROR] Groq API not initialized. Set GROQ_API_KEY in Settings."
    
    if not document_store['chunks']:
        return "[WARNING] Please upload and process documents first."
    
    try:
        # Retrieve context
        relevant_chunks, metadata = retrieve_relevant_chunks(message, top_k=3)
        
        if not relevant_chunks:
            return "[ERROR] No relevant information found."
        
        # Build context
        context = "\n\n".join([
            f"[Source: {meta['filename']}, Page {meta['page']}]\n{chunk}" 
            for chunk, meta in zip(relevant_chunks, metadata)
        ])
        
        # Build messages for Groq
        messages = [
            {"role": "system", "content": "You are a helpful assistant that answers questions based on provided context. Be concise and accurate."}
        ]
        
        # Add history - convert from tuples to message format
        if history:
            for user_msg, assistant_msg in history[-3:]:  # Last 3 exchanges
                messages.append({"role": "user", "content": user_msg})
                messages.append({"role": "assistant", "content": assistant_msg})
        
        # Add current query
        messages.append({
            "role": "user",
            "content": f"Context:\n{context}\n\nQuestion: {message}"
        })
        
        # Call Groq
        response = client.chat.completions.create(
            messages=messages,
            model="llama-3.1-8b-instant",
            temperature=0.3,
            max_tokens=1024,
        )
        
        answer = response.choices[0].message.content
        
        # Add sources
        sources = "\n\n**Sources:**\n" + "\n".join([
            f"- {m['filename']} (Page {m['page']})" for m in metadata
        ])
        
        full_answer = answer + sources
        
        # Log
        document_store['conversation_history'].append({
            'timestamp': datetime.now().isoformat(),
            'query': message,
            'answer': answer
        })
        
        return full_answer
        
    except Exception as e:
        print(f"Error: {e}")
        return f"[ERROR] {str(e)}"

def download_history():
    """Download chat history"""
    if not document_store['conversation_history']:
        return None
    
    try:
        with open("chat_history.json", 'w') as f:
            json.dump(document_store['conversation_history'], f, indent=2)
        return "chat_history.json"
    except:
        return None

# Build interface
with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
    
    gr.Markdown("""
    # Enhanced RAG-Based Chatbot
    Upload PDF/DOCX files and ask questions!
    
    **Features:** Multiple files, Semantic search, Source references, Chat history
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            file_upload = gr.File(
                label="Upload Documents (PDF/DOCX)",
                file_count="multiple",
                file_types=[".pdf", ".docx"]
            )
            process_btn = gr.Button("Process Documents", variant="primary")
            process_output = gr.Markdown()
            
            gr.Markdown("### History")
            download_btn = gr.Button("Download (JSON)")
            download_file = gr.File(label="Download")
        
        with gr.Column(scale=2):
            # Minimal ChatInterface compatible with Gradio 4.44.1
            chat_interface = gr.ChatInterface(
                fn=chat
            )
    
    # Process files
    process_btn.click(process_files, [file_upload], [process_output])
    
    # Download
    download_btn.click(download_history, None, [download_file])
    
    gr.Markdown("""
    ---
    ### How It Works:
    1. Upload PDF/DOCX files and click "Process Documents"
    2. Ask questions - RAG finds relevant chunks and generates answers
    3. Sources are cited with page numbers
    """)

if __name__ == "__main__":
    demo.launch()