SyedZainAliShah commited on
Commit
bc268dd
·
verified ·
1 Parent(s): dfcf54f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -180
app.py CHANGED
@@ -16,7 +16,6 @@ try:
16
  if not api_key:
17
  print("WARNING: GROQ_API_KEY not found in environment variables")
18
  else:
19
- # Initialize without proxies parameter
20
  import httpx
21
  client = Groq(
22
  api_key=api_key,
@@ -47,7 +46,6 @@ document_store = {
47
  def extract_text_from_pdf(pdf_file):
48
  """Extract text from PDF file"""
49
  try:
50
- # Handle both file path (string) and file object
51
  if isinstance(pdf_file, str):
52
  pdf_reader = PyPDF2.PdfReader(pdf_file)
53
  filename = os.path.basename(pdf_file)
@@ -58,7 +56,7 @@ def extract_text_from_pdf(pdf_file):
58
  text_data = []
59
  for page_num, page in enumerate(pdf_reader.pages):
60
  text = page.extract_text()
61
- if text and text.strip(): # Only add non-empty pages
62
  text_data.append({
63
  'text': text,
64
  'page': page_num + 1,
@@ -73,7 +71,6 @@ def extract_text_from_pdf(pdf_file):
73
  def extract_text_from_docx(docx_file):
74
  """Extract text from DOCX file (Enhancement 5)"""
75
  try:
76
- # Handle both file path and file object
77
  if isinstance(docx_file, str):
78
  doc = docx.Document(docx_file)
79
  filename = os.path.basename(docx_file)
@@ -103,7 +100,7 @@ def chunk_text(text_data, chunk_size=500, overlap=50):
103
 
104
  for i in range(0, len(words), chunk_size - overlap):
105
  chunk = ' '.join(words[i:i + chunk_size])
106
- if len(chunk.strip()) > 50: # Only keep meaningful chunks
107
  chunks.append(chunk)
108
  metadata.append({
109
  'page': data['page'],
@@ -139,7 +136,6 @@ def process_files(files):
139
  file_summaries = []
140
 
141
  for file in files:
142
- # Get file extension
143
  if isinstance(file, str):
144
  file_path = file
145
  file_ext = os.path.splitext(file)[1].lower()
@@ -159,7 +155,6 @@ def process_files(files):
159
 
160
  all_text_data.extend(text_data)
161
 
162
- # Generate file summary (Enhancement 2)
163
  total_text = ' '.join([d['text'] for d in text_data if d['text']])
164
  filename = os.path.basename(file_path)
165
  file_summaries.append(f"- **{filename}**: {len(text_data)} pages, {len(total_text)} characters")
@@ -167,7 +162,6 @@ def process_files(files):
167
  if not all_text_data:
168
  return "[ERROR] No valid text extracted from uploaded files."
169
 
170
- # Chunk and embed
171
  chunks, metadata = chunk_text(all_text_data)
172
 
173
  if not chunks:
@@ -208,11 +202,11 @@ def retrieve_relevant_chunks(query, top_k=3):
208
  print(f"Error retrieving chunks: {e}")
209
  return [], []
210
 
211
- def generate_answer(query, history):
212
- """Generate answer using Groq LLM with RAG (Enhancement 3 - Conversational Memory)"""
213
  global client
214
 
215
- # Try to reinitialize client if it's None
216
  if client is None:
217
  try:
218
  api_key = os.environ.get("GROQ_API_KEY")
@@ -224,77 +218,57 @@ def generate_answer(query, history):
224
  )
225
  print("Groq client reinitialized successfully")
226
  else:
227
- return "[ERROR] Groq API client not initialized. Please set GROQ_API_KEY in your Space settings (Settings > Variables > Add 'GROQ_API_KEY')."
228
  except Exception as e:
229
  return f"[ERROR] Failed to initialize Groq client: {str(e)}"
230
 
231
  if not document_store['chunks']:
232
- return "[WARNING] Please upload and process documents first using the 'Process Documents' button."
233
 
234
  try:
235
  # Retrieve relevant context
236
- relevant_chunks, metadata = retrieve_relevant_chunks(query, top_k=3)
237
 
238
  if not relevant_chunks:
239
  return "[ERROR] No relevant information found in the documents."
240
 
241
- # Build context with source references (Enhancement 4)
242
  context = "\n\n".join([
243
  f"[Source: {meta['filename']}, Page {meta['page']}]\n{chunk}"
244
  for chunk, meta in zip(relevant_chunks, metadata)
245
  ])
246
 
247
- # Build messages array for Groq API
248
- messages = []
 
 
 
 
 
249
 
250
- # Add system message
251
- system_prompt = """You are a helpful assistant that answers questions based on the provided document context.
252
-
253
- Instructions:
254
- - Answer based strictly on the provided context
255
- - If the answer isn't in the context, say so clearly
256
- - Be concise and accurate
257
- - Reference specific sources when relevant"""
258
-
259
- messages.append({
260
- "role": "system",
261
- "content": system_prompt
262
- })
263
-
264
- # Add conversation history (last 3 exchanges for context)
265
- if history and len(history) > 0:
266
- # Get last 3 user messages (skip current one which isn't in history yet)
267
- recent_history = history[-3:] if len(history) > 3 else history
268
- for msg in recent_history:
269
- # History format from Gradio Chatbot with type="messages"
270
- if isinstance(msg, dict) and "role" in msg and "content" in msg:
271
- messages.append({
272
- "role": msg["role"],
273
- "content": msg["content"]
274
- })
275
 
276
  # Add current query with context
277
- user_message = f"""Context from documents:
278
- {context}
279
-
280
- Question: {query}"""
281
-
282
  messages.append({
283
  "role": "user",
284
- "content": user_message
285
  })
286
 
287
- # Call Groq API with updated model
288
  chat_completion = client.chat.completions.create(
289
  messages=messages,
290
- model="llama-3.1-8b-instant", # Updated model
291
  temperature=0.3,
292
  max_tokens=1024,
293
  )
294
 
295
  answer = chat_completion.choices[0].message.content
296
 
297
- # Add source references to answer (Enhancement 4)
298
  sources = "\n\n**Sources:**\n" + "\n".join([
299
  f"- {meta['filename']} (Page {meta['page']})"
300
  for meta in metadata
@@ -302,10 +276,10 @@ Question: {query}"""
302
 
303
  full_answer = answer + sources
304
 
305
- # Log query (Enhancement 8)
306
  document_store['conversation_history'].append({
307
  'timestamp': datetime.now().isoformat(),
308
- 'query': query,
309
  'answer': answer,
310
  'sources': [f"{m['filename']}_p{m['page']}" for m in metadata]
311
  })
@@ -315,12 +289,10 @@ Question: {query}"""
315
  except Exception as e:
316
  error_msg = str(e)
317
  print(f"Error generating answer: {error_msg}")
318
- if "api_key" in error_msg.lower() or "authentication" in error_msg.lower():
319
- return "[ERROR] Invalid or missing GROQ_API_KEY. Please set it in your Space settings (Settings > Variables)."
320
- return f"[ERROR] Failed to generate answer: {error_msg}"
321
 
322
  def download_chat_history():
323
- """Download conversation history as JSON (Enhancement 7)"""
324
  if not document_store['conversation_history']:
325
  return None
326
 
@@ -328,136 +300,89 @@ def download_chat_history():
328
  history_file = "chat_history.json"
329
  with open(history_file, 'w', encoding='utf-8') as f:
330
  json.dump(document_store['conversation_history'], f, indent=2)
331
-
332
  return history_file
333
  except Exception as e:
334
  print(f"Error downloading history: {e}")
335
  return None
336
 
337
- def clear_history():
338
- """Clear conversation history"""
339
- document_store['conversation_history'] = []
340
- return None, "History cleared successfully!"
341
-
342
  # Build Gradio Interface
343
- def create_demo():
344
- with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
345
-
346
- gr.Markdown("""
347
- # Enhanced RAG-Based Chatbot
348
- Upload PDF/DOCX files and ask questions about their content!
349
-
350
- **Features:**
351
- - Multiple file support (PDF & DOCX)
352
- - Semantic embeddings with sentence-transformers
353
- - Document preview & summaries
354
- - Conversational memory
355
- - Source references with page numbers
356
- - Download chat history
357
- """)
358
-
359
- with gr.Row():
360
- with gr.Column(scale=1):
361
- file_upload = gr.File(
362
- label="Upload Documents (PDF/DOCX)",
363
- file_count="multiple",
364
- file_types=[".pdf", ".docx"]
365
- )
366
- process_btn = gr.Button("Process Documents", variant="primary")
367
- process_output = gr.Markdown(label="Processing Status")
368
-
369
- gr.Markdown("### Chat History Options")
370
- download_btn = gr.Button("Download History (JSON)")
371
- download_file = gr.File(label="Download", visible=True)
372
- clear_btn = gr.Button("Clear History")
373
- clear_msg = gr.Textbox(label="Status", interactive=False, visible=False)
374
-
375
- with gr.Column(scale=2):
376
- chatbot = gr.Chatbot(
377
- label="Conversation",
378
- height=500,
379
- type="messages"
380
- )
381
- query_input = gr.Textbox(
382
- label="Ask a question",
383
- placeholder="Type your question here and press Enter...",
384
- lines=2
385
- )
386
- submit_btn = gr.Button("Ask Question", variant="primary")
387
-
388
- # Event handlers
389
- process_btn.click(
390
- fn=process_files,
391
- inputs=[file_upload],
392
- outputs=[process_output]
393
- )
394
-
395
- def respond(message, chat_history):
396
- """Handle user message and generate response"""
397
- if not message or not message.strip():
398
- return chat_history
399
-
400
- # Ensure chat_history is a list
401
- if chat_history is None:
402
- chat_history = []
403
-
404
- # Generate answer
405
- bot_response = generate_answer(message, chat_history)
406
-
407
- # Append user message and bot response in Gradio messages format
408
- chat_history.append({"role": "user", "content": message})
409
- chat_history.append({"role": "assistant", "content": bot_response})
410
 
411
- return chat_history
412
-
413
- # Submit button and enter key
414
- submit_btn.click(
415
- fn=respond,
416
- inputs=[query_input, chatbot],
417
- outputs=[chatbot]
418
- ).then(
419
- lambda: "",
420
- outputs=[query_input]
421
- )
422
-
423
- query_input.submit(
424
- fn=respond,
425
- inputs=[query_input, chatbot],
426
- outputs=[chatbot]
427
- ).then(
428
- lambda: "",
429
- outputs=[query_input]
430
- )
431
-
432
- # Download history
433
- download_btn.click(
434
- fn=download_chat_history,
435
- outputs=[download_file]
436
- )
437
-
438
- # Clear history
439
- clear_btn.click(
440
- fn=clear_history,
441
- outputs=[chatbot, clear_msg]
442
- )
443
-
444
- gr.Markdown("""
445
- ---
446
- ### How RAG Works:
447
- 1. **Retrieval**: Finds relevant text chunks from uploaded documents using semantic similarity
448
- 2. **Augmentation**: Combines retrieved context with your question
449
- 3. **Generation**: Uses Groq LLM to generate accurate answers based on the context
450
-
451
- ### Usage Instructions:
452
- 1. Upload one or more PDF/DOCX files
453
- 2. Click "Process Documents" and wait for confirmation
454
- 3. Ask questions about the content
455
- 4. Download chat history anytime as JSON
456
- """)
457
 
458
- return demo
 
 
 
 
 
459
 
460
- # Launch the app
461
  if __name__ == "__main__":
462
- demo = create_demo()
463
- demo.launch(ssr_mode=False)
 
16
  if not api_key:
17
  print("WARNING: GROQ_API_KEY not found in environment variables")
18
  else:
 
19
  import httpx
20
  client = Groq(
21
  api_key=api_key,
 
46
  def extract_text_from_pdf(pdf_file):
47
  """Extract text from PDF file"""
48
  try:
 
49
  if isinstance(pdf_file, str):
50
  pdf_reader = PyPDF2.PdfReader(pdf_file)
51
  filename = os.path.basename(pdf_file)
 
56
  text_data = []
57
  for page_num, page in enumerate(pdf_reader.pages):
58
  text = page.extract_text()
59
+ if text and text.strip():
60
  text_data.append({
61
  'text': text,
62
  'page': page_num + 1,
 
71
  def extract_text_from_docx(docx_file):
72
  """Extract text from DOCX file (Enhancement 5)"""
73
  try:
 
74
  if isinstance(docx_file, str):
75
  doc = docx.Document(docx_file)
76
  filename = os.path.basename(docx_file)
 
100
 
101
  for i in range(0, len(words), chunk_size - overlap):
102
  chunk = ' '.join(words[i:i + chunk_size])
103
+ if len(chunk.strip()) > 50:
104
  chunks.append(chunk)
105
  metadata.append({
106
  'page': data['page'],
 
136
  file_summaries = []
137
 
138
  for file in files:
 
139
  if isinstance(file, str):
140
  file_path = file
141
  file_ext = os.path.splitext(file)[1].lower()
 
155
 
156
  all_text_data.extend(text_data)
157
 
 
158
  total_text = ' '.join([d['text'] for d in text_data if d['text']])
159
  filename = os.path.basename(file_path)
160
  file_summaries.append(f"- **{filename}**: {len(text_data)} pages, {len(total_text)} characters")
 
162
  if not all_text_data:
163
  return "[ERROR] No valid text extracted from uploaded files."
164
 
 
165
  chunks, metadata = chunk_text(all_text_data)
166
 
167
  if not chunks:
 
202
  print(f"Error retrieving chunks: {e}")
203
  return [], []
204
 
205
+ def chat(message, history):
206
+ """Main chat function that handles RAG pipeline"""
207
  global client
208
 
209
+ # Reinitialize client if needed
210
  if client is None:
211
  try:
212
  api_key = os.environ.get("GROQ_API_KEY")
 
218
  )
219
  print("Groq client reinitialized successfully")
220
  else:
221
+ return "[ERROR] Groq API client not initialized. Please set GROQ_API_KEY in your Space settings."
222
  except Exception as e:
223
  return f"[ERROR] Failed to initialize Groq client: {str(e)}"
224
 
225
  if not document_store['chunks']:
226
+ return "[WARNING] Please upload and process documents first."
227
 
228
  try:
229
  # Retrieve relevant context
230
+ relevant_chunks, metadata = retrieve_relevant_chunks(message, top_k=3)
231
 
232
  if not relevant_chunks:
233
  return "[ERROR] No relevant information found in the documents."
234
 
235
+ # Build context
236
  context = "\n\n".join([
237
  f"[Source: {meta['filename']}, Page {meta['page']}]\n{chunk}"
238
  for chunk, meta in zip(relevant_chunks, metadata)
239
  ])
240
 
241
+ # Build messages for Groq API
242
+ messages = [
243
+ {
244
+ "role": "system",
245
+ "content": "You are a helpful assistant that answers questions based on provided document context. Be concise and accurate."
246
+ }
247
+ ]
248
 
249
+ # Add conversation history
250
+ if history:
251
+ for user_msg, bot_msg in history[-3:]: # Last 3 exchanges
252
+ messages.append({"role": "user", "content": user_msg})
253
+ messages.append({"role": "assistant", "content": bot_msg})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  # Add current query with context
 
 
 
 
 
256
  messages.append({
257
  "role": "user",
258
+ "content": f"Context from documents:\n{context}\n\nQuestion: {message}"
259
  })
260
 
261
+ # Call Groq API
262
  chat_completion = client.chat.completions.create(
263
  messages=messages,
264
+ model="llama-3.1-8b-instant",
265
  temperature=0.3,
266
  max_tokens=1024,
267
  )
268
 
269
  answer = chat_completion.choices[0].message.content
270
 
271
+ # Add sources
272
  sources = "\n\n**Sources:**\n" + "\n".join([
273
  f"- {meta['filename']} (Page {meta['page']})"
274
  for meta in metadata
 
276
 
277
  full_answer = answer + sources
278
 
279
+ # Log query
280
  document_store['conversation_history'].append({
281
  'timestamp': datetime.now().isoformat(),
282
+ 'query': message,
283
  'answer': answer,
284
  'sources': [f"{m['filename']}_p{m['page']}" for m in metadata]
285
  })
 
289
  except Exception as e:
290
  error_msg = str(e)
291
  print(f"Error generating answer: {error_msg}")
292
+ return f"[ERROR] {error_msg}"
 
 
293
 
294
  def download_chat_history():
295
+ """Download conversation history as JSON"""
296
  if not document_store['conversation_history']:
297
  return None
298
 
 
300
  history_file = "chat_history.json"
301
  with open(history_file, 'w', encoding='utf-8') as f:
302
  json.dump(document_store['conversation_history'], f, indent=2)
 
303
  return history_file
304
  except Exception as e:
305
  print(f"Error downloading history: {e}")
306
  return None
307
 
 
 
 
 
 
308
  # Build Gradio Interface
309
+ with gr.Blocks(title="Enhanced RAG Chatbot") as demo:
310
+
311
+ gr.Markdown("""
312
+ # Enhanced RAG-Based Chatbot
313
+ Upload PDF/DOCX files and ask questions about their content!
314
+
315
+ **Features:**
316
+ - Multiple file support (PDF & DOCX)
317
+ - Semantic embeddings with sentence-transformers
318
+ - Document preview & summaries
319
+ - Conversational memory
320
+ - Source references with page numbers
321
+ - Download chat history
322
+ """)
323
+
324
+ with gr.Row():
325
+ with gr.Column(scale=1):
326
+ file_upload = gr.File(
327
+ label="Upload Documents (PDF/DOCX)",
328
+ file_count="multiple",
329
+ file_types=[".pdf", ".docx"]
330
+ )
331
+ process_btn = gr.Button("Process Documents", variant="primary")
332
+ process_output = gr.Markdown(label="Processing Status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
+ gr.Markdown("### Chat History Options")
335
+ download_btn = gr.Button("Download History (JSON)")
336
+ download_file = gr.File(label="Download", visible=True)
337
+ clear_btn = gr.Button("Clear Chat")
338
+
339
+ with gr.Column(scale=2):
340
+ chatbot = gr.Chatbot(label="Conversation", height=500)
341
+ msg = gr.Textbox(
342
+ label="Ask a question",
343
+ placeholder="Type your question here...",
344
+ lines=2
345
+ )
346
+ submit = gr.Button("Ask Question", variant="primary")
347
+
348
+ # Event handlers
349
+ process_btn.click(
350
+ fn=process_files,
351
+ inputs=[file_upload],
352
+ outputs=[process_output]
353
+ )
354
+
355
+ # Chat interactions
356
+ msg.submit(chat, [msg, chatbot], [chatbot]).then(
357
+ lambda: gr.update(value=""), None, [msg]
358
+ )
359
+
360
+ submit.click(chat, [msg, chatbot], [chatbot]).then(
361
+ lambda: gr.update(value=""), None, [msg]
362
+ )
363
+
364
+ # Clear chat
365
+ clear_btn.click(lambda: None, None, chatbot)
366
+
367
+ # Download history
368
+ download_btn.click(
369
+ fn=download_chat_history,
370
+ outputs=[download_file]
371
+ )
372
+
373
+ gr.Markdown("""
374
+ ---
375
+ ### How RAG Works:
376
+ 1. **Retrieval**: Finds relevant text chunks from uploaded documents using semantic similarity
377
+ 2. **Augmentation**: Combines retrieved context with your question
378
+ 3. **Generation**: Uses Groq LLM to generate accurate answers based on the context
 
379
 
380
+ ### Usage Instructions:
381
+ 1. Upload one or more PDF/DOCX files
382
+ 2. Click "Process Documents" and wait for confirmation
383
+ 3. Ask questions about the content
384
+ 4. Download chat history anytime as JSON
385
+ """)
386
 
 
387
  if __name__ == "__main__":
388
+ demo.launch()