msmaje commited on
Commit
6c0a884
Β·
verified Β·
1 Parent(s): 30050c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -77
app.py CHANGED
@@ -17,19 +17,22 @@ try:
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.prompts import PromptTemplate
19
  from langchain.chains import RetrievalQA
20
- from langchain_community.llms import HuggingFaceHub
21
  LANGCHAIN_AVAILABLE = True
22
  except ImportError as e:
23
  logger.error(f"LangChain import error: {e}")
24
  LANGCHAIN_AVAILABLE = False
25
 
 
 
 
 
26
  # Global variables for the RAG system
27
  vectorstore = None
28
  retrieval_qa = None
29
  embedding_model = None
30
 
31
  # Check for pre-existing PDF folder
32
- PDF_FOLDER_PATH = "./pdfs" # Default folder for PDFs in the space
33
  PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
34
 
35
  def initialize_models():
@@ -48,19 +51,25 @@ def initialize_models():
48
  if not hf_token:
49
  return False, "❌ HuggingFace API token not found in environment variables"
50
 
51
- # Initialize LLM
52
- llm = HuggingFaceHub(
53
- repo_id="microsoft/DialoGPT-medium",
54
- model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
55
- huggingfacehub_api_token=hf_token
56
- )
57
-
58
  return True, "βœ… Models initialized successfully"
59
 
60
  except Exception as e:
61
  logger.error(f"Model initialization error: {e}")
62
  return False, f"❌ Error initializing models: {str(e)}"
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
65
  """Load PDFs from the pre-existing folder"""
66
  global vectorstore, retrieval_qa, embedding_model
@@ -112,13 +121,8 @@ Helpful Answer:
112
  template=prompt_template
113
  )
114
 
115
- # Initialize LLM
116
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
117
- llm = HuggingFaceHub(
118
- repo_id="google/flan-t5-base",
119
- model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
120
- huggingfacehub_api_token=hf_token
121
- )
122
 
123
  # Create RetrievalQA chain
124
  retrieval_qa = RetrievalQA.from_chain_type(
@@ -175,6 +179,7 @@ def extract_zip_to_pdfs(zip_file):
175
 
176
  except Exception as e:
177
  return f"❌ Error extracting ZIP file: {str(e)}"
 
178
  def process_pdfs(pdf_files, chunk_size, chunk_overlap):
179
  """Process uploaded PDF files and create vector store"""
180
  global vectorstore, retrieval_qa, embedding_model
@@ -235,13 +240,8 @@ Helpful Answer:
235
  template=prompt_template
236
  )
237
 
238
- # Initialize LLM
239
- hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
240
- llm = HuggingFaceHub(
241
- repo_id="google/flan-t5-base",
242
- model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
243
- huggingfacehub_api_token=hf_token
244
- )
245
 
246
  # Create RetrievalQA chain
247
  retrieval_qa = RetrievalQA.from_chain_type(
@@ -294,71 +294,256 @@ def answer_question(question):
294
  logger.error(f"Question answering error: {e}")
295
  return f"❌ Error answering question: {str(e)}", ""
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  def create_interface():
298
- """Create the Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- with gr.Blocks(title="PDF RAG System", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  gr.Markdown("""
302
  # πŸ“š PDF Question Answering System
303
 
304
  Upload your PDF documents and ask questions about their content!
305
 
306
- **Instructions:**
307
- 1. **Option A**: Upload individual PDF files and click "Process PDFs"
308
- 2. **Option B**: Upload a ZIP file containing PDFs and extract them
309
- 3. **Option C**: Use pre-loaded PDFs (if available in ./pdfs folder)
310
- 4. Ask questions about your documents
311
  """)
312
 
313
  # Check for pre-loaded PDFs
314
  if PRELOADED_PDFS:
315
- gr.Markdown("πŸŽ‰ **Pre-loaded PDFs detected!** You can use the 'Load Pre-existing PDFs' button.")
316
-
 
 
 
 
 
 
317
  with gr.Row():
318
- with gr.Column(scale=1):
319
- gr.Markdown("### πŸ“„ Upload & Settings")
 
320
 
321
  with gr.Tabs():
322
- with gr.TabItem("πŸ“ Individual PDFs"):
323
  pdf_files = gr.File(
324
- label="Upload PDF Files",
325
  file_count="multiple",
326
  file_types=[".pdf"],
327
- height=150
 
 
 
 
 
328
  )
329
- process_btn = gr.Button("πŸ”„ Process PDFs", variant="primary")
330
 
331
  with gr.TabItem("πŸ—‚οΈ ZIP Upload"):
332
  zip_file = gr.File(
333
- label="Upload ZIP File (containing PDFs)",
334
  file_count="single",
335
  file_types=[".zip"],
336
- height=100
 
 
 
 
 
 
 
 
 
 
337
  )
338
- extract_btn = gr.Button("πŸ“¦ Extract ZIP to PDFs Folder", variant="secondary")
339
- extract_output = gr.Textbox(label="Extraction Status", lines=2)
340
 
341
  with gr.TabItem("πŸ’Ύ Pre-loaded"):
342
  if PRELOADED_PDFS:
343
  pdf_list = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
344
- gr.Markdown(f"**Found {len(pdf_list)} PDF files:**")
345
- for pdf in pdf_list[:10]: # Show first 10
346
- gr.Markdown(f"- {pdf}")
347
- if len(pdf_list) > 10:
348
- gr.Markdown(f"... and {len(pdf_list) - 10} more files")
 
 
 
 
 
349
  else:
350
- gr.Markdown("No pre-loaded PDFs found. Place PDF files in `./pdfs/` folder.")
351
 
352
- preload_btn = gr.Button("πŸ“š Load Pre-existing PDFs", variant="primary",
353
- interactive=PRELOADED_PDFS)
 
 
 
 
354
 
355
- with gr.Row():
 
356
  chunk_size = gr.Slider(
357
  minimum=200,
358
  maximum=2000,
359
  value=1000,
360
  step=100,
361
- label="Chunk Size"
 
362
  )
363
 
364
  chunk_overlap = gr.Slider(
@@ -366,38 +551,52 @@ def create_interface():
366
  maximum=500,
367
  value=200,
368
  step=50,
369
- label="Chunk Overlap"
 
370
  )
371
 
372
- process_output = gr.Textbox(label="Processing Status", lines=4)
 
 
 
 
 
 
373
 
374
- with gr.Column(scale=2):
 
375
  gr.Markdown("### ❓ Ask Questions")
376
 
377
  question_input = gr.Textbox(
378
  label="Your Question",
379
  placeholder="What would you like to know about your documents?",
380
- lines=2
 
381
  )
382
 
383
- ask_btn = gr.Button("πŸ€” Ask Question", variant="secondary")
 
 
 
 
384
 
 
385
  with gr.Row():
386
- with gr.Column():
387
- answer_output = gr.Textbox(
388
- label="Answer",
389
- lines=8,
390
- max_lines=15
391
- )
392
 
393
- with gr.Column():
394
- sources_output = gr.Textbox(
395
- label="Sources",
396
- lines=8,
397
- max_lines=15
398
- )
399
 
400
- # Event handlers
401
  process_btn.click(
402
  fn=process_pdfs,
403
  inputs=[pdf_files, chunk_size, chunk_overlap],
@@ -428,14 +627,22 @@ def create_interface():
428
  outputs=[answer_output, sources_output]
429
  )
430
 
431
- # Example questions
 
 
 
 
 
 
 
 
 
 
432
  gr.Markdown("""
433
- ### πŸ’‘ Example Questions:
434
- - What are the main topics covered in these documents?
435
- - Can you summarize the key findings?
436
- - What data is available for [specific topic]?
437
- - What are the differences between [X] and [Y]?
438
- - What are the differences in the uninsured rate by state in 2022?
439
  """)
440
 
441
  return demo
 
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.prompts import PromptTemplate
19
  from langchain.chains import RetrievalQA
20
+ from langchain_community.llms import HuggingFaceEndpoint
21
  LANGCHAIN_AVAILABLE = True
22
  except ImportError as e:
23
  logger.error(f"LangChain import error: {e}")
24
  LANGCHAIN_AVAILABLE = False
25
 
26
+ # Create PDFs folder if it doesn't exist
27
+ PDF_FOLDER_PATH = "./pdfs"
28
+ os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
29
+
30
  # Global variables for the RAG system
31
  vectorstore = None
32
  retrieval_qa = None
33
  embedding_model = None
34
 
35
  # Check for pre-existing PDF folder
 
36
  PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
37
 
38
  def initialize_models():
 
51
  if not hf_token:
52
  return False, "❌ HuggingFace API token not found in environment variables"
53
 
 
 
 
 
 
 
 
54
  return True, "βœ… Models initialized successfully"
55
 
56
  except Exception as e:
57
  logger.error(f"Model initialization error: {e}")
58
  return False, f"❌ Error initializing models: {str(e)}"
59
 
60
+ def create_llm():
61
+ """Create and return the LLM instance"""
62
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
63
+
64
+ llm = HuggingFaceEndpoint(
65
+ repo_id="google/flan-t5-base",
66
+ temperature=0.7,
67
+ max_new_tokens=512,
68
+ huggingfacehub_api_token=hf_token
69
+ )
70
+
71
+ return llm
72
+
73
  def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
74
  """Load PDFs from the pre-existing folder"""
75
  global vectorstore, retrieval_qa, embedding_model
 
121
  template=prompt_template
122
  )
123
 
124
+ # Initialize LLM using the new function
125
+ llm = create_llm()
 
 
 
 
 
126
 
127
  # Create RetrievalQA chain
128
  retrieval_qa = RetrievalQA.from_chain_type(
 
179
 
180
  except Exception as e:
181
  return f"❌ Error extracting ZIP file: {str(e)}"
182
+
183
  def process_pdfs(pdf_files, chunk_size, chunk_overlap):
184
  """Process uploaded PDF files and create vector store"""
185
  global vectorstore, retrieval_qa, embedding_model
 
240
  template=prompt_template
241
  )
242
 
243
+ # Initialize LLM using the new function
244
+ llm = create_llm()
 
 
 
 
 
245
 
246
  # Create RetrievalQA chain
247
  retrieval_qa = RetrievalQA.from_chain_type(
 
294
  logger.error(f"Question answering error: {e}")
295
  return f"❌ Error answering question: {str(e)}", ""
296
 
297
+ def get_device_info():
298
+ """Simple function to detect if mobile (basic detection)"""
299
+ return """
300
+ <script>
301
+ function isMobile() {
302
+ return window.innerWidth <= 768;
303
+ }
304
+
305
+ function adjustLayout() {
306
+ const isMob = isMobile();
307
+ const root = document.documentElement;
308
+ if (isMob) {
309
+ root.style.setProperty('--mobile-mode', '1');
310
+ } else {
311
+ root.style.setProperty('--mobile-mode', '0');
312
+ }
313
+ }
314
+
315
+ window.addEventListener('resize', adjustLayout);
316
+ adjustLayout();
317
+ </script>
318
+ """
319
+
320
  def create_interface():
321
+ """Create the fully responsive Gradio interface"""
322
+
323
+ # Custom CSS for better responsiveness
324
+ custom_css = """
325
+ /* Base responsive styles */
326
+ .gradio-container {
327
+ max-width: 100% !important;
328
+ margin: 0 auto;
329
+ padding: 10px;
330
+ }
331
+
332
+ /* Mobile-first responsive design */
333
+ @media (max-width: 768px) {
334
+ .gradio-container {
335
+ padding: 5px;
336
+ }
337
+
338
+ /* Stack elements vertically on mobile */
339
+ .gr-row {
340
+ flex-direction: column !important;
341
+ gap: 10px !important;
342
+ }
343
+
344
+ /* Full width on mobile */
345
+ .gr-column {
346
+ width: 100% !important;
347
+ min-width: 100% !important;
348
+ }
349
+
350
+ /* Adjust component spacing */
351
+ .gr-form > * {
352
+ margin-bottom: 8px !important;
353
+ }
354
+
355
+ /* Better button sizing */
356
+ .gr-button {
357
+ width: 100% !important;
358
+ min-height: 44px !important;
359
+ font-size: 14px !important;
360
+ }
361
+
362
+ /* Text input improvements */
363
+ .gr-textbox textarea {
364
+ min-height: 60px !important;
365
+ font-size: 16px !important; /* Prevents zoom on iOS */
366
+ }
367
+
368
+ /* File upload improvements */
369
+ .gr-file {
370
+ min-height: 100px !important;
371
+ }
372
+
373
+ /* Slider improvements */
374
+ .gr-slider {
375
+ margin: 10px 0 !important;
376
+ }
377
+
378
+ /* Tab improvements */
379
+ .gr-tab-nav {
380
+ flex-wrap: wrap !important;
381
+ }
382
+
383
+ .gr-tab-nav > button {
384
+ flex: 1 1 auto !important;
385
+ min-width: 80px !important;
386
+ font-size: 12px !important;
387
+ }
388
+ }
389
+
390
+ /* Tablet styles */
391
+ @media (min-width: 769px) and (max-width: 1024px) {
392
+ .gradio-container {
393
+ padding: 15px;
394
+ }
395
+
396
+ .gr-button {
397
+ min-height: 40px !important;
398
+ }
399
+ }
400
+
401
+ /* Desktop styles */
402
+ @media (min-width: 1025px) {
403
+ .gradio-container {
404
+ max-width: 1400px;
405
+ padding: 20px;
406
+ }
407
+ }
408
+
409
+ /* Improve readability */
410
+ .gr-markdown h1 {
411
+ font-size: clamp(1.5rem, 4vw, 2.5rem) !important;
412
+ line-height: 1.2 !important;
413
+ margin-bottom: 1rem !important;
414
+ }
415
 
416
+ .gr-markdown h3 {
417
+ font-size: clamp(1.1rem, 3vw, 1.4rem) !important;
418
+ margin: 1rem 0 0.5rem 0 !important;
419
+ }
420
+
421
+ .gr-markdown p, .gr-markdown li {
422
+ font-size: clamp(0.9rem, 2.5vw, 1rem) !important;
423
+ line-height: 1.5 !important;
424
+ }
425
+
426
+ /* Status text improvements */
427
+ .gr-textbox[data-testid="textbox"] {
428
+ font-family: monospace !important;
429
+ font-size: clamp(0.8rem, 2vw, 0.9rem) !important;
430
+ }
431
+
432
+ /* Accessibility improvements */
433
+ .gr-button:focus,
434
+ .gr-textbox:focus,
435
+ .gr-file:focus {
436
+ outline: 2px solid #2563eb !important;
437
+ outline-offset: 2px !important;
438
+ }
439
+
440
+ /* Dark mode considerations */
441
+ @media (prefers-color-scheme: dark) {
442
+ .gr-button {
443
+ border: 1px solid #374151 !important;
444
+ }
445
+ }
446
+ """
447
+
448
+ with gr.Blocks(
449
+ title="PDF RAG System",
450
+ theme=gr.themes.Soft(),
451
+ css=custom_css
452
+ ) as demo:
453
+
454
+ # Add device detection script
455
+ gr.HTML(get_device_info())
456
+
457
  gr.Markdown("""
458
  # πŸ“š PDF Question Answering System
459
 
460
  Upload your PDF documents and ask questions about their content!
461
 
462
+ **Quick Start:**
463
+ 1. Upload PDFs or use pre-loaded ones
464
+ 2. Click Process to prepare your documents
465
+ 3. Ask questions about the content
 
466
  """)
467
 
468
  # Check for pre-loaded PDFs
469
  if PRELOADED_PDFS:
470
+ gr.Markdown("""
471
+ <div style="background: linear-gradient(90deg, #10b981, #059669);
472
+ color: white; padding: 12px; border-radius: 8px; margin: 10px 0;">
473
+ πŸŽ‰ <strong>Pre-loaded PDFs detected!</strong> Use the 'Load Pre-existing PDFs' button to get started quickly.
474
+ </div>
475
+ """)
476
+
477
+ # Main layout - responsive columns
478
  with gr.Row():
479
+ # Left column - Upload & Settings (collapses to full width on mobile)
480
+ with gr.Column(scale=1, min_width=300):
481
+ gr.Markdown("### πŸ“„ Document Management")
482
 
483
  with gr.Tabs():
484
+ with gr.TabItem("πŸ“ Upload PDFs"):
485
  pdf_files = gr.File(
486
+ label="Select PDF Files",
487
  file_count="multiple",
488
  file_types=[".pdf"],
489
+ height=120
490
+ )
491
+ process_btn = gr.Button(
492
+ "πŸ”„ Process PDFs",
493
+ variant="primary",
494
+ size="lg"
495
  )
 
496
 
497
  with gr.TabItem("πŸ—‚οΈ ZIP Upload"):
498
  zip_file = gr.File(
499
+ label="Upload ZIP (with PDFs)",
500
  file_count="single",
501
  file_types=[".zip"],
502
+ height=80
503
+ )
504
+ extract_btn = gr.Button(
505
+ "πŸ“¦ Extract ZIP",
506
+ variant="secondary",
507
+ size="lg"
508
+ )
509
+ extract_output = gr.Textbox(
510
+ label="Extraction Status",
511
+ lines=2,
512
+ max_lines=3
513
  )
 
 
514
 
515
  with gr.TabItem("πŸ’Ύ Pre-loaded"):
516
  if PRELOADED_PDFS:
517
  pdf_list = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
518
+ gr.Markdown(f"**Found {len(pdf_list)} PDF files**")
519
+
520
+ # Show files in a more mobile-friendly way
521
+ if len(pdf_list) <= 5:
522
+ for pdf in pdf_list:
523
+ gr.Markdown(f"πŸ“„ {pdf}")
524
+ else:
525
+ for pdf in pdf_list[:3]:
526
+ gr.Markdown(f"πŸ“„ {pdf}")
527
+ gr.Markdown(f"*... and {len(pdf_list) - 3} more files*")
528
  else:
529
+ gr.Markdown("No pre-loaded PDFs found.")
530
 
531
+ preload_btn = gr.Button(
532
+ "πŸ“š Load Pre-existing PDFs",
533
+ variant="primary",
534
+ size="lg",
535
+ interactive=PRELOADED_PDFS
536
+ )
537
 
538
+ # Settings section - collapsible on mobile
539
+ with gr.Accordion("βš™οΈ Advanced Settings", open=False):
540
  chunk_size = gr.Slider(
541
  minimum=200,
542
  maximum=2000,
543
  value=1000,
544
  step=100,
545
+ label="Chunk Size",
546
+ info="Larger chunks = more context, smaller = more precise"
547
  )
548
 
549
  chunk_overlap = gr.Slider(
 
551
  maximum=500,
552
  value=200,
553
  step=50,
554
+ label="Chunk Overlap",
555
+ info="Overlap between text chunks"
556
  )
557
 
558
+ # Status display
559
+ process_output = gr.Textbox(
560
+ label="πŸ“Š Processing Status",
561
+ lines=3,
562
+ max_lines=5,
563
+ placeholder="Status updates will appear here..."
564
+ )
565
 
566
+ # Right column - Q&A Section (collapses to full width on mobile)
567
+ with gr.Column(scale=2, min_width=400):
568
  gr.Markdown("### ❓ Ask Questions")
569
 
570
  question_input = gr.Textbox(
571
  label="Your Question",
572
  placeholder="What would you like to know about your documents?",
573
+ lines=2,
574
+ max_lines=4
575
  )
576
 
577
+ ask_btn = gr.Button(
578
+ "πŸ€” Ask Question",
579
+ variant="secondary",
580
+ size="lg"
581
+ )
582
 
583
+ # Results section - stack vertically on mobile
584
  with gr.Row():
585
+ answer_output = gr.Textbox(
586
+ label="πŸ’‘ Answer",
587
+ lines=6,
588
+ max_lines=12,
589
+ placeholder="Your answer will appear here..."
590
+ )
591
 
592
+ sources_output = gr.Textbox(
593
+ label="πŸ“š Sources",
594
+ lines=6,
595
+ max_lines=12,
596
+ placeholder="Source references will appear here..."
597
+ )
598
 
599
+ # Event handlers (unchanged)
600
  process_btn.click(
601
  fn=process_pdfs,
602
  inputs=[pdf_files, chunk_size, chunk_overlap],
 
627
  outputs=[answer_output, sources_output]
628
  )
629
 
630
+ # Example questions - more mobile-friendly
631
+ with gr.Accordion("πŸ’‘ Example Questions", open=False):
632
+ gr.Markdown("""
633
+ **Try asking:**
634
+ - What are the main topics in these documents?
635
+ - Can you summarize the key findings?
636
+ - What data is available for [specific topic]?
637
+ - What are the differences between X and Y?
638
+ """)
639
+
640
+ # Footer with helpful info
641
  gr.Markdown("""
642
+ ---
643
+ <div style="text-align: center; color: #666; font-size: 0.9em;">
644
+ πŸ’‘ <strong>Tip:</strong> For best results, ask specific questions about your documents
645
+ </div>
 
 
646
  """)
647
 
648
  return demo