shukdevdattaEX commited on
Commit
b7e74ea
Β·
verified Β·
1 Parent(s): 2b5cbea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +289 -181
app.py CHANGED
@@ -7,6 +7,7 @@ from typing import List, Tuple, Optional
7
  import time
8
  from PIL import Image
9
  import io
 
10
 
11
  # Global client variable
12
  client = None
@@ -31,59 +32,132 @@ def encode_image(image_path: str) -> str:
31
  with open(image_path, "rb") as image_file:
32
  return base64.b64encode(image_file.read()).decode('utf-8')
33
 
34
- def pdf_to_images(pdf_path: str) -> List[Image.Image]:
35
- """Convert PDF to images using pdf2image"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  try:
37
  from pdf2image import convert_from_path
38
  images = convert_from_path(pdf_path, dpi=200)
39
  return images
40
- except ImportError:
41
- # If pdf2image is not available, try PyMuPDF (fitz)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  try:
43
- import fitz
44
- doc = fitz.open(pdf_path)
45
- images = []
46
- for page_num in range(len(doc)):
47
- page = doc[page_num]
48
- pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
49
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
50
- images.append(img)
51
- doc.close()
52
- return images
53
- except ImportError:
54
- raise Exception("Please install pdf2image or PyMuPDF: pip install pdf2image PyMuPDF")
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  def image_to_base64(image: Image.Image, format: str = "PNG") -> str:
57
  """Convert PIL Image to base64"""
58
  buffered = io.BytesIO()
59
- image.save(buffered, format=format)
 
 
 
 
 
 
 
 
 
60
  return base64.b64encode(buffered.getvalue()).decode('utf-8')
61
 
62
- def process_file(file_path: str) -> List[dict]:
63
- """Process a file and return content blocks for API"""
 
 
 
64
  file_extension = Path(file_path).suffix.lower()
 
65
  content_blocks = []
 
66
 
67
  try:
68
  if file_extension == '.pdf':
69
  # Convert PDF pages to images
70
- images = pdf_to_images(file_path)
71
- for img in images:
72
- base64_image = image_to_base64(img, format="PNG")
 
 
73
  content_blocks.append({
74
  "type": "image_url",
75
  "image_url": {
76
- "url": f"data:image/png;base64,{base64_image}"
77
  }
78
  })
 
79
  elif file_extension == '.txt':
80
  # Read text file
81
- with open(file_path, 'r', encoding='utf-8') as f:
82
- text_content = f.read()
 
 
 
 
 
 
 
83
  content_blocks.append({
84
  "type": "text",
85
- "text": f"[Text File Content]:\n{text_content}"
86
  })
 
87
  else:
88
  # Handle image files
89
  # Determine MIME type
@@ -99,20 +173,51 @@ def process_file(file_path: str) -> List[dict]:
99
  elif file_extension in ['.tiff', '.tif']:
100
  mime_type = "image/tiff"
101
 
102
- base64_image = encode_image(file_path)
103
- content_blocks.append({
104
- "type": "image_url",
105
- "image_url": {
106
- "url": f"data:{mime_type};base64,{base64_image}"
107
- }
108
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  except Exception as e:
 
110
  content_blocks.append({
111
  "type": "text",
112
- "text": f"[Error processing file {Path(file_path).name}: {str(e)}]"
113
  })
 
114
 
115
- return content_blocks
116
 
117
  def process_message(
118
  message: str,
@@ -121,15 +226,20 @@ def process_message(
121
  enable_reasoning: bool = True,
122
  temperature: float = 0.7,
123
  max_tokens: int = 2000
124
- ) -> Tuple[List[Tuple[str, str]], str]:
125
- """Process user message and generate response"""
 
 
 
126
  global client
127
 
128
  if client is None:
129
- return history + [(message, "❌ Please configure your API key first in the Settings tab.")], ""
130
 
131
  if not message.strip() and not files:
132
- return history + [(message, "⚠️ Please enter a message or upload files.")], ""
 
 
133
 
134
  try:
135
  # Build messages array
@@ -147,17 +257,28 @@ def process_message(
147
  # Process files if provided
148
  if files:
149
  file_count = 0
 
 
150
  for file in files:
151
  if file is not None:
152
- file_blocks = process_file(file)
153
  content.extend(file_blocks)
 
154
  file_count += 1
 
 
 
 
 
 
 
 
155
 
156
  if file_count > 0:
157
- content.insert(0, {
158
- "type": "text",
159
- "text": f"[{file_count} file(s) uploaded]"
160
- })
161
 
162
  # Add text message
163
  if message.strip():
@@ -187,20 +308,71 @@ def process_message(
187
  if enable_reasoning and hasattr(response.choices[0].message, 'reasoning_details'):
188
  reasoning_details = response.choices[0].message.reasoning_details
189
  if reasoning_details:
190
- reasoning_text = f"\n\n**🧠 Reasoning Process:**\n{json.dumps(reasoning_details, indent=2)}"
191
 
192
  # Update history
193
  new_history = history + [(message, assistant_message)]
194
 
195
- return new_history, reasoning_text
 
 
 
196
 
197
  except Exception as e:
198
  error_message = f"❌ Error: {str(e)}"
199
- return history + [(message, error_message)], ""
200
 
201
  def clear_conversation():
202
  """Clear conversation history"""
203
- return [], ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  # Custom CSS for premium design
206
  custom_css = """
@@ -375,6 +547,13 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
375
  elem_classes=["chatbot"]
376
  )
377
 
 
 
 
 
 
 
 
378
  with gr.Row():
379
  msg = gr.Textbox(
380
  label="Your Message",
@@ -454,11 +633,31 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
454
  info="Maximum length of response"
455
  )
456
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  gr.HTML("""
458
  <div class='info-box' style='margin-top: 20px;'>
459
- <strong>πŸ“¦ Required Dependencies for PDF Support:</strong><br>
460
- <code>pip install pdf2image PyMuPDF pillow</code><br><br>
461
- <strong>Note:</strong> pdf2image also requires poppler-utils installed on your system.
 
 
 
 
 
 
 
 
462
  </div>
463
  """)
464
 
@@ -512,7 +711,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
512
  <p style='margin: 10px 0 0 0; color: #666; line-height: 1.6;'>
513
  β€’ Multi-page support<br>
514
  β€’ Automatic conversion to images<br>
515
- β€’ Layout analysis<br>
516
  β€’ Scanned documents<br>
517
  β€’ Forms and tables
518
  </p>
@@ -523,138 +722,42 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
523
  β€’ Plain text documents<br>
524
  β€’ Code snippets<br>
525
  β€’ Notes and logs<br>
526
- β€’ Data files<br>
527
  β€’ Configuration files
528
  </p>
529
  </div>
530
  </div>
531
  </div>
532
 
533
- <div style='margin-top: 30px; text-align: center;'>
534
- <h2 style='color: #667eea; font-size: 2em; margin-bottom: 20px;'>🎯 What the Model Excels At</h2>
535
- </div>
536
-
537
- <div class='capability-card' style='background: linear-gradient(135deg, #e8f5e9 0%, #f1f8e9 100%);'>
538
- <h3 style='color: #2e7d32;'>πŸ“Š 1. Document Intelligence</h3>
539
- <div style='background: white; padding: 20px; border-radius: 10px; margin-top: 15px;'>
540
- <ul style='color: #666; line-height: 2; margin: 0; padding-left: 20px;'>
541
- <li><strong>Multi-page PDF analysis</strong> - Process entire documents at once</li>
542
- <li><strong>Scanned documents</strong> - Extract text from scans and photos of documents</li>
543
- <li><strong>Forms and tables</strong> - Understand structured data layouts</li>
544
- <li><strong>Financial reports</strong> - Parse complex financial documents</li>
545
- <li><strong>Receipts and invoices</strong> - Extract itemized information</li>
546
- <li><strong>Academic papers</strong> - Understand scientific content and citations</li>
547
- </ul>
548
- </div>
549
- </div>
550
-
551
- <div class='capability-card' style='background: linear-gradient(135deg, #fff9c4 0%, #fff3e0 100%);'>
552
- <h3 style='color: #f57f17;'>πŸ”€ 2. OCR Excellence (Optical Character Recognition)</h3>
553
- <div style='background: white; padding: 20px; border-radius: 10px; margin-top: 15px;'>
554
- <ul style='color: #666; line-height: 2; margin: 0; padding-left: 20px;'>
555
- <li><strong>Handwritten text</strong> - Recognize cursive and printed handwriting</li>
556
- <li><strong>Printed text</strong> - Extract text from any printed material</li>
557
- <li><strong>Text in images</strong> - Find and read text embedded in photos</li>
558
- <li><strong>Multi-language support</strong> - Handle various languages and scripts</li>
559
- <li><strong>Low-quality images</strong> - Work with blurry or low-resolution scans</li>
560
- <li><strong>Complex layouts</strong> - Handle multi-column and mixed layouts</li>
561
- </ul>
562
- </div>
563
- </div>
564
-
565
- <div class='capability-card' style='background: linear-gradient(135deg, #e1bee7 0%, #f3e5f5 100%);'>
566
- <h3 style='color: #6a1b9a;'>πŸ“ˆ 3. Chart & Graph Analysis</h3>
567
- <div style='background: white; padding: 20px; border-radius: 10px; margin-top: 15px;'>
568
- <ul style='color: #666; line-height: 2; margin: 0; padding-left: 20px;'>
569
- <li><strong>Bar charts</strong> - Interpret categorical data comparisons</li>
570
- <li><strong>Line graphs</strong> - Analyze trends over time</li>
571
- <li><strong>Pie charts</strong> - Understand proportional distributions</li>
572
- <li><strong>Scatter plots</strong> - Identify correlations and patterns</li>
573
- <li><strong>Complex visualizations</strong> - Parse multi-axis and combined charts</li>
574
- <li><strong>Infographics</strong> - Extract insights from visual data stories</li>
575
- </ul>
576
- </div>
577
- </div>
578
-
579
- <div class='capability-card' style='background: linear-gradient(135deg, #b3e5fc 0%, #e1f5fe 100%);'>
580
- <h3 style='color: #01579b;'>🎬 4. Video Understanding (Frame-by-Frame)</h3>
581
- <div style='background: white; padding: 20px; border-radius: 10px; margin-top: 15px;'>
582
- <ul style='color: #666; line-height: 2; margin: 0; padding-left: 20px;'>
583
- <li><strong>Sequential frames</strong> - Upload multiple frames from videos</li>
584
- <li><strong>Action recognition</strong> - Understand what's happening across frames</li>
585
- <li><strong>Temporal analysis</strong> - Track changes over time</li>
586
- <li><strong>Scene understanding</strong> - Comprehend context and setting</li>
587
- <li><strong>Object tracking</strong> - Follow objects across frames</li>
588
- <li><strong>Event detection</strong> - Identify key moments in sequences</li>
589
- </ul>
590
- </div>
591
- </div>
592
-
593
- <div class='capability-card' style='background: linear-gradient(135deg, #ffccbc 0%, #ffe0b2 100%);'>
594
- <h3 style='color: #bf360c;'>πŸ“‘ 5. Multi-Image Document Processing</h3>
595
- <div style='background: white; padding: 20px; border-radius: 10px; margin-top: 15px;'>
596
- <ul style='color: #666; line-height: 2; margin: 0; padding-left: 20px;'>
597
- <li><strong>Multiple pages at once</strong> - Upload and analyze entire documents</li>
598
- <li><strong>Cross-reference</strong> - Connect information across different images</li>
599
- <li><strong>Document comparison</strong> - Compare versions or similar documents</li>
600
- <li><strong>Batch processing</strong> - Handle multiple documents simultaneously</li>
601
- <li><strong>Presentation slides</strong> - Understand slide decks and flow</li>
602
- <li><strong>Comic books/Manga</strong> - Follow visual narratives</li>
603
- </ul>
604
- </div>
605
- </div>
606
-
607
- <div class='capability-card' style='background: linear-gradient(135deg, #c5e1a5 0%, #dcedc8 100%);'>
608
- <h3 style='color: #33691e;'>🧠 6. Advanced Reasoning</h3>
609
- <div style='background: white; padding: 20px; border-radius: 10px; margin-top: 15px;'>
610
- <ul style='color: #666; line-height: 2; margin: 0; padding-left: 20px;'>
611
- <li><strong>Step-by-step thinking</strong> - See the model's reasoning process</li>
612
- <li><strong>Mathematical problems</strong> - Solve complex math with visual elements</li>
613
- <li><strong>Logical deduction</strong> - Draw conclusions from visual evidence</li>
614
- <li><strong>Problem decomposition</strong> - Break down complex questions</li>
615
- <li><strong>Visual reasoning</strong> - Understand spatial and logical relationships</li>
616
- <li><strong>Transparent thinking</strong> - Explain how conclusions are reached</li>
617
- </ul>
618
- </div>
619
- </div>
620
-
621
- <div class='success-box' style='margin-top: 30px; font-size: 1.05em;'>
622
- <strong>πŸ’‘ Pro Tips for Best Results:</strong><br><br>
623
- βœ… <strong>High-quality images</strong> - Use clear, well-lit photos for better OCR<br>
624
- βœ… <strong>Multiple angles</strong> - Upload different views for complex objects<br>
625
- βœ… <strong>Specific questions</strong> - Ask targeted questions for precise answers<br>
626
- βœ… <strong>Enable reasoning</strong> - Turn on reasoning mode for complex analysis<br>
627
- βœ… <strong>Sequential order</strong> - Upload video frames in chronological order<br>
628
- βœ… <strong>Context matters</strong> - Provide background information for better understanding
629
- </div>
630
-
631
- <div style='background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 25px; border-radius: 16px; text-align: center; margin-top: 30px;'>
632
- <h3 style='margin: 0 0 10px 0; font-size: 1.5em;'>πŸš€ Ready to Get Started?</h3>
633
- <p style='margin: 0; font-size: 1.1em; opacity: 0.95;'>
634
- Upload your files in the Chat Interface tab and experience the power of Nemotron Nano 2 VL!
635
- </p>
636
  </div>
637
  """)
638
 
639
- # Examples Tab
640
- with gr.Tab("πŸ“š Examples & Capabilities", elem_classes=["tab-nav"]):
641
  gr.HTML("""
642
  <div class='capability-card'>
643
- <h3>πŸ“Š Document Intelligence</h3>
644
  <p><strong>Example:</strong> "Extract all the key metrics from this financial report"</p>
645
- <p>Nemotron excels at understanding complex documents, tables, and structured data.</p>
646
  </div>
647
 
648
  <div class='capability-card'>
649
- <h3>πŸ”€ OCR Excellence</h3>
650
- <p><strong>Example:</strong> "What text appears in this image?"</p>
651
- <p>State-of-the-art optical character recognition for any text in images.</p>
652
  </div>
653
 
654
  <div class='capability-card'>
655
- <h3>πŸ“ˆ Chart & Graph Analysis</h3>
656
- <p><strong>Example:</strong> "What trends do you see in this chart?"</p>
657
- <p>Analyze charts, graphs, and data visualizations with high accuracy.</p>
658
  </div>
659
 
660
  <div class='capability-card'>
@@ -664,26 +767,26 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
664
  </div>
665
 
666
  <div class='capability-card'>
667
- <h3>🎬 Video Understanding</h3>
668
- <p><strong>Example:</strong> Upload video frames and ask "What's happening in this sequence?"</p>
669
- <p>Process multiple frames to understand temporal sequences and events.</p>
670
  </div>
671
 
672
  <div class='capability-card'>
673
- <h3>πŸ“‘ Multi-Image Documents</h3>
674
- <p><strong>Example:</strong> Upload multiple pages and ask "Summarize this document"</p>
675
- <p>Handle multi-page documents and complex layouts with ease.</p>
676
  </div>
677
  """)
678
 
679
  gr.HTML("""
680
  <div class='success-box' style='margin-top: 30px;'>
681
  <strong>πŸ’‘ Pro Tips:</strong><br>
682
- β€’ Upload multiple images for document analysis<br>
683
- β€’ Enable reasoning mode for complex problems<br>
684
- β€’ Adjust temperature for creative vs precise outputs<br>
685
- β€’ Use specific questions for better OCR results<br>
686
- β€’ Try video frame sequences for temporal analysis
687
  </div>
688
  """)
689
 
@@ -744,10 +847,15 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
744
  outputs=[api_status]
745
  )
746
 
 
 
 
 
 
747
  submit_btn.click(
748
  fn=process_message,
749
  inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
750
- outputs=[chatbot, reasoning_display]
751
  ).then(
752
  lambda: ("", None),
753
  outputs=[msg, files]
@@ -756,7 +864,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
756
  msg.submit(
757
  fn=process_message,
758
  inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
759
- outputs=[chatbot, reasoning_display]
760
  ).then(
761
  lambda: ("", None),
762
  outputs=[msg, files]
@@ -764,11 +872,11 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
764
 
765
  clear_btn.click(
766
  fn=clear_conversation,
767
- outputs=[chatbot, reasoning_display]
768
  )
769
 
770
  # Launch the app
771
  if __name__ == "__main__":
772
  app.launch(
773
- share=True,
774
  )
 
7
  import time
8
  from PIL import Image
9
  import io
10
+ import sys
11
 
12
  # Global client variable
13
  client = None
 
32
  with open(image_path, "rb") as image_file:
33
  return base64.b64encode(image_file.read()).decode('utf-8')
34
 
35
+ def pdf_to_images_pymupdf(pdf_path: str) -> List[Image.Image]:
36
+ """Convert PDF to images using PyMuPDF (primary method)"""
37
+ try:
38
+ import fitz # PyMuPDF
39
+ doc = fitz.open(pdf_path)
40
+ images = []
41
+
42
+ for page_num in range(len(doc)):
43
+ page = doc[page_num]
44
+ # Render at 2x resolution for better quality
45
+ mat = fitz.Matrix(2, 2)
46
+ pix = page.get_pixmap(matrix=mat)
47
+
48
+ # Convert to PIL Image
49
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
50
+ images.append(img)
51
+
52
+ doc.close()
53
+ return images
54
+ except Exception as e:
55
+ raise Exception(f"PyMuPDF error: {str(e)}")
56
+
57
+ def pdf_to_images_pdf2image(pdf_path: str) -> List[Image.Image]:
58
+ """Convert PDF to images using pdf2image (requires poppler)"""
59
  try:
60
  from pdf2image import convert_from_path
61
  images = convert_from_path(pdf_path, dpi=200)
62
  return images
63
+ except Exception as e:
64
+ raise Exception(f"pdf2image error: {str(e)}")
65
+
66
+ def pdf_to_images(pdf_path: str) -> Tuple[List[Image.Image], str]:
67
+ """
68
+ Convert PDF to images with multiple fallback methods
69
+ Returns: (list of images, method used or error message)
70
+ """
71
+ # Try PyMuPDF first (doesn't require poppler)
72
+ try:
73
+ images = pdf_to_images_pymupdf(pdf_path)
74
+ return images, "PyMuPDF"
75
+ except Exception as e1:
76
+ pymupdf_error = str(e1)
77
+
78
+ # Try pdf2image as fallback
79
  try:
80
+ images = pdf_to_images_pdf2image(pdf_path)
81
+ return images, "pdf2image"
82
+ except Exception as e2:
83
+ pdf2image_error = str(e2)
84
+
85
+ # Both methods failed
86
+ error_msg = f"""PDF conversion failed. Tried multiple methods:
87
+
88
+ 1. PyMuPDF: {pymupdf_error}
89
+ 2. pdf2image: {pdf2image_error}
90
+
91
+ SOLUTION:
92
+ Install PyMuPDF (recommended - no external dependencies):
93
+ pip install PyMuPDF
94
+
95
+ OR install pdf2image + poppler:
96
+ pip install pdf2image
97
+
98
+ Then install poppler:
99
+ - Ubuntu/Debian: sudo apt-get install poppler-utils
100
+ - macOS: brew install poppler
101
+ - Windows: Download from https://github.com/oschwartz10612/poppler-windows/releases/
102
+ """
103
+ raise Exception(error_msg)
104
 
105
  def image_to_base64(image: Image.Image, format: str = "PNG") -> str:
106
  """Convert PIL Image to base64"""
107
  buffered = io.BytesIO()
108
+
109
+ # Convert RGBA to RGB if needed
110
+ if image.mode == 'RGBA':
111
+ background = Image.new('RGB', image.size, (255, 255, 255))
112
+ background.paste(image, mask=image.split()[3])
113
+ image = background
114
+ elif image.mode != 'RGB':
115
+ image = image.convert('RGB')
116
+
117
+ image.save(buffered, format=format, quality=95)
118
  return base64.b64encode(buffered.getvalue()).decode('utf-8')
119
 
120
+ def process_file(file_path: str) -> Tuple[List[dict], str]:
121
+ """
122
+ Process a file and return content blocks for API
123
+ Returns: (content_blocks, status_message)
124
+ """
125
  file_extension = Path(file_path).suffix.lower()
126
+ file_name = Path(file_path).name
127
  content_blocks = []
128
+ status_message = ""
129
 
130
  try:
131
  if file_extension == '.pdf':
132
  # Convert PDF pages to images
133
+ images, method = pdf_to_images(file_path)
134
+ status_message = f"βœ… PDF '{file_name}' converted to {len(images)} page(s) using {method}"
135
+
136
+ for idx, img in enumerate(images, 1):
137
+ base64_image = image_to_base64(img, format="JPEG")
138
  content_blocks.append({
139
  "type": "image_url",
140
  "image_url": {
141
+ "url": f"data:image/jpeg;base64,{base64_image}"
142
  }
143
  })
144
+
145
  elif file_extension == '.txt':
146
  # Read text file
147
+ try:
148
+ with open(file_path, 'r', encoding='utf-8') as f:
149
+ text_content = f.read()
150
+ except UnicodeDecodeError:
151
+ # Try with different encoding
152
+ with open(file_path, 'r', encoding='latin-1') as f:
153
+ text_content = f.read()
154
+
155
+ status_message = f"βœ… Text file '{file_name}' loaded ({len(text_content)} characters)"
156
  content_blocks.append({
157
  "type": "text",
158
+ "text": f"πŸ“„ Content from '{file_name}':\n\n{text_content}"
159
  })
160
+
161
  else:
162
  # Handle image files
163
  # Determine MIME type
 
173
  elif file_extension in ['.tiff', '.tif']:
174
  mime_type = "image/tiff"
175
 
176
+ # Load and potentially convert the image
177
+ try:
178
+ img = Image.open(file_path)
179
+
180
+ # Convert to RGB if necessary
181
+ if img.mode in ('RGBA', 'LA', 'P'):
182
+ background = Image.new('RGB', img.size, (255, 255, 255))
183
+ if img.mode == 'P':
184
+ img = img.convert('RGBA')
185
+ if img.mode in ('RGBA', 'LA'):
186
+ background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
187
+ img = background
188
+ elif img.mode != 'RGB':
189
+ img = img.convert('RGB')
190
+
191
+ # Convert to base64
192
+ base64_image = image_to_base64(img, format="JPEG")
193
+
194
+ status_message = f"βœ… Image '{file_name}' loaded ({img.width}x{img.height})"
195
+ content_blocks.append({
196
+ "type": "image_url",
197
+ "image_url": {
198
+ "url": f"data:image/jpeg;base64,{base64_image}"
199
+ }
200
+ })
201
+ except Exception as img_error:
202
+ # If image processing fails, try direct base64 encoding
203
+ base64_image = encode_image(file_path)
204
+ status_message = f"βœ… Image '{file_name}' loaded (direct encoding)"
205
+ content_blocks.append({
206
+ "type": "image_url",
207
+ "image_url": {
208
+ "url": f"data:{mime_type};base64,{base64_image}"
209
+ }
210
+ })
211
+
212
  except Exception as e:
213
+ error_msg = f"❌ Error processing '{file_name}': {str(e)}"
214
  content_blocks.append({
215
  "type": "text",
216
+ "text": error_msg
217
  })
218
+ status_message = error_msg
219
 
220
+ return content_blocks, status_message
221
 
222
  def process_message(
223
  message: str,
 
226
  enable_reasoning: bool = True,
227
  temperature: float = 0.7,
228
  max_tokens: int = 2000
229
+ ) -> Tuple[List[Tuple[str, str]], str, str]:
230
+ """
231
+ Process user message and generate response
232
+ Returns: (updated_history, reasoning_text, status_message)
233
+ """
234
  global client
235
 
236
  if client is None:
237
+ return history + [(message, "❌ Please configure your API key first in the Settings tab.")], "", ""
238
 
239
  if not message.strip() and not files:
240
+ return history + [(message, "⚠️ Please enter a message or upload files.")], "", ""
241
+
242
+ status_messages = []
243
 
244
  try:
245
  # Build messages array
 
257
  # Process files if provided
258
  if files:
259
  file_count = 0
260
+ total_pages = 0
261
+
262
  for file in files:
263
  if file is not None:
264
+ file_blocks, status = process_file(file)
265
  content.extend(file_blocks)
266
+ status_messages.append(status)
267
  file_count += 1
268
+
269
+ # Count pages for PDFs
270
+ if status.startswith("βœ…") and "page(s)" in status:
271
+ try:
272
+ pages = int(status.split("converted to ")[1].split(" page(s)")[0])
273
+ total_pages += pages
274
+ except:
275
+ pass
276
 
277
  if file_count > 0:
278
+ file_summary = f"πŸ“Ž {file_count} file(s) uploaded"
279
+ if total_pages > 0:
280
+ file_summary += f" ({total_pages} PDF pages)"
281
+ content.insert(0, {"type": "text", "text": file_summary})
282
 
283
  # Add text message
284
  if message.strip():
 
308
  if enable_reasoning and hasattr(response.choices[0].message, 'reasoning_details'):
309
  reasoning_details = response.choices[0].message.reasoning_details
310
  if reasoning_details:
311
+ reasoning_text = f"**🧠 Reasoning Process:**\n{json.dumps(reasoning_details, indent=2)}"
312
 
313
  # Update history
314
  new_history = history + [(message, assistant_message)]
315
 
316
+ # Combine status messages
317
+ combined_status = "\n".join(status_messages) if status_messages else "βœ… Message processed successfully"
318
+
319
+ return new_history, reasoning_text, combined_status
320
 
321
  except Exception as e:
322
  error_message = f"❌ Error: {str(e)}"
323
+ return history + [(message, error_message)], "", error_message
324
 
325
  def clear_conversation():
326
  """Clear conversation history"""
327
+ return [], "", ""
328
+
329
+ def check_dependencies() -> str:
330
+ """Check which PDF processing libraries are available"""
331
+ status = "**πŸ“¦ PDF Processing Dependencies Status:**\n\n"
332
+
333
+ # Check PyMuPDF
334
+ try:
335
+ import fitz
336
+ status += "βœ… **PyMuPDF (fitz)**: Installed and ready!\n"
337
+ status += " - No external dependencies needed\n"
338
+ status += " - This is the primary PDF processing method\n\n"
339
+ except ImportError:
340
+ status += "❌ **PyMuPDF (fitz)**: Not installed\n"
341
+ status += " - Install: `pip install PyMuPDF`\n\n"
342
+
343
+ # Check pdf2image
344
+ try:
345
+ import pdf2image
346
+ status += "βœ… **pdf2image**: Installed\n"
347
+ status += " - Requires poppler-utils (external)\n"
348
+
349
+ # Try to check if poppler is available
350
+ try:
351
+ from pdf2image.exceptions import PDFInfoNotInstalledError
352
+ from pdf2image import pdfinfo_from_path
353
+ # This will throw an error if poppler is not found
354
+ status += " - Checking poppler availability...\n"
355
+ except:
356
+ status += " - ⚠️ poppler-utils may not be installed\n"
357
+
358
+ status += "\n"
359
+ except ImportError:
360
+ status += "⚠️ **pdf2image**: Not installed (optional fallback)\n"
361
+ status += " - Install: `pip install pdf2image`\n\n"
362
+
363
+ # Check PIL/Pillow
364
+ try:
365
+ from PIL import Image
366
+ status += "βœ… **Pillow (PIL)**: Installed and ready!\n\n"
367
+ except ImportError:
368
+ status += "❌ **Pillow (PIL)**: Not installed\n"
369
+ status += " - Install: `pip install Pillow`\n\n"
370
+
371
+ status += "**πŸ’‘ Recommendation:**\n"
372
+ status += "Install PyMuPDF for the best PDF support:\n"
373
+ status += "`pip install PyMuPDF Pillow`"
374
+
375
+ return status
376
 
377
  # Custom CSS for premium design
378
  custom_css = """
 
547
  elem_classes=["chatbot"]
548
  )
549
 
550
+ file_status = gr.Textbox(
551
+ label="πŸ“‹ File Processing Status",
552
+ lines=2,
553
+ interactive=False,
554
+ visible=True
555
+ )
556
+
557
  with gr.Row():
558
  msg = gr.Textbox(
559
  label="Your Message",
 
633
  info="Maximum length of response"
634
  )
635
 
636
+ gr.HTML("<hr style='margin: 30px 0; border: none; border-top: 2px solid #e0e7ff;'>")
637
+
638
+ gr.HTML("""
639
+ <div class='info-box'>
640
+ <strong>πŸ“¦ Check Dependencies</strong><br>
641
+ Verify that PDF processing libraries are installed
642
+ </div>
643
+ """)
644
+
645
+ check_deps_btn = gr.Button("πŸ” Check Dependencies", variant="secondary", elem_classes=["secondary"])
646
+ deps_status = gr.Markdown(label="Dependency Status")
647
+
648
  gr.HTML("""
649
  <div class='info-box' style='margin-top: 20px;'>
650
+ <strong>πŸ“¦ Installation Guide:</strong><br><br>
651
+ <strong>Recommended (PyMuPDF - No external dependencies):</strong><br>
652
+ <code>pip install PyMuPDF Pillow openai gradio</code><br><br>
653
+
654
+ <strong>Alternative (pdf2image - Requires poppler):</strong><br>
655
+ <code>pip install pdf2image Pillow openai gradio</code><br><br>
656
+
657
+ <strong>Poppler installation (for pdf2image):</strong><br>
658
+ β€’ Ubuntu/Debian: <code>sudo apt-get install poppler-utils</code><br>
659
+ β€’ macOS: <code>brew install poppler</code><br>
660
+ β€’ Windows: Download from <a href="https://github.com/oschwartz10612/poppler-windows/releases/" target="_blank">GitHub</a>
661
  </div>
662
  """)
663
 
 
711
  <p style='margin: 10px 0 0 0; color: #666; line-height: 1.6;'>
712
  β€’ Multi-page support<br>
713
  β€’ Automatic conversion to images<br>
714
+ β€’ PyMuPDF (recommended)<br>
715
  β€’ Scanned documents<br>
716
  β€’ Forms and tables
717
  </p>
 
722
  β€’ Plain text documents<br>
723
  β€’ Code snippets<br>
724
  β€’ Notes and logs<br>
725
+ β€’ UTF-8 encoding<br>
726
  β€’ Configuration files
727
  </p>
728
  </div>
729
  </div>
730
  </div>
731
 
732
+ <div class='success-box' style='margin-top: 20px;'>
733
+ <strong>πŸš€ PDF Processing:</strong><br>
734
+ This app uses <strong>PyMuPDF (fitz)</strong> as the primary method for PDF conversion.<br>
735
+ β€’ βœ… No external dependencies (no poppler needed)<br>
736
+ β€’ βœ… Fast and reliable<br>
737
+ β€’ βœ… Automatic fallback to pdf2image if needed<br>
738
+ β€’ βœ… Clear error messages with installation instructions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
  </div>
740
  """)
741
 
742
+ # Examples Tab
743
+ with gr.Tab("πŸ“š Use Cases", elem_classes=["tab-nav"]):
744
  gr.HTML("""
745
  <div class='capability-card'>
746
+ <h3>πŸ“Š Financial Report Analysis</h3>
747
  <p><strong>Example:</strong> "Extract all the key metrics from this financial report"</p>
748
+ <p><strong>What it extracts:</strong> Revenue, Net Profit, EBITDA, Cash Flow, Assets, Liabilities, Ratios, YoY Growth</p>
749
  </div>
750
 
751
  <div class='capability-card'>
752
+ <h3>πŸ”€ OCR & Text Extraction</h3>
753
+ <p><strong>Example:</strong> "What text appears in this scanned document?"</p>
754
+ <p>State-of-the-art optical character recognition for any text in images or PDFs.</p>
755
  </div>
756
 
757
  <div class='capability-card'>
758
+ <h3>πŸ“ˆ Chart & Data Visualization</h3>
759
+ <p><strong>Example:</strong> "Analyze the trends in these charts"</p>
760
+ <p>Understand bar charts, line graphs, pie charts, scatter plots, and complex visualizations.</p>
761
  </div>
762
 
763
  <div class='capability-card'>
 
767
  </div>
768
 
769
  <div class='capability-card'>
770
+ <h3>πŸ“‘ Multi-Page Documents</h3>
771
+ <p><strong>Example:</strong> Upload a PDF and ask "Summarize the key points from all pages"</p>
772
+ <p>Process entire documents with multiple pages simultaneously.</p>
773
  </div>
774
 
775
  <div class='capability-card'>
776
+ <h3>🏒 Business Document Processing</h3>
777
+ <p><strong>Example:</strong> "Extract information from this invoice/receipt/form"</p>
778
+ <p>Handle invoices, receipts, forms, contracts, and structured business documents.</p>
779
  </div>
780
  """)
781
 
782
  gr.HTML("""
783
  <div class='success-box' style='margin-top: 30px;'>
784
  <strong>πŸ’‘ Pro Tips:</strong><br>
785
+ β€’ Upload high-quality scans for best OCR results<br>
786
+ β€’ Enable reasoning mode for complex financial analysis<br>
787
+ β€’ Ask specific questions to get targeted information<br>
788
+ β€’ Upload multiple related documents for comparison<br>
789
+ β€’ Use clear, descriptive questions for better answers
790
  </div>
791
  """)
792
 
 
847
  outputs=[api_status]
848
  )
849
 
850
+ check_deps_btn.click(
851
+ fn=check_dependencies,
852
+ outputs=[deps_status]
853
+ )
854
+
855
  submit_btn.click(
856
  fn=process_message,
857
  inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
858
+ outputs=[chatbot, reasoning_display, file_status]
859
  ).then(
860
  lambda: ("", None),
861
  outputs=[msg, files]
 
864
  msg.submit(
865
  fn=process_message,
866
  inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
867
+ outputs=[chatbot, reasoning_display, file_status]
868
  ).then(
869
  lambda: ("", None),
870
  outputs=[msg, files]
 
872
 
873
  clear_btn.click(
874
  fn=clear_conversation,
875
+ outputs=[chatbot, reasoning_display, file_status]
876
  )
877
 
878
  # Launch the app
879
  if __name__ == "__main__":
880
  app.launch(
881
+ share=True
882
  )