shukdevdattaEX commited on
Commit
c130422
Β·
verified Β·
1 Parent(s): 0475588

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -38
app.py CHANGED
@@ -5,6 +5,8 @@ from pathlib import Path
5
  import json
6
  from typing import List, Tuple, Optional
7
  import time
 
 
8
 
9
  # Global client variable
10
  client = None
@@ -29,20 +31,93 @@ def encode_image(image_path: str) -> str:
29
  with open(image_path, "rb") as image_file:
30
  return base64.b64encode(image_file.read()).decode('utf-8')
31
 
32
- def create_image_content(image_path: str, mime_type: str = "image/jpeg") -> dict:
33
- """Create image content for API"""
34
- base64_image = encode_image(image_path)
35
- return {
36
- "type": "image_url",
37
- "image_url": {
38
- "url": f"data:{mime_type};base64,{base64_image}"
39
- }
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  def process_message(
43
  message: str,
44
  history: List[Tuple[str, str]],
45
- images: Optional[List] = None,
46
  enable_reasoning: bool = True,
47
  temperature: float = 0.7,
48
  max_tokens: int = 2000
@@ -53,6 +128,9 @@ def process_message(
53
  if client is None:
54
  return history + [(message, "❌ Please configure your API key first in the Settings tab.")], ""
55
 
 
 
 
56
  try:
57
  # Build messages array
58
  messages = []
@@ -66,22 +144,24 @@ def process_message(
66
  # Build current message content
67
  content = []
68
 
69
- # Add images if provided
70
- if images:
71
- for img in images:
72
- if img is not None:
73
- # Determine MIME type
74
- img_path = Path(img)
75
- mime_type = "image/jpeg"
76
- if img_path.suffix.lower() in ['.png']:
77
- mime_type = "image/png"
78
- elif img_path.suffix.lower() in ['.webp']:
79
- mime_type = "image/webp"
80
-
81
- content.append(create_image_content(img, mime_type))
 
82
 
83
  # Add text message
84
- content.append({"type": "text", "text": message})
 
85
 
86
  messages.append({"role": "user", "content": content})
87
 
@@ -131,7 +211,7 @@ custom_css = """
131
  }
132
 
133
  .gradio-container {
134
- background: linear-gradient(135deg, rgb(102 225 234) 0%, rgb(118, 75, 162) 100%) !important;
135
  }
136
 
137
  #main-container {
@@ -143,7 +223,7 @@ custom_css = """
143
  }
144
 
145
  .header-title {
146
- background: linear-gradient(135deg, rgb(71, 35, 242) 0%, rgb(72 32 113) 100%) text;
147
  -webkit-background-clip: text;
148
  -webkit-text-fill-color: transparent;
149
  font-size: 3em;
@@ -280,10 +360,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
280
  <div class='info-box'>
281
  <strong>🎯 What can I do?</strong><br>
282
  β€’ Analyze images, documents, and charts<br>
283
- β€’ Perform OCR and text extraction<br>
284
  β€’ Reason through complex problems<br>
285
  β€’ Answer questions about visual content<br>
286
- β€’ Process multi-image documents
287
  </div>
288
  """)
289
 
@@ -298,13 +378,13 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
298
  with gr.Row():
299
  msg = gr.Textbox(
300
  label="Your Message",
301
- placeholder="Ask me anything about images, documents, or reasoning tasks...",
302
  lines=3,
303
  scale=4
304
  )
305
 
306
  with gr.Row():
307
- images = gr.File(
308
  label="πŸ“Ž Upload Files (Images, PDFs, Documents - Multi-file support)",
309
  file_count="multiple",
310
  file_types=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".txt"],
@@ -373,6 +453,14 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
373
  label="πŸ“ Max Tokens",
374
  info="Maximum length of response"
375
  )
 
 
 
 
 
 
 
 
376
 
377
  # File Support Tab
378
  with gr.Tab("πŸ“ Supported Files", elem_classes=["tab-nav"]):
@@ -423,7 +511,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
423
  <strong style='color: #f57c00; font-size: 1.1em;'>πŸ“• PDF Documents</strong>
424
  <p style='margin: 10px 0 0 0; color: #666; line-height: 1.6;'>
425
  β€’ Multi-page support<br>
426
- β€’ Text extraction<br>
427
  β€’ Layout analysis<br>
428
  β€’ Scanned documents<br>
429
  β€’ Forms and tables
@@ -658,20 +746,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
658
 
659
  submit_btn.click(
660
  fn=process_message,
661
- inputs=[msg, chatbot, images, enable_reasoning, temperature, max_tokens],
662
  outputs=[chatbot, reasoning_display]
663
  ).then(
664
  lambda: ("", None),
665
- outputs=[msg, images]
666
  )
667
 
668
  msg.submit(
669
  fn=process_message,
670
- inputs=[msg, chatbot, images, enable_reasoning, temperature, max_tokens],
671
  outputs=[chatbot, reasoning_display]
672
  ).then(
673
  lambda: ("", None),
674
- outputs=[msg, images]
675
  )
676
 
677
  clear_btn.click(
@@ -683,7 +771,4 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
683
  if __name__ == "__main__":
684
  app.launch(
685
  share=True,
686
- # server_name="0.0.0.0",
687
- # server_port=7860,
688
- # show_error=True
689
  )
 
5
  import json
6
  from typing import List, Tuple, Optional
7
  import time
8
+ from PIL import Image
9
+ import io
10
 
11
  # Global client variable
12
  client = None
 
31
  with open(image_path, "rb") as image_file:
32
  return base64.b64encode(image_file.read()).decode('utf-8')
33
 
34
+ def pdf_to_images(pdf_path: str) -> List[Image.Image]:
35
+ """Convert PDF to images using pdf2image"""
36
+ try:
37
+ from pdf2image import convert_from_path
38
+ images = convert_from_path(pdf_path, dpi=200)
39
+ return images
40
+ except ImportError:
41
+ # If pdf2image is not available, try PyMuPDF (fitz)
42
+ try:
43
+ import fitz
44
+ doc = fitz.open(pdf_path)
45
+ images = []
46
+ for page_num in range(len(doc)):
47
+ page = doc[page_num]
48
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
49
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
50
+ images.append(img)
51
+ doc.close()
52
+ return images
53
+ except ImportError:
54
+ raise Exception("Please install pdf2image or PyMuPDF: pip install pdf2image PyMuPDF")
55
+
56
+ def image_to_base64(image: Image.Image, format: str = "PNG") -> str:
57
+ """Convert PIL Image to base64"""
58
+ buffered = io.BytesIO()
59
+ image.save(buffered, format=format)
60
+ return base64.b64encode(buffered.getvalue()).decode('utf-8')
61
+
62
+ def process_file(file_path: str) -> List[dict]:
63
+ """Process a file and return content blocks for API"""
64
+ file_extension = Path(file_path).suffix.lower()
65
+ content_blocks = []
66
+
67
+ try:
68
+ if file_extension == '.pdf':
69
+ # Convert PDF pages to images
70
+ images = pdf_to_images(file_path)
71
+ for img in images:
72
+ base64_image = image_to_base64(img, format="PNG")
73
+ content_blocks.append({
74
+ "type": "image_url",
75
+ "image_url": {
76
+ "url": f"data:image/png;base64,{base64_image}"
77
+ }
78
+ })
79
+ elif file_extension == '.txt':
80
+ # Read text file
81
+ with open(file_path, 'r', encoding='utf-8') as f:
82
+ text_content = f.read()
83
+ content_blocks.append({
84
+ "type": "text",
85
+ "text": f"[Text File Content]:\n{text_content}"
86
+ })
87
+ else:
88
+ # Handle image files
89
+ # Determine MIME type
90
+ mime_type = "image/jpeg"
91
+ if file_extension in ['.png']:
92
+ mime_type = "image/png"
93
+ elif file_extension in ['.webp']:
94
+ mime_type = "image/webp"
95
+ elif file_extension in ['.gif']:
96
+ mime_type = "image/gif"
97
+ elif file_extension in ['.bmp']:
98
+ mime_type = "image/bmp"
99
+ elif file_extension in ['.tiff', '.tif']:
100
+ mime_type = "image/tiff"
101
+
102
+ base64_image = encode_image(file_path)
103
+ content_blocks.append({
104
+ "type": "image_url",
105
+ "image_url": {
106
+ "url": f"data:{mime_type};base64,{base64_image}"
107
+ }
108
+ })
109
+ except Exception as e:
110
+ content_blocks.append({
111
+ "type": "text",
112
+ "text": f"[Error processing file {Path(file_path).name}: {str(e)}]"
113
+ })
114
+
115
+ return content_blocks
116
 
117
  def process_message(
118
  message: str,
119
  history: List[Tuple[str, str]],
120
+ files: Optional[List] = None,
121
  enable_reasoning: bool = True,
122
  temperature: float = 0.7,
123
  max_tokens: int = 2000
 
128
  if client is None:
129
  return history + [(message, "❌ Please configure your API key first in the Settings tab.")], ""
130
 
131
+ if not message.strip() and not files:
132
+ return history + [(message, "⚠️ Please enter a message or upload files.")], ""
133
+
134
  try:
135
  # Build messages array
136
  messages = []
 
144
  # Build current message content
145
  content = []
146
 
147
+ # Process files if provided
148
+ if files:
149
+ file_count = 0
150
+ for file in files:
151
+ if file is not None:
152
+ file_blocks = process_file(file)
153
+ content.extend(file_blocks)
154
+ file_count += 1
155
+
156
+ if file_count > 0:
157
+ content.insert(0, {
158
+ "type": "text",
159
+ "text": f"[{file_count} file(s) uploaded]"
160
+ })
161
 
162
  # Add text message
163
+ if message.strip():
164
+ content.append({"type": "text", "text": message})
165
 
166
  messages.append({"role": "user", "content": content})
167
 
 
211
  }
212
 
213
  .gradio-container {
214
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
215
  }
216
 
217
  #main-container {
 
223
  }
224
 
225
  .header-title {
226
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
227
  -webkit-background-clip: text;
228
  -webkit-text-fill-color: transparent;
229
  font-size: 3em;
 
360
  <div class='info-box'>
361
  <strong>🎯 What can I do?</strong><br>
362
  β€’ Analyze images, documents, and charts<br>
363
+ β€’ Perform OCR and text extraction from PDFs<br>
364
  β€’ Reason through complex problems<br>
365
  β€’ Answer questions about visual content<br>
366
+ β€’ Process multi-image documents and PDFs
367
  </div>
368
  """)
369
 
 
378
  with gr.Row():
379
  msg = gr.Textbox(
380
  label="Your Message",
381
+ placeholder="Ask me anything about images, documents, PDFs, or reasoning tasks...",
382
  lines=3,
383
  scale=4
384
  )
385
 
386
  with gr.Row():
387
+ files = gr.File(
388
  label="πŸ“Ž Upload Files (Images, PDFs, Documents - Multi-file support)",
389
  file_count="multiple",
390
  file_types=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".txt"],
 
453
  label="πŸ“ Max Tokens",
454
  info="Maximum length of response"
455
  )
456
+
457
+ gr.HTML("""
458
+ <div class='info-box' style='margin-top: 20px;'>
459
+ <strong>πŸ“¦ Required Dependencies for PDF Support:</strong><br>
460
+ <code>pip install pdf2image PyMuPDF pillow</code><br><br>
461
+ <strong>Note:</strong> pdf2image also requires poppler-utils installed on your system.
462
+ </div>
463
+ """)
464
 
465
  # File Support Tab
466
  with gr.Tab("πŸ“ Supported Files", elem_classes=["tab-nav"]):
 
511
  <strong style='color: #f57c00; font-size: 1.1em;'>πŸ“• PDF Documents</strong>
512
  <p style='margin: 10px 0 0 0; color: #666; line-height: 1.6;'>
513
  β€’ Multi-page support<br>
514
+ β€’ Automatic conversion to images<br>
515
  β€’ Layout analysis<br>
516
  β€’ Scanned documents<br>
517
  β€’ Forms and tables
 
746
 
747
  submit_btn.click(
748
  fn=process_message,
749
+ inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
750
  outputs=[chatbot, reasoning_display]
751
  ).then(
752
  lambda: ("", None),
753
+ outputs=[msg, files]
754
  )
755
 
756
  msg.submit(
757
  fn=process_message,
758
+ inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
759
  outputs=[chatbot, reasoning_display]
760
  ).then(
761
  lambda: ("", None),
762
+ outputs=[msg, files]
763
  )
764
 
765
  clear_btn.click(
 
771
  if __name__ == "__main__":
772
  app.launch(
773
  share=True,
 
 
 
774
  )