Spaces:

shukdevdattaEX
/

NemoVision

Paused

App Files Files Community

shukdevdattaEX commited on Dec 26, 2025

Commit

c130422

verified ·

1 Parent(s): 0475588

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -38

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ from pathlib import Path
 import json
 from typing import List, Tuple, Optional
 import time
 # Global client variable
 client = None
@@ -29,20 +31,93 @@ def encode_image(image_path: str) -> str:
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
-def create_image_content(image_path: str, mime_type: str = "image/jpeg") -> dict:
-    """Create image content for API"""
-    base64_image = encode_image(image_path)
-    return {
-        "type": "image_url",
-        "image_url": {
-            "url": f"data:{mime_type};base64,{base64_image}"
-        }
-    }
 def process_message(
     message: str,
     history: List[Tuple[str, str]],
-    images: Optional[List] = None,
     enable_reasoning: bool = True,
     temperature: float = 0.7,
     max_tokens: int = 2000
@@ -53,6 +128,9 @@ def process_message(
     if client is None:
         return history + [(message, "❌ Please configure your API key first in the Settings tab.")], ""
     try:
         # Build messages array
         messages = []
@@ -66,22 +144,24 @@ def process_message(
         # Build current message content
         content = []
-        # Add images if provided
-        if images:
-            for img in images:
-                if img is not None:
-                    # Determine MIME type
-                    img_path = Path(img)
-                    mime_type = "image/jpeg"
-                    if img_path.suffix.lower() in ['.png']:
-                        mime_type = "image/png"
-                    elif img_path.suffix.lower() in ['.webp']:
-                        mime_type = "image/webp"
-                    content.append(create_image_content(img, mime_type))
         # Add text message
-        content.append({"type": "text", "text": message})
         messages.append({"role": "user", "content": content})
@@ -131,7 +211,7 @@ custom_css = """
 }
 .gradio-container {
-    background: linear-gradient(135deg, rgb(102 225 234) 0%, rgb(118, 75, 162) 100%) !important;
 }
 #main-container {
@@ -143,7 +223,7 @@ custom_css = """
 }
 .header-title {
-    background: linear-gradient(135deg, rgb(71, 35, 242) 0%, rgb(72 32 113) 100%) text;
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     font-size: 3em;
@@ -280,10 +360,10 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
                         <div class='info-box'>
                             <strong>🎯 What can I do?</strong><br>
                             • Analyze images, documents, and charts<br>
-                            • Perform OCR and text extraction<br>
                             • Reason through complex problems<br>
                             • Answer questions about visual content<br>
-                            • Process multi-image documents
                         </div>
                     """)
@@ -298,13 +378,13 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
                     with gr.Row():
                         msg = gr.Textbox(
                             label="Your Message",
-                            placeholder="Ask me anything about images, documents, or reasoning tasks...",
                             lines=3,
                             scale=4
                         )
                     with gr.Row():
-                        images = gr.File(
                             label="📎 Upload Files (Images, PDFs, Documents - Multi-file support)",
                             file_count="multiple",
                             file_types=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".txt"],
@@ -373,6 +453,14 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
                         label="📏 Max Tokens",
                         info="Maximum length of response"
                     )
                 # File Support Tab
                 with gr.Tab("📁 Supported Files", elem_classes=["tab-nav"]):
@@ -423,7 +511,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
                                     <strong style='color: #f57c00; font-size: 1.1em;'>📕 PDF Documents</strong>
                                     <p style='margin: 10px 0 0 0; color: #666; line-height: 1.6;'>
                                         • Multi-page support<br>
-                                        • Text extraction<br>
                                         • Layout analysis<br>
                                         • Scanned documents<br>
                                         • Forms and tables
@@ -658,20 +746,20 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
     submit_btn.click(
         fn=process_message,
-        inputs=[msg, chatbot, images, enable_reasoning, temperature, max_tokens],
         outputs=[chatbot, reasoning_display]
     ).then(
         lambda: ("", None),
-        outputs=[msg, images]
     )
     msg.submit(
         fn=process_message,
-        inputs=[msg, chatbot, images, enable_reasoning, temperature, max_tokens],
         outputs=[chatbot, reasoning_display]
     ).then(
         lambda: ("", None),
-        outputs=[msg, images]
     )
     clear_btn.click(
@@ -683,7 +771,4 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as app:
 if __name__ == "__main__":
     app.launch(
         share=True,
-        # server_name="0.0.0.0",
-        # server_port=7860,
-        # show_error=True
     )

 import json
 from typing import List, Tuple, Optional
 import time
+from PIL import Image
+import io
 # Global client variable
 client = None
     with open(image_path, "rb") as image_file:
         return base64.b64encode(image_file.read()).decode('utf-8')
+def pdf_to_images(pdf_path: str) -> List[Image.Image]:
+    """Convert PDF to images using pdf2image"""
+    try:
+        from pdf2image import convert_from_path
+        images = convert_from_path(pdf_path, dpi=200)
+        return images
+    except ImportError:
+        # If pdf2image is not available, try PyMuPDF (fitz)
+        try:
+            import fitz
+            doc = fitz.open(pdf_path)
+            images = []
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                images.append(img)
+            doc.close()
+            return images
+        except ImportError:
+            raise Exception("Please install pdf2image or PyMuPDF: pip install pdf2image PyMuPDF")
+def image_to_base64(image: Image.Image, format: str = "PNG") -> str:
+    """Convert PIL Image to base64"""
+    buffered = io.BytesIO()
+    image.save(buffered, format=format)
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+def process_file(file_path: str) -> List[dict]:
+    """Process a file and return content blocks for API"""
+    file_extension = Path(file_path).suffix.lower()
+    content_blocks = []
+    try:
+        if file_extension == '.pdf':
+            # Convert PDF pages to images
+            images = pdf_to_images(file_path)
+            for img in images:
+                base64_image = image_to_base64(img, format="PNG")
+                content_blocks.append({
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{base64_image}"
+                    }
+                })
+        elif file_extension == '.txt':
+            # Read text file
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text_content = f.read()
+            content_blocks.append({
+                "type": "text",
+                "text": f"[Text File Content]:\n{text_content}"
+            })
+        else:
+            # Handle image files
+            # Determine MIME type
+            mime_type = "image/jpeg"
+            if file_extension in ['.png']:
+                mime_type = "image/png"
+            elif file_extension in ['.webp']:
+                mime_type = "image/webp"
+            elif file_extension in ['.gif']:
+                mime_type = "image/gif"
+            elif file_extension in ['.bmp']:
+                mime_type = "image/bmp"
+            elif file_extension in ['.tiff', '.tif']:
+                mime_type = "image/tiff"
+            base64_image = encode_image(file_path)
+            content_blocks.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:{mime_type};base64,{base64_image}"
+                }
+            })
+    except Exception as e:
+        content_blocks.append({
+            "type": "text",
+            "text": f"[Error processing file {Path(file_path).name}: {str(e)}]"
+        })
+    return content_blocks
 def process_message(
     message: str,
     history: List[Tuple[str, str]],
+    files: Optional[List] = None,
     enable_reasoning: bool = True,
     temperature: float = 0.7,
     max_tokens: int = 2000
     if client is None:
         return history + [(message, "❌ Please configure your API key first in the Settings tab.")], ""
+    if not message.strip() and not files:
+        return history + [(message, "⚠️ Please enter a message or upload files.")], ""
     try:
         # Build messages array
         messages = []
         # Build current message content
         content = []
+        # Process files if provided
+        if files:
+            file_count = 0
+            for file in files:
+                if file is not None:
+                    file_blocks = process_file(file)
+                    content.extend(file_blocks)
+                    file_count += 1
+            if file_count > 0:
+                content.insert(0, {
+                    "type": "text",
+                    "text": f"[{file_count} file(s) uploaded]"
+                })
         # Add text message
+        if message.strip():
+            content.append({"type": "text", "text": message})
         messages.append({"role": "user", "content": content})
 }
 .gradio-container {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
 }
 #main-container {
 }
 .header-title {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
     -webkit-background-clip: text;
     -webkit-text-fill-color: transparent;
     font-size: 3em;
                         <div class='info-box'>
                             <strong>🎯 What can I do?</strong><br>
                             • Analyze images, documents, and charts<br>
+                            • Perform OCR and text extraction from PDFs<br>
                             • Reason through complex problems<br>
                             • Answer questions about visual content<br>
+                            • Process multi-image documents and PDFs
                         </div>
                     """)
                     with gr.Row():
                         msg = gr.Textbox(
                             label="Your Message",
+                            placeholder="Ask me anything about images, documents, PDFs, or reasoning tasks...",
                             lines=3,
                             scale=4
                         )
                     with gr.Row():
+                        files = gr.File(
                             label="📎 Upload Files (Images, PDFs, Documents - Multi-file support)",
                             file_count="multiple",
                             file_types=[".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp", ".tiff", ".pdf", ".txt"],
                         label="📏 Max Tokens",
                         info="Maximum length of response"
                     )
+                    gr.HTML("""
+                        <div class='info-box' style='margin-top: 20px;'>
+                            <strong>📦 Required Dependencies for PDF Support:</strong><br>
+                            <code>pip install pdf2image PyMuPDF pillow</code><br><br>
+                            <strong>Note:</strong> pdf2image also requires poppler-utils installed on your system.
+                        </div>
+                    """)
                 # File Support Tab
                 with gr.Tab("📁 Supported Files", elem_classes=["tab-nav"]):
                                     <strong style='color: #f57c00; font-size: 1.1em;'>📕 PDF Documents</strong>
                                     <p style='margin: 10px 0 0 0; color: #666; line-height: 1.6;'>
                                         • Multi-page support<br>
+                                        • Automatic conversion to images<br>
                                         • Layout analysis<br>
                                         • Scanned documents<br>
                                         • Forms and tables
     submit_btn.click(
         fn=process_message,
+        inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
         outputs=[chatbot, reasoning_display]
     ).then(
         lambda: ("", None),
+        outputs=[msg, files]
     )
     msg.submit(
         fn=process_message,
+        inputs=[msg, chatbot, files, enable_reasoning, temperature, max_tokens],
         outputs=[chatbot, reasoning_display]
     ).then(
         lambda: ("", None),
+        outputs=[msg, files]
     )
     clear_btn.click(
 if __name__ == "__main__":
     app.launch(
         share=True,
     )