Spaces:

samwell
/

medrax2

Sleeping

samwell Claude commited on 20 days ago

Commit

9dc78d7

1 Parent(s): 11b0dac

fix: Enable Gemini vision support for image analysis

- Added PIL, base64, and BytesIO imports for image processing
- Updated chat function to encode images as base64 for Gemini
- Images are now passed as multimodal content to support Gemini 2.0 Flash vision
- Resize images larger than 4096x4096 to meet Gemini limits
- Include image path in message text for tool access
- This fixes the issue where tools couldn't access uploaded images

Now both Assistant and Socratic modes can properly analyze X-ray images
and invoke tools like grounding and segmentation.

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +55 -5

app.py CHANGED Viewed

@@ -17,6 +17,9 @@ if hf_token:
 import gradio as gr
 from dotenv import load_dotenv
 import torch
 load_dotenv()
@@ -166,17 +169,64 @@ def chat(message, history, mode):
     # Get or create the appropriate agent
     agent = get_or_create_agent(mode)
-    # Handle multimodal input
     if isinstance(message, dict):
         text = message.get("text", "")
         files = message.get("files", [])
-        if files:
-            file_info = f"[Image uploaded: {files[0]}]\n\n"
-            text = file_info + text
         message = text
     response = agent.workflow.invoke(
-        {"messages": [("user", message)]},
         config=config
     )

 import gradio as gr
 from dotenv import load_dotenv
 import torch
+from PIL import Image
+import base64
+from io import BytesIO
 load_dotenv()
     # Get or create the appropriate agent
     agent = get_or_create_agent(mode)
+    # Handle multimodal input - Gemini 2.0 Flash supports vision
+    image_content = None
     if isinstance(message, dict):
         text = message.get("text", "")
         files = message.get("files", [])
+        if files and len(files) > 0:
+            image_path = files[0]
+            # Store image path for tools to use
+            # LangChain Google GenAI expects images as base64 or PIL
+            try:
+                # Open and encode image for Gemini
+                with Image.open(image_path) as img:
+                    # Convert to RGB if needed
+                    if img.mode != "RGB":
+                        img = img.convert("RGB")
+                    # Resize if too large (max 4096x4096 for Gemini)
+                    max_size = 4096
+                    if img.width > max_size or img.height > max_size:
+                        img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+                    # Store as bytes for LangChain
+                    buffered = BytesIO()
+                    img.save(buffered, format="PNG")
+                    img_bytes = buffered.getvalue()
+                    img_b64 = base64.b64encode(img_bytes).decode()
+                    # Create multimodal content for Gemini
+                    # Format: [{"type": "text", "text": "..."}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}]
+                    image_content = {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{img_b64}"
+                        }
+                    }
+                    # Include image path in text for tools to use
+                    text = f"[Image: {image_path}]\n\n{text}"
+            except Exception as e:
+                print(f"Error processing image: {e}")
+                text = f"[Failed to load image: {image_path}]\n\n{text}"
         message = text
+    # Create message content - multimodal if image exists
+    if image_content:
+        # For Gemini multimodal: pass list of content parts
+        user_message = [
+            {"type": "text", "text": message},
+            image_content
+        ]
+    else:
+        user_message = message
     response = agent.workflow.invoke(
+        {"messages": [("user", user_message)]},
         config=config
     )