samwell Claude commited on
Commit
9dc78d7
·
1 Parent(s): 11b0dac

fix: Enable Gemini vision support for image analysis

Browse files

- Added PIL, base64, and BytesIO imports for image processing
- Updated chat function to encode images as base64 for Gemini
- Images are now passed as multimodal content to support Gemini 2.0 Flash vision
- Resize images larger than 4096x4096 to meet Gemini limits
- Include image path in message text for tool access
- This fixes the issue where tools couldn't access uploaded images

Now both Assistant and Socratic modes can properly analyze X-ray images
and invoke tools like grounding and segmentation.

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +55 -5
app.py CHANGED
@@ -17,6 +17,9 @@ if hf_token:
17
  import gradio as gr
18
  from dotenv import load_dotenv
19
  import torch
 
 
 
20
 
21
  load_dotenv()
22
 
@@ -166,17 +169,64 @@ def chat(message, history, mode):
166
  # Get or create the appropriate agent
167
  agent = get_or_create_agent(mode)
168
 
169
- # Handle multimodal input
 
170
  if isinstance(message, dict):
171
  text = message.get("text", "")
172
  files = message.get("files", [])
173
- if files:
174
- file_info = f"[Image uploaded: {files[0]}]\n\n"
175
- text = file_info + text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  message = text
177
 
 
 
 
 
 
 
 
 
 
 
178
  response = agent.workflow.invoke(
179
- {"messages": [("user", message)]},
180
  config=config
181
  )
182
 
 
17
  import gradio as gr
18
  from dotenv import load_dotenv
19
  import torch
20
+ from PIL import Image
21
+ import base64
22
+ from io import BytesIO
23
 
24
  load_dotenv()
25
 
 
169
  # Get or create the appropriate agent
170
  agent = get_or_create_agent(mode)
171
 
172
+ # Handle multimodal input - Gemini 2.0 Flash supports vision
173
+ image_content = None
174
  if isinstance(message, dict):
175
  text = message.get("text", "")
176
  files = message.get("files", [])
177
+
178
+ if files and len(files) > 0:
179
+ image_path = files[0]
180
+ # Store image path for tools to use
181
+ # LangChain Google GenAI expects images as base64 or PIL
182
+ try:
183
+ # Open and encode image for Gemini
184
+ with Image.open(image_path) as img:
185
+ # Convert to RGB if needed
186
+ if img.mode != "RGB":
187
+ img = img.convert("RGB")
188
+
189
+ # Resize if too large (max 4096x4096 for Gemini)
190
+ max_size = 4096
191
+ if img.width > max_size or img.height > max_size:
192
+ img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
193
+
194
+ # Store as bytes for LangChain
195
+ buffered = BytesIO()
196
+ img.save(buffered, format="PNG")
197
+ img_bytes = buffered.getvalue()
198
+ img_b64 = base64.b64encode(img_bytes).decode()
199
+
200
+ # Create multimodal content for Gemini
201
+ # Format: [{"type": "text", "text": "..."}, {"type": "image_url", "image_url": {"url": "data:image/png;base64,..."}}]
202
+ image_content = {
203
+ "type": "image_url",
204
+ "image_url": {
205
+ "url": f"data:image/png;base64,{img_b64}"
206
+ }
207
+ }
208
+
209
+ # Include image path in text for tools to use
210
+ text = f"[Image: {image_path}]\n\n{text}"
211
+
212
+ except Exception as e:
213
+ print(f"Error processing image: {e}")
214
+ text = f"[Failed to load image: {image_path}]\n\n{text}"
215
+
216
  message = text
217
 
218
+ # Create message content - multimodal if image exists
219
+ if image_content:
220
+ # For Gemini multimodal: pass list of content parts
221
+ user_message = [
222
+ {"type": "text", "text": message},
223
+ image_content
224
+ ]
225
+ else:
226
+ user_message = message
227
+
228
  response = agent.workflow.invoke(
229
+ {"messages": [("user", user_message)]},
230
  config=config
231
  )
232