Spaces:
Sleeping
Sleeping
| import asyncio | |
| import base64 | |
| import io | |
| import random | |
| from PIL import Image, ImageDraw, ImageFont | |
| from browser_use.llm.google.chat import ChatGoogle | |
| from browser_use.llm.google.serializer import GoogleMessageSerializer | |
| from browser_use.llm.messages import ( | |
| BaseMessage, | |
| ContentPartImageParam, | |
| ContentPartTextParam, | |
| ImageURL, | |
| SystemMessage, | |
| UserMessage, | |
| ) | |
| def create_random_text_image(text: str = 'hello world', width: int = 4000, height: int = 4000) -> str: | |
| # Create image with random background color | |
| bg_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) | |
| image = Image.new('RGB', (width, height), bg_color) | |
| draw = ImageDraw.Draw(image) | |
| # Try to use a default font, fallback to default if not available | |
| try: | |
| font = ImageFont.truetype('arial.ttf', 24) | |
| except Exception: | |
| font = ImageFont.load_default() | |
| # Calculate text position to center it | |
| bbox = draw.textbbox((0, 0), text, font=font) | |
| text_width = bbox[2] - bbox[0] | |
| text_height = bbox[3] - bbox[1] | |
| x = (width - text_width) // 2 | |
| y = (height - text_height) // 2 | |
| # Draw text with contrasting color | |
| text_color = (255 - bg_color[0], 255 - bg_color[1], 255 - bg_color[2]) | |
| draw.text((x, y), text, fill=text_color, font=font) | |
| # Convert to base64 | |
| buffer = io.BytesIO() | |
| image.save(buffer, format='JPEG') | |
| img_data = base64.b64encode(buffer.getvalue()).decode() | |
| return f'data:image/jpeg;base64,{img_data}' | |
| async def test_gemini_image_vision(): | |
| """Test Gemini's ability to see and describe images.""" | |
| # Create the LLM | |
| llm = ChatGoogle(model='gemini-2.0-flash-exp') | |
| # Create a random image with text | |
| image_data_url = create_random_text_image('Hello Gemini! Can you see this text?') | |
| # Create messages with image | |
| messages: list[BaseMessage] = [ | |
| SystemMessage(content='You are a helpful assistant that can see and describe images.'), | |
| UserMessage( | |
| content=[ | |
| ContentPartTextParam(text='What do you see in this image? Please describe the text and any visual elements.'), | |
| ContentPartImageParam(image_url=ImageURL(url=image_data_url)), | |
| ] | |
| ), | |
| ] | |
| # Serialize messages for Google format | |
| serializer = GoogleMessageSerializer() | |
| formatted_messages, system_message = serializer.serialize_messages(messages) | |
| print('Testing Gemini image vision...') | |
| print(f'System message: {system_message}') | |
| # Make the API call | |
| try: | |
| response = await llm.ainvoke(messages) | |
| print('\n=== Gemini Response ===') | |
| print(response.completion) | |
| print(response.usage) | |
| print('=======================') | |
| except Exception as e: | |
| print(f'Error calling Gemini: {e}') | |
| print(f'Error type: {type(e)}') | |
| if __name__ == '__main__': | |
| asyncio.run(test_gemini_image_vision()) | |