nikhmr1235 commited on
Commit
480b629
·
verified ·
1 Parent(s): 824eaad

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +0 -80
helper.py CHANGED
@@ -451,83 +451,3 @@ import os
451
  # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
452
  # ... (rest of your helper.py code for other tools) ...
453
 
454
- def analyze_image_with_gemini(args: dict) -> str:
455
- """
456
- Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
457
- This tool is designed for tasks requiring visual understanding, such as
458
- describing image content, identifying objects, or answering questions about
459
- information presented visually (e.g., charts, diagrams, chess boards).
460
-
461
- **Input Format (CRITICAL):**
462
- The input MUST be a JSON string with 'image_path' and 'question' keys.
463
- - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
464
- This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
465
- - 'question': The question to answer based on the image content.
466
-
467
- Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
468
- Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
469
-
470
- **DO NOT:**
471
- - Pass URLs directly to this tool; always use 'file_saver' first.
472
- - Ask questions unrelated to the image content.
473
- - Expect real-time actions or external website access.
474
-
475
- **Output:**
476
- The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
477
- Returns an informative error message if the image file is not found,
478
- the API key is missing, or the LLM encounters an issue.
479
- """
480
- try:
481
- # Ensure the input is parsed if it comes as a string (common from LLMs)
482
- if isinstance(args, str):
483
- import json
484
- args = json.loads(args)
485
-
486
- image_path = args.get("image_path")
487
- question = args.get("question")
488
-
489
- if not image_path or not question:
490
- return "Error: Both 'image_path' and 'question' must be provided."
491
-
492
- if not os.path.exists(image_path):
493
- return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
494
-
495
- google_api_key = os.getenv("GOOGLE_API_KEY")
496
- if not google_api_key:
497
- return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
498
-
499
- # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
500
- # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
501
- llm = ChatGoogleGenerativeAI(
502
- model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
503
- google_api_key=google_api_key,
504
- temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
505
- )
506
-
507
- # Load the image as base64 for multimodal input
508
- with open(image_path, "rb") as f:
509
- image_bytes = f.read()
510
- # Encode image to base64
511
- image_base64 = base64.b64encode(image_bytes).decode('utf-8')
512
-
513
- # Create a multimodal message for the LLM
514
- message = HumanMessage(
515
- content=[
516
- {"type": "text", "text": question},
517
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
518
- ]
519
- )
520
-
521
- # Invoke the LLM
522
- response = llm.invoke([message])
523
- return response.content
524
-
525
- except Exception as e:
526
- return f"Error in gemini_multimodal_tool: {e}"
527
-
528
- # Define the Tool object for the agent
529
- gemini_multimodal_tool = Tool(
530
- name="gemini_multimodal_tool",
531
- description=analyze_image_with_gemini.__doc__, # Use the docstring as description
532
- func=analyze_image_with_gemini,
533
- )
 
451
  # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
452
  # ... (rest of your helper.py code for other tools) ...
453