nikhmr1235 commited on
Commit
b8f75bc
·
verified ·
1 Parent(s): 169060d

+gemini_multimodal_tool

Browse files
Files changed (1) hide show
  1. helper.py +88 -0
helper.py CHANGED
@@ -451,3 +451,91 @@ import os
451
  # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
452
  # ... (rest of your helper.py code for other tools) ...
453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
  # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
452
  # ... (rest of your helper.py code for other tools) ...
453
 
454
+ import base64
455
+ from langchain.tools import Tool
456
+ from langchain_google_genai import ChatGoogleGenerativeAI
457
+ from langchain_core.messages import HumanMessage
458
+ import os
459
+
460
+ def analyze_image_with_gemini(args: dict) -> str:
461
+ """
462
+ Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
463
+ This tool is designed for tasks requiring visual understanding, such as
464
+ describing image content, identifying objects, or answering questions about
465
+ information presented visually (e.g., charts, diagrams, chess boards).
466
+
467
+ **Input Format (CRITICAL):**
468
+ The input MUST be a JSON string with 'image_path' and 'question' keys.
469
+ - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
470
+ This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
471
+ - 'question': The question to answer based on the image content.
472
+
473
+ Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
474
+ Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
475
+
476
+ **DO NOT:**
477
+ - Pass URLs directly to this tool; always use 'file_saver' first.
478
+ - Ask questions unrelated to the image content.
479
+ - Expect real-time actions or external website access.
480
+
481
+ **Output:**
482
+ The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
483
+ Returns an informative error message if the image file is not found,
484
+ the API key is missing, or the LLM encounters an issue.
485
+ """
486
+ try:
487
+ # Ensure the input is parsed if it comes as a string (common from LLMs)
488
+ if isinstance(args, str):
489
+ import json
490
+ args = json.loads(args)
491
+
492
+ image_path = args.get("image_path")
493
+ question = args.get("question")
494
+
495
+ if not image_path or not question:
496
+ return "Error: Both 'image_path' and 'question' must be provided."
497
+
498
+ if not os.path.exists(image_path):
499
+ return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
500
+
501
+ google_api_key = os.getenv("GOOGLE_API_KEY")
502
+
503
+ if not google_api_key:
504
+ return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
505
+
506
+ # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
507
+ # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
508
+ llm = ChatGoogleGenerativeAI(
509
+ #model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
510
+ model="gemini-2.0-flash",
511
+ google_api_key=google_api_key,
512
+ temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
513
+ )
514
+
515
+ # Load the image as base64 for multimodal input
516
+ with open(image_path, "rb") as f:
517
+ image_bytes = f.read()
518
+ # Encode image to base64
519
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
520
+
521
+ # Create a multimodal message for the LLM
522
+ message = HumanMessage(
523
+ content=[
524
+ {"type": "text", "text": question},
525
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
526
+ ]
527
+ )
528
+
529
+ # Invoke the LLM
530
+ response = llm.invoke([message])
531
+ return response.content
532
+
533
+ except Exception as e:
534
+ return f"Error in gemini_multimodal_tool: {e}"
535
+
536
+ # Define the Tool object for the agent
537
+ gemini_multimodal_tool = Tool(
538
+ name="gemini_multimodal_tool",
539
+ description=analyze_image_with_gemini.__doc__, # Use the docstring as description
540
+ func=analyze_image_with_gemini,
541
+ )