nikhmr1235 commited on
Commit
45f56a3
·
verified ·
1 Parent(s): f99529c

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +93 -1
helper.py CHANGED
@@ -438,4 +438,96 @@ serpapi_Google_Search_tool = Tool(
438
  # tools = [travily_api_search_tool, python_repl, ..., serpapi_Google Search_tool]
439
  #
440
  # And you would need to update your prompt's "Available Tools" section
441
- # to describe `serpapi_Google Search` to the LLM.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  # tools = [travily_api_search_tool, python_repl, ..., serpapi_Google Search_tool]
439
  #
440
  # And you would need to update your prompt's "Available Tools" section
441
+ # to describe `serpapi_Google Search` to the LLM.
442
+
443
+ # In helper.py
444
+
445
+ import base64
446
+ from langchain.tools import Tool
447
+ from langchain_google_genai import ChatGoogleGenerativeAI
448
+ from langchain_core.messages import HumanMessage
449
+ import os
450
+
451
+ # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
452
+ # ... (rest of your helper.py code for other tools) ...
453
+
454
+ def analyze_image_with_gemini(args: dict) -> str:
455
+ """
456
+ Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
457
+ This tool is designed for tasks requiring visual understanding, such as
458
+ describing image content, identifying objects, or answering questions about
459
+ information presented visually (e.g., charts, diagrams, chess boards).
460
+
461
+ **Input Format (CRITICAL):**
462
+ The input MUST be a JSON string with 'image_path' and 'question' keys.
463
+ - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
464
+ This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
465
+ - 'question': The question to answer based on the image content.
466
+
467
+ Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
468
+ Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
469
+
470
+ **DO NOT:**
471
+ - Pass URLs directly to this tool; always use 'file_saver' first.
472
+ - Ask questions unrelated to the image content.
473
+ - Expect real-time actions or external website access.
474
+
475
+ **Output:**
476
+ The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
477
+ Returns an informative error message if the image file is not found,
478
+ the API key is missing, or the LLM encounters an issue.
479
+ """
480
+ try:
481
+ # Ensure the input is parsed if it comes as a string (common from LLMs)
482
+ if isinstance(args, str):
483
+ import json
484
+ args = json.loads(args)
485
+
486
+ image_path = args.get("image_path")
487
+ question = args.get("question")
488
+
489
+ if not image_path or not question:
490
+ return "Error: Both 'image_path' and 'question' must be provided."
491
+
492
+ if not os.path.exists(image_path):
493
+ return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
494
+
495
+ google_api_key = os.getenv("GOOGLE_API_KEY")
496
+ if not google_api_key:
497
+ return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
498
+
499
+ # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
500
+ # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
501
+ llm = ChatGoogleGenerativeAI(
502
+ model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
503
+ google_api_key=google_api_key,
504
+ temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
505
+ )
506
+
507
+ # Load the image as base64 for multimodal input
508
+ with open(image_path, "rb") as f:
509
+ image_bytes = f.read()
510
+ # Encode image to base64
511
+ image_base64 = base64.b64encode(image_bytes).decode('utf-8')
512
+
513
+ # Create a multimodal message for the LLM
514
+ message = HumanMessage(
515
+ content=[
516
+ {"type": "text", "text": question},
517
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
518
+ ]
519
+ )
520
+
521
+ # Invoke the LLM
522
+ response = llm.invoke([message])
523
+ return response.content
524
+
525
+ except Exception as e:
526
+ return f"Error in gemini_multimodal_tool: {e}"
527
+
528
+ # Define the Tool object for the agent
529
+ gemini_multimodal_tool = Tool(
530
+ name="gemini_multimodal_tool",
531
+ description=analyze_image_with_gemini.__doc__, # Use the docstring as description
532
+ func=analyze_image_with_gemini,
533
+ )