Final_Assignment_Template

Runtime error

App Files Files Community

nikhmr1235 commited on Jun 5, 2025

Commit

b8f75bc

verified ·

1 Parent(s): 169060d

+gemini_multimodal_tool

Browse files

Files changed (1) hide show

helper.py +88 -0

helper.py CHANGED Viewed

@@ -451,3 +451,91 @@ import os
 # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
 # ... (rest of your helper.py code for other tools) ...

 # Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
 # ... (rest of your helper.py code for other tools) ...
+import base64
+from langchain.tools import Tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage
+import os
+def analyze_image_with_gemini(args: dict) -> str:
+    """
+    Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
+    This tool is designed for tasks requiring visual understanding, such as
+    describing image content, identifying objects, or answering questions about
+    information presented visually (e.g., charts, diagrams, chess boards).
+    **Input Format (CRITICAL):**
+    The input MUST be a JSON string with 'image_path' and 'question' keys.
+    - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
+      This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
+    - 'question': The question to answer based on the image content.
+    Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
+    Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
+    **DO NOT:**
+    - Pass URLs directly to this tool; always use 'file_saver' first.
+    - Ask questions unrelated to the image content.
+    - Expect real-time actions or external website access.
+    **Output:**
+    The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
+    Returns an informative error message if the image file is not found,
+    the API key is missing, or the LLM encounters an issue.
+    """
+    try:
+        # Ensure the input is parsed if it comes as a string (common from LLMs)
+        if isinstance(args, str):
+            import json
+            args = json.loads(args)
+        image_path = args.get("image_path")
+        question = args.get("question")
+        if not image_path or not question:
+            return "Error: Both 'image_path' and 'question' must be provided."
+        if not os.path.exists(image_path):
+            return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
+        google_api_key = os.getenv("GOOGLE_API_KEY")
+        if not google_api_key:
+            return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
+        # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
+        # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
+        llm = ChatGoogleGenerativeAI(
+            #model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
+            model="gemini-2.0-flash",
+            google_api_key=google_api_key,
+            temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
+        )
+        # Load the image as base64 for multimodal input
+        with open(image_path, "rb") as f:
+            image_bytes = f.read()
+            # Encode image to base64
+            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        # Create a multimodal message for the LLM
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": question},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+            ]
+        )
+        # Invoke the LLM
+        response = llm.invoke([message])
+        return response.content
+    except Exception as e:
+        return f"Error in gemini_multimodal_tool: {e}"
+# Define the Tool object for the agent
+gemini_multimodal_tool = Tool(
+    name="gemini_multimodal_tool",
+    description=analyze_image_with_gemini.__doc__, # Use the docstring as description
+    func=analyze_image_with_gemini,
+)