Final_Assignment_Template

Runtime error

App Files Files Community

nikhmr1235 commited on Jun 5, 2025

Commit

45f56a3

verified ·

1 Parent(s): f99529c

Update helper.py

Browse files

Files changed (1) hide show

helper.py +93 -1

helper.py CHANGED Viewed

@@ -438,4 +438,96 @@ serpapi_Google_Search_tool = Tool(
 # tools = [travily_api_search_tool, python_repl, ..., serpapi_Google Search_tool]
 #
 # And you would need to update your prompt's "Available Tools" section
-# to describe `serpapi_Google Search` to the LLM.

 # tools = [travily_api_search_tool, python_repl, ..., serpapi_Google Search_tool]
 #
 # And you would need to update your prompt's "Available Tools" section
+# to describe `serpapi_Google Search` to the LLM.
+# In helper.py
+import base64
+from langchain.tools import Tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.messages import HumanMessage
+import os
+# Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
+# ... (rest of your helper.py code for other tools) ...
+def analyze_image_with_gemini(args: dict) -> str:
+    """
+    Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
+    This tool is designed for tasks requiring visual understanding, such as
+    describing image content, identifying objects, or answering questions about
+    information presented visually (e.g., charts, diagrams, chess boards).
+    **Input Format (CRITICAL):**
+    The input MUST be a JSON string with 'image_path' and 'question' keys.
+    - 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
+      This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
+    - 'question': The question to answer based on the image content.
+    Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
+    Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
+    **DO NOT:**
+    - Pass URLs directly to this tool; always use 'file_saver' first.
+    - Ask questions unrelated to the image content.
+    - Expect real-time actions or external website access.
+    **Output:**
+    The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
+    Returns an informative error message if the image file is not found,
+    the API key is missing, or the LLM encounters an issue.
+    """
+    try:
+        # Ensure the input is parsed if it comes as a string (common from LLMs)
+        if isinstance(args, str):
+            import json
+            args = json.loads(args)
+        image_path = args.get("image_path")
+        question = args.get("question")
+        if not image_path or not question:
+            return "Error: Both 'image_path' and 'question' must be provided."
+        if not os.path.exists(image_path):
+            return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
+        google_api_key = os.getenv("GOOGLE_API_KEY")
+        if not google_api_key:
+            return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
+        # Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
+        # Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
+        llm = ChatGoogleGenerativeAI(
+            model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
+            google_api_key=google_api_key,
+            temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
+        )
+        # Load the image as base64 for multimodal input
+        with open(image_path, "rb") as f:
+            image_bytes = f.read()
+            # Encode image to base64
+            image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        # Create a multimodal message for the LLM
+        message = HumanMessage(
+            content=[
+                {"type": "text", "text": question},
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
+            ]
+        )
+        # Invoke the LLM
+        response = llm.invoke([message])
+        return response.content
+    except Exception as e:
+        return f"Error in gemini_multimodal_tool: {e}"
+# Define the Tool object for the agent
+gemini_multimodal_tool = Tool(
+    name="gemini_multimodal_tool",
+    description=analyze_image_with_gemini.__doc__, # Use the docstring as description
+    func=analyze_image_with_gemini,
+)