Spaces:
Runtime error
Runtime error
Update helper.py
Browse files
helper.py
CHANGED
|
@@ -438,4 +438,96 @@ serpapi_Google_Search_tool = Tool(
|
|
| 438 |
# tools = [travily_api_search_tool, python_repl, ..., serpapi_Google Search_tool]
|
| 439 |
#
|
| 440 |
# And you would need to update your prompt's "Available Tools" section
|
| 441 |
-
# to describe `serpapi_Google Search` to the LLM.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
# tools = [travily_api_search_tool, python_repl, ..., serpapi_Google Search_tool]
|
| 439 |
#
|
| 440 |
# And you would need to update your prompt's "Available Tools" section
|
| 441 |
+
# to describe `serpapi_Google Search` to the LLM.
|
| 442 |
+
|
| 443 |
+
# In helper.py
|
| 444 |
+
|
| 445 |
+
import base64
|
| 446 |
+
from langchain.tools import Tool
|
| 447 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
| 448 |
+
from langchain_core.messages import HumanMessage
|
| 449 |
+
import os
|
| 450 |
+
|
| 451 |
+
# Your existing tools (PythonREPL, TavilySearchResults, file_saver, audio_transcriber, Wikipedia, SerpAPI) go here...
|
| 452 |
+
# ... (rest of your helper.py code for other tools) ...
|
| 453 |
+
|
| 454 |
+
def analyze_image_with_gemini(args: dict) -> str:
|
| 455 |
+
"""
|
| 456 |
+
Analyzes an image using Google's Gemini Multimodal LLM to answer a given question.
|
| 457 |
+
This tool is designed for tasks requiring visual understanding, such as
|
| 458 |
+
describing image content, identifying objects, or answering questions about
|
| 459 |
+
information presented visually (e.g., charts, diagrams, chess boards).
|
| 460 |
+
|
| 461 |
+
**Input Format (CRITICAL):**
|
| 462 |
+
The input MUST be a JSON string with 'image_path' and 'question' keys.
|
| 463 |
+
- 'image_path': The local file path to the image (e.g., 'path/to/my_image.png').
|
| 464 |
+
This image MUST have been previously downloaded and saved locally using the 'file_saver' tool.
|
| 465 |
+
- 'question': The question to answer based on the image content.
|
| 466 |
+
|
| 467 |
+
Example: '{"image_path": "downloaded_image.png", "question": "What is depicted in this image?"}'
|
| 468 |
+
Example: '{"image_path": "chess_board.jpg", "question": "What is the next best move in this chess position?"}'
|
| 469 |
+
|
| 470 |
+
**DO NOT:**
|
| 471 |
+
- Pass URLs directly to this tool; always use 'file_saver' first.
|
| 472 |
+
- Ask questions unrelated to the image content.
|
| 473 |
+
- Expect real-time actions or external website access.
|
| 474 |
+
|
| 475 |
+
**Output:**
|
| 476 |
+
The tool returns the answer generated by the Gemini Multimodal LLM based on the image and question.
|
| 477 |
+
Returns an informative error message if the image file is not found,
|
| 478 |
+
the API key is missing, or the LLM encounters an issue.
|
| 479 |
+
"""
|
| 480 |
+
try:
|
| 481 |
+
# Ensure the input is parsed if it comes as a string (common from LLMs)
|
| 482 |
+
if isinstance(args, str):
|
| 483 |
+
import json
|
| 484 |
+
args = json.loads(args)
|
| 485 |
+
|
| 486 |
+
image_path = args.get("image_path")
|
| 487 |
+
question = args.get("question")
|
| 488 |
+
|
| 489 |
+
if not image_path or not question:
|
| 490 |
+
return "Error: Both 'image_path' and 'question' must be provided."
|
| 491 |
+
|
| 492 |
+
if not os.path.exists(image_path):
|
| 493 |
+
return f"Error: Local image file not found at '{image_path}'. Did you save it with 'file_saver'?"
|
| 494 |
+
|
| 495 |
+
google_api_key = os.getenv("GOOGLE_API_KEY")
|
| 496 |
+
if not google_api_key:
|
| 497 |
+
return "Error: GOOGLE_API_KEY not found in environment variables for multimodal tool."
|
| 498 |
+
|
| 499 |
+
# Initialize the multimodal LLM (Gemini-Pro-Vision is recommended for image understanding)
|
| 500 |
+
# Using a fallback to 'gemini-pro' if 'gemini-pro-vision' isn't directly available or preferred
|
| 501 |
+
llm = ChatGoogleGenerativeAI(
|
| 502 |
+
model="gemini-pro-vision" if "gemini-pro-vision" in ChatGoogleGenerativeAI.get_available_models(google_api_key) else "gemini-2.0-flash",
|
| 503 |
+
google_api_key=google_api_key,
|
| 504 |
+
temperature=0.0 # Set temperature to 0 for more factual/deterministic responses
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
# Load the image as base64 for multimodal input
|
| 508 |
+
with open(image_path, "rb") as f:
|
| 509 |
+
image_bytes = f.read()
|
| 510 |
+
# Encode image to base64
|
| 511 |
+
image_base64 = base64.b64encode(image_bytes).decode('utf-8')
|
| 512 |
+
|
| 513 |
+
# Create a multimodal message for the LLM
|
| 514 |
+
message = HumanMessage(
|
| 515 |
+
content=[
|
| 516 |
+
{"type": "text", "text": question},
|
| 517 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}},
|
| 518 |
+
]
|
| 519 |
+
)
|
| 520 |
+
|
| 521 |
+
# Invoke the LLM
|
| 522 |
+
response = llm.invoke([message])
|
| 523 |
+
return response.content
|
| 524 |
+
|
| 525 |
+
except Exception as e:
|
| 526 |
+
return f"Error in gemini_multimodal_tool: {e}"
|
| 527 |
+
|
| 528 |
+
# Define the Tool object for the agent
|
| 529 |
+
gemini_multimodal_tool = Tool(
|
| 530 |
+
name="gemini_multimodal_tool",
|
| 531 |
+
description=analyze_image_with_gemini.__doc__, # Use the docstring as description
|
| 532 |
+
func=analyze_image_with_gemini,
|
| 533 |
+
)
|