Agent_GAIA_Benchmark / agent_tools /ImageReaderTool.py
DakshChaudhary's picture
Refactored + Added prompts + Added Tools (Calculator, FileDownloader, ImageReader, Pandas, WebSearch) + NebiusAI inference added
1b34f03
import os
import base64
from llama_index.core.tools import FunctionTool
from llama_index.readers.file.image import ImageReader
from agent_models.models import get_vision_model_client
from agent_prompts.SystemPrompt import vision_model_system_prompt
def get_image_description(image_path: str) -> str:
"""
Analyzes a local image and returns a text description. This tool is used to "see" what is in an image file.
Args:
image_path (str): The local file path of the image to analyze.
"""
try:
print(f"Analyzing image at path: {image_path}")
# Read and encode the image
with open(image_path, "rb") as img_file:
b64_image = base64.b64encode(img_file.read()).decode("utf-8")
b64_url = f"data:image/png;base64,{b64_image}"
# Get Nebius client
client = get_vision_model_client()
# Call Nebius API
response = client.chat.completions.create(
model="Qwen/Qwen2-VL-72B-Instruct",
messages=[
{
"role": "system",
"content": vision_model_system_prompt
},
{
"role": "user",
"content": [
{"type": "text", "text": "Here is an image."},
{"type": "image_url", "image_url": {"url": b64_url}}
]
}
]
)
description = response.choices[0].message.content
print(f"Vision model response: {description}")
return description
except Exception as e:
return f"Error analyzing image: {e}"
# Wrapper function to create the tool for our agent
def get_image_interpreter_tool() -> FunctionTool:
return FunctionTool.from_defaults(
fn=get_image_description,
name="image_interpreter",
description="A tool to analyze an image from a local file path and return a detailed text description. Use this to 'see' what is in an image file that has already been downloaded."
)