agents_gaia / tools /vision_tools.py
Isics's picture
initial commit
32844c7
import base64
import requests
from config import IP_WINDOWS
import ollama
from smolagents import tool
@tool
def analyze_image(image_path: str, question: str) -> str:
""" Analyze an image using a local vision model and answer a question about it.
Use this tool when you need to extract information from a jpg/png file.
Args:
image_path: The local path to the image file (e.g. 'images/grafico.png').
question: The specific question about what to look for in the image (e.g. 'What value is the red bar?').
Returns:
str: The answer to the question, based on the image.
Example:
>>> result = analise_image("images/grafico.png", "What value is the red bar?")
"""
# url = f"http://{IP_WINDOWS}:11434/api/generate"
try:
# Codificamos la imagen a base64 para enviarla por red
with open(image_path, "rb") as image_file:
img_str = base64.b64encode(image_file.read()).decode('utf-8')
# payload = {
# "model": "llava", # Asegúrate de tener este modelo en Windows
# "prompt": question,
# "images": [img_str],
# "stream": False
#}
# response = requests.post(url, json=payload)
response = ollama.chat(model='llava',
messages=[{'role': 'user',
'content': question,
'images': image_file}])
return response['messages']['content']
except Exception as e:
return f"Error conectando con Windows: {str(e)}"