GAIA-Agent / tools /image_tools.py
DenisRz's picture
Initial upload: GAIA Agent
67d287e
"""
Image processing tools for the GAIA Agent.
Includes image analysis (GPT-4o vision), manipulation, annotation, and OCR.
"""
import os
import json
import tempfile
import base64
from typing import Optional
from langchain_core.tools import tool
import openai
from dotenv import load_dotenv
load_dotenv()
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@tool
def image_analyze(file_path: str, question: str) -> str:
"""Analyze an image (local path or URL) with GPT-4o vision.
Use this to understand image contents, describe what's shown, read text,
analyze diagrams, identify objects, or answer questions about images.
Args:
file_path: Path to the image file OR an http/https URL
question: What you want to know about the image
"""
try:
# Decide whether this is a URL or a local file
is_url = file_path.lower().startswith(("http://", "https://"))
if is_url:
image_content = {"type": "image_url", "image_url": {"url": file_path}}
else:
with open(file_path, "rb") as img_file:
image_data = base64.b64encode(img_file.read()).decode("utf-8")
ext = file_path.lower().split('.')[-1]
media_type = {
"png": "image/png",
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"gif": "image/gif",
"webp": "image/webp",
}.get(ext, "image/png")
image_content = {
"type": "image_url",
"image_url": {"url": f"data:{media_type};base64,{image_data}"},
}
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
image_content,
],
}
],
max_tokens=800,
)
return response.choices[0].message.content
except Exception as e:
return f"Image analysis error: {str(e)}"
@tool
def image_manipulate(
file_path: str,
operation: str,
params: str = "{}"
) -> str:
"""Manipulate an image file using PIL/Pillow.
Operations available:
- crop: Crop image. Params: {"box": [left, top, right, bottom]}
- rotate: Rotate image. Params: {"angle": 90} (degrees, counterclockwise)
- resize: Resize image. Params: {"width": 800, "height": 600} or {"scale": 0.5}
- flip: Flip image. Params: {"direction": "horizontal"} or {"direction": "vertical"}
- grayscale: Convert to grayscale. No params needed.
- brightness: Adjust brightness. Params: {"factor": 1.5} (1.0 = original)
- contrast: Adjust contrast. Params: {"factor": 1.5} (1.0 = original)
- sharpen: Sharpen image. Params: {"factor": 2.0} (1.0 = original)
- blur: Apply Gaussian blur. Params: {"radius": 2}
- thumbnail: Create thumbnail. Params: {"size": [128, 128]}
Args:
file_path: Path to the image file
operation: One of the operations listed above
params: JSON string with operation parameters
"""
try:
from PIL import Image, ImageEnhance, ImageFilter
# Parse parameters
try:
p = json.loads(params) if params else {}
except json.JSONDecodeError:
return f"Error parsing params: {params}. Use JSON format like {{\"angle\": 90}}"
# Open the image
img = Image.open(file_path)
original_format = img.format or "PNG"
operation = operation.lower().strip()
if operation == "crop":
if "box" not in p:
return "Error: crop requires 'box' param: {\"box\": [left, top, right, bottom]}"
box = tuple(p["box"])
img = img.crop(box)
elif operation == "rotate":
angle = p.get("angle", 90)
expand = p.get("expand", True)
img = img.rotate(angle, expand=expand)
elif operation == "resize":
if "scale" in p:
new_width = int(img.width * p["scale"])
new_height = int(img.height * p["scale"])
elif "width" in p and "height" in p:
new_width = p["width"]
new_height = p["height"]
elif "width" in p:
new_width = p["width"]
new_height = int(img.height * (p["width"] / img.width))
elif "height" in p:
new_height = p["height"]
new_width = int(img.width * (p["height"] / img.height))
else:
return "Error: resize requires 'width'/'height' or 'scale' param"
img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
elif operation == "flip":
direction = p.get("direction", "horizontal")
if direction == "horizontal":
img = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
elif direction == "vertical":
img = img.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
else:
return "Error: flip direction must be 'horizontal' or 'vertical'"
elif operation == "grayscale":
img = img.convert("L")
elif operation == "brightness":
factor = p.get("factor", 1.0)
enhancer = ImageEnhance.Brightness(img)
img = enhancer.enhance(factor)
elif operation == "contrast":
factor = p.get("factor", 1.0)
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(factor)
elif operation == "sharpen":
factor = p.get("factor", 2.0)
enhancer = ImageEnhance.Sharpness(img)
img = enhancer.enhance(factor)
elif operation == "blur":
radius = p.get("radius", 2)
img = img.filter(ImageFilter.GaussianBlur(radius=radius))
elif operation == "thumbnail":
size = tuple(p.get("size", [128, 128]))
img.thumbnail(size, Image.Resampling.LANCZOS)
else:
return f"Unknown operation: {operation}. Available: crop, rotate, resize, flip, grayscale, brightness, contrast, sharpen, blur, thumbnail"
# Save to temp file
ext = file_path.lower().split('.')[-1]
if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
ext = 'png'
output_path = os.path.join(tempfile.gettempdir(), f"manipulated_{os.path.basename(file_path)}")
# Handle mode conversion for JPEG
if ext in ['jpg', 'jpeg'] and img.mode in ['RGBA', 'LA', 'P']:
img = img.convert('RGB')
img.save(output_path, format=original_format if original_format else None)
return f"Image manipulated successfully.\nOperation: {operation}\nOriginal size: {Image.open(file_path).size}\nNew size: {img.size}\nSaved to: {output_path}"
except ImportError:
return "Error: Pillow is not installed. Please install it with: pip install Pillow"
except Exception as e:
return f"Image manipulation error: {str(e)}"
@tool
def image_annotate(
file_path: str,
annotations: str
) -> str:
"""Add annotations (text, rectangles, circles, lines) to an image.
Annotations format (JSON array):
[
{"type": "text", "text": "Label", "position": [x, y], "color": "red", "size": 20},
{"type": "rectangle", "box": [x1, y1, x2, y2], "color": "blue", "width": 2},
{"type": "circle", "center": [x, y], "radius": 50, "color": "green", "width": 2},
{"type": "line", "start": [x1, y1], "end": [x2, y2], "color": "yellow", "width": 2},
{"type": "arrow", "start": [x1, y1], "end": [x2, y2], "color": "red", "width": 2}
]
Colors can be: "red", "green", "blue", "yellow", "white", "black", "orange", "purple", or RGB tuple like [255, 0, 0]
Args:
file_path: Path to the image file
annotations: JSON string with list of annotations
"""
try:
from PIL import Image, ImageDraw, ImageFont
import math
# Parse annotations
try:
annots = json.loads(annotations)
except json.JSONDecodeError:
return f"Error parsing annotations: {annotations}. Use JSON array format."
if not isinstance(annots, list):
annots = [annots]
# Open the image
img = Image.open(file_path)
if img.mode != 'RGBA':
img = img.convert('RGBA')
draw = ImageDraw.Draw(img)
# Color mapping
color_map = {
"red": (255, 0, 0),
"green": (0, 255, 0),
"blue": (0, 0, 255),
"yellow": (255, 255, 0),
"white": (255, 255, 255),
"black": (0, 0, 0),
"orange": (255, 165, 0),
"purple": (128, 0, 128),
"cyan": (0, 255, 255),
"magenta": (255, 0, 255),
}
def get_color(c):
if isinstance(c, str):
return color_map.get(c.lower(), (255, 0, 0))
elif isinstance(c, list):
return tuple(c)
return (255, 0, 0)
# Try to load a font, fall back to default
def get_font(size):
try:
# Try common font paths
font_paths = [
"arial.ttf",
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/System/Library/Fonts/Helvetica.ttc",
"C:/Windows/Fonts/arial.ttf",
]
for fp in font_paths:
try:
return ImageFont.truetype(fp, size)
except:
continue
return ImageFont.load_default()
except:
return ImageFont.load_default()
# Process each annotation
for annot in annots:
atype = annot.get("type", "").lower()
color = get_color(annot.get("color", "red"))
width = annot.get("width", 2)
if atype == "text":
text = annot.get("text", "")
position = tuple(annot.get("position", [10, 10]))
size = annot.get("size", 20)
font = get_font(size)
draw.text(position, text, fill=color, font=font)
elif atype == "rectangle":
box = annot.get("box", [0, 0, 100, 100])
fill = annot.get("fill")
fill_color = get_color(fill) if fill else None
draw.rectangle(box, outline=color, width=width, fill=fill_color)
elif atype == "circle":
center = annot.get("center", [50, 50])
radius = annot.get("radius", 25)
box = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius]
fill = annot.get("fill")
fill_color = get_color(fill) if fill else None
draw.ellipse(box, outline=color, width=width, fill=fill_color)
elif atype == "line":
start = tuple(annot.get("start", [0, 0]))
end = tuple(annot.get("end", [100, 100]))
draw.line([start, end], fill=color, width=width)
elif atype == "arrow":
start = annot.get("start", [0, 0])
end = annot.get("end", [100, 100])
draw.line([tuple(start), tuple(end)], fill=color, width=width)
# Draw arrowhead
angle = math.atan2(end[1] - start[1], end[0] - start[0])
arrow_length = 15
arrow_angle = math.pi / 6 # 30 degrees
p1 = (
end[0] - arrow_length * math.cos(angle - arrow_angle),
end[1] - arrow_length * math.sin(angle - arrow_angle)
)
p2 = (
end[0] - arrow_length * math.cos(angle + arrow_angle),
end[1] - arrow_length * math.sin(angle + arrow_angle)
)
draw.polygon([tuple(end), p1, p2], fill=color)
# Save to temp file
output_path = os.path.join(tempfile.gettempdir(), f"annotated_{os.path.basename(file_path)}")
# Convert back to RGB if saving as JPEG
ext = file_path.lower().split('.')[-1]
if ext in ['jpg', 'jpeg']:
img = img.convert('RGB')
img.save(output_path)
return f"Image annotated successfully.\nAnnotations added: {len(annots)}\nSaved to: {output_path}"
except ImportError:
return "Error: Pillow is not installed. Please install it with: pip install Pillow"
except Exception as e:
return f"Image annotation error: {str(e)}"
@tool
def image_ocr(file_path: str, lang: str = "eng") -> str:
"""Extract text from an image using OCR (Optical Character Recognition).
Uses Tesseract OCR engine. Requires tesseract to be installed on the system.
Args:
file_path: Path to the image file
lang: Language code for OCR (default: "eng" for English).
Common codes: eng, fra, deu, spa, ita, por, chi_sim, chi_tra, jpn, kor
"""
try:
import pytesseract
from PIL import Image
# Open and preprocess image
img = Image.open(file_path)
# Convert to RGB if necessary
if img.mode not in ['RGB', 'L']:
img = img.convert('RGB')
# Extract text
text = pytesseract.image_to_string(img, lang=lang)
# Also get structured data with confidence
try:
data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)
# Calculate average confidence for detected words
confidences = [int(c) for c in data['conf'] if int(c) > 0]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
word_count = len([w for w in data['text'] if w.strip()])
return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}\nWords detected: {word_count}\nAverage confidence: {avg_confidence:.1f}%"
except:
return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}"
except ImportError as e:
if "pytesseract" in str(e):
return "Error: pytesseract is not installed. Please install it with: pip install pytesseract\nAlso ensure Tesseract OCR is installed on your system."
return f"Import error: {str(e)}"
except Exception as e:
error_msg = str(e)
if "tesseract" in error_msg.lower():
return f"Tesseract OCR error: {error_msg}\n\nMake sure Tesseract is installed:\n- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n- Mac: brew install tesseract\n- Linux: sudo apt install tesseract-ocr"
return f"OCR error: {error_msg}"