Spaces:

DenisRz
/

GAIA-Agent

Sleeping

App Files Files Community

GAIA-Agent / tools /image_tools.py

DenisRz

Initial upload: GAIA Agent

67d287e about 2 months ago

raw

history blame contribute delete

15.4 kB

	"""
	Image processing tools for the GAIA Agent.
	Includes image analysis (GPT-4o vision), manipulation, annotation, and OCR.
	"""

	import os
	import json
	import tempfile
	import base64
	from typing import Optional
	from langchain_core.tools import tool
	import openai
	from dotenv import load_dotenv

	load_dotenv()
	client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


	@tool
	def image_analyze(file_path: str, question: str) -> str:
	"""Analyze an image (local path or URL) with GPT-4o vision.

	Use this to understand image contents, describe what's shown, read text,
	analyze diagrams, identify objects, or answer questions about images.

	Args:
	file_path: Path to the image file OR an http/https URL
	question: What you want to know about the image
	"""
	try:
	# Decide whether this is a URL or a local file
	is_url = file_path.lower().startswith(("http://", "https://"))

	if is_url:
	image_content = {"type": "image_url", "image_url": {"url": file_path}}
	else:
	with open(file_path, "rb") as img_file:
	image_data = base64.b64encode(img_file.read()).decode("utf-8")
	ext = file_path.lower().split('.')[-1]
	media_type = {
	"png": "image/png",
	"jpg": "image/jpeg",
	"jpeg": "image/jpeg",
	"gif": "image/gif",
	"webp": "image/webp",
	}.get(ext, "image/png")
	image_content = {
	"type": "image_url",
	"image_url": {"url": f"data:{media_type};base64,{image_data}"},
	}

	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	image_content,
	],
	}
	],
	max_tokens=800,
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Image analysis error: {str(e)}"


	@tool
	def image_manipulate(
	file_path: str,
	operation: str,
	params: str = "{}"
	) -> str:
	"""Manipulate an image file using PIL/Pillow.

	Operations available:
	- crop: Crop image. Params: {"box": [left, top, right, bottom]}
	- rotate: Rotate image. Params: {"angle": 90} (degrees, counterclockwise)
	- resize: Resize image. Params: {"width": 800, "height": 600} or {"scale": 0.5}
	- flip: Flip image. Params: {"direction": "horizontal"} or {"direction": "vertical"}
	- grayscale: Convert to grayscale. No params needed.
	- brightness: Adjust brightness. Params: {"factor": 1.5} (1.0 = original)
	- contrast: Adjust contrast. Params: {"factor": 1.5} (1.0 = original)
	- sharpen: Sharpen image. Params: {"factor": 2.0} (1.0 = original)
	- blur: Apply Gaussian blur. Params: {"radius": 2}
	- thumbnail: Create thumbnail. Params: {"size": [128, 128]}

	Args:
	file_path: Path to the image file
	operation: One of the operations listed above
	params: JSON string with operation parameters
	"""
	try:
	from PIL import Image, ImageEnhance, ImageFilter

	# Parse parameters
	try:
	p = json.loads(params) if params else {}
	except json.JSONDecodeError:
	return f"Error parsing params: {params}. Use JSON format like {{\"angle\": 90}}"

	# Open the image
	img = Image.open(file_path)
	original_format = img.format or "PNG"

	operation = operation.lower().strip()

	if operation == "crop":
	if "box" not in p:
	return "Error: crop requires 'box' param: {\"box\": [left, top, right, bottom]}"
	box = tuple(p["box"])
	img = img.crop(box)

	elif operation == "rotate":
	angle = p.get("angle", 90)
	expand = p.get("expand", True)
	img = img.rotate(angle, expand=expand)

	elif operation == "resize":
	if "scale" in p:
	new_width = int(img.width * p["scale"])
	new_height = int(img.height * p["scale"])
	elif "width" in p and "height" in p:
	new_width = p["width"]
	new_height = p["height"]
	elif "width" in p:
	new_width = p["width"]
	new_height = int(img.height * (p["width"] / img.width))
	elif "height" in p:
	new_height = p["height"]
	new_width = int(img.width * (p["height"] / img.height))
	else:
	return "Error: resize requires 'width'/'height' or 'scale' param"
	img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

	elif operation == "flip":
	direction = p.get("direction", "horizontal")
	if direction == "horizontal":
	img = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT)
	elif direction == "vertical":
	img = img.transpose(Image.Transpose.FLIP_TOP_BOTTOM)
	else:
	return "Error: flip direction must be 'horizontal' or 'vertical'"

	elif operation == "grayscale":
	img = img.convert("L")

	elif operation == "brightness":
	factor = p.get("factor", 1.0)
	enhancer = ImageEnhance.Brightness(img)
	img = enhancer.enhance(factor)

	elif operation == "contrast":
	factor = p.get("factor", 1.0)
	enhancer = ImageEnhance.Contrast(img)
	img = enhancer.enhance(factor)

	elif operation == "sharpen":
	factor = p.get("factor", 2.0)
	enhancer = ImageEnhance.Sharpness(img)
	img = enhancer.enhance(factor)

	elif operation == "blur":
	radius = p.get("radius", 2)
	img = img.filter(ImageFilter.GaussianBlur(radius=radius))

	elif operation == "thumbnail":
	size = tuple(p.get("size", [128, 128]))
	img.thumbnail(size, Image.Resampling.LANCZOS)

	else:
	return f"Unknown operation: {operation}. Available: crop, rotate, resize, flip, grayscale, brightness, contrast, sharpen, blur, thumbnail"

	# Save to temp file
	ext = file_path.lower().split('.')[-1]
	if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
	ext = 'png'

	output_path = os.path.join(tempfile.gettempdir(), f"manipulated_{os.path.basename(file_path)}")

	# Handle mode conversion for JPEG
	if ext in ['jpg', 'jpeg'] and img.mode in ['RGBA', 'LA', 'P']:
	img = img.convert('RGB')

	img.save(output_path, format=original_format if original_format else None)

	return f"Image manipulated successfully.\nOperation: {operation}\nOriginal size: {Image.open(file_path).size}\nNew size: {img.size}\nSaved to: {output_path}"

	except ImportError:
	return "Error: Pillow is not installed. Please install it with: pip install Pillow"
	except Exception as e:
	return f"Image manipulation error: {str(e)}"


	@tool
	def image_annotate(
	file_path: str,
	annotations: str
	) -> str:
	"""Add annotations (text, rectangles, circles, lines) to an image.

	Annotations format (JSON array):
	[
	{"type": "text", "text": "Label", "position": [x, y], "color": "red", "size": 20},
	{"type": "rectangle", "box": [x1, y1, x2, y2], "color": "blue", "width": 2},
	{"type": "circle", "center": [x, y], "radius": 50, "color": "green", "width": 2},
	{"type": "line", "start": [x1, y1], "end": [x2, y2], "color": "yellow", "width": 2},
	{"type": "arrow", "start": [x1, y1], "end": [x2, y2], "color": "red", "width": 2}
	]

	Colors can be: "red", "green", "blue", "yellow", "white", "black", "orange", "purple", or RGB tuple like [255, 0, 0]

	Args:
	file_path: Path to the image file
	annotations: JSON string with list of annotations
	"""
	try:
	from PIL import Image, ImageDraw, ImageFont
	import math

	# Parse annotations
	try:
	annots = json.loads(annotations)
	except json.JSONDecodeError:
	return f"Error parsing annotations: {annotations}. Use JSON array format."

	if not isinstance(annots, list):
	annots = [annots]

	# Open the image
	img = Image.open(file_path)
	if img.mode != 'RGBA':
	img = img.convert('RGBA')

	draw = ImageDraw.Draw(img)

	# Color mapping
	color_map = {
	"red": (255, 0, 0),
	"green": (0, 255, 0),
	"blue": (0, 0, 255),
	"yellow": (255, 255, 0),
	"white": (255, 255, 255),
	"black": (0, 0, 0),
	"orange": (255, 165, 0),
	"purple": (128, 0, 128),
	"cyan": (0, 255, 255),
	"magenta": (255, 0, 255),
	}

	def get_color(c):
	if isinstance(c, str):
	return color_map.get(c.lower(), (255, 0, 0))
	elif isinstance(c, list):
	return tuple(c)
	return (255, 0, 0)

	# Try to load a font, fall back to default
	def get_font(size):
	try:
	# Try common font paths
	font_paths = [
	"arial.ttf",
	"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
	"/System/Library/Fonts/Helvetica.ttc",
	"C:/Windows/Fonts/arial.ttf",
	]
	for fp in font_paths:
	try:
	return ImageFont.truetype(fp, size)
	except:
	continue
	return ImageFont.load_default()
	except:
	return ImageFont.load_default()

	# Process each annotation
	for annot in annots:
	atype = annot.get("type", "").lower()
	color = get_color(annot.get("color", "red"))
	width = annot.get("width", 2)

	if atype == "text":
	text = annot.get("text", "")
	position = tuple(annot.get("position", [10, 10]))
	size = annot.get("size", 20)
	font = get_font(size)
	draw.text(position, text, fill=color, font=font)

	elif atype == "rectangle":
	box = annot.get("box", [0, 0, 100, 100])
	fill = annot.get("fill")
	fill_color = get_color(fill) if fill else None
	draw.rectangle(box, outline=color, width=width, fill=fill_color)

	elif atype == "circle":
	center = annot.get("center", [50, 50])
	radius = annot.get("radius", 25)
	box = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius]
	fill = annot.get("fill")
	fill_color = get_color(fill) if fill else None
	draw.ellipse(box, outline=color, width=width, fill=fill_color)

	elif atype == "line":
	start = tuple(annot.get("start", [0, 0]))
	end = tuple(annot.get("end", [100, 100]))
	draw.line([start, end], fill=color, width=width)

	elif atype == "arrow":
	start = annot.get("start", [0, 0])
	end = annot.get("end", [100, 100])
	draw.line([tuple(start), tuple(end)], fill=color, width=width)

	# Draw arrowhead
	angle = math.atan2(end[1] - start[1], end[0] - start[0])
	arrow_length = 15
	arrow_angle = math.pi / 6 # 30 degrees

	p1 = (
	end[0] - arrow_length * math.cos(angle - arrow_angle),
	end[1] - arrow_length * math.sin(angle - arrow_angle)
	)
	p2 = (
	end[0] - arrow_length * math.cos(angle + arrow_angle),
	end[1] - arrow_length * math.sin(angle + arrow_angle)
	)
	draw.polygon([tuple(end), p1, p2], fill=color)

	# Save to temp file
	output_path = os.path.join(tempfile.gettempdir(), f"annotated_{os.path.basename(file_path)}")

	# Convert back to RGB if saving as JPEG
	ext = file_path.lower().split('.')[-1]
	if ext in ['jpg', 'jpeg']:
	img = img.convert('RGB')

	img.save(output_path)

	return f"Image annotated successfully.\nAnnotations added: {len(annots)}\nSaved to: {output_path}"

	except ImportError:
	return "Error: Pillow is not installed. Please install it with: pip install Pillow"
	except Exception as e:
	return f"Image annotation error: {str(e)}"


	@tool
	def image_ocr(file_path: str, lang: str = "eng") -> str:
	"""Extract text from an image using OCR (Optical Character Recognition).

	Uses Tesseract OCR engine. Requires tesseract to be installed on the system.

	Args:
	file_path: Path to the image file
	lang: Language code for OCR (default: "eng" for English).
	Common codes: eng, fra, deu, spa, ita, por, chi_sim, chi_tra, jpn, kor
	"""
	try:
	import pytesseract
	from PIL import Image

	# Open and preprocess image
	img = Image.open(file_path)

	# Convert to RGB if necessary
	if img.mode not in ['RGB', 'L']:
	img = img.convert('RGB')

	# Extract text
	text = pytesseract.image_to_string(img, lang=lang)

	# Also get structured data with confidence
	try:
	data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT)

	# Calculate average confidence for detected words
	confidences = [int(c) for c in data['conf'] if int(c) > 0]
	avg_confidence = sum(confidences) / len(confidences) if confidences else 0
	word_count = len([w for w in data['text'] if w.strip()])

	return f"OCR Result:\n{'-'40}\n{text.strip()}\n{'-'40}\nWords detected: {word_count}\nAverage confidence: {avg_confidence:.1f}%"
	except:
	return f"OCR Result:\n{'-'40}\n{text.strip()}\n{'-'40}"

	except ImportError as e:
	if "pytesseract" in str(e):
	return "Error: pytesseract is not installed. Please install it with: pip install pytesseract\nAlso ensure Tesseract OCR is installed on your system."
	return f"Import error: {str(e)}"
	except Exception as e:
	error_msg = str(e)
	if "tesseract" in error_msg.lower():
	return f"Tesseract OCR error: {error_msg}\n\nMake sure Tesseract is installed:\n- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n- Mac: brew install tesseract\n- Linux: sudo apt install tesseract-ocr"
	return f"OCR error: {error_msg}"