""" Image processing tools for the GAIA Agent. Includes image analysis (GPT-4o vision), manipulation, annotation, and OCR. """ import os import json import tempfile import base64 from typing import Optional from langchain_core.tools import tool import openai from dotenv import load_dotenv load_dotenv() client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) @tool def image_analyze(file_path: str, question: str) -> str: """Analyze an image (local path or URL) with GPT-4o vision. Use this to understand image contents, describe what's shown, read text, analyze diagrams, identify objects, or answer questions about images. Args: file_path: Path to the image file OR an http/https URL question: What you want to know about the image """ try: # Decide whether this is a URL or a local file is_url = file_path.lower().startswith(("http://", "https://")) if is_url: image_content = {"type": "image_url", "image_url": {"url": file_path}} else: with open(file_path, "rb") as img_file: image_data = base64.b64encode(img_file.read()).decode("utf-8") ext = file_path.lower().split('.')[-1] media_type = { "png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "gif": "image/gif", "webp": "image/webp", }.get(ext, "image/png") image_content = { "type": "image_url", "image_url": {"url": f"data:{media_type};base64,{image_data}"}, } response = client.chat.completions.create( model="gpt-4o", messages=[ { "role": "user", "content": [ {"type": "text", "text": question}, image_content, ], } ], max_tokens=800, ) return response.choices[0].message.content except Exception as e: return f"Image analysis error: {str(e)}" @tool def image_manipulate( file_path: str, operation: str, params: str = "{}" ) -> str: """Manipulate an image file using PIL/Pillow. Operations available: - crop: Crop image. Params: {"box": [left, top, right, bottom]} - rotate: Rotate image. Params: {"angle": 90} (degrees, counterclockwise) - resize: Resize image. Params: {"width": 800, "height": 600} or {"scale": 0.5} - flip: Flip image. Params: {"direction": "horizontal"} or {"direction": "vertical"} - grayscale: Convert to grayscale. No params needed. - brightness: Adjust brightness. Params: {"factor": 1.5} (1.0 = original) - contrast: Adjust contrast. Params: {"factor": 1.5} (1.0 = original) - sharpen: Sharpen image. Params: {"factor": 2.0} (1.0 = original) - blur: Apply Gaussian blur. Params: {"radius": 2} - thumbnail: Create thumbnail. Params: {"size": [128, 128]} Args: file_path: Path to the image file operation: One of the operations listed above params: JSON string with operation parameters """ try: from PIL import Image, ImageEnhance, ImageFilter # Parse parameters try: p = json.loads(params) if params else {} except json.JSONDecodeError: return f"Error parsing params: {params}. Use JSON format like {{\"angle\": 90}}" # Open the image img = Image.open(file_path) original_format = img.format or "PNG" operation = operation.lower().strip() if operation == "crop": if "box" not in p: return "Error: crop requires 'box' param: {\"box\": [left, top, right, bottom]}" box = tuple(p["box"]) img = img.crop(box) elif operation == "rotate": angle = p.get("angle", 90) expand = p.get("expand", True) img = img.rotate(angle, expand=expand) elif operation == "resize": if "scale" in p: new_width = int(img.width * p["scale"]) new_height = int(img.height * p["scale"]) elif "width" in p and "height" in p: new_width = p["width"] new_height = p["height"] elif "width" in p: new_width = p["width"] new_height = int(img.height * (p["width"] / img.width)) elif "height" in p: new_height = p["height"] new_width = int(img.width * (p["height"] / img.height)) else: return "Error: resize requires 'width'/'height' or 'scale' param" img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) elif operation == "flip": direction = p.get("direction", "horizontal") if direction == "horizontal": img = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT) elif direction == "vertical": img = img.transpose(Image.Transpose.FLIP_TOP_BOTTOM) else: return "Error: flip direction must be 'horizontal' or 'vertical'" elif operation == "grayscale": img = img.convert("L") elif operation == "brightness": factor = p.get("factor", 1.0) enhancer = ImageEnhance.Brightness(img) img = enhancer.enhance(factor) elif operation == "contrast": factor = p.get("factor", 1.0) enhancer = ImageEnhance.Contrast(img) img = enhancer.enhance(factor) elif operation == "sharpen": factor = p.get("factor", 2.0) enhancer = ImageEnhance.Sharpness(img) img = enhancer.enhance(factor) elif operation == "blur": radius = p.get("radius", 2) img = img.filter(ImageFilter.GaussianBlur(radius=radius)) elif operation == "thumbnail": size = tuple(p.get("size", [128, 128])) img.thumbnail(size, Image.Resampling.LANCZOS) else: return f"Unknown operation: {operation}. Available: crop, rotate, resize, flip, grayscale, brightness, contrast, sharpen, blur, thumbnail" # Save to temp file ext = file_path.lower().split('.')[-1] if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']: ext = 'png' output_path = os.path.join(tempfile.gettempdir(), f"manipulated_{os.path.basename(file_path)}") # Handle mode conversion for JPEG if ext in ['jpg', 'jpeg'] and img.mode in ['RGBA', 'LA', 'P']: img = img.convert('RGB') img.save(output_path, format=original_format if original_format else None) return f"Image manipulated successfully.\nOperation: {operation}\nOriginal size: {Image.open(file_path).size}\nNew size: {img.size}\nSaved to: {output_path}" except ImportError: return "Error: Pillow is not installed. Please install it with: pip install Pillow" except Exception as e: return f"Image manipulation error: {str(e)}" @tool def image_annotate( file_path: str, annotations: str ) -> str: """Add annotations (text, rectangles, circles, lines) to an image. Annotations format (JSON array): [ {"type": "text", "text": "Label", "position": [x, y], "color": "red", "size": 20}, {"type": "rectangle", "box": [x1, y1, x2, y2], "color": "blue", "width": 2}, {"type": "circle", "center": [x, y], "radius": 50, "color": "green", "width": 2}, {"type": "line", "start": [x1, y1], "end": [x2, y2], "color": "yellow", "width": 2}, {"type": "arrow", "start": [x1, y1], "end": [x2, y2], "color": "red", "width": 2} ] Colors can be: "red", "green", "blue", "yellow", "white", "black", "orange", "purple", or RGB tuple like [255, 0, 0] Args: file_path: Path to the image file annotations: JSON string with list of annotations """ try: from PIL import Image, ImageDraw, ImageFont import math # Parse annotations try: annots = json.loads(annotations) except json.JSONDecodeError: return f"Error parsing annotations: {annotations}. Use JSON array format." if not isinstance(annots, list): annots = [annots] # Open the image img = Image.open(file_path) if img.mode != 'RGBA': img = img.convert('RGBA') draw = ImageDraw.Draw(img) # Color mapping color_map = { "red": (255, 0, 0), "green": (0, 255, 0), "blue": (0, 0, 255), "yellow": (255, 255, 0), "white": (255, 255, 255), "black": (0, 0, 0), "orange": (255, 165, 0), "purple": (128, 0, 128), "cyan": (0, 255, 255), "magenta": (255, 0, 255), } def get_color(c): if isinstance(c, str): return color_map.get(c.lower(), (255, 0, 0)) elif isinstance(c, list): return tuple(c) return (255, 0, 0) # Try to load a font, fall back to default def get_font(size): try: # Try common font paths font_paths = [ "arial.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", "/System/Library/Fonts/Helvetica.ttc", "C:/Windows/Fonts/arial.ttf", ] for fp in font_paths: try: return ImageFont.truetype(fp, size) except: continue return ImageFont.load_default() except: return ImageFont.load_default() # Process each annotation for annot in annots: atype = annot.get("type", "").lower() color = get_color(annot.get("color", "red")) width = annot.get("width", 2) if atype == "text": text = annot.get("text", "") position = tuple(annot.get("position", [10, 10])) size = annot.get("size", 20) font = get_font(size) draw.text(position, text, fill=color, font=font) elif atype == "rectangle": box = annot.get("box", [0, 0, 100, 100]) fill = annot.get("fill") fill_color = get_color(fill) if fill else None draw.rectangle(box, outline=color, width=width, fill=fill_color) elif atype == "circle": center = annot.get("center", [50, 50]) radius = annot.get("radius", 25) box = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius] fill = annot.get("fill") fill_color = get_color(fill) if fill else None draw.ellipse(box, outline=color, width=width, fill=fill_color) elif atype == "line": start = tuple(annot.get("start", [0, 0])) end = tuple(annot.get("end", [100, 100])) draw.line([start, end], fill=color, width=width) elif atype == "arrow": start = annot.get("start", [0, 0]) end = annot.get("end", [100, 100]) draw.line([tuple(start), tuple(end)], fill=color, width=width) # Draw arrowhead angle = math.atan2(end[1] - start[1], end[0] - start[0]) arrow_length = 15 arrow_angle = math.pi / 6 # 30 degrees p1 = ( end[0] - arrow_length * math.cos(angle - arrow_angle), end[1] - arrow_length * math.sin(angle - arrow_angle) ) p2 = ( end[0] - arrow_length * math.cos(angle + arrow_angle), end[1] - arrow_length * math.sin(angle + arrow_angle) ) draw.polygon([tuple(end), p1, p2], fill=color) # Save to temp file output_path = os.path.join(tempfile.gettempdir(), f"annotated_{os.path.basename(file_path)}") # Convert back to RGB if saving as JPEG ext = file_path.lower().split('.')[-1] if ext in ['jpg', 'jpeg']: img = img.convert('RGB') img.save(output_path) return f"Image annotated successfully.\nAnnotations added: {len(annots)}\nSaved to: {output_path}" except ImportError: return "Error: Pillow is not installed. Please install it with: pip install Pillow" except Exception as e: return f"Image annotation error: {str(e)}" @tool def image_ocr(file_path: str, lang: str = "eng") -> str: """Extract text from an image using OCR (Optical Character Recognition). Uses Tesseract OCR engine. Requires tesseract to be installed on the system. Args: file_path: Path to the image file lang: Language code for OCR (default: "eng" for English). Common codes: eng, fra, deu, spa, ita, por, chi_sim, chi_tra, jpn, kor """ try: import pytesseract from PIL import Image # Open and preprocess image img = Image.open(file_path) # Convert to RGB if necessary if img.mode not in ['RGB', 'L']: img = img.convert('RGB') # Extract text text = pytesseract.image_to_string(img, lang=lang) # Also get structured data with confidence try: data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT) # Calculate average confidence for detected words confidences = [int(c) for c in data['conf'] if int(c) > 0] avg_confidence = sum(confidences) / len(confidences) if confidences else 0 word_count = len([w for w in data['text'] if w.strip()]) return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}\nWords detected: {word_count}\nAverage confidence: {avg_confidence:.1f}%" except: return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}" except ImportError as e: if "pytesseract" in str(e): return "Error: pytesseract is not installed. Please install it with: pip install pytesseract\nAlso ensure Tesseract OCR is installed on your system." return f"Import error: {str(e)}" except Exception as e: error_msg = str(e) if "tesseract" in error_msg.lower(): return f"Tesseract OCR error: {error_msg}\n\nMake sure Tesseract is installed:\n- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n- Mac: brew install tesseract\n- Linux: sudo apt install tesseract-ocr" return f"OCR error: {error_msg}"