Spaces:
Sleeping
Sleeping
| """ | |
| Image processing tools for the GAIA Agent. | |
| Includes image analysis (GPT-4o vision), manipulation, annotation, and OCR. | |
| """ | |
| import os | |
| import json | |
| import tempfile | |
| import base64 | |
| from typing import Optional | |
| from langchain_core.tools import tool | |
| import openai | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| def image_analyze(file_path: str, question: str) -> str: | |
| """Analyze an image (local path or URL) with GPT-4o vision. | |
| Use this to understand image contents, describe what's shown, read text, | |
| analyze diagrams, identify objects, or answer questions about images. | |
| Args: | |
| file_path: Path to the image file OR an http/https URL | |
| question: What you want to know about the image | |
| """ | |
| try: | |
| # Decide whether this is a URL or a local file | |
| is_url = file_path.lower().startswith(("http://", "https://")) | |
| if is_url: | |
| image_content = {"type": "image_url", "image_url": {"url": file_path}} | |
| else: | |
| with open(file_path, "rb") as img_file: | |
| image_data = base64.b64encode(img_file.read()).decode("utf-8") | |
| ext = file_path.lower().split('.')[-1] | |
| media_type = { | |
| "png": "image/png", | |
| "jpg": "image/jpeg", | |
| "jpeg": "image/jpeg", | |
| "gif": "image/gif", | |
| "webp": "image/webp", | |
| }.get(ext, "image/png") | |
| image_content = { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:{media_type};base64,{image_data}"}, | |
| } | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": question}, | |
| image_content, | |
| ], | |
| } | |
| ], | |
| max_tokens=800, | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Image analysis error: {str(e)}" | |
| def image_manipulate( | |
| file_path: str, | |
| operation: str, | |
| params: str = "{}" | |
| ) -> str: | |
| """Manipulate an image file using PIL/Pillow. | |
| Operations available: | |
| - crop: Crop image. Params: {"box": [left, top, right, bottom]} | |
| - rotate: Rotate image. Params: {"angle": 90} (degrees, counterclockwise) | |
| - resize: Resize image. Params: {"width": 800, "height": 600} or {"scale": 0.5} | |
| - flip: Flip image. Params: {"direction": "horizontal"} or {"direction": "vertical"} | |
| - grayscale: Convert to grayscale. No params needed. | |
| - brightness: Adjust brightness. Params: {"factor": 1.5} (1.0 = original) | |
| - contrast: Adjust contrast. Params: {"factor": 1.5} (1.0 = original) | |
| - sharpen: Sharpen image. Params: {"factor": 2.0} (1.0 = original) | |
| - blur: Apply Gaussian blur. Params: {"radius": 2} | |
| - thumbnail: Create thumbnail. Params: {"size": [128, 128]} | |
| Args: | |
| file_path: Path to the image file | |
| operation: One of the operations listed above | |
| params: JSON string with operation parameters | |
| """ | |
| try: | |
| from PIL import Image, ImageEnhance, ImageFilter | |
| # Parse parameters | |
| try: | |
| p = json.loads(params) if params else {} | |
| except json.JSONDecodeError: | |
| return f"Error parsing params: {params}. Use JSON format like {{\"angle\": 90}}" | |
| # Open the image | |
| img = Image.open(file_path) | |
| original_format = img.format or "PNG" | |
| operation = operation.lower().strip() | |
| if operation == "crop": | |
| if "box" not in p: | |
| return "Error: crop requires 'box' param: {\"box\": [left, top, right, bottom]}" | |
| box = tuple(p["box"]) | |
| img = img.crop(box) | |
| elif operation == "rotate": | |
| angle = p.get("angle", 90) | |
| expand = p.get("expand", True) | |
| img = img.rotate(angle, expand=expand) | |
| elif operation == "resize": | |
| if "scale" in p: | |
| new_width = int(img.width * p["scale"]) | |
| new_height = int(img.height * p["scale"]) | |
| elif "width" in p and "height" in p: | |
| new_width = p["width"] | |
| new_height = p["height"] | |
| elif "width" in p: | |
| new_width = p["width"] | |
| new_height = int(img.height * (p["width"] / img.width)) | |
| elif "height" in p: | |
| new_height = p["height"] | |
| new_width = int(img.width * (p["height"] / img.height)) | |
| else: | |
| return "Error: resize requires 'width'/'height' or 'scale' param" | |
| img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) | |
| elif operation == "flip": | |
| direction = p.get("direction", "horizontal") | |
| if direction == "horizontal": | |
| img = img.transpose(Image.Transpose.FLIP_LEFT_RIGHT) | |
| elif direction == "vertical": | |
| img = img.transpose(Image.Transpose.FLIP_TOP_BOTTOM) | |
| else: | |
| return "Error: flip direction must be 'horizontal' or 'vertical'" | |
| elif operation == "grayscale": | |
| img = img.convert("L") | |
| elif operation == "brightness": | |
| factor = p.get("factor", 1.0) | |
| enhancer = ImageEnhance.Brightness(img) | |
| img = enhancer.enhance(factor) | |
| elif operation == "contrast": | |
| factor = p.get("factor", 1.0) | |
| enhancer = ImageEnhance.Contrast(img) | |
| img = enhancer.enhance(factor) | |
| elif operation == "sharpen": | |
| factor = p.get("factor", 2.0) | |
| enhancer = ImageEnhance.Sharpness(img) | |
| img = enhancer.enhance(factor) | |
| elif operation == "blur": | |
| radius = p.get("radius", 2) | |
| img = img.filter(ImageFilter.GaussianBlur(radius=radius)) | |
| elif operation == "thumbnail": | |
| size = tuple(p.get("size", [128, 128])) | |
| img.thumbnail(size, Image.Resampling.LANCZOS) | |
| else: | |
| return f"Unknown operation: {operation}. Available: crop, rotate, resize, flip, grayscale, brightness, contrast, sharpen, blur, thumbnail" | |
| # Save to temp file | |
| ext = file_path.lower().split('.')[-1] | |
| if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']: | |
| ext = 'png' | |
| output_path = os.path.join(tempfile.gettempdir(), f"manipulated_{os.path.basename(file_path)}") | |
| # Handle mode conversion for JPEG | |
| if ext in ['jpg', 'jpeg'] and img.mode in ['RGBA', 'LA', 'P']: | |
| img = img.convert('RGB') | |
| img.save(output_path, format=original_format if original_format else None) | |
| return f"Image manipulated successfully.\nOperation: {operation}\nOriginal size: {Image.open(file_path).size}\nNew size: {img.size}\nSaved to: {output_path}" | |
| except ImportError: | |
| return "Error: Pillow is not installed. Please install it with: pip install Pillow" | |
| except Exception as e: | |
| return f"Image manipulation error: {str(e)}" | |
| def image_annotate( | |
| file_path: str, | |
| annotations: str | |
| ) -> str: | |
| """Add annotations (text, rectangles, circles, lines) to an image. | |
| Annotations format (JSON array): | |
| [ | |
| {"type": "text", "text": "Label", "position": [x, y], "color": "red", "size": 20}, | |
| {"type": "rectangle", "box": [x1, y1, x2, y2], "color": "blue", "width": 2}, | |
| {"type": "circle", "center": [x, y], "radius": 50, "color": "green", "width": 2}, | |
| {"type": "line", "start": [x1, y1], "end": [x2, y2], "color": "yellow", "width": 2}, | |
| {"type": "arrow", "start": [x1, y1], "end": [x2, y2], "color": "red", "width": 2} | |
| ] | |
| Colors can be: "red", "green", "blue", "yellow", "white", "black", "orange", "purple", or RGB tuple like [255, 0, 0] | |
| Args: | |
| file_path: Path to the image file | |
| annotations: JSON string with list of annotations | |
| """ | |
| try: | |
| from PIL import Image, ImageDraw, ImageFont | |
| import math | |
| # Parse annotations | |
| try: | |
| annots = json.loads(annotations) | |
| except json.JSONDecodeError: | |
| return f"Error parsing annotations: {annotations}. Use JSON array format." | |
| if not isinstance(annots, list): | |
| annots = [annots] | |
| # Open the image | |
| img = Image.open(file_path) | |
| if img.mode != 'RGBA': | |
| img = img.convert('RGBA') | |
| draw = ImageDraw.Draw(img) | |
| # Color mapping | |
| color_map = { | |
| "red": (255, 0, 0), | |
| "green": (0, 255, 0), | |
| "blue": (0, 0, 255), | |
| "yellow": (255, 255, 0), | |
| "white": (255, 255, 255), | |
| "black": (0, 0, 0), | |
| "orange": (255, 165, 0), | |
| "purple": (128, 0, 128), | |
| "cyan": (0, 255, 255), | |
| "magenta": (255, 0, 255), | |
| } | |
| def get_color(c): | |
| if isinstance(c, str): | |
| return color_map.get(c.lower(), (255, 0, 0)) | |
| elif isinstance(c, list): | |
| return tuple(c) | |
| return (255, 0, 0) | |
| # Try to load a font, fall back to default | |
| def get_font(size): | |
| try: | |
| # Try common font paths | |
| font_paths = [ | |
| "arial.ttf", | |
| "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", | |
| "/System/Library/Fonts/Helvetica.ttc", | |
| "C:/Windows/Fonts/arial.ttf", | |
| ] | |
| for fp in font_paths: | |
| try: | |
| return ImageFont.truetype(fp, size) | |
| except: | |
| continue | |
| return ImageFont.load_default() | |
| except: | |
| return ImageFont.load_default() | |
| # Process each annotation | |
| for annot in annots: | |
| atype = annot.get("type", "").lower() | |
| color = get_color(annot.get("color", "red")) | |
| width = annot.get("width", 2) | |
| if atype == "text": | |
| text = annot.get("text", "") | |
| position = tuple(annot.get("position", [10, 10])) | |
| size = annot.get("size", 20) | |
| font = get_font(size) | |
| draw.text(position, text, fill=color, font=font) | |
| elif atype == "rectangle": | |
| box = annot.get("box", [0, 0, 100, 100]) | |
| fill = annot.get("fill") | |
| fill_color = get_color(fill) if fill else None | |
| draw.rectangle(box, outline=color, width=width, fill=fill_color) | |
| elif atype == "circle": | |
| center = annot.get("center", [50, 50]) | |
| radius = annot.get("radius", 25) | |
| box = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius] | |
| fill = annot.get("fill") | |
| fill_color = get_color(fill) if fill else None | |
| draw.ellipse(box, outline=color, width=width, fill=fill_color) | |
| elif atype == "line": | |
| start = tuple(annot.get("start", [0, 0])) | |
| end = tuple(annot.get("end", [100, 100])) | |
| draw.line([start, end], fill=color, width=width) | |
| elif atype == "arrow": | |
| start = annot.get("start", [0, 0]) | |
| end = annot.get("end", [100, 100]) | |
| draw.line([tuple(start), tuple(end)], fill=color, width=width) | |
| # Draw arrowhead | |
| angle = math.atan2(end[1] - start[1], end[0] - start[0]) | |
| arrow_length = 15 | |
| arrow_angle = math.pi / 6 # 30 degrees | |
| p1 = ( | |
| end[0] - arrow_length * math.cos(angle - arrow_angle), | |
| end[1] - arrow_length * math.sin(angle - arrow_angle) | |
| ) | |
| p2 = ( | |
| end[0] - arrow_length * math.cos(angle + arrow_angle), | |
| end[1] - arrow_length * math.sin(angle + arrow_angle) | |
| ) | |
| draw.polygon([tuple(end), p1, p2], fill=color) | |
| # Save to temp file | |
| output_path = os.path.join(tempfile.gettempdir(), f"annotated_{os.path.basename(file_path)}") | |
| # Convert back to RGB if saving as JPEG | |
| ext = file_path.lower().split('.')[-1] | |
| if ext in ['jpg', 'jpeg']: | |
| img = img.convert('RGB') | |
| img.save(output_path) | |
| return f"Image annotated successfully.\nAnnotations added: {len(annots)}\nSaved to: {output_path}" | |
| except ImportError: | |
| return "Error: Pillow is not installed. Please install it with: pip install Pillow" | |
| except Exception as e: | |
| return f"Image annotation error: {str(e)}" | |
| def image_ocr(file_path: str, lang: str = "eng") -> str: | |
| """Extract text from an image using OCR (Optical Character Recognition). | |
| Uses Tesseract OCR engine. Requires tesseract to be installed on the system. | |
| Args: | |
| file_path: Path to the image file | |
| lang: Language code for OCR (default: "eng" for English). | |
| Common codes: eng, fra, deu, spa, ita, por, chi_sim, chi_tra, jpn, kor | |
| """ | |
| try: | |
| import pytesseract | |
| from PIL import Image | |
| # Open and preprocess image | |
| img = Image.open(file_path) | |
| # Convert to RGB if necessary | |
| if img.mode not in ['RGB', 'L']: | |
| img = img.convert('RGB') | |
| # Extract text | |
| text = pytesseract.image_to_string(img, lang=lang) | |
| # Also get structured data with confidence | |
| try: | |
| data = pytesseract.image_to_data(img, lang=lang, output_type=pytesseract.Output.DICT) | |
| # Calculate average confidence for detected words | |
| confidences = [int(c) for c in data['conf'] if int(c) > 0] | |
| avg_confidence = sum(confidences) / len(confidences) if confidences else 0 | |
| word_count = len([w for w in data['text'] if w.strip()]) | |
| return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}\nWords detected: {word_count}\nAverage confidence: {avg_confidence:.1f}%" | |
| except: | |
| return f"OCR Result:\n{'-'*40}\n{text.strip()}\n{'-'*40}" | |
| except ImportError as e: | |
| if "pytesseract" in str(e): | |
| return "Error: pytesseract is not installed. Please install it with: pip install pytesseract\nAlso ensure Tesseract OCR is installed on your system." | |
| return f"Import error: {str(e)}" | |
| except Exception as e: | |
| error_msg = str(e) | |
| if "tesseract" in error_msg.lower(): | |
| return f"Tesseract OCR error: {error_msg}\n\nMake sure Tesseract is installed:\n- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki\n- Mac: brew install tesseract\n- Linux: sudo apt install tesseract-ocr" | |
| return f"OCR error: {error_msg}" | |