File size: 3,797 Bytes

5374a2d

from typing import Dict, Optional, List
from ...tool import Tool
from ...storage_handler import FileStorageHandler, LocalStorageHandler
from .openai_utils import create_openai_client


class OpenAIImageAnalysisTool(Tool):
    name: str = "openai_image_analysis"
    description: str = "Simple image analysis via OpenAI Responses API (input_text + input_image)."

    inputs: Dict[str, Dict[str, str]] = {
        "prompt": {"type": "string", "description": "User question/instruction. Required."},
        "image_url": {"type": "string", "description": "HTTP(S) image URL. Optional if image_path provided."},
        "image_path": {"type": "string", "description": "Local image path; converted to data URL internally."},
        "model": {"type": "string", "description": "OpenAI model for responses.create (e.g., gpt-4o-mini, gpt-4.1, gpt-5). Optional."},
    }
    required: Optional[List[str]] = ["prompt"]

    def __init__(self, api_key: str, organization_id: str = None, model: str = "gpt-4o-mini", 
                 storage_handler: Optional[FileStorageHandler] = None):
        super().__init__()
        self.api_key = api_key
        self.organization_id = organization_id
        self.model = model
        self.storage_handler = storage_handler or LocalStorageHandler()

    def __call__(
        self,
        prompt: str,
        image_url: str = None,
        image_path: str = None,
        model: str = None,
    ):
        try:
            client = create_openai_client(self.api_key, self.organization_id)
            actual_model = model if model else self.model

            # Resolve image source: prefer URL, else local path to data URL
            final_image_url = image_url
            if not final_image_url and image_path:
                import base64
                import mimetypes
                mime, _ = mimetypes.guess_type(image_path)
                mime = mime or "image/png"
                
                # Use storage handler to read raw bytes directly
                # This bypasses the high-level read() method that processes images
                try:
                    # Translate user path to system path first
                    system_path = self.storage_handler.translate_in(image_path)
                    content = self.storage_handler._read_raw(system_path)
                except Exception as e:
                    return {"error": f"Could not read image {image_path}: {str(e)}"}
                
                b64 = base64.b64encode(content).decode("utf-8")
                final_image_url = f"data:{mime};base64,{b64}"

            response = client.responses.create(
                model=actual_model,
                input=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "input_text", "text": prompt},
                            {"type": "input_image", "image_url": final_image_url},
                        ],
                    }
                ],
            )

            # Prefer unified output_text when present
            text = getattr(response, "output_text", None)
            if text is None:
                # Fallback: try to assemble from content if SDK shape differs
                try:
                    choices = getattr(response, "output", None) or getattr(response, "choices", None)
                    if choices and isinstance(choices, list):
                        first = choices[0]
                        text = getattr(first, "message", {}).get("content", "") if isinstance(first, dict) else ""
                except Exception:
                    text = ""

            return {"content": text or ""}
        except Exception as e:
            return {"error": f"OpenAI image analysis failed: {e}"}