Spaces:

Fynd
/

image-understanding

Sleeping

File size: 5,055 Bytes

2948ced

"""LLM provider wrappers (OpenAI + Gemini 3) with a unified analyze interface."""

from __future__ import annotations

import base64
import json
import logging
from typing import List, Sequence

from openai import OpenAI
from google import genai
from google.genai import types as genai_types
from google.genai import errors as genai_errors

from settings import Settings

LOGGER = logging.getLogger("llm")

# Model identifiers exposed to the UI
OPENAI_GPT5 = "gpt-5"
OPENAI_GPT5_MINI = "gpt-5-mini"
# Gemini 3 multimodal text-out model (supports image+text input, text output)
GEMINI_3_VISION = "gemini-3-pro-preview"


class LLMError(RuntimeError):
    pass


def _encode_image_to_data_url(image_bytes: bytes, mime: str = "image/png") -> str:
    b64 = base64.b64encode(image_bytes).decode("utf-8")
    return f"data:{mime};base64,{b64}"


def _collect_openai_messages(system_prompt: str, user_prompt: str, images: Sequence[bytes]):
    system = {"role": "system", "content": [{"type": "input_text", "text": system_prompt}]}
    user_content = [{"type": "input_text", "text": user_prompt}]
    for img in images:
        user_content.append({"type": "input_image", "image_url": _encode_image_to_data_url(img)})
    user = {"role": "user", "content": user_content}
    return [system, user]


def run_openai(
    images: Sequence[bytes],
    system_prompt: str,
    user_prompt: str,
    model: str,
    settings: Settings,
) -> str:
    if not settings.openai_api_key:
        raise LLMError("OPENAI_API_KEY is missing")

    client = OpenAI(api_key=settings.openai_api_key)
    messages = _collect_openai_messages(system_prompt, user_prompt, images)

    kwargs = {}
    if settings.openai_reasoning_effort:
        kwargs["reasoning"] = {"effort": settings.openai_reasoning_effort}

    LOGGER.info(
        "Calling OpenAI model=%s reasoning=%s images=%s total_bytes=%s",
        model,
        settings.openai_reasoning_effort,
        len(images),
        sum(len(i) for i in images),
    )
    resp = client.responses.create(model=model, input=messages, **kwargs)
    text = getattr(resp, "output_text", None) or str(resp)
    LOGGER.info("OpenAI response (truncated 500 chars): %s", text[:500])
    return text


def run_gemini(
    images: Sequence[bytes],
    system_prompt: str,
    user_prompt: str,
    model: str,
    settings: Settings,
) -> str:
    # Two modes:
    # - Vertex (preferred when GOOGLE_GENAI_USE_VERTEXAI=True): uses ADC / gcloud auth
    # - API key (Studio): uses GEMINI_API_KEY
    if settings.google_genai_use_vertexai:
        client = genai.Client(
            vertexai=True,
            project=settings.google_cloud_project,
            location=settings.google_cloud_location or "us-central1",
        )
    else:
        if not settings.gemini_api_key:
            raise LLMError("GEMINI_API_KEY is missing and vertex mode is disabled")
        client = genai.Client(api_key=settings.gemini_api_key)

    parts: List[genai_types.Part | str] = [system_prompt]
    for img in images:
        parts.append(genai_types.Part.from_bytes(data=img, mime_type="image/png"))
    parts.append(user_prompt)

    LOGGER.info(
        "Calling Gemini model=%s vertex=%s images=%s total_bytes=%s",
        model,
        settings.google_genai_use_vertexai,
        len(images),
        sum(len(i) for i in images),
    )
    try:
        response = client.models.generate_content(
            model=model,
            contents=parts,
            config=genai_types.GenerateContentConfig(response_modalities=["text"]),
        )
    except genai_errors.ClientError as exc:
        # Provide clearer guidance for common auth/model issues.
        raise LLMError(
            "Gemini request failed. "
            "If using Vertex, ensure the model exists in your project/location and ADC is active (`gcloud auth application-default login`). "
            "If using Studio/API key (e.g., on HuggingFace), set GOOGLE_GENAI_USE_VERTEXAI=false and provide GEMINI_API_KEY. "
            f"Details: {exc}"
        ) from exc

    # Prefer `.text`; fallback to concatenated text parts
    if getattr(response, "text", None):
        text = response.text
    if getattr(response, "parts", None):
        text_parts = [p.text for p in response.parts if getattr(p, "text", None)]
        if text_parts:
            text = "\n".join(text_parts)
    if "text" not in locals():
        text = str(response)

    LOGGER.info("Gemini response (truncated 500 chars): %s", text[:500])
    return text


def analyze(
    images: Sequence[bytes],
    system_prompt: str,
    user_prompt: str,
    model_choice: str,
    settings: Settings,
) -> str:
    """Dispatch to the correct provider based on model_choice."""
    if model_choice in {OPENAI_GPT5, OPENAI_GPT5_MINI}:
        return run_openai(images, system_prompt, user_prompt, model_choice, settings)
    if model_choice.startswith("gemini"):
        return run_gemini(images, system_prompt, user_prompt, model_choice, settings)
    raise LLMError(f"Unsupported model choice: {model_choice}")