File size: 5,055 Bytes
2948ced
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""LLM provider wrappers (OpenAI + Gemini 3) with a unified analyze interface."""

from __future__ import annotations

import base64
import json
import logging
from typing import List, Sequence

from openai import OpenAI
from google import genai
from google.genai import types as genai_types
from google.genai import errors as genai_errors

from settings import Settings

LOGGER = logging.getLogger("llm")

# Model identifiers exposed to the UI
OPENAI_GPT5 = "gpt-5"
OPENAI_GPT5_MINI = "gpt-5-mini"
# Gemini 3 multimodal text-out model (supports image+text input, text output)
GEMINI_3_VISION = "gemini-3-pro-preview"


class LLMError(RuntimeError):
    pass


def _encode_image_to_data_url(image_bytes: bytes, mime: str = "image/png") -> str:
    b64 = base64.b64encode(image_bytes).decode("utf-8")
    return f"data:{mime};base64,{b64}"


def _collect_openai_messages(system_prompt: str, user_prompt: str, images: Sequence[bytes]):
    system = {"role": "system", "content": [{"type": "input_text", "text": system_prompt}]}
    user_content = [{"type": "input_text", "text": user_prompt}]
    for img in images:
        user_content.append({"type": "input_image", "image_url": _encode_image_to_data_url(img)})
    user = {"role": "user", "content": user_content}
    return [system, user]


def run_openai(
    images: Sequence[bytes],
    system_prompt: str,
    user_prompt: str,
    model: str,
    settings: Settings,
) -> str:
    if not settings.openai_api_key:
        raise LLMError("OPENAI_API_KEY is missing")

    client = OpenAI(api_key=settings.openai_api_key)
    messages = _collect_openai_messages(system_prompt, user_prompt, images)

    kwargs = {}
    if settings.openai_reasoning_effort:
        kwargs["reasoning"] = {"effort": settings.openai_reasoning_effort}

    LOGGER.info(
        "Calling OpenAI model=%s reasoning=%s images=%s total_bytes=%s",
        model,
        settings.openai_reasoning_effort,
        len(images),
        sum(len(i) for i in images),
    )
    resp = client.responses.create(model=model, input=messages, **kwargs)
    text = getattr(resp, "output_text", None) or str(resp)
    LOGGER.info("OpenAI response (truncated 500 chars): %s", text[:500])
    return text


def run_gemini(
    images: Sequence[bytes],
    system_prompt: str,
    user_prompt: str,
    model: str,
    settings: Settings,
) -> str:
    # Two modes:
    # - Vertex (preferred when GOOGLE_GENAI_USE_VERTEXAI=True): uses ADC / gcloud auth
    # - API key (Studio): uses GEMINI_API_KEY
    if settings.google_genai_use_vertexai:
        client = genai.Client(
            vertexai=True,
            project=settings.google_cloud_project,
            location=settings.google_cloud_location or "us-central1",
        )
    else:
        if not settings.gemini_api_key:
            raise LLMError("GEMINI_API_KEY is missing and vertex mode is disabled")
        client = genai.Client(api_key=settings.gemini_api_key)

    parts: List[genai_types.Part | str] = [system_prompt]
    for img in images:
        parts.append(genai_types.Part.from_bytes(data=img, mime_type="image/png"))
    parts.append(user_prompt)

    LOGGER.info(
        "Calling Gemini model=%s vertex=%s images=%s total_bytes=%s",
        model,
        settings.google_genai_use_vertexai,
        len(images),
        sum(len(i) for i in images),
    )
    try:
        response = client.models.generate_content(
            model=model,
            contents=parts,
            config=genai_types.GenerateContentConfig(response_modalities=["text"]),
        )
    except genai_errors.ClientError as exc:
        # Provide clearer guidance for common auth/model issues.
        raise LLMError(
            "Gemini request failed. "
            "If using Vertex, ensure the model exists in your project/location and ADC is active (`gcloud auth application-default login`). "
            "If using Studio/API key (e.g., on HuggingFace), set GOOGLE_GENAI_USE_VERTEXAI=false and provide GEMINI_API_KEY. "
            f"Details: {exc}"
        ) from exc

    # Prefer `.text`; fallback to concatenated text parts
    if getattr(response, "text", None):
        text = response.text
    if getattr(response, "parts", None):
        text_parts = [p.text for p in response.parts if getattr(p, "text", None)]
        if text_parts:
            text = "\n".join(text_parts)
    if "text" not in locals():
        text = str(response)

    LOGGER.info("Gemini response (truncated 500 chars): %s", text[:500])
    return text


def analyze(
    images: Sequence[bytes],
    system_prompt: str,
    user_prompt: str,
    model_choice: str,
    settings: Settings,
) -> str:
    """Dispatch to the correct provider based on model_choice."""
    if model_choice in {OPENAI_GPT5, OPENAI_GPT5_MINI}:
        return run_openai(images, system_prompt, user_prompt, model_choice, settings)
    if model_choice.startswith("gemini"):
        return run_gemini(images, system_prompt, user_prompt, model_choice, settings)
    raise LLMError(f"Unsupported model choice: {model_choice}")