Spaces:
Sleeping
Sleeping
File size: 5,055 Bytes
2948ced | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """LLM provider wrappers (OpenAI + Gemini 3) with a unified analyze interface."""
from __future__ import annotations
import base64
import json
import logging
from typing import List, Sequence
from openai import OpenAI
from google import genai
from google.genai import types as genai_types
from google.genai import errors as genai_errors
from settings import Settings
LOGGER = logging.getLogger("llm")
# Model identifiers exposed to the UI
OPENAI_GPT5 = "gpt-5"
OPENAI_GPT5_MINI = "gpt-5-mini"
# Gemini 3 multimodal text-out model (supports image+text input, text output)
GEMINI_3_VISION = "gemini-3-pro-preview"
class LLMError(RuntimeError):
pass
def _encode_image_to_data_url(image_bytes: bytes, mime: str = "image/png") -> str:
b64 = base64.b64encode(image_bytes).decode("utf-8")
return f"data:{mime};base64,{b64}"
def _collect_openai_messages(system_prompt: str, user_prompt: str, images: Sequence[bytes]):
system = {"role": "system", "content": [{"type": "input_text", "text": system_prompt}]}
user_content = [{"type": "input_text", "text": user_prompt}]
for img in images:
user_content.append({"type": "input_image", "image_url": _encode_image_to_data_url(img)})
user = {"role": "user", "content": user_content}
return [system, user]
def run_openai(
images: Sequence[bytes],
system_prompt: str,
user_prompt: str,
model: str,
settings: Settings,
) -> str:
if not settings.openai_api_key:
raise LLMError("OPENAI_API_KEY is missing")
client = OpenAI(api_key=settings.openai_api_key)
messages = _collect_openai_messages(system_prompt, user_prompt, images)
kwargs = {}
if settings.openai_reasoning_effort:
kwargs["reasoning"] = {"effort": settings.openai_reasoning_effort}
LOGGER.info(
"Calling OpenAI model=%s reasoning=%s images=%s total_bytes=%s",
model,
settings.openai_reasoning_effort,
len(images),
sum(len(i) for i in images),
)
resp = client.responses.create(model=model, input=messages, **kwargs)
text = getattr(resp, "output_text", None) or str(resp)
LOGGER.info("OpenAI response (truncated 500 chars): %s", text[:500])
return text
def run_gemini(
images: Sequence[bytes],
system_prompt: str,
user_prompt: str,
model: str,
settings: Settings,
) -> str:
# Two modes:
# - Vertex (preferred when GOOGLE_GENAI_USE_VERTEXAI=True): uses ADC / gcloud auth
# - API key (Studio): uses GEMINI_API_KEY
if settings.google_genai_use_vertexai:
client = genai.Client(
vertexai=True,
project=settings.google_cloud_project,
location=settings.google_cloud_location or "us-central1",
)
else:
if not settings.gemini_api_key:
raise LLMError("GEMINI_API_KEY is missing and vertex mode is disabled")
client = genai.Client(api_key=settings.gemini_api_key)
parts: List[genai_types.Part | str] = [system_prompt]
for img in images:
parts.append(genai_types.Part.from_bytes(data=img, mime_type="image/png"))
parts.append(user_prompt)
LOGGER.info(
"Calling Gemini model=%s vertex=%s images=%s total_bytes=%s",
model,
settings.google_genai_use_vertexai,
len(images),
sum(len(i) for i in images),
)
try:
response = client.models.generate_content(
model=model,
contents=parts,
config=genai_types.GenerateContentConfig(response_modalities=["text"]),
)
except genai_errors.ClientError as exc:
# Provide clearer guidance for common auth/model issues.
raise LLMError(
"Gemini request failed. "
"If using Vertex, ensure the model exists in your project/location and ADC is active (`gcloud auth application-default login`). "
"If using Studio/API key (e.g., on HuggingFace), set GOOGLE_GENAI_USE_VERTEXAI=false and provide GEMINI_API_KEY. "
f"Details: {exc}"
) from exc
# Prefer `.text`; fallback to concatenated text parts
if getattr(response, "text", None):
text = response.text
if getattr(response, "parts", None):
text_parts = [p.text for p in response.parts if getattr(p, "text", None)]
if text_parts:
text = "\n".join(text_parts)
if "text" not in locals():
text = str(response)
LOGGER.info("Gemini response (truncated 500 chars): %s", text[:500])
return text
def analyze(
images: Sequence[bytes],
system_prompt: str,
user_prompt: str,
model_choice: str,
settings: Settings,
) -> str:
"""Dispatch to the correct provider based on model_choice."""
if model_choice in {OPENAI_GPT5, OPENAI_GPT5_MINI}:
return run_openai(images, system_prompt, user_prompt, model_choice, settings)
if model_choice.startswith("gemini"):
return run_gemini(images, system_prompt, user_prompt, model_choice, settings)
raise LLMError(f"Unsupported model choice: {model_choice}")
|