image-understanding / llm_clients.py
shahkushan1's picture
Add Gradio micro-trend app with LLM integrations and prompt loading
2948ced
"""LLM provider wrappers (OpenAI + Gemini 3) with a unified analyze interface."""
from __future__ import annotations
import base64
import json
import logging
from typing import List, Sequence
from openai import OpenAI
from google import genai
from google.genai import types as genai_types
from google.genai import errors as genai_errors
from settings import Settings
LOGGER = logging.getLogger("llm")
# Model identifiers exposed to the UI
OPENAI_GPT5 = "gpt-5"
OPENAI_GPT5_MINI = "gpt-5-mini"
# Gemini 3 multimodal text-out model (supports image+text input, text output)
GEMINI_3_VISION = "gemini-3-pro-preview"
class LLMError(RuntimeError):
pass
def _encode_image_to_data_url(image_bytes: bytes, mime: str = "image/png") -> str:
b64 = base64.b64encode(image_bytes).decode("utf-8")
return f"data:{mime};base64,{b64}"
def _collect_openai_messages(system_prompt: str, user_prompt: str, images: Sequence[bytes]):
system = {"role": "system", "content": [{"type": "input_text", "text": system_prompt}]}
user_content = [{"type": "input_text", "text": user_prompt}]
for img in images:
user_content.append({"type": "input_image", "image_url": _encode_image_to_data_url(img)})
user = {"role": "user", "content": user_content}
return [system, user]
def run_openai(
images: Sequence[bytes],
system_prompt: str,
user_prompt: str,
model: str,
settings: Settings,
) -> str:
if not settings.openai_api_key:
raise LLMError("OPENAI_API_KEY is missing")
client = OpenAI(api_key=settings.openai_api_key)
messages = _collect_openai_messages(system_prompt, user_prompt, images)
kwargs = {}
if settings.openai_reasoning_effort:
kwargs["reasoning"] = {"effort": settings.openai_reasoning_effort}
LOGGER.info(
"Calling OpenAI model=%s reasoning=%s images=%s total_bytes=%s",
model,
settings.openai_reasoning_effort,
len(images),
sum(len(i) for i in images),
)
resp = client.responses.create(model=model, input=messages, **kwargs)
text = getattr(resp, "output_text", None) or str(resp)
LOGGER.info("OpenAI response (truncated 500 chars): %s", text[:500])
return text
def run_gemini(
images: Sequence[bytes],
system_prompt: str,
user_prompt: str,
model: str,
settings: Settings,
) -> str:
# Two modes:
# - Vertex (preferred when GOOGLE_GENAI_USE_VERTEXAI=True): uses ADC / gcloud auth
# - API key (Studio): uses GEMINI_API_KEY
if settings.google_genai_use_vertexai:
client = genai.Client(
vertexai=True,
project=settings.google_cloud_project,
location=settings.google_cloud_location or "us-central1",
)
else:
if not settings.gemini_api_key:
raise LLMError("GEMINI_API_KEY is missing and vertex mode is disabled")
client = genai.Client(api_key=settings.gemini_api_key)
parts: List[genai_types.Part | str] = [system_prompt]
for img in images:
parts.append(genai_types.Part.from_bytes(data=img, mime_type="image/png"))
parts.append(user_prompt)
LOGGER.info(
"Calling Gemini model=%s vertex=%s images=%s total_bytes=%s",
model,
settings.google_genai_use_vertexai,
len(images),
sum(len(i) for i in images),
)
try:
response = client.models.generate_content(
model=model,
contents=parts,
config=genai_types.GenerateContentConfig(response_modalities=["text"]),
)
except genai_errors.ClientError as exc:
# Provide clearer guidance for common auth/model issues.
raise LLMError(
"Gemini request failed. "
"If using Vertex, ensure the model exists in your project/location and ADC is active (`gcloud auth application-default login`). "
"If using Studio/API key (e.g., on HuggingFace), set GOOGLE_GENAI_USE_VERTEXAI=false and provide GEMINI_API_KEY. "
f"Details: {exc}"
) from exc
# Prefer `.text`; fallback to concatenated text parts
if getattr(response, "text", None):
text = response.text
if getattr(response, "parts", None):
text_parts = [p.text for p in response.parts if getattr(p, "text", None)]
if text_parts:
text = "\n".join(text_parts)
if "text" not in locals():
text = str(response)
LOGGER.info("Gemini response (truncated 500 chars): %s", text[:500])
return text
def analyze(
images: Sequence[bytes],
system_prompt: str,
user_prompt: str,
model_choice: str,
settings: Settings,
) -> str:
"""Dispatch to the correct provider based on model_choice."""
if model_choice in {OPENAI_GPT5, OPENAI_GPT5_MINI}:
return run_openai(images, system_prompt, user_prompt, model_choice, settings)
if model_choice.startswith("gemini"):
return run_gemini(images, system_prompt, user_prompt, model_choice, settings)
raise LLMError(f"Unsupported model choice: {model_choice}")