Spaces:

Fynd
/

image-understanding

Sleeping

App Files Files Community

image-understanding / llm_clients.py

shahkushan1

Add Gradio micro-trend app with LLM integrations and prompt loading

2948ced about 2 months ago

raw

history blame contribute delete

5.06 kB

	"""LLM provider wrappers (OpenAI + Gemini 3) with a unified analyze interface."""

	from __future__ import annotations

	import base64
	import json
	import logging
	from typing import List, Sequence

	from openai import OpenAI
	from google import genai
	from google.genai import types as genai_types
	from google.genai import errors as genai_errors

	from settings import Settings

	LOGGER = logging.getLogger("llm")

	# Model identifiers exposed to the UI
	OPENAI_GPT5 = "gpt-5"
	OPENAI_GPT5_MINI = "gpt-5-mini"
	# Gemini 3 multimodal text-out model (supports image+text input, text output)
	GEMINI_3_VISION = "gemini-3-pro-preview"


	class LLMError(RuntimeError):
	pass


	def _encode_image_to_data_url(image_bytes: bytes, mime: str = "image/png") -> str:
	b64 = base64.b64encode(image_bytes).decode("utf-8")
	return f"data:{mime};base64,{b64}"


	def _collect_openai_messages(system_prompt: str, user_prompt: str, images: Sequence[bytes]):
	system = {"role": "system", "content": [{"type": "input_text", "text": system_prompt}]}
	user_content = [{"type": "input_text", "text": user_prompt}]
	for img in images:
	user_content.append({"type": "input_image", "image_url": _encode_image_to_data_url(img)})
	user = {"role": "user", "content": user_content}
	return [system, user]


	def run_openai(
	images: Sequence[bytes],
	system_prompt: str,
	user_prompt: str,
	model: str,
	settings: Settings,
	) -> str:
	if not settings.openai_api_key:
	raise LLMError("OPENAI_API_KEY is missing")

	client = OpenAI(api_key=settings.openai_api_key)
	messages = _collect_openai_messages(system_prompt, user_prompt, images)

	kwargs = {}
	if settings.openai_reasoning_effort:
	kwargs["reasoning"] = {"effort": settings.openai_reasoning_effort}

	LOGGER.info(
	"Calling OpenAI model=%s reasoning=%s images=%s total_bytes=%s",
	model,
	settings.openai_reasoning_effort,
	len(images),
	sum(len(i) for i in images),
	)
	resp = client.responses.create(model=model, input=messages, **kwargs)
	text = getattr(resp, "output_text", None) or str(resp)
	LOGGER.info("OpenAI response (truncated 500 chars): %s", text[:500])
	return text


	def run_gemini(
	images: Sequence[bytes],
	system_prompt: str,
	user_prompt: str,
	model: str,
	settings: Settings,
	) -> str:
	# Two modes:
	# - Vertex (preferred when GOOGLE_GENAI_USE_VERTEXAI=True): uses ADC / gcloud auth
	# - API key (Studio): uses GEMINI_API_KEY
	if settings.google_genai_use_vertexai:
	client = genai.Client(
	vertexai=True,
	project=settings.google_cloud_project,
	location=settings.google_cloud_location or "us-central1",
	)
	else:
	if not settings.gemini_api_key:
	raise LLMError("GEMINI_API_KEY is missing and vertex mode is disabled")
	client = genai.Client(api_key=settings.gemini_api_key)

	parts: List[genai_types.Part \| str] = [system_prompt]
	for img in images:
	parts.append(genai_types.Part.from_bytes(data=img, mime_type="image/png"))
	parts.append(user_prompt)

	LOGGER.info(
	"Calling Gemini model=%s vertex=%s images=%s total_bytes=%s",
	model,
	settings.google_genai_use_vertexai,
	len(images),
	sum(len(i) for i in images),
	)
	try:
	response = client.models.generate_content(
	model=model,
	contents=parts,
	config=genai_types.GenerateContentConfig(response_modalities=["text"]),
	)
	except genai_errors.ClientError as exc:
	# Provide clearer guidance for common auth/model issues.
	raise LLMError(
	"Gemini request failed. "
	"If using Vertex, ensure the model exists in your project/location and ADC is active (`gcloud auth application-default login`). "
	"If using Studio/API key (e.g., on HuggingFace), set GOOGLE_GENAI_USE_VERTEXAI=false and provide GEMINI_API_KEY. "
	f"Details: {exc}"
	) from exc

	# Prefer `.text`; fallback to concatenated text parts
	if getattr(response, "text", None):
	text = response.text
	if getattr(response, "parts", None):
	text_parts = [p.text for p in response.parts if getattr(p, "text", None)]
	if text_parts:
	text = "\n".join(text_parts)
	if "text" not in locals():
	text = str(response)

	LOGGER.info("Gemini response (truncated 500 chars): %s", text[:500])
	return text


	def analyze(
	images: Sequence[bytes],
	system_prompt: str,
	user_prompt: str,
	model_choice: str,
	settings: Settings,
	) -> str:
	"""Dispatch to the correct provider based on model_choice."""
	if model_choice in {OPENAI_GPT5, OPENAI_GPT5_MINI}:
	return run_openai(images, system_prompt, user_prompt, model_choice, settings)
	if model_choice.startswith("gemini"):
	return run_gemini(images, system_prompt, user_prompt, model_choice, settings)
	raise LLMError(f"Unsupported model choice: {model_choice}")