Spaces:

ntairov
/

generativeai2

Sleeping

Nazim Tairov

initial commit

317adb5 5 months ago

9.06 kB

	import base64
	import io
	import logging
	import os
	from typing import Dict, Tuple

	from openai import OpenAI


	logger = logging.getLogger(__name__)


	def _get_openai_client(api_key: str \| None = None) -> OpenAI:
	"""
	Create an OpenAI client using the provided API key or the OPENAI_API_KEY environment variable.

	Parameters
	----------
	api_key:
	Optional OpenAI API key. If provided, this will be used.
	If None, falls back to OPENAI_API_KEY environment variable.
	"""
	if api_key:
	return OpenAI(api_key=api_key)

	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	raise RuntimeError(
	"OpenAI API key is required. "
	"Please provide it in the app settings or set OPENAI_API_KEY environment variable."
	)

	return OpenAI(api_key=api_key)


	def transcribe_audio(
	audio_file: io.BufferedIOBase,
	model: str = "whisper-1",
	api_key: str \| None = None,
	) -> str:
	"""
	Transcribe an audio file to text using OpenAI Whisper.

	Parameters
	----------
	audio_file:
	A binary file-like object positioned at the beginning, with a valid
	`.name` attribute that includes an audio file extension.
	model:
	The OpenAI audio transcription model to use.
	api_key:
	Optional OpenAI API key. If not provided, uses environment variable.
	"""
	client = _get_openai_client(api_key=api_key)
	logger.info("Starting transcription with model=%s", model)

	# OpenAI client expects a file-like object; make sure we're at the start.
	audio_file.seek(0)

	transcript = client.audio.transcriptions.create(
	model=model,
	file=audio_file,
	response_format="json",
	temperature=0,
	)

	text = transcript.text.strip()
	logger.info("Transcription finished. Length=%d characters", len(text))

	if not text:
	raise ValueError(
	"Transcription returned empty text. The audio file may be too quiet, "
	"corrupted, or contain no speech. Please try with a different audio file."
	)

	return text


	def build_image_prompt(
	transcript: str,
	model: str = "gpt-4o-mini",
	api_key: str \| None = None,
	) -> str:
	"""
	Use an LLM to convert a raw user transcript into a detailed image prompt.

	The returned text is intended to be passed directly to the image
	generation model.

	Parameters
	----------
	transcript:
	The transcribed text from the audio file.
	model:
	The OpenAI chat model to use for prompt generation.
	api_key:
	Optional OpenAI API key. If not provided, uses environment variable.
	"""
	# Validate transcript is not empty
	if not transcript or not transcript.strip():
	raise ValueError(
	"Transcript is empty. Cannot generate image prompt from empty transcript."
	)

	client = _get_openai_client(api_key=api_key)
	logger.info("Starting prompt generation with model=%s", model)
	logger.info("Transcript received: %s", transcript[:100] + "..." if len(transcript) > 100 else transcript)

	# Enhanced system message with more explicit instructions
	system_message = (
	"You are a creative assistant that transforms voice requests into detailed, "
	"concrete image generation prompts. Your task is CRITICAL: you MUST always return "
	"a non-empty, descriptive image prompt.\n\n"
	"Guidelines:\n"
	"- Focus purely on describing a visual scene, not dialogue or conversation\n"
	"- Include specific details: subject, style, lighting, colors, composition, "
	"camera angle, and atmosphere\n"
	"- Keep it concise but vivid (1-2 sentences)\n"
	"- If the transcript is unclear, interpret it creatively and describe a reasonable visual scene\n"
	"- NEVER return empty text, whitespace only, or just punctuation\n"
	"- Your response must be a complete, usable image generation prompt\n\n"
	"CRITICAL: You must ALWAYS return a non-empty prompt. Even if the transcript is unclear, "
	"create a descriptive visual prompt based on your best interpretation."
	)

	user_message = (
	"User voice transcript:\n"
	f"\"{transcript}\"\n\n"
	"Transform this into a single, self-contained image generation prompt that "
	"captures the user's intent as a visual scene. Return ONLY the prompt text, "
	"nothing else. Make it detailed and visually descriptive."
	)

	# Try with the primary prompt first
	try:
	completion = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": system_message},
	{"role": "user", "content": user_message},
	],
	temperature=0.8, # Slightly lower for more consistency
	max_tokens=300,
	)

	prompt = completion.choices[0].message.content
	if prompt is None:
	logger.warning("LLM returned None, attempting retry with fallback approach")
	raise ValueError("LLM returned None")

	prompt = prompt.strip()

	# If empty, try a more direct approach
	if not prompt:
	logger.warning("LLM returned empty prompt, attempting retry with simpler approach")
	raise ValueError("Empty prompt returned")

	logger.info("Prompt generation finished. Length=%d characters", len(prompt))
	return prompt

	except (ValueError, Exception) as e:
	# Retry with a simpler, more direct approach
	logger.info("Retrying prompt generation with fallback approach")

	fallback_system = (
	"You are an image prompt generator. Transform the given text into a visual description. "
	"Always return a descriptive image prompt, even if the input is unclear. "
	"Make it creative and detailed."
	)

	fallback_user = (
	f"Create an image generation prompt from this text: \"{transcript}\"\n\n"
	"Return a detailed visual description suitable for image generation. "
	"Include subject, style, and visual details."
	)

	try:
	completion = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": fallback_system},
	{"role": "user", "content": fallback_user},
	],
	temperature=0.7,
	max_tokens=250,
	)

	prompt = completion.choices[0].message.content
	if prompt:
	prompt = prompt.strip()
	if prompt:
	logger.info("Fallback prompt generation succeeded. Length=%d characters", len(prompt))
	return prompt
	except Exception as retry_error:
	logger.warning("Fallback attempt also failed: %s", retry_error)

	# Final fallback: create a basic prompt from the transcript
	logger.info("Using final fallback: creating prompt directly from transcript")
	fallback_prompt = (
	f"A detailed, artistic visualization of: {transcript.strip()}. "
	"High quality, vivid colors, professional composition, cinematic lighting."
	)
	logger.info("Using fallback prompt. Length=%d characters", len(fallback_prompt))
	return fallback_prompt


	def generate_image(
	prompt: str,
	model: str = "gpt-image-1",
	size: str = "1024x1024",
	api_key: str \| None = None,
	) -> Tuple[bytes, Dict[str, str]]:
	"""
	Generate an image from a text prompt using OpenAI's image model.

	Parameters
	----------
	prompt:
	The text prompt describing the image to generate.
	model:
	The OpenAI image generation model to use.
	size:
	The size of the generated image (e.g., "1024x1024").
	api_key:
	Optional OpenAI API key. If not provided, uses environment variable.

	Returns
	-------
	image_bytes:
	The raw bytes of the generated image (PNG by default).
	metadata:
	A small dict with additional information (e.g., model, size).
	"""
	# Validate prompt is not empty
	if not prompt or not prompt.strip():
	raise ValueError(
	"Image prompt is empty. Cannot generate image from empty prompt. "
	"Please ensure the transcript was properly transcribed and the LLM generated a valid prompt."
	)

	client = _get_openai_client(api_key=api_key)
	logger.info(
	"Starting image generation with model=%s size=%s", model, size
	)

	result = client.images.generate(
	model=model,
	prompt=prompt,
	size=size,
	n=1,
	)

	image_b64 = result.data[0].b64_json
	image_bytes = base64.b64decode(image_b64)

	logger.info("Image generation finished. Bytes=%d", len(image_bytes))
	return image_bytes, {"model": model, "size": size}