Spaces:

ntairov
/

generativeai2

Sleeping

Nazim Tairov

initial commit

317adb5 4 months ago

16.9 kB

	import io
	import logging

	import streamlit as st
	from dotenv import load_dotenv

	from llm_pipeline import build_image_prompt, generate_image, transcribe_audio


	# Load environment variables from a .env file if present
	load_dotenv()


	# -----------------------------------------------------------------------------
	# Logging setup
	# -----------------------------------------------------------------------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
	)
	logger = logging.getLogger("voice_to_image_app")


	# -----------------------------------------------------------------------------
	# Streamlit page configuration
	# -----------------------------------------------------------------------------
	st.set_page_config(
	page_title="Voice to Image Agent",
	page_icon="🎨",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	# -----------------------------------------------------------------------------
	# Custom CSS for modern styling
	# -----------------------------------------------------------------------------
	st.markdown(
	"""
	<style>
	/* Main container styling */
	.main .block-container {
	padding-top: 2rem;
	padding-bottom: 2rem;
	}

	/* Header styling */
	h1 {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	font-size: 3rem;
	font-weight: 700;
	margin-bottom: 0.5rem;
	}

	/* Card-like containers */
	.result-card {
	background-color: #f8f9fa;
	border-radius: 10px;
	padding: 1.5rem;
	margin: 1rem 0;
	border-left: 4px solid #667eea;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}

	/* Step indicators */
	.step-indicator {
	display: flex;
	align-items: center;
	margin: 1.5rem 0;
	padding: 1rem;
	background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
	border-radius: 10px;
	}

	/* Status badges */
	.status-badge {
	display: inline-block;
	padding: 0.25rem 0.75rem;
	border-radius: 20px;
	font-size: 0.85rem;
	font-weight: 600;
	margin-left: 0.5rem;
	}

	.status-success {
	background-color: #d4edda;
	color: #155724;
	}

	.status-pending {
	background-color: #fff3cd;
	color: #856404;
	}

	/* Sidebar styling */
	.sidebar .sidebar-content {
	background: linear-gradient(180deg, #f8f9fa 0%, #ffffff 100%);
	}

	/* Button styling */
	.stButton > button {
	width: 100%;
	border-radius: 8px;
	font-weight: 600;
	padding: 0.75rem;
	transition: all 0.3s ease;
	}

	.stButton > button:hover {
	transform: translateY(-2px);
	box-shadow: 0 4px 8px rgba(0,0,0,0.2);
	}

	/* Metric cards */
	[data-testid="stMetricValue"] {
	font-size: 1.5rem;
	}

	/* Divider styling */
	hr {
	margin: 2rem 0;
	border: none;
	border-top: 2px solid #e0e0e0;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)


	def init_session_state() -> None:
	"""Initialize Streamlit session state keys used across the app."""
	defaults = {
	"transcript": "",
	"image_prompt": "",
	"image_bytes": None,
	"error_message": "",
	"openai_api_key": "",
	"models_used": {
	"transcription_model": "whisper-1",
	"llm_model": "gpt-5-nano",
	"image_model": "gpt-image-1",
	},
	"image_size": "1024x1024",
	}
	for key, value in defaults.items():
	st.session_state.setdefault(key, value)


	def render_sidebar() -> None:
	"""Render the configuration sidebar with modern design."""
	st.sidebar.markdown(
	"""
	<div style='text-align: center; padding: 1rem 0;'>
	<h2 style='margin: 0; color: #667eea;'>⚙️ Settings</h2>
	</div>
	""",
	unsafe_allow_html=True,
	)

	st.sidebar.markdown(
	"Configure the models and parameters for the Voice → Image pipeline."
	)

	st.sidebar.markdown("---")

	# API Key section
	st.sidebar.markdown("### 🔑 API Configuration")
	st.session_state.openai_api_key = st.sidebar.text_input(
	"OpenAI API Key",
	value=st.session_state.openai_api_key,
	type="password",
	help="Enter your OpenAI API key. If left empty, will use OPENAI_API_KEY from environment variables.",
	key="api_key_input",
	)

	if not st.session_state.openai_api_key:
	st.sidebar.warning("⚠️ API key not set. Using environment variable if available.")

	st.sidebar.markdown("---")

	# Models section with icons
	st.sidebar.markdown("### 🎯 Model Configuration")

	with st.sidebar.container():
	st.session_state.models_used["transcription_model"] = st.text_input(
	"🎤 Transcription Model",
	value=st.session_state.models_used["transcription_model"],
	help="OpenAI audio transcription model (e.g., `whisper-1`).",
	key="transcription_input",
	)

	st.session_state.models_used["llm_model"] = st.text_input(
	"🤖 LLM Model",
	value=st.session_state.models_used["llm_model"],
	help="OpenAI chat model for building the image description.",
	key="llm_input",
	)

	st.session_state.models_used["image_model"] = st.text_input(
	"🎨 Image Model",
	value=st.session_state.models_used["image_model"],
	help="OpenAI image generation model (e.g., `gpt-image-1`).",
	key="image_input",
	)

	st.sidebar.markdown("---")

	# Image options
	st.sidebar.markdown("### 🖼️ Image Options")

	size_index = (
	["512x512", "768x768", "1024x1024"].index(st.session_state.image_size)
	if st.session_state.image_size in ["512x512", "768x768", "1024x1024"]
	else 2
	)

	st.session_state.image_size = st.sidebar.selectbox(
	"Image Resolution",
	options=["512x512", "768x768", "1024x1024"],
	index=size_index,
	help="Higher resolution = better quality but slower generation.",
	)

	st.sidebar.markdown("---")

	# Info section
	st.sidebar.markdown("### ℹ️ Information")
	st.sidebar.info(
	"💡 Tip: Logs are printed to the terminal where you run `streamlit run app.py`."
	)

	# Pipeline status
	if st.session_state.transcript or st.session_state.image_bytes:
	st.sidebar.markdown("---")
	st.sidebar.markdown("### 📊 Pipeline Status")

	status_transcript = "✅" if st.session_state.transcript else "⏳"
	status_prompt = "✅" if st.session_state.image_prompt else "⏳"
	status_image = "✅" if st.session_state.image_bytes else "⏳"

	st.sidebar.markdown(f"{status_transcript} Transcription")
	st.sidebar.markdown(f"{status_prompt} Prompt Generation")
	st.sidebar.markdown(f"{status_image} Image Generation")


	def run_pipeline(uploaded_audio) -> None:
	"""
	Run the full voice → transcript → image prompt → image pipeline.

	Side effects: updates Streamlit session_state keys.
	"""
	st.session_state.error_message = ""
	st.session_state.transcript = ""
	st.session_state.image_prompt = ""
	st.session_state.image_bytes = None

	if uploaded_audio is None:
	st.session_state.error_message = "Please upload an audio file first."
	return

	# Convert the uploaded file into a file-like object compatible with OpenAI
	audio_bytes = uploaded_audio.read()
	audio_buffer = io.BytesIO(audio_bytes)
	# OpenAI expects a name with an extension
	audio_buffer.name = uploaded_audio.name or "voice_message.wav"

	transcription_model = st.session_state.models_used["transcription_model"]
	llm_model = st.session_state.models_used["llm_model"]
	image_model = st.session_state.models_used["image_model"]

	# Get API key from session state (empty string will fall back to env var)
	api_key = st.session_state.openai_api_key if st.session_state.openai_api_key else None

	try:
	with st.spinner("Transcribing audio with Whisper..."):
	logger.info("Step 1/3: Transcribing audio.")
	transcript = transcribe_audio(
	audio_buffer,
	model=transcription_model,
	api_key=api_key,
	)
	st.session_state.transcript = transcript

	with st.spinner("Building image prompt with LLM..."):
	logger.info("Step 2/3: Building image prompt from transcript.")
	prompt = build_image_prompt(
	transcript,
	model=llm_model,
	api_key=api_key,
	)
	st.session_state.image_prompt = prompt

	with st.spinner("Generating image from prompt..."):
	logger.info("Step 3/3: Generating image from prompt.")
	image_bytes, metadata = generate_image(
	prompt,
	model=image_model,
	size=st.session_state.image_size,
	api_key=api_key,
	)
	st.session_state.image_bytes = image_bytes

	# Update models_used with the final image metadata as well
	st.session_state.models_used["image_model"] = metadata.get(
	"model", image_model
	)

	logger.info("Pipeline finished successfully.")

	except Exception as exc: # noqa: BLE001
	logger.exception("Pipeline failed: %s", exc)
	st.session_state.error_message = str(exc)


	def render_pipeline_steps() -> None:
	"""Render visual pipeline step indicators."""
	steps = [
	("🎤", "Transcription", st.session_state.transcript),
	("✍️", "Prompt Building", st.session_state.image_prompt),
	("🎨", "Image Generation", st.session_state.image_bytes),
	]

	cols = st.columns(3)
	for idx, (icon, label, status) in enumerate(steps):
	with cols[idx]:
	if status:
	st.markdown(
	f"""
	<div style='text-align: center; padding: 1rem;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	border-radius: 10px; color: white;'>
	<div style='font-size: 2rem; margin-bottom: 0.5rem;'>{icon}</div>
	<div style='font-weight: 600;'>{label}</div>
	<div style='font-size: 0.8rem; margin-top: 0.5rem;'>✅ Complete</div>
	</div>
	""",
	unsafe_allow_html=True,
	)
	else:
	st.markdown(
	f"""
	<div style='text-align: center; padding: 1rem;
	background: #f0f0f0; border-radius: 10px;'>
	<div style='font-size: 2rem; margin-bottom: 0.5rem; opacity: 0.5;'>{icon}</div>
	<div style='font-weight: 600; opacity: 0.7;'>{label}</div>
	<div style='font-size: 0.8rem; margin-top: 0.5rem; opacity: 0.5;'>⏳ Pending</div>
	</div>
	""",
	unsafe_allow_html=True,
	)


	def main() -> None:
	"""Main entry point for the Streamlit app."""
	init_session_state()
	render_sidebar()

	# Header section
	st.markdown(
	"""
	<div style='text-align: center; margin-bottom: 2rem;'>
	<h1>🎙️ Voice to Image Agent</h1>
	<p style='font-size: 1.1rem; color: #666; margin-top: -1rem;'>
	Transform your voice into stunning AI-generated images
	</p>
	</div>
	""",
	unsafe_allow_html=True,
	)

	# Pipeline steps visualization
	render_pipeline_steps()

	st.markdown("<br>", unsafe_allow_html=True)

	# Main content area
	tab1, tab2 = st.tabs(["🎤 Upload & Generate", "📊 Results & Details"])

	with tab1:
	st.markdown("### Upload Your Voice Message")
	st.markdown(
	"Upload a short audio file (`.wav`, `.mp3`, `.m4a`, `.ogg`, `.webm`). "
	"The agent will transcribe it and transform it into a beautiful image."
	)

	st.markdown("<br>", unsafe_allow_html=True)

	# Audio upload section
	uploaded_audio = st.file_uploader(
	"Choose an audio file",
	type=["wav", "mp3", "m4a", "ogg", "webm"],
	accept_multiple_files=False,
	help="Supported formats: WAV, MP3, M4A, OGG, WebM",
	)

	if uploaded_audio is not None:
	st.markdown("Audio Preview:")
	st.audio(uploaded_audio, format="audio/wav")

	# File info
	file_size_mb = len(uploaded_audio.getvalue()) / (1024 * 1024)
	col1, col2 = st.columns(2)
	with col1:
	st.metric("File Name", uploaded_audio.name)
	with col2:
	st.metric("File Size", f"{file_size_mb:.2f} MB")

	st.markdown("<br>", unsafe_allow_html=True)

	# Run button
	run_button_disabled = uploaded_audio is None
	col_btn1, col_btn2, col_btn3 = st.columns([1, 2, 1])
	with col_btn2:
	if st.button(
	"🚀 Run Voice → Image Pipeline",
	type="primary",
	disabled=run_button_disabled,
	use_container_width=True,
	):
	run_pipeline(uploaded_audio)

	if st.session_state.error_message:
	st.error(f"❌ Error: {st.session_state.error_message}")

	with tab2:
	# Results section with better formatting
	if not st.session_state.transcript and not st.session_state.image_bytes:
	st.info("👆 Upload an audio file and run the pipeline to see results here.")
	else:
	# Transcript section
	st.markdown("### 📝 Transcribed Text")
	if st.session_state.transcript:
	st.markdown(
	f"""
	<div class='result-card'>
	<p style='font-size: 1.1rem; line-height: 1.6; color: #333;'>
	{st.session_state.transcript}
	</p>
	</div>
	""",
	unsafe_allow_html=True,
	)
	else:
	st.info("⏳ Transcript will appear here after transcription.")

	st.markdown("<br>", unsafe_allow_html=True)

	# Prompt section
	st.markdown("### ✍️ Enhanced Image Prompt")
	if st.session_state.image_prompt:
	st.markdown(
	f"""
	<div class='result-card'>
	<p style='font-size: 1rem; line-height: 1.6; color: #555; font-style: italic;'>
	"{st.session_state.image_prompt}"
	</p>
	</div>
	""",
	unsafe_allow_html=True,
	)
	else:
	st.info("⏳ The LLM-generated prompt will appear here after transcription.")

	st.markdown("<br>", unsafe_allow_html=True)

	# Image section
	st.markdown("### 🎨 Generated Image")
	if st.session_state.image_bytes:
	col_img1, col_img2, col_img3 = st.columns([1, 3, 1])
	with col_img2:
	st.image(
	st.session_state.image_bytes,
	caption="✨ Generated by the image model",
	use_container_width=True,
	)

	# Image metadata
	st.markdown("<br>", unsafe_allow_html=True)
	col_meta1, col_meta2, col_meta3 = st.columns(3)
	with col_meta1:
	st.metric("Model", st.session_state.models_used.get("image_model", "N/A"))
	with col_meta2:
	st.metric("Size", st.session_state.image_size)
	with col_meta3:
	img_size_kb = len(st.session_state.image_bytes) / 1024
	st.metric("File Size", f"{img_size_kb:.1f} KB")
	else:
	st.info("⏳ The generated image will appear here once ready.")

	st.markdown("---")

	# Models used section
	st.markdown("### ⚙️ Models Used")
	st.json(st.session_state.models_used)


	if __name__ == "__main__":
	main()