import io import logging import streamlit as st from dotenv import load_dotenv from llm_pipeline import build_image_prompt, generate_image, transcribe_audio # Load environment variables from a .env file if present load_dotenv() # ----------------------------------------------------------------------------- # Logging setup # ----------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", ) logger = logging.getLogger("voice_to_image_app") # ----------------------------------------------------------------------------- # Streamlit page configuration # ----------------------------------------------------------------------------- st.set_page_config( page_title="Voice to Image Agent", page_icon="🎨", layout="wide", initial_sidebar_state="expanded", ) # ----------------------------------------------------------------------------- # Custom CSS for modern styling # ----------------------------------------------------------------------------- st.markdown( """ """, unsafe_allow_html=True, ) def init_session_state() -> None: """Initialize Streamlit session state keys used across the app.""" defaults = { "transcript": "", "image_prompt": "", "image_bytes": None, "error_message": "", "openai_api_key": "", "models_used": { "transcription_model": "whisper-1", "llm_model": "gpt-5-nano", "image_model": "gpt-image-1", }, "image_size": "1024x1024", } for key, value in defaults.items(): st.session_state.setdefault(key, value) def render_sidebar() -> None: """Render the configuration sidebar with modern design.""" st.sidebar.markdown( """

âš™ī¸ Settings

""", unsafe_allow_html=True, ) st.sidebar.markdown( "Configure the models and parameters for the **Voice → Image** pipeline." ) st.sidebar.markdown("---") # API Key section st.sidebar.markdown("### 🔑 API Configuration") st.session_state.openai_api_key = st.sidebar.text_input( "OpenAI API Key", value=st.session_state.openai_api_key, type="password", help="Enter your OpenAI API key. If left empty, will use OPENAI_API_KEY from environment variables.", key="api_key_input", ) if not st.session_state.openai_api_key: st.sidebar.warning("âš ī¸ API key not set. Using environment variable if available.") st.sidebar.markdown("---") # Models section with icons st.sidebar.markdown("### đŸŽ¯ Model Configuration") with st.sidebar.container(): st.session_state.models_used["transcription_model"] = st.text_input( "🎤 Transcription Model", value=st.session_state.models_used["transcription_model"], help="OpenAI audio transcription model (e.g., `whisper-1`).", key="transcription_input", ) st.session_state.models_used["llm_model"] = st.text_input( "🤖 LLM Model", value=st.session_state.models_used["llm_model"], help="OpenAI chat model for building the image description.", key="llm_input", ) st.session_state.models_used["image_model"] = st.text_input( "🎨 Image Model", value=st.session_state.models_used["image_model"], help="OpenAI image generation model (e.g., `gpt-image-1`).", key="image_input", ) st.sidebar.markdown("---") # Image options st.sidebar.markdown("### đŸ–ŧī¸ Image Options") size_index = ( ["512x512", "768x768", "1024x1024"].index(st.session_state.image_size) if st.session_state.image_size in ["512x512", "768x768", "1024x1024"] else 2 ) st.session_state.image_size = st.sidebar.selectbox( "Image Resolution", options=["512x512", "768x768", "1024x1024"], index=size_index, help="Higher resolution = better quality but slower generation.", ) st.sidebar.markdown("---") # Info section st.sidebar.markdown("### â„šī¸ Information") st.sidebar.info( "💡 **Tip**: Logs are printed to the terminal where you run `streamlit run app.py`." ) # Pipeline status if st.session_state.transcript or st.session_state.image_bytes: st.sidebar.markdown("---") st.sidebar.markdown("### 📊 Pipeline Status") status_transcript = "✅" if st.session_state.transcript else "âŗ" status_prompt = "✅" if st.session_state.image_prompt else "âŗ" status_image = "✅" if st.session_state.image_bytes else "âŗ" st.sidebar.markdown(f"{status_transcript} Transcription") st.sidebar.markdown(f"{status_prompt} Prompt Generation") st.sidebar.markdown(f"{status_image} Image Generation") def run_pipeline(uploaded_audio) -> None: """ Run the full voice → transcript → image prompt → image pipeline. Side effects: updates Streamlit session_state keys. """ st.session_state.error_message = "" st.session_state.transcript = "" st.session_state.image_prompt = "" st.session_state.image_bytes = None if uploaded_audio is None: st.session_state.error_message = "Please upload an audio file first." return # Convert the uploaded file into a file-like object compatible with OpenAI audio_bytes = uploaded_audio.read() audio_buffer = io.BytesIO(audio_bytes) # OpenAI expects a name with an extension audio_buffer.name = uploaded_audio.name or "voice_message.wav" transcription_model = st.session_state.models_used["transcription_model"] llm_model = st.session_state.models_used["llm_model"] image_model = st.session_state.models_used["image_model"] # Get API key from session state (empty string will fall back to env var) api_key = st.session_state.openai_api_key if st.session_state.openai_api_key else None try: with st.spinner("Transcribing audio with Whisper..."): logger.info("Step 1/3: Transcribing audio.") transcript = transcribe_audio( audio_buffer, model=transcription_model, api_key=api_key, ) st.session_state.transcript = transcript with st.spinner("Building image prompt with LLM..."): logger.info("Step 2/3: Building image prompt from transcript.") prompt = build_image_prompt( transcript, model=llm_model, api_key=api_key, ) st.session_state.image_prompt = prompt with st.spinner("Generating image from prompt..."): logger.info("Step 3/3: Generating image from prompt.") image_bytes, metadata = generate_image( prompt, model=image_model, size=st.session_state.image_size, api_key=api_key, ) st.session_state.image_bytes = image_bytes # Update models_used with the final image metadata as well st.session_state.models_used["image_model"] = metadata.get( "model", image_model ) logger.info("Pipeline finished successfully.") except Exception as exc: # noqa: BLE001 logger.exception("Pipeline failed: %s", exc) st.session_state.error_message = str(exc) def render_pipeline_steps() -> None: """Render visual pipeline step indicators.""" steps = [ ("🎤", "Transcription", st.session_state.transcript), ("âœī¸", "Prompt Building", st.session_state.image_prompt), ("🎨", "Image Generation", st.session_state.image_bytes), ] cols = st.columns(3) for idx, (icon, label, status) in enumerate(steps): with cols[idx]: if status: st.markdown( f"""
{icon}
{label}
✅ Complete
""", unsafe_allow_html=True, ) else: st.markdown( f"""
{icon}
{label}
âŗ Pending
""", unsafe_allow_html=True, ) def main() -> None: """Main entry point for the Streamlit app.""" init_session_state() render_sidebar() # Header section st.markdown( """

đŸŽ™ī¸ Voice to Image Agent

Transform your voice into stunning AI-generated images

""", unsafe_allow_html=True, ) # Pipeline steps visualization render_pipeline_steps() st.markdown("
", unsafe_allow_html=True) # Main content area tab1, tab2 = st.tabs(["🎤 Upload & Generate", "📊 Results & Details"]) with tab1: st.markdown("### Upload Your Voice Message") st.markdown( "Upload a short audio file (`.wav`, `.mp3`, `.m4a`, `.ogg`, `.webm`). " "The agent will transcribe it and transform it into a beautiful image." ) st.markdown("
", unsafe_allow_html=True) # Audio upload section uploaded_audio = st.file_uploader( "Choose an audio file", type=["wav", "mp3", "m4a", "ogg", "webm"], accept_multiple_files=False, help="Supported formats: WAV, MP3, M4A, OGG, WebM", ) if uploaded_audio is not None: st.markdown("**Audio Preview:**") st.audio(uploaded_audio, format="audio/wav") # File info file_size_mb = len(uploaded_audio.getvalue()) / (1024 * 1024) col1, col2 = st.columns(2) with col1: st.metric("File Name", uploaded_audio.name) with col2: st.metric("File Size", f"{file_size_mb:.2f} MB") st.markdown("
", unsafe_allow_html=True) # Run button run_button_disabled = uploaded_audio is None col_btn1, col_btn2, col_btn3 = st.columns([1, 2, 1]) with col_btn2: if st.button( "🚀 Run Voice → Image Pipeline", type="primary", disabled=run_button_disabled, use_container_width=True, ): run_pipeline(uploaded_audio) if st.session_state.error_message: st.error(f"❌ **Error**: {st.session_state.error_message}") with tab2: # Results section with better formatting if not st.session_state.transcript and not st.session_state.image_bytes: st.info("👆 Upload an audio file and run the pipeline to see results here.") else: # Transcript section st.markdown("### 📝 Transcribed Text") if st.session_state.transcript: st.markdown( f"""

{st.session_state.transcript}

""", unsafe_allow_html=True, ) else: st.info("âŗ Transcript will appear here after transcription.") st.markdown("
", unsafe_allow_html=True) # Prompt section st.markdown("### âœī¸ Enhanced Image Prompt") if st.session_state.image_prompt: st.markdown( f"""

"{st.session_state.image_prompt}"

""", unsafe_allow_html=True, ) else: st.info("âŗ The LLM-generated prompt will appear here after transcription.") st.markdown("
", unsafe_allow_html=True) # Image section st.markdown("### 🎨 Generated Image") if st.session_state.image_bytes: col_img1, col_img2, col_img3 = st.columns([1, 3, 1]) with col_img2: st.image( st.session_state.image_bytes, caption="✨ Generated by the image model", use_container_width=True, ) # Image metadata st.markdown("
", unsafe_allow_html=True) col_meta1, col_meta2, col_meta3 = st.columns(3) with col_meta1: st.metric("Model", st.session_state.models_used.get("image_model", "N/A")) with col_meta2: st.metric("Size", st.session_state.image_size) with col_meta3: img_size_kb = len(st.session_state.image_bytes) / 1024 st.metric("File Size", f"{img_size_kb:.1f} KB") else: st.info("âŗ The generated image will appear here once ready.") st.markdown("---") # Models used section st.markdown("### âš™ī¸ Models Used") st.json(st.session_state.models_used) if __name__ == "__main__": main()