generativeai2 / app.py
Nazim Tairov
initial commit
317adb5
import io
import logging
import streamlit as st
from dotenv import load_dotenv
from llm_pipeline import build_image_prompt, generate_image, transcribe_audio
# Load environment variables from a .env file if present
load_dotenv()
# -----------------------------------------------------------------------------
# Logging setup
# -----------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
)
logger = logging.getLogger("voice_to_image_app")
# -----------------------------------------------------------------------------
# Streamlit page configuration
# -----------------------------------------------------------------------------
st.set_page_config(
page_title="Voice to Image Agent",
page_icon="🎨",
layout="wide",
initial_sidebar_state="expanded",
)
# -----------------------------------------------------------------------------
# Custom CSS for modern styling
# -----------------------------------------------------------------------------
st.markdown(
"""
<style>
/* Main container styling */
.main .block-container {
padding-top: 2rem;
padding-bottom: 2rem;
}
/* Header styling */
h1 {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-size: 3rem;
font-weight: 700;
margin-bottom: 0.5rem;
}
/* Card-like containers */
.result-card {
background-color: #f8f9fa;
border-radius: 10px;
padding: 1.5rem;
margin: 1rem 0;
border-left: 4px solid #667eea;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
/* Step indicators */
.step-indicator {
display: flex;
align-items: center;
margin: 1.5rem 0;
padding: 1rem;
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
border-radius: 10px;
}
/* Status badges */
.status-badge {
display: inline-block;
padding: 0.25rem 0.75rem;
border-radius: 20px;
font-size: 0.85rem;
font-weight: 600;
margin-left: 0.5rem;
}
.status-success {
background-color: #d4edda;
color: #155724;
}
.status-pending {
background-color: #fff3cd;
color: #856404;
}
/* Sidebar styling */
.sidebar .sidebar-content {
background: linear-gradient(180deg, #f8f9fa 0%, #ffffff 100%);
}
/* Button styling */
.stButton > button {
width: 100%;
border-radius: 8px;
font-weight: 600;
padding: 0.75rem;
transition: all 0.3s ease;
}
.stButton > button:hover {
transform: translateY(-2px);
box-shadow: 0 4px 8px rgba(0,0,0,0.2);
}
/* Metric cards */
[data-testid="stMetricValue"] {
font-size: 1.5rem;
}
/* Divider styling */
hr {
margin: 2rem 0;
border: none;
border-top: 2px solid #e0e0e0;
}
</style>
""",
unsafe_allow_html=True,
)
def init_session_state() -> None:
"""Initialize Streamlit session state keys used across the app."""
defaults = {
"transcript": "",
"image_prompt": "",
"image_bytes": None,
"error_message": "",
"openai_api_key": "",
"models_used": {
"transcription_model": "whisper-1",
"llm_model": "gpt-5-nano",
"image_model": "gpt-image-1",
},
"image_size": "1024x1024",
}
for key, value in defaults.items():
st.session_state.setdefault(key, value)
def render_sidebar() -> None:
"""Render the configuration sidebar with modern design."""
st.sidebar.markdown(
"""
<div style='text-align: center; padding: 1rem 0;'>
<h2 style='margin: 0; color: #667eea;'>βš™οΈ Settings</h2>
</div>
""",
unsafe_allow_html=True,
)
st.sidebar.markdown(
"Configure the models and parameters for the **Voice β†’ Image** pipeline."
)
st.sidebar.markdown("---")
# API Key section
st.sidebar.markdown("### πŸ”‘ API Configuration")
st.session_state.openai_api_key = st.sidebar.text_input(
"OpenAI API Key",
value=st.session_state.openai_api_key,
type="password",
help="Enter your OpenAI API key. If left empty, will use OPENAI_API_KEY from environment variables.",
key="api_key_input",
)
if not st.session_state.openai_api_key:
st.sidebar.warning("⚠️ API key not set. Using environment variable if available.")
st.sidebar.markdown("---")
# Models section with icons
st.sidebar.markdown("### 🎯 Model Configuration")
with st.sidebar.container():
st.session_state.models_used["transcription_model"] = st.text_input(
"🎀 Transcription Model",
value=st.session_state.models_used["transcription_model"],
help="OpenAI audio transcription model (e.g., `whisper-1`).",
key="transcription_input",
)
st.session_state.models_used["llm_model"] = st.text_input(
"πŸ€– LLM Model",
value=st.session_state.models_used["llm_model"],
help="OpenAI chat model for building the image description.",
key="llm_input",
)
st.session_state.models_used["image_model"] = st.text_input(
"🎨 Image Model",
value=st.session_state.models_used["image_model"],
help="OpenAI image generation model (e.g., `gpt-image-1`).",
key="image_input",
)
st.sidebar.markdown("---")
# Image options
st.sidebar.markdown("### πŸ–ΌοΈ Image Options")
size_index = (
["512x512", "768x768", "1024x1024"].index(st.session_state.image_size)
if st.session_state.image_size in ["512x512", "768x768", "1024x1024"]
else 2
)
st.session_state.image_size = st.sidebar.selectbox(
"Image Resolution",
options=["512x512", "768x768", "1024x1024"],
index=size_index,
help="Higher resolution = better quality but slower generation.",
)
st.sidebar.markdown("---")
# Info section
st.sidebar.markdown("### ℹ️ Information")
st.sidebar.info(
"πŸ’‘ **Tip**: Logs are printed to the terminal where you run `streamlit run app.py`."
)
# Pipeline status
if st.session_state.transcript or st.session_state.image_bytes:
st.sidebar.markdown("---")
st.sidebar.markdown("### πŸ“Š Pipeline Status")
status_transcript = "βœ…" if st.session_state.transcript else "⏳"
status_prompt = "βœ…" if st.session_state.image_prompt else "⏳"
status_image = "βœ…" if st.session_state.image_bytes else "⏳"
st.sidebar.markdown(f"{status_transcript} Transcription")
st.sidebar.markdown(f"{status_prompt} Prompt Generation")
st.sidebar.markdown(f"{status_image} Image Generation")
def run_pipeline(uploaded_audio) -> None:
"""
Run the full voice β†’ transcript β†’ image prompt β†’ image pipeline.
Side effects: updates Streamlit session_state keys.
"""
st.session_state.error_message = ""
st.session_state.transcript = ""
st.session_state.image_prompt = ""
st.session_state.image_bytes = None
if uploaded_audio is None:
st.session_state.error_message = "Please upload an audio file first."
return
# Convert the uploaded file into a file-like object compatible with OpenAI
audio_bytes = uploaded_audio.read()
audio_buffer = io.BytesIO(audio_bytes)
# OpenAI expects a name with an extension
audio_buffer.name = uploaded_audio.name or "voice_message.wav"
transcription_model = st.session_state.models_used["transcription_model"]
llm_model = st.session_state.models_used["llm_model"]
image_model = st.session_state.models_used["image_model"]
# Get API key from session state (empty string will fall back to env var)
api_key = st.session_state.openai_api_key if st.session_state.openai_api_key else None
try:
with st.spinner("Transcribing audio with Whisper..."):
logger.info("Step 1/3: Transcribing audio.")
transcript = transcribe_audio(
audio_buffer,
model=transcription_model,
api_key=api_key,
)
st.session_state.transcript = transcript
with st.spinner("Building image prompt with LLM..."):
logger.info("Step 2/3: Building image prompt from transcript.")
prompt = build_image_prompt(
transcript,
model=llm_model,
api_key=api_key,
)
st.session_state.image_prompt = prompt
with st.spinner("Generating image from prompt..."):
logger.info("Step 3/3: Generating image from prompt.")
image_bytes, metadata = generate_image(
prompt,
model=image_model,
size=st.session_state.image_size,
api_key=api_key,
)
st.session_state.image_bytes = image_bytes
# Update models_used with the final image metadata as well
st.session_state.models_used["image_model"] = metadata.get(
"model", image_model
)
logger.info("Pipeline finished successfully.")
except Exception as exc: # noqa: BLE001
logger.exception("Pipeline failed: %s", exc)
st.session_state.error_message = str(exc)
def render_pipeline_steps() -> None:
"""Render visual pipeline step indicators."""
steps = [
("🎀", "Transcription", st.session_state.transcript),
("✍️", "Prompt Building", st.session_state.image_prompt),
("🎨", "Image Generation", st.session_state.image_bytes),
]
cols = st.columns(3)
for idx, (icon, label, status) in enumerate(steps):
with cols[idx]:
if status:
st.markdown(
f"""
<div style='text-align: center; padding: 1rem;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
border-radius: 10px; color: white;'>
<div style='font-size: 2rem; margin-bottom: 0.5rem;'>{icon}</div>
<div style='font-weight: 600;'>{label}</div>
<div style='font-size: 0.8rem; margin-top: 0.5rem;'>βœ… Complete</div>
</div>
""",
unsafe_allow_html=True,
)
else:
st.markdown(
f"""
<div style='text-align: center; padding: 1rem;
background: #f0f0f0; border-radius: 10px;'>
<div style='font-size: 2rem; margin-bottom: 0.5rem; opacity: 0.5;'>{icon}</div>
<div style='font-weight: 600; opacity: 0.7;'>{label}</div>
<div style='font-size: 0.8rem; margin-top: 0.5rem; opacity: 0.5;'>⏳ Pending</div>
</div>
""",
unsafe_allow_html=True,
)
def main() -> None:
"""Main entry point for the Streamlit app."""
init_session_state()
render_sidebar()
# Header section
st.markdown(
"""
<div style='text-align: center; margin-bottom: 2rem;'>
<h1>πŸŽ™οΈ Voice to Image Agent</h1>
<p style='font-size: 1.1rem; color: #666; margin-top: -1rem;'>
Transform your voice into stunning AI-generated images
</p>
</div>
""",
unsafe_allow_html=True,
)
# Pipeline steps visualization
render_pipeline_steps()
st.markdown("<br>", unsafe_allow_html=True)
# Main content area
tab1, tab2 = st.tabs(["🎀 Upload & Generate", "πŸ“Š Results & Details"])
with tab1:
st.markdown("### Upload Your Voice Message")
st.markdown(
"Upload a short audio file (`.wav`, `.mp3`, `.m4a`, `.ogg`, `.webm`). "
"The agent will transcribe it and transform it into a beautiful image."
)
st.markdown("<br>", unsafe_allow_html=True)
# Audio upload section
uploaded_audio = st.file_uploader(
"Choose an audio file",
type=["wav", "mp3", "m4a", "ogg", "webm"],
accept_multiple_files=False,
help="Supported formats: WAV, MP3, M4A, OGG, WebM",
)
if uploaded_audio is not None:
st.markdown("**Audio Preview:**")
st.audio(uploaded_audio, format="audio/wav")
# File info
file_size_mb = len(uploaded_audio.getvalue()) / (1024 * 1024)
col1, col2 = st.columns(2)
with col1:
st.metric("File Name", uploaded_audio.name)
with col2:
st.metric("File Size", f"{file_size_mb:.2f} MB")
st.markdown("<br>", unsafe_allow_html=True)
# Run button
run_button_disabled = uploaded_audio is None
col_btn1, col_btn2, col_btn3 = st.columns([1, 2, 1])
with col_btn2:
if st.button(
"πŸš€ Run Voice β†’ Image Pipeline",
type="primary",
disabled=run_button_disabled,
use_container_width=True,
):
run_pipeline(uploaded_audio)
if st.session_state.error_message:
st.error(f"❌ **Error**: {st.session_state.error_message}")
with tab2:
# Results section with better formatting
if not st.session_state.transcript and not st.session_state.image_bytes:
st.info("πŸ‘† Upload an audio file and run the pipeline to see results here.")
else:
# Transcript section
st.markdown("### πŸ“ Transcribed Text")
if st.session_state.transcript:
st.markdown(
f"""
<div class='result-card'>
<p style='font-size: 1.1rem; line-height: 1.6; color: #333;'>
{st.session_state.transcript}
</p>
</div>
""",
unsafe_allow_html=True,
)
else:
st.info("⏳ Transcript will appear here after transcription.")
st.markdown("<br>", unsafe_allow_html=True)
# Prompt section
st.markdown("### ✍️ Enhanced Image Prompt")
if st.session_state.image_prompt:
st.markdown(
f"""
<div class='result-card'>
<p style='font-size: 1rem; line-height: 1.6; color: #555; font-style: italic;'>
"{st.session_state.image_prompt}"
</p>
</div>
""",
unsafe_allow_html=True,
)
else:
st.info("⏳ The LLM-generated prompt will appear here after transcription.")
st.markdown("<br>", unsafe_allow_html=True)
# Image section
st.markdown("### 🎨 Generated Image")
if st.session_state.image_bytes:
col_img1, col_img2, col_img3 = st.columns([1, 3, 1])
with col_img2:
st.image(
st.session_state.image_bytes,
caption="✨ Generated by the image model",
use_container_width=True,
)
# Image metadata
st.markdown("<br>", unsafe_allow_html=True)
col_meta1, col_meta2, col_meta3 = st.columns(3)
with col_meta1:
st.metric("Model", st.session_state.models_used.get("image_model", "N/A"))
with col_meta2:
st.metric("Size", st.session_state.image_size)
with col_meta3:
img_size_kb = len(st.session_state.image_bytes) / 1024
st.metric("File Size", f"{img_size_kb:.1f} KB")
else:
st.info("⏳ The generated image will appear here once ready.")
st.markdown("---")
# Models used section
st.markdown("### βš™οΈ Models Used")
st.json(st.session_state.models_used)
if __name__ == "__main__":
main()