File size: 4,565 Bytes
b713a83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import streamlit as st
import tempfile
import os
import time
from agent import VoiceToImageAgent

# Page configuration
st.set_page_config(
    page_title="Voice into Imagination",
    page_icon="🎙️",
    layout="wide"
)

# Custom CSS for refined chat style and bottom bar
st.markdown("""
<style>
    /* Fix input at bottom */
    .stChatInput {
        position: fixed;
        bottom: 3rem;
        z-index: 1000;
    }
    
    /* Hide some Streamlit elements for cleaner look */
    .element-container:has(#button-after) {
        display: none;
    }

    /* Status Container Styling */
    div[data-testid="stStatusWidget"] {
        visibility: hidden;
    }
</style>
""", unsafe_allow_html=True)

st.title("🎙️ Voice into Imagination")

# Initialize agent
if "agent" not in st.session_state:
    st.session_state.agent = VoiceToImageAgent()

agent = st.session_state.agent

# Initialize chat history
if "messages" not in st.session_state:
    st.session_state.messages = []

# Initialize persistent logs
if "logs" not in st.session_state:
    st.session_state.logs = []

# Initialize audio input key counter for resetting
if "audio_key_count" not in st.session_state:
    st.session_state.audio_key_count = 0

# Sidebar for Logs
with st.sidebar:
    st.title("🛠️ System Logs")
    # Display all previous logs
    log_placeholder = st.empty()
    
    with log_placeholder.container():
        for log in st.session_state.logs:
            st.caption(f"INFO: {log}")

def log_message(message):
    st.session_state.logs.append(message)
    # Refresh log view
    with log_placeholder.container():
        for log in st.session_state.logs:
            st.caption(f"INFO: {log}")

# Display chat messages
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        if message["role"] == "user":
            st.markdown(message["content"])
        else:
            if "image_url" in message:
                st.image(message["image_url"], width="stretch")
                # Removed caption showing prompt text to keep UI clean
            else:
                st.markdown(message["content"])

# Bottom Input Area
# We use a container to hold our custom status area + the audio input
bottom_container = st.container()

with bottom_container:
    # 1. Status Area (Dynamic)
    status_placeholder = st.empty()

    # 2. Audio Input
    # Using a dynamic key allows us to reset/clear the component by incrementing the counter
    audio_key = f"audio_{st.session_state.audio_key_count}"
    audio_value = st.audio_input("Recorder", key=audio_key)

if audio_value:
    # Process the audio
    
    with st.spinner("Processing..."):
        # Save audio to temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
            f.write(audio_value.getvalue())
            audio_path = f.name
        
        try:
            # STATUS: Transcribing
            status_placeholder.info("🎙️ Transcribing voice...")
            log_message("Audio received. Transcribing...")
            transcript = agent.transcribe(audio_path)
            
            # STATUS: Show Transcript (Simulate appearing on label/near input)
            status_placeholder.success(f"🗣️ You said: \"{transcript}\"")
            log_message(f"Transcript: {transcript}")
            
            # Simulate "automatic send" pause
            time.sleep(2)
            
            # STATUS: Generating
            status_placeholder.info("🎨 Generating image...")
            log_message("Generating image prompt...")
            prompt = agent.text_to_prompt(transcript)
            
            log_message(f"Prompt: {prompt}")
            log_message("Generating image...")
            image_url = agent.generate_image(prompt)
            log_message("Image generated successfully.")
            
            # Clear Status
            status_placeholder.empty()
            
            # Update Chat History
            st.session_state.messages.append({"role": "user", "content": transcript})
            st.session_state.messages.append({"role": "assistant", "content": prompt, "image_url": image_url})

            # Increment key to reset audio input
            st.session_state.audio_key_count += 1
            
            # Rerun to update the view
            st.rerun()

        except Exception as e:
            st.error(f"An error occurred: {e}")
            log_message(f"ERROR: {e}")
        finally:
            if os.path.exists(audio_path):
                os.remove(audio_path)