File size: 5,900 Bytes
10095b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import streamlit as st
import google.generativeai as genai
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
import time
import os
from utils import AudioProcessor, save_audio_to_bytes

# --- Configuration ---
st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="πŸŽ™οΈ", layout="wide")

# --- Custom CSS ---
st.markdown("""
    <style>
    .stChatInput {bottom: 20px;}
    .status-box {
        padding: 10px;
        border-radius: 5px;
        margin-bottom: 10px;
        text-align: center;
        font-weight: bold;
    }
    .recording { background-color: #ffcccc; color: #cc0000; border: 1px solid #cc0000; }
    .processing { background-color: #cce5ff; color: #004085; border: 1px solid #b8daff; }
    .waiting { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
    </style>
""", unsafe_allow_html=True)

# --- Header ---
st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True)
st.title("πŸŽ™οΈ Gemini Live Audio Chat")
st.caption("Speak naturally. The model will detect when you finish a sentence and respond.")

# --- Sidebar & Setup ---
with st.sidebar:
    st.header("Settings")
    api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com")
    
    st.markdown("### Audio Settings")
    energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds")
    silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send")
    
    st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.")

# --- Session State ---
if "messages" not in st.session_state:
    st.session_state.messages = []
if "audio_queue" not in st.session_state:
    st.session_state.audio_queue = queue.Queue()

# --- Gemini Logic ---
def get_gemini_response(audio_bytes):
    try:
        genai.configure(api_key=api_key)
        # Gemini 1.5 Flash is optimized for speed and multimodal input
        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
        
        # Create the content payload
        response = model.generate_content([
            "Listen to this audio and respond conversationally to the user.",
            {"mime_type": "audio/wav", "data": audio_bytes}
        ])
        return response.text
    except Exception as e:
        return f"Error communicating with Gemini: {str(e)}"

# --- Main UI Layout ---
col1, col2 = st.columns([2, 1])

with col1:
    st.subheader("Chat History")
    chat_container = st.container(height=500)
    
    with chat_container:
        if not st.session_state.messages:
            st.info("Start the audio stream and say 'Hello'!")
            
        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

with col2:
    st.subheader("Audio Interface")
    
    if not api_key:
        st.warning("Please enter your Gemini API Key in the sidebar.")
    else:
        # WebRTC Configuration
        rtc_configuration = RTCConfiguration(
            {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
        )

        # We use a key based on settings to force reload if settings change
        ctx = webrtc_streamer(
            key=f"gemini-voice-{energy_threshold}-{silence_duration}",
            mode=WebRtcMode.SENDONLY,
            audio_receiver_size=1024,
            rtc_configuration=rtc_configuration,
            media_stream_constraints={"video": False, "audio": True},
            processor_factory=AudioProcessor,  # Defined in utils.py (imported, but we need to pass args)
        )

        # Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc)
        if ctx.state.playing:
            if ctx.audio_processor:
                ctx.audio_processor.set_thresholds(energy_threshold, silence_duration)
                
                # Status Indicators
                status_placeholder = st.empty()
                
                # Poll the processor for status
                if ctx.audio_processor.is_speaking:
                    status_placeholder.markdown('<div class="status-box recording">πŸ”΄ Listening...</div>', unsafe_allow_html=True)
                else:
                    status_placeholder.markdown('<div class="status-box waiting">🟒 Ready / Waiting for speech</div>', unsafe_allow_html=True)

                # Check if audio is ready to be sent
                if ctx.audio_processor.has_audio_frame():
                    status_placeholder.markdown('<div class="status-box processing">βš™οΈ Processing Audio...</div>', unsafe_allow_html=True)
                    
                    # Get the audio data
                    audio_frames = ctx.audio_processor.get_audio_frames()
                    
                    if audio_frames:
                        # Convert to WAV bytes
                        wav_bytes = save_audio_to_bytes(audio_frames)
                        
                        # Add user placeholder (audio icon)
                        st.session_state.messages.append({"role": "user", "content": "🎀 *Sent Audio Clip*"})
                        
                        # Get Gemini Response
                        with st.spinner("Gemini is thinking..."):
                            response_text = get_gemini_response(wav_bytes)
                        
                        st.session_state.messages.append({"role": "assistant", "content": response_text})
                        st.rerun()

# --- Footer ---
st.markdown("---")
st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.")