import streamlit as st import google.generativeai as genai from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration import av import queue import time import os from utils import AudioProcessor, save_audio_to_bytes # --- Configuration --- st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="🎙️", layout="wide") # --- Custom CSS --- st.markdown(""" """, unsafe_allow_html=True) # --- Header --- st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True) st.title("🎙️ Gemini Live Audio Chat") st.caption("Speak naturally. The model will detect when you finish a sentence and respond.") # --- Sidebar & Setup --- with st.sidebar: st.header("Settings") api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com") st.markdown("### Audio Settings") energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds") silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send") st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.") # --- Session State --- if "messages" not in st.session_state: st.session_state.messages = [] if "audio_queue" not in st.session_state: st.session_state.audio_queue = queue.Queue() # --- Gemini Logic --- def get_gemini_response(audio_bytes): try: genai.configure(api_key=api_key) # Gemini 1.5 Flash is optimized for speed and multimodal input model = genai.GenerativeModel(model_name="gemini-1.5-flash") # Create the content payload response = model.generate_content([ "Listen to this audio and respond conversationally to the user.", {"mime_type": "audio/wav", "data": audio_bytes} ]) return response.text except Exception as e: return f"Error communicating with Gemini: {str(e)}" # --- Main UI Layout --- col1, col2 = st.columns([2, 1]) with col1: st.subheader("Chat History") chat_container = st.container(height=500) with chat_container: if not st.session_state.messages: st.info("Start the audio stream and say 'Hello'!") for message in st.session_state.messages: with st.chat_message(message["role"]): st.markdown(message["content"]) with col2: st.subheader("Audio Interface") if not api_key: st.warning("Please enter your Gemini API Key in the sidebar.") else: # WebRTC Configuration rtc_configuration = RTCConfiguration( {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]} ) # We use a key based on settings to force reload if settings change ctx = webrtc_streamer( key=f"gemini-voice-{energy_threshold}-{silence_duration}", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, rtc_configuration=rtc_configuration, media_stream_constraints={"video": False, "audio": True}, processor_factory=AudioProcessor, # Defined in utils.py (imported, but we need to pass args) ) # Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc) if ctx.state.playing: if ctx.audio_processor: ctx.audio_processor.set_thresholds(energy_threshold, silence_duration) # Status Indicators status_placeholder = st.empty() # Poll the processor for status if ctx.audio_processor.is_speaking: status_placeholder.markdown('
🔴 Listening...
', unsafe_allow_html=True) else: status_placeholder.markdown('
🟢 Ready / Waiting for speech
', unsafe_allow_html=True) # Check if audio is ready to be sent if ctx.audio_processor.has_audio_frame(): status_placeholder.markdown('
⚙️ Processing Audio...
', unsafe_allow_html=True) # Get the audio data audio_frames = ctx.audio_processor.get_audio_frames() if audio_frames: # Convert to WAV bytes wav_bytes = save_audio_to_bytes(audio_frames) # Add user placeholder (audio icon) st.session_state.messages.append({"role": "user", "content": "🎤 *Sent Audio Clip*"}) # Get Gemini Response with st.spinner("Gemini is thinking..."): response_text = get_gemini_response(wav_bytes) st.session_state.messages.append({"role": "assistant", "content": response_text}) st.rerun() # --- Footer --- st.markdown("---") st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.")