Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import google.generativeai as genai | |
| from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration | |
| import av | |
| import queue | |
| import time | |
| import os | |
| from utils import AudioProcessor, save_audio_to_bytes | |
| # --- Configuration --- | |
| st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="ποΈ", layout="wide") | |
| # --- Custom CSS --- | |
| st.markdown(""" | |
| <style> | |
| .stChatInput {bottom: 20px;} | |
| .status-box { | |
| padding: 10px; | |
| border-radius: 5px; | |
| margin-bottom: 10px; | |
| text-align: center; | |
| font-weight: bold; | |
| } | |
| .recording { background-color: #ffcccc; color: #cc0000; border: 1px solid #cc0000; } | |
| .processing { background-color: #cce5ff; color: #004085; border: 1px solid #b8daff; } | |
| .waiting { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # --- Header --- | |
| st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True) | |
| st.title("ποΈ Gemini Live Audio Chat") | |
| st.caption("Speak naturally. The model will detect when you finish a sentence and respond.") | |
| # --- Sidebar & Setup --- | |
| with st.sidebar: | |
| st.header("Settings") | |
| api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com") | |
| st.markdown("### Audio Settings") | |
| energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds") | |
| silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send") | |
| st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.") | |
| # --- Session State --- | |
| if "messages" not in st.session_state: | |
| st.session_state.messages = [] | |
| if "audio_queue" not in st.session_state: | |
| st.session_state.audio_queue = queue.Queue() | |
| # --- Gemini Logic --- | |
| def get_gemini_response(audio_bytes): | |
| try: | |
| genai.configure(api_key=api_key) | |
| # Gemini 1.5 Flash is optimized for speed and multimodal input | |
| model = genai.GenerativeModel(model_name="gemini-1.5-flash") | |
| # Create the content payload | |
| response = model.generate_content([ | |
| "Listen to this audio and respond conversationally to the user.", | |
| {"mime_type": "audio/wav", "data": audio_bytes} | |
| ]) | |
| return response.text | |
| except Exception as e: | |
| return f"Error communicating with Gemini: {str(e)}" | |
| # --- Main UI Layout --- | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.subheader("Chat History") | |
| chat_container = st.container(height=500) | |
| with chat_container: | |
| if not st.session_state.messages: | |
| st.info("Start the audio stream and say 'Hello'!") | |
| for message in st.session_state.messages: | |
| with st.chat_message(message["role"]): | |
| st.markdown(message["content"]) | |
| with col2: | |
| st.subheader("Audio Interface") | |
| if not api_key: | |
| st.warning("Please enter your Gemini API Key in the sidebar.") | |
| else: | |
| # WebRTC Configuration | |
| rtc_configuration = RTCConfiguration( | |
| {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]} | |
| ) | |
| # We use a key based on settings to force reload if settings change | |
| ctx = webrtc_streamer( | |
| key=f"gemini-voice-{energy_threshold}-{silence_duration}", | |
| mode=WebRtcMode.SENDONLY, | |
| audio_receiver_size=1024, | |
| rtc_configuration=rtc_configuration, | |
| media_stream_constraints={"video": False, "audio": True}, | |
| processor_factory=AudioProcessor, # Defined in utils.py (imported, but we need to pass args) | |
| ) | |
| # Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc) | |
| if ctx.state.playing: | |
| if ctx.audio_processor: | |
| ctx.audio_processor.set_thresholds(energy_threshold, silence_duration) | |
| # Status Indicators | |
| status_placeholder = st.empty() | |
| # Poll the processor for status | |
| if ctx.audio_processor.is_speaking: | |
| status_placeholder.markdown('<div class="status-box recording">π΄ Listening...</div>', unsafe_allow_html=True) | |
| else: | |
| status_placeholder.markdown('<div class="status-box waiting">π’ Ready / Waiting for speech</div>', unsafe_allow_html=True) | |
| # Check if audio is ready to be sent | |
| if ctx.audio_processor.has_audio_frame(): | |
| status_placeholder.markdown('<div class="status-box processing">βοΈ Processing Audio...</div>', unsafe_allow_html=True) | |
| # Get the audio data | |
| audio_frames = ctx.audio_processor.get_audio_frames() | |
| if audio_frames: | |
| # Convert to WAV bytes | |
| wav_bytes = save_audio_to_bytes(audio_frames) | |
| # Add user placeholder (audio icon) | |
| st.session_state.messages.append({"role": "user", "content": "π€ *Sent Audio Clip*"}) | |
| # Get Gemini Response | |
| with st.spinner("Gemini is thinking..."): | |
| response_text = get_gemini_response(wav_bytes) | |
| st.session_state.messages.append({"role": "assistant", "content": response_text}) | |
| st.rerun() | |
| # --- Footer --- | |
| st.markdown("---") | |
| st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.") |