Spaces:

kamcio1989
/

anycoder-86ef485c

Sleeping

File size: 5,900 Bytes

10095b3

import streamlit as st
import google.generativeai as genai
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
import time
import os
from utils import AudioProcessor, save_audio_to_bytes

# --- Configuration ---
st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="🎙️", layout="wide")

# --- Custom CSS ---
st.markdown("""
    <style>
    .stChatInput {bottom: 20px;}
    .status-box {
        padding: 10px;
        border-radius: 5px;
        margin-bottom: 10px;
        text-align: center;
        font-weight: bold;
    }
    .recording { background-color: #ffcccc; color: #cc0000; border: 1px solid #cc0000; }
    .processing { background-color: #cce5ff; color: #004085; border: 1px solid #b8daff; }
    .waiting { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
    </style>
""", unsafe_allow_html=True)

# --- Header ---
st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True)
st.title("🎙️ Gemini Live Audio Chat")
st.caption("Speak naturally. The model will detect when you finish a sentence and respond.")

# --- Sidebar & Setup ---
with st.sidebar:
    st.header("Settings")
    api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com")
    
    st.markdown("### Audio Settings")
    energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds")
    silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send")
    
    st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.")

# --- Session State ---
if "messages" not in st.session_state:
    st.session_state.messages = []
if "audio_queue" not in st.session_state:
    st.session_state.audio_queue = queue.Queue()

# --- Gemini Logic ---
def get_gemini_response(audio_bytes):
    try:
        genai.configure(api_key=api_key)
        # Gemini 1.5 Flash is optimized for speed and multimodal input
        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
        
        # Create the content payload
        response = model.generate_content([
            "Listen to this audio and respond conversationally to the user.",
            {"mime_type": "audio/wav", "data": audio_bytes}
        ])
        return response.text
    except Exception as e:
        return f"Error communicating with Gemini: {str(e)}"

# --- Main UI Layout ---
col1, col2 = st.columns([2, 1])

with col1:
    st.subheader("Chat History")
    chat_container = st.container(height=500)
    
    with chat_container:
        if not st.session_state.messages:
            st.info("Start the audio stream and say 'Hello'!")
            
        for message in st.session_state.messages:
            with st.chat_message(message["role"]):
                st.markdown(message["content"])

with col2:
    st.subheader("Audio Interface")
    
    if not api_key:
        st.warning("Please enter your Gemini API Key in the sidebar.")
    else:
        # WebRTC Configuration
        rtc_configuration = RTCConfiguration(
            {"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
        )

        # We use a key based on settings to force reload if settings change
        ctx = webrtc_streamer(
            key=f"gemini-voice-{energy_threshold}-{silence_duration}",
            mode=WebRtcMode.SENDONLY,
            audio_receiver_size=1024,
            rtc_configuration=rtc_configuration,
            media_stream_constraints={"video": False, "audio": True},
            processor_factory=AudioProcessor,  # Defined in utils.py (imported, but we need to pass args)
        )

        # Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc)
        if ctx.state.playing:
            if ctx.audio_processor:
                ctx.audio_processor.set_thresholds(energy_threshold, silence_duration)
                
                # Status Indicators
                status_placeholder = st.empty()
                
                # Poll the processor for status
                if ctx.audio_processor.is_speaking:
                    status_placeholder.markdown('<div class="status-box recording">🔴 Listening...</div>', unsafe_allow_html=True)
                else:
                    status_placeholder.markdown('<div class="status-box waiting">🟢 Ready / Waiting for speech</div>', unsafe_allow_html=True)

                # Check if audio is ready to be sent
                if ctx.audio_processor.has_audio_frame():
                    status_placeholder.markdown('<div class="status-box processing">⚙️ Processing Audio...</div>', unsafe_allow_html=True)
                    
                    # Get the audio data
                    audio_frames = ctx.audio_processor.get_audio_frames()
                    
                    if audio_frames:
                        # Convert to WAV bytes
                        wav_bytes = save_audio_to_bytes(audio_frames)
                        
                        # Add user placeholder (audio icon)
                        st.session_state.messages.append({"role": "user", "content": "🎤 *Sent Audio Clip*"})
                        
                        # Get Gemini Response
                        with st.spinner("Gemini is thinking..."):
                            response_text = get_gemini_response(wav_bytes)
                        
                        st.session_state.messages.append({"role": "assistant", "content": response_text})
                        st.rerun()

# --- Footer ---
st.markdown("---")
st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.")