Spaces:
Sleeping
Sleeping
File size: 5,900 Bytes
10095b3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | import streamlit as st
import google.generativeai as genai
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
import time
import os
from utils import AudioProcessor, save_audio_to_bytes
# --- Configuration ---
st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="ποΈ", layout="wide")
# --- Custom CSS ---
st.markdown("""
<style>
.stChatInput {bottom: 20px;}
.status-box {
padding: 10px;
border-radius: 5px;
margin-bottom: 10px;
text-align: center;
font-weight: bold;
}
.recording { background-color: #ffcccc; color: #cc0000; border: 1px solid #cc0000; }
.processing { background-color: #cce5ff; color: #004085; border: 1px solid #b8daff; }
.waiting { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
</style>
""", unsafe_allow_html=True)
# --- Header ---
st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True)
st.title("ποΈ Gemini Live Audio Chat")
st.caption("Speak naturally. The model will detect when you finish a sentence and respond.")
# --- Sidebar & Setup ---
with st.sidebar:
st.header("Settings")
api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com")
st.markdown("### Audio Settings")
energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds")
silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send")
st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.")
# --- Session State ---
if "messages" not in st.session_state:
st.session_state.messages = []
if "audio_queue" not in st.session_state:
st.session_state.audio_queue = queue.Queue()
# --- Gemini Logic ---
def get_gemini_response(audio_bytes):
try:
genai.configure(api_key=api_key)
# Gemini 1.5 Flash is optimized for speed and multimodal input
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Create the content payload
response = model.generate_content([
"Listen to this audio and respond conversationally to the user.",
{"mime_type": "audio/wav", "data": audio_bytes}
])
return response.text
except Exception as e:
return f"Error communicating with Gemini: {str(e)}"
# --- Main UI Layout ---
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Chat History")
chat_container = st.container(height=500)
with chat_container:
if not st.session_state.messages:
st.info("Start the audio stream and say 'Hello'!")
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
with col2:
st.subheader("Audio Interface")
if not api_key:
st.warning("Please enter your Gemini API Key in the sidebar.")
else:
# WebRTC Configuration
rtc_configuration = RTCConfiguration(
{"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
)
# We use a key based on settings to force reload if settings change
ctx = webrtc_streamer(
key=f"gemini-voice-{energy_threshold}-{silence_duration}",
mode=WebRtcMode.SENDONLY,
audio_receiver_size=1024,
rtc_configuration=rtc_configuration,
media_stream_constraints={"video": False, "audio": True},
processor_factory=AudioProcessor, # Defined in utils.py (imported, but we need to pass args)
)
# Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc)
if ctx.state.playing:
if ctx.audio_processor:
ctx.audio_processor.set_thresholds(energy_threshold, silence_duration)
# Status Indicators
status_placeholder = st.empty()
# Poll the processor for status
if ctx.audio_processor.is_speaking:
status_placeholder.markdown('<div class="status-box recording">π΄ Listening...</div>', unsafe_allow_html=True)
else:
status_placeholder.markdown('<div class="status-box waiting">π’ Ready / Waiting for speech</div>', unsafe_allow_html=True)
# Check if audio is ready to be sent
if ctx.audio_processor.has_audio_frame():
status_placeholder.markdown('<div class="status-box processing">βοΈ Processing Audio...</div>', unsafe_allow_html=True)
# Get the audio data
audio_frames = ctx.audio_processor.get_audio_frames()
if audio_frames:
# Convert to WAV bytes
wav_bytes = save_audio_to_bytes(audio_frames)
# Add user placeholder (audio icon)
st.session_state.messages.append({"role": "user", "content": "π€ *Sent Audio Clip*"})
# Get Gemini Response
with st.spinner("Gemini is thinking..."):
response_text = get_gemini_response(wav_bytes)
st.session_state.messages.append({"role": "assistant", "content": response_text})
st.rerun()
# --- Footer ---
st.markdown("---")
st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.") |