kamcio1989's picture
Rename streamlit_app.py to app.py
8d1dc26 verified
import streamlit as st
import google.generativeai as genai
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
import av
import queue
import time
import os
from utils import AudioProcessor, save_audio_to_bytes
# --- Configuration ---
st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="πŸŽ™οΈ", layout="wide")
# --- Custom CSS ---
st.markdown("""
<style>
.stChatInput {bottom: 20px;}
.status-box {
padding: 10px;
border-radius: 5px;
margin-bottom: 10px;
text-align: center;
font-weight: bold;
}
.recording { background-color: #ffcccc; color: #cc0000; border: 1px solid #cc0000; }
.processing { background-color: #cce5ff; color: #004085; border: 1px solid #b8daff; }
.waiting { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
</style>
""", unsafe_allow_html=True)
# --- Header ---
st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True)
st.title("πŸŽ™οΈ Gemini Live Audio Chat")
st.caption("Speak naturally. The model will detect when you finish a sentence and respond.")
# --- Sidebar & Setup ---
with st.sidebar:
st.header("Settings")
api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com")
st.markdown("### Audio Settings")
energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds")
silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send")
st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.")
# --- Session State ---
if "messages" not in st.session_state:
st.session_state.messages = []
if "audio_queue" not in st.session_state:
st.session_state.audio_queue = queue.Queue()
# --- Gemini Logic ---
def get_gemini_response(audio_bytes):
try:
genai.configure(api_key=api_key)
# Gemini 1.5 Flash is optimized for speed and multimodal input
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
# Create the content payload
response = model.generate_content([
"Listen to this audio and respond conversationally to the user.",
{"mime_type": "audio/wav", "data": audio_bytes}
])
return response.text
except Exception as e:
return f"Error communicating with Gemini: {str(e)}"
# --- Main UI Layout ---
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Chat History")
chat_container = st.container(height=500)
with chat_container:
if not st.session_state.messages:
st.info("Start the audio stream and say 'Hello'!")
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
with col2:
st.subheader("Audio Interface")
if not api_key:
st.warning("Please enter your Gemini API Key in the sidebar.")
else:
# WebRTC Configuration
rtc_configuration = RTCConfiguration(
{"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
)
# We use a key based on settings to force reload if settings change
ctx = webrtc_streamer(
key=f"gemini-voice-{energy_threshold}-{silence_duration}",
mode=WebRtcMode.SENDONLY,
audio_receiver_size=1024,
rtc_configuration=rtc_configuration,
media_stream_constraints={"video": False, "audio": True},
processor_factory=AudioProcessor, # Defined in utils.py (imported, but we need to pass args)
)
# Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc)
if ctx.state.playing:
if ctx.audio_processor:
ctx.audio_processor.set_thresholds(energy_threshold, silence_duration)
# Status Indicators
status_placeholder = st.empty()
# Poll the processor for status
if ctx.audio_processor.is_speaking:
status_placeholder.markdown('<div class="status-box recording">πŸ”΄ Listening...</div>', unsafe_allow_html=True)
else:
status_placeholder.markdown('<div class="status-box waiting">🟒 Ready / Waiting for speech</div>', unsafe_allow_html=True)
# Check if audio is ready to be sent
if ctx.audio_processor.has_audio_frame():
status_placeholder.markdown('<div class="status-box processing">βš™οΈ Processing Audio...</div>', unsafe_allow_html=True)
# Get the audio data
audio_frames = ctx.audio_processor.get_audio_frames()
if audio_frames:
# Convert to WAV bytes
wav_bytes = save_audio_to_bytes(audio_frames)
# Add user placeholder (audio icon)
st.session_state.messages.append({"role": "user", "content": "🎀 *Sent Audio Clip*"})
# Get Gemini Response
with st.spinner("Gemini is thinking..."):
response_text = get_gemini_response(wav_bytes)
st.session_state.messages.append({"role": "assistant", "content": response_text})
st.rerun()
# --- Footer ---
st.markdown("---")
st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.")