Spaces:

kamcio1989
/

anycoder-86ef485c

Sleeping

App Files Files Community

anycoder-86ef485c / app.py

kamcio1989

Rename streamlit_app.py to app.py

8d1dc26 verified about 2 months ago

raw

history blame contribute delete

5.9 kB

	import streamlit as st
	import google.generativeai as genai
	from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
	import av
	import queue
	import time
	import os
	from utils import AudioProcessor, save_audio_to_bytes

	# --- Configuration ---
	st.set_page_config(page_title="Gemini Live Audio Chat", page_icon="🎙️", layout="wide")

	# --- Custom CSS ---
	st.markdown("""
	<style>
	.stChatInput {bottom: 20px;}
	.status-box {
	padding: 10px;
	border-radius: 5px;
	margin-bottom: 10px;
	text-align: center;
	font-weight: bold;
	}
	.recording { background-color: #ffcccc; color: #cc0000; border: 1px solid #cc0000; }
	.processing { background-color: #cce5ff; color: #004085; border: 1px solid #b8daff; }
	.waiting { background-color: #d4edda; color: #155724; border: 1px solid #c3e6cb; }
	</style>
	""", unsafe_allow_html=True)

	# --- Header ---
	st.markdown("[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)", unsafe_allow_html=True)
	st.title("🎙️ Gemini Live Audio Chat")
	st.caption("Speak naturally. The model will detect when you finish a sentence and respond.")

	# --- Sidebar & Setup ---
	with st.sidebar:
	st.header("Settings")
	api_key = st.text_input("Gemini API Key", type="password", help="Get one at aistudio.google.com")

	st.markdown("### Audio Settings")
	energy_threshold = st.slider("Voice Sensitivity", 500, 5000, 2000, help="Lower = more sensitive to quiet sounds")
	silence_duration = st.slider("Silence to Trigger (sec)", 1.0, 3.0, 1.5, help="How long to wait after speech to send")

	st.info("Note: This app uses WebRTC for real-time audio capture and Gemini 1.5 Flash for multimodal processing.")

	# --- Session State ---
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "audio_queue" not in st.session_state:
	st.session_state.audio_queue = queue.Queue()

	# --- Gemini Logic ---
	def get_gemini_response(audio_bytes):
	try:
	genai.configure(api_key=api_key)
	# Gemini 1.5 Flash is optimized for speed and multimodal input
	model = genai.GenerativeModel(model_name="gemini-1.5-flash")

	# Create the content payload
	response = model.generate_content([
	"Listen to this audio and respond conversationally to the user.",
	{"mime_type": "audio/wav", "data": audio_bytes}
	])
	return response.text
	except Exception as e:
	return f"Error communicating with Gemini: {str(e)}"

	# --- Main UI Layout ---
	col1, col2 = st.columns([2, 1])

	with col1:
	st.subheader("Chat History")
	chat_container = st.container(height=500)

	with chat_container:
	if not st.session_state.messages:
	st.info("Start the audio stream and say 'Hello'!")

	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	with col2:
	st.subheader("Audio Interface")

	if not api_key:
	st.warning("Please enter your Gemini API Key in the sidebar.")
	else:
	# WebRTC Configuration
	rtc_configuration = RTCConfiguration(
	{"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]}
	)

	# We use a key based on settings to force reload if settings change
	ctx = webrtc_streamer(
	key=f"gemini-voice-{energy_threshold}-{silence_duration}",
	mode=WebRtcMode.SENDONLY,
	audio_receiver_size=1024,
	rtc_configuration=rtc_configuration,
	media_stream_constraints={"video": False, "audio": True},
	processor_factory=AudioProcessor, # Defined in utils.py (imported, but we need to pass args)
	)

	# Inject settings into the processor singleton pattern (Streamlit specific hack for webrtc)
	if ctx.state.playing:
	if ctx.audio_processor:
	ctx.audio_processor.set_thresholds(energy_threshold, silence_duration)

	# Status Indicators
	status_placeholder = st.empty()

	# Poll the processor for status
	if ctx.audio_processor.is_speaking:
	status_placeholder.markdown('<div class="status-box recording">🔴 Listening...</div>', unsafe_allow_html=True)
	else:
	status_placeholder.markdown('<div class="status-box waiting">🟢 Ready / Waiting for speech</div>', unsafe_allow_html=True)

	# Check if audio is ready to be sent
	if ctx.audio_processor.has_audio_frame():
	status_placeholder.markdown('<div class="status-box processing">⚙️ Processing Audio...</div>', unsafe_allow_html=True)

	# Get the audio data
	audio_frames = ctx.audio_processor.get_audio_frames()

	if audio_frames:
	# Convert to WAV bytes
	wav_bytes = save_audio_to_bytes(audio_frames)

	# Add user placeholder (audio icon)
	st.session_state.messages.append({"role": "user", "content": "🎤 Sent Audio Clip"})

	# Get Gemini Response
	with st.spinner("Gemini is thinking..."):
	response_text = get_gemini_response(wav_bytes)

	st.session_state.messages.append({"role": "assistant", "content": response_text})
	st.rerun()

	# --- Footer ---
	st.markdown("---")
	st.caption("Tips: If the model interrupts you, increase the 'Silence to Trigger' duration in the sidebar. If it doesn't hear you, lower the 'Voice Sensitivity'.")