Spaces:
Build error
Build error
| import gradio as gr | |
| import asyncio | |
| import json | |
| import base64 | |
| import numpy as np | |
| from dataclasses import dataclass, field | |
| from typing import AsyncIterator, Callable | |
| import threading | |
| import queue | |
| # Mock WebRTC and Gemini integration - in production, use actual WebRTC libraries | |
| # and Google's Gemini Live API with proper authentication | |
| class GeminiConfig: | |
| """Configuration for Gemini Live API connection.""" | |
| model: str = "gemini-2.0-flash-exp" | |
| api_key: str = "" | |
| voice: str = "Puck" # Puck, Charon, Kore, Fenrir, Aoede | |
| response_modalities: list = field(default_factory=lambda: ["AUDIO", "TEXT"]) | |
| class WebRTCGeminiClient: | |
| """ | |
| WebRTC client for Gemini Live API. | |
| Handles real-time bidirectional streaming of audio and text. | |
| """ | |
| def __init__(self, config: GeminiConfig = None): | |
| self.config = config or GeminiConfig() | |
| self.is_connected = False | |
| self.audio_input_queue = queue.Queue() | |
| self.audio_output_queue = queue.Queue() | |
| self.text_output_queue = queue.Queue() | |
| self._running = False | |
| self._thread = None | |
| def connect(self) -> bool: | |
| """Establish WebRTC connection to Gemini Live API.""" | |
| # In production: Initialize WebRTC peer connection | |
| # Connect to Gemini Live API endpoint | |
| # Set up ICE servers, SDP exchange, etc. | |
| self.is_connected = True | |
| self._running = True | |
| self._thread = threading.Thread(target=self._simulate_gemini_loop) | |
| self._thread.start() | |
| return True | |
| def disconnect(self): | |
| """Close WebRTC connection.""" | |
| self._running = False | |
| self.is_connected = False | |
| if self._thread: | |
| self._thread.join(timeout=2) | |
| def _simulate_gemini_loop(self): | |
| """Simulate Gemini responses for demo purposes.""" | |
| # In production: This would handle actual WebRTC data channels | |
| # Receive audio/text from Gemini, send user audio/text to Gemini | |
| responses = [ | |
| "Hello! I'm your Gemini Live assistant. How can I help you today?", | |
| "I can see and hear you in real-time. Feel free to ask me anything!", | |
| "That's interesting! Tell me more about what you're working on.", | |
| "I understand. Let me think about that for a moment...", | |
| "Great question! Here's what I know about that topic...", | |
| "I'm processing your request. One moment please.", | |
| "I can help with that! Let me provide some guidance.", | |
| "Thanks for sharing that with me. Is there anything else you'd like to discuss?", | |
| ] | |
| import random | |
| import time | |
| idx = 0 | |
| while self._running: | |
| time.sleep(0.1) | |
| # Simulate receiving audio/text from Gemini | |
| if random.random() < 0.02 and self.audio_output_queue.qsize() < 5: | |
| # Simulate text response | |
| if idx < len(responses): | |
| self.text_output_queue.put({ | |
| "type": "text", | |
| "content": responses[idx] | |
| }) | |
| idx = (idx + 1) % len(responses) | |
| # Simulate audio response (would be actual PCM audio in production) | |
| sample_rate = 24000 | |
| duration = 2.0 # seconds | |
| t = np.linspace(0, duration, int(sample_rate * duration)) | |
| # Generate synthetic speech-like audio | |
| audio = np.sin(2 * np.pi * 200 * t) * np.exp(-t * 2) | |
| audio = (audio * 32767).astype(np.int16) | |
| self.audio_output_queue.put({ | |
| "type": "audio", | |
| "data": (sample_rate, audio) | |
| }) | |
| def send_audio(self, audio_data: tuple) -> bool: | |
| """Send audio chunk to Gemini.""" | |
| # audio_data: (sample_rate, numpy_array) | |
| if not self.is_connected: | |
| return False | |
| self.audio_input_queue.put(audio_data) | |
| return True | |
| def send_text(self, text: str) -> bool: | |
| """Send text message to Gemini.""" | |
| if not self.is_connected: | |
| return False | |
| # In production: Send via WebRTC data channel | |
| return True | |
| def get_audio_response(self) -> dict | None: | |
| """Get audio response from Gemini if available.""" | |
| try: | |
| return self.audio_output_queue.get_nowait() | |
| except queue.Empty: | |
| return None | |
| def get_text_response(self) -> dict | None: | |
| """Get text response from Gemini if available.""" | |
| try: | |
| return self.text_output_queue.get_nowait() | |
| except queue.Empty: | |
| return None | |
| class GeminiLiveChat: | |
| """Main application class for Gemini Live Chat.""" | |
| def __init__(self): | |
| self.client = WebRTCGeminiClient() | |
| self.chat_history = [] | |
| self.is_streaming = False | |
| self.audio_buffer = [] | |
| def start_session(self, api_key: str, voice: str, enable_audio: bool, enable_text: bool) -> str: | |
| """Initialize Gemini Live session.""" | |
| if not api_key or len(api_key) < 10: | |
| return "β Error: Please provide a valid Gemini API key" | |
| config = GeminiConfig( | |
| api_key=api_key, | |
| voice=voice, | |
| response_modalities=["AUDIO" if enable_audio else "", "TEXT" if enable_text else ""] | |
| ) | |
| config.response_modalities = [m for m in config.response_modalities if m] | |
| self.client = WebRTCGeminiClient(config) | |
| try: | |
| success = self.client.connect() | |
| if success: | |
| self.is_streaming = True | |
| return f"β Connected to Gemini Live!\nπ€ Voice: {voice}\nπ‘ Modalities: {', '.join(config.response_modalities)}" | |
| else: | |
| return "β Failed to connect. Please check your API key and try again." | |
| except Exception as e: | |
| return f"β Connection error: {str(e)}" | |
| def stop_session(self) -> str: | |
| """End Gemini Live session.""" | |
| self.is_streaming = False | |
| self.client.disconnect() | |
| return "βΉοΈ Session ended. Thanks for chatting!" | |
| def process_audio_stream(self, audio_data: tuple | None) -> tuple: | |
| """ | |
| Process incoming audio from microphone and return audio response. | |
| This is called continuously during streaming. | |
| """ | |
| if not self.is_streaming or audio_data is None: | |
| # Return silence when not streaming | |
| sample_rate = 24000 | |
| silence = np.zeros(sample_rate // 10, dtype=np.int16) # 100ms silence | |
| return (sample_rate, silence) | |
| # Send user audio to Gemini | |
| self.client.send_audio(audio_data) | |
| # Check for Gemini audio response | |
| response = self.client.get_audio_response() | |
| if response and response.get("type") == "audio": | |
| return response["data"] | |
| # Return silence if no response yet | |
| sample_rate = 24000 | |
| silence = np.zeros(sample_rate // 10, dtype=np.int16) | |
| return (sample_rate, silence) | |
| def get_text_updates(self) -> str: | |
| """Get text responses from Gemini.""" | |
| if not self.is_streaming: | |
| return self._format_chat_history() | |
| # Check for new text responses | |
| while True: | |
| response = self.client.get_text_response() | |
| if response is None: | |
| break | |
| if response.get("type") == "text": | |
| self.chat_history.append({ | |
| "role": "assistant", | |
| "content": response["content"] | |
| }) | |
| return self._format_chat_history() | |
| def _format_chat_history(self) -> str: | |
| """Format chat history for display.""" | |
| if not self.chat_history: | |
| return "No messages yet. Start speaking or type a message!" | |
| formatted = [] | |
| for msg in self.chat_history: | |
| role_emoji = "π€" if msg["role"] == "user" else "π€" | |
| formatted.append(f"{role_emoji} **{msg['role'].title()}**: {msg['content']}") | |
| return "\n\n".join(formatted) | |
| def send_text_message(self, message: str) -> str: | |
| """Send a text message to Gemini.""" | |
| if not self.is_streaming: | |
| return "β οΈ Please start a session first!" | |
| if not message.strip(): | |
| return self._format_chat_history() | |
| self.chat_history.append({ | |
| "role": "user", | |
| "content": message.strip() | |
| }) | |
| self.client.send_text(message.strip()) | |
| return self.get_text_updates() | |
| def clear_history(self) -> str: | |
| """Clear chat history.""" | |
| self.chat_history = [] | |
| return "History cleared." | |
| # Global app instance | |
| app = GeminiLiveChat() | |
| def create_gemini_live_chat(): | |
| """Create the Gradio 6 WebRTC Gemini Live Chat interface.""" | |
| with gr.Blocks() as demo: | |
| # Header with anycoder link | |
| gr.Markdown(""" | |
| # ποΈ WebRTC Gemini Live Chat Agent | |
| Real-time voice and text conversation with Google's Gemini AI using WebRTC for ultra-low latency. | |
| <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">Built with anycoder</a> | |
| """) | |
| with gr.Row(): | |
| # Left panel - Controls | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Session Settings") | |
| api_key_input = gr.Textbox( | |
| label="Gemini API Key", | |
| placeholder="Enter your Gemini API key...", | |
| type="password", | |
| info="Get your key at makersuite.google.com" | |
| ) | |
| voice_select = gr.Dropdown( | |
| choices=["Puck", "Charon", "Kore", "Fenrir", "Aoede"], | |
| value="Puck", | |
| label="Voice", | |
| info="Select Gemini's voice" | |
| ) | |
| with gr.Row(): | |
| enable_audio = gr.Checkbox( | |
| label="Audio Output", | |
| value=True, | |
| info="Receive voice responses" | |
| ) | |
| enable_text = gr.Checkbox( | |
| label="Text Output", | |
| value=True, | |
| info="Receive text responses" | |
| ) | |
| with gr.Row(): | |
| start_btn = gr.Button("βΆοΈ Start Session", variant="primary") | |
| stop_btn = gr.Button("βΉοΈ Stop", variant="stop") | |
| status_output = gr.Textbox( | |
| label="Status", | |
| value="Ready to connect. Enter your API key and click Start.", | |
| lines=3 | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("### π¬ Text Input") | |
| text_input = gr.Textbox( | |
| label="Type a message", | |
| placeholder="Or speak naturally...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| send_btn = gr.Button("π€ Send", variant="secondary") | |
| clear_btn = gr.Button("ποΈ Clear") | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| ### π Instructions | |
| 1. Enter your Gemini API key | |
| 2. Click **Start Session** | |
| 3. Allow microphone access | |
| 4. Speak naturally or type messages | |
| 5. Gemini responds in real-time with voice and text | |
| **Note**: This demo simulates WebRTC streaming. | |
| For production, use actual WebRTC libraries with | |
| Google's Gemini Live API. | |
| """) | |
| # Right panel - Chat and Audio | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Live Audio Stream") | |
| # Audio streaming component - input from mic, output to speakers | |
| audio_stream = gr.Audio( | |
| label="Live Conversation", | |
| sources=["microphone"], | |
| streaming=True, | |
| autoplay=True, | |
| waveform_options=gr.WaveformOptions( | |
| waveform_color="#4285f4", | |
| waveform_progress_color="#ea4335", | |
| show_recording_waveform=True | |
| ) | |
| ) | |
| gr.Markdown("### π¬ Conversation Transcript") | |
| chat_display = gr.Markdown( | |
| value="No messages yet. Start a session to begin chatting!", | |
| label="Chat History" | |
| ) | |
| # Event handlers | |
| def on_start(api_key, voice, audio_enabled, text_enabled): | |
| status = app.start_session(api_key, voice, audio_enabled, text_enabled) | |
| return status | |
| def on_stop(): | |
| status = app.stop_session() | |
| return status | |
| def on_send(message): | |
| return app.send_text_message(message) | |
| def on_clear(): | |
| return app.clear_history() | |
| def audio_callback(audio_data): | |
| # Process audio bidirectionally | |
| return app.process_audio_stream(audio_data) | |
| def update_chat(): | |
| # Poll for text updates | |
| return app.get_text_updates() | |
| # Connect events with Gradio 6 syntax | |
| start_btn.click( | |
| fn=on_start, | |
| inputs=[api_key_input, voice_select, enable_audio, enable_text], | |
| outputs=status_output, | |
| api_visibility="public" | |
| ) | |
| stop_btn.click( | |
| fn=on_stop, | |
| inputs=None, | |
| outputs=status_output, | |
| api_visibility="public" | |
| ) | |
| send_btn.click( | |
| fn=on_send, | |
| inputs=text_input, | |
| outputs=chat_display, | |
| api_visibility="public" | |
| ).then( | |
| fn=lambda: "", # Clear text input after sending | |
| outputs=text_input | |
| ) | |
| clear_btn.click( | |
| fn=on_clear, | |
| outputs=chat_display, | |
| api_visibility="public" | |
| ) | |
| # Audio streaming - Gradio 6 stream event | |
| audio_stream.stream( | |
| fn=audio_callback, | |
| inputs=audio_stream, | |
| outputs=audio_stream, | |
| time_limit=300, # 5 minutes max per session | |
| stream_every=0.1, # 100ms chunks | |
| concurrency_limit=1, | |
| api_visibility="private" | |
| ) | |
| # Timer for text updates | |
| timer = gr.Timer(0.5, active=True) | |
| timer.tick( | |
| fn=update_chat, | |
| outputs=chat_display, | |
| api_visibility="private" | |
| ) | |
| # Also update on start/stop | |
| start_btn.click(fn=update_chat, outputs=chat_display) | |
| stop_btn.click(fn=update_chat, outputs=chat_display) | |
| return demo | |
| # Create and launch the app | |
| with gr.Blocks() as demo: | |
| create_gemini_live_chat() | |
| demo.launch( | |
| theme=gr.themes.Soft( | |
| primary_hue="blue", | |
| secondary_hue="indigo", | |
| neutral_hue="slate", | |
| font=gr.themes.GoogleFont("Inter"), | |
| text_size="md", | |
| spacing_size="md", | |
| radius_size="lg" | |
| ).set( | |
| button_primary_background_fill="*primary_600", | |
| button_primary_background_fill_hover="*primary_700", | |
| block_title_text_weight="600", | |
| block_background_fill="*neutral_50" | |
| ), | |
| footer_links=[ | |
| {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}, | |
| {"label": "Gradio", "url": "https://gradio.app"}, | |
| {"label": "API", "url": "/docs"} | |
| ], | |
| pwa=True, | |
| title="WebRTC Gemini Live Chat" | |
| ) |