#!/usr/bin/env python3 """ ChatCal Voice-Enabled AI Assistant - Hugging Face Gradio Implementation A voice-enabled calendar booking assistant with real-time speech-to-text, text-to-speech responses, and Google Calendar integration. """ import gradio as gr import os import asyncio import json from typing import Dict, List, Tuple, Optional from datetime import datetime # Core functionality imports from core.chat_agent import ChatCalAgent from core.session_manager import SessionManager from core.mcp_audio_handler import MCPAudioHandler from core.config import config from version import get_version_info # WebRTC imports - re-enabled for WebRTC-first approach from webrtc.server.fastapi_integration import create_fastapi_app class ChatCalVoiceApp: """Main application class for voice-enabled ChatCal.""" def __init__(self): self.session_manager = SessionManager() self.chat_agent = ChatCalAgent() self.audio_handler = MCPAudioHandler() async def process_message( self, message: str, history: List[Tuple[str, str]], session_id: str ) -> Tuple[List[Tuple[str, str]], str]: """Process a chat message and return updated history.""" try: # Get or create session session = await self.session_manager.get_session(session_id) # Process message through ChatCal agent response = await self.chat_agent.process_message(message, session) # Update conversation history history.append((message, response)) return history, "" except Exception as e: error_msg = f"Sorry, I encountered an error: {str(e)}" history.append((message, error_msg)) return history, "" async def process_audio( self, audio_data: bytes, history: List[Tuple[str, str]], session_id: str ) -> Tuple[List[Tuple[str, str]], str, bytes]: """Process audio input and return transcription + response audio.""" try: # Convert audio to text via STT service transcription = await self.audio_handler.speech_to_text(audio_data) # Process the transcribed message history, _ = await self.process_message(transcription, history, session_id) # Get the latest response for TTS if history: latest_response = history[-1][1] # Convert response to speech response_audio = await self.audio_handler.text_to_speech(latest_response) return history, transcription, response_audio return history, transcription, None except Exception as e: error_msg = f"Audio processing error: {str(e)}" history.append(("(Audio input)", error_msg)) return history, "", None def create_interface(self) -> gr.Interface: """Create the main Gradio interface.""" with gr.Blocks( theme=gr.themes.Soft(), title="ChatCal Voice Assistant", css=""" .chat-container { max-height: 500px; overflow-y: auto; } .voice-controls { background: linear-gradient(45deg, #667eea 0%, #764ba2 100%); padding: 10px; border-radius: 10px; margin: 10px 0; } .status-indicator { display: inline-block; width: 12px; height: 12px; border-radius: 50%; margin-right: 8px; } .recording { background-color: #ff4444; } .idle { background-color: #44ff44; } """ ) as demo: # Title and description gr.Markdown(""" # ๐ŸŽค๐Ÿ“… ChatCal Voice Assistant **Book your Google Calendar appointments with voice or text!** - ๐Ÿ—ฃ๏ธ **Voice Input**: Click record, speak naturally - ๐Ÿ’ฌ **Text Input**: Type your message - ๐Ÿ“… **Smart Booking**: AI understands dates, times, and preferences - ๐ŸŽฅ **Google Meet**: Automatic video conference setup """) # Session state session_id = gr.State(value=lambda: f"session_{datetime.now().timestamp()}") with gr.Row(): with gr.Column(scale=3): # Chat history display chatbot = gr.Chatbot( label="Chat History", height=400, elem_classes=["chat-container"] ) with gr.Row(elem_classes=["voice-controls"]): # Traditional Voice input section with gr.Column(scale=2): audio_input = gr.Audio( type="numpy", label="๐ŸŽค Voice Input (Gradio)", interactive=True ) voice_status = gr.HTML( value='Ready for voice input' ) with gr.Column(scale=1): # Audio output audio_output = gr.Audio( label="๐Ÿ”Š AI Response", type="numpy", interactive=False ) # WebRTC Real-time Voice Section with gr.Row(): gr.HTML("""

๐Ÿš€ WebRTC Real-time Voice (Beta)

Enhanced real-time voice interaction with streaming transcription

๐Ÿ“ก WebSocket endpoints: /ws/webrtc/{client_id} | ๐Ÿงช Test page: WebRTC Demo | โšก API Status: Test Endpoint

""") # Text input section with gr.Row(): text_input = gr.Textbox( label="๐Ÿ’ฌ Type your message or see voice transcription", placeholder="Hi! I'm [Your Name]. Book a 30-minute meeting tomorrow at 2 PM...", lines=2, scale=4 ) send_btn = gr.Button("Send", variant="primary", scale=1) with gr.Column(scale=1): # Quick action buttons gr.Markdown("### ๐Ÿš€ Quick Actions") quick_meet = gr.Button( "๐ŸŽฅ Google Meet (30m)", variant="secondary" ) quick_availability = gr.Button( "๐Ÿ“… Check Availability", variant="secondary" ) quick_cancel = gr.Button( "โŒ Cancel Meeting", variant="secondary" ) # Version info version_btn = gr.Button( "โ„น๏ธ Version Info", variant="secondary" ) version_display = gr.Textbox( label="Version Information", interactive=False, visible=False ) # Voice settings gr.Markdown("### ๐ŸŽญ Voice Settings") voice_enabled = gr.Checkbox( label="Enable voice responses", value=True ) voice_selection = gr.Dropdown( choices=[ "v2/en_speaker_0", "v2/en_speaker_1", "v2/en_speaker_2", "v2/en_speaker_6", "v2/en_speaker_9" ], value="v2/en_speaker_6", label="AI Voice" ) # Event handlers def handle_text_submit(message, history, session): if message.strip(): # Use asyncio to handle the async function loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: result = loop.run_until_complete( app.process_message(message, history, session) ) return result finally: loop.close() return history, message def handle_audio_submit(audio, history, session): print(f"๐ŸŽค AUDIO DEBUG: Received audio input: {type(audio)}") print(f"๐ŸŽค AUDIO DEBUG: Audio data: {audio}") if audio is not None: print(f"๐ŸŽค AUDIO DEBUG: Processing audio...") # Convert audio data and process loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: # Debug audio format if isinstance(audio, tuple) and len(audio) >= 2: sample_rate, audio_array = audio print(f"๐ŸŽค AUDIO DEBUG: Sample rate: {sample_rate}") print(f"๐ŸŽค AUDIO DEBUG: Audio array type: {type(audio_array)}") print(f"๐ŸŽค AUDIO DEBUG: Audio array shape: {audio_array.shape if hasattr(audio_array, 'shape') else 'No shape'}") # Use the audio handler's process method instead transcription = app.audio_handler.process_audio_input(audio) print(f"๐ŸŽค AUDIO DEBUG: Transcription result: {transcription}") if transcription and transcription != "No audio received": # Process the transcription as a message result = loop.run_until_complete( app.process_message(transcription, history, session) ) # Return updated history, transcription in text box, and no audio output for now return result[0], transcription, None else: print(f"๐ŸŽค AUDIO DEBUG: No valid transcription received") return history, "No audio transcription available", None else: print(f"๐ŸŽค AUDIO DEBUG: Invalid audio format") return history, "Invalid audio format", None except Exception as e: print(f"๐ŸŽค AUDIO ERROR: {str(e)}") import traceback traceback.print_exc() return history, f"Audio processing error: {str(e)}", None finally: loop.close() else: print(f"๐ŸŽค AUDIO DEBUG: No audio received") return history, "No audio received", None def handle_quick_action(action_text, history, session): """Handle quick action button clicks.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: result = loop.run_until_complete( app.process_message(action_text, history, session) ) return result[0], "" # Return updated history and clear text input finally: loop.close() # Wire up the event handlers send_btn.click( fn=handle_text_submit, inputs=[text_input, chatbot, session_id], outputs=[chatbot, text_input] ) text_input.submit( fn=handle_text_submit, inputs=[text_input, chatbot, session_id], outputs=[chatbot, text_input] ) audio_input.change( fn=handle_audio_submit, inputs=[audio_input, chatbot, session_id], outputs=[chatbot, text_input, audio_output] ) # Quick action handlers quick_meet.click( fn=lambda hist, sess: handle_quick_action( "Book a 30-minute Google Meet with Peter for next available time", hist, sess ), inputs=[chatbot, session_id], outputs=[chatbot, text_input] ) quick_availability.click( fn=lambda hist, sess: handle_quick_action( "What is Peter's availability this week?", hist, sess ), inputs=[chatbot, session_id], outputs=[chatbot, text_input] ) quick_cancel.click( fn=lambda hist, sess: handle_quick_action( "Cancel my upcoming meeting with Peter", hist, sess ), inputs=[chatbot, session_id], outputs=[chatbot, text_input] ) # Version info handler def show_version(): info = get_version_info() version_text = f"Version: {info['version']}\nBuild: {info['build_date']}\nDescription: {info['description']}\nStatus: {info['status']}" return version_text, gr.update(visible=True) version_btn.click( fn=show_version, outputs=[version_display, version_display] ) return demo # Global app instance app = ChatCalVoiceApp() # Create and launch the interface if __name__ == "__main__": import uvicorn try: # Create WebRTC-enabled FastAPI app as main app webrtc_app = create_fastapi_app() # Create Gradio interface (for future integration) demo = app.create_interface() # WebRTC-first approach: Launch FastAPI with WebSocket endpoints print("๐Ÿš€ ChatCal WebRTC-First Deployment v0.4.3") print("๐Ÿ“ก WebSocket endpoint: /ws/webrtc/{client_id}") print("๐Ÿงช WebRTC demo page: /webrtc/demo") print("โšก API status: /webrtc/test") print("โš ๏ธ Gradio interface development - WebRTC priority") # Launch WebRTC FastAPI app directly uvicorn.run(webrtc_app, host="0.0.0.0", port=7860) except Exception as e: print(f"โŒ WebRTC integration error: {e}") print("๐Ÿ“‹ Falling back to Gradio-only deployment") import traceback traceback.print_exc() # Create stable Gradio interface fallback demo = app.create_interface() print("๐Ÿš€ ChatCal Voice-Enabled Assistant v0.4.3") print("๐Ÿ“ฑ Traditional voice input available via Gradio Audio component") print("โš™๏ธ WebRTC real-time streaming: Debugging in progress") # Launch configuration for HF Spaces (stable fallback) demo.launch( server_name="0.0.0.0", server_port=7860, share=False, # HF handles sharing show_error=True )