Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| ChatCal Voice-Enabled AI Assistant - Hugging Face Gradio Implementation | |
| A voice-enabled calendar booking assistant with real-time speech-to-text, | |
| text-to-speech responses, and Google Calendar integration. | |
| """ | |
| import gradio as gr | |
| import os | |
| import asyncio | |
| import json | |
| from typing import Dict, List, Tuple, Optional | |
| from datetime import datetime | |
| # Core functionality imports | |
| from core.chat_agent import ChatCalAgent | |
| from core.session_manager import SessionManager | |
| from core.mcp_audio_handler import MCPAudioHandler | |
| from core.config import config | |
| from version import get_version_info | |
| # WebRTC imports - re-enabled for WebRTC-first approach | |
| from webrtc.server.fastapi_integration import create_fastapi_app | |
| class ChatCalVoiceApp: | |
| """Main application class for voice-enabled ChatCal.""" | |
| def __init__(self): | |
| self.session_manager = SessionManager() | |
| self.chat_agent = ChatCalAgent() | |
| self.audio_handler = MCPAudioHandler() | |
| async def process_message( | |
| self, | |
| message: str, | |
| history: List[Tuple[str, str]], | |
| session_id: str | |
| ) -> Tuple[List[Tuple[str, str]], str]: | |
| """Process a chat message and return updated history.""" | |
| try: | |
| # Get or create session | |
| session = await self.session_manager.get_session(session_id) | |
| # Process message through ChatCal agent | |
| response = await self.chat_agent.process_message(message, session) | |
| # Update conversation history | |
| history.append((message, response)) | |
| return history, "" | |
| except Exception as e: | |
| error_msg = f"Sorry, I encountered an error: {str(e)}" | |
| history.append((message, error_msg)) | |
| return history, "" | |
| async def process_audio( | |
| self, | |
| audio_data: bytes, | |
| history: List[Tuple[str, str]], | |
| session_id: str | |
| ) -> Tuple[List[Tuple[str, str]], str, bytes]: | |
| """Process audio input and return transcription + response audio.""" | |
| try: | |
| # Convert audio to text via STT service | |
| transcription = await self.audio_handler.speech_to_text(audio_data) | |
| # Process the transcribed message | |
| history, _ = await self.process_message(transcription, history, session_id) | |
| # Get the latest response for TTS | |
| if history: | |
| latest_response = history[-1][1] | |
| # Convert response to speech | |
| response_audio = await self.audio_handler.text_to_speech(latest_response) | |
| return history, transcription, response_audio | |
| return history, transcription, None | |
| except Exception as e: | |
| error_msg = f"Audio processing error: {str(e)}" | |
| history.append(("(Audio input)", error_msg)) | |
| return history, "", None | |
| def create_interface(self) -> gr.Interface: | |
| """Create the main Gradio interface.""" | |
| with gr.Blocks( | |
| theme=gr.themes.Soft(), | |
| title="ChatCal Voice Assistant", | |
| css=""" | |
| .chat-container { | |
| max-height: 500px; | |
| overflow-y: auto; | |
| } | |
| .voice-controls { | |
| background: linear-gradient(45deg, #667eea 0%, #764ba2 100%); | |
| padding: 10px; | |
| border-radius: 10px; | |
| margin: 10px 0; | |
| } | |
| .status-indicator { | |
| display: inline-block; | |
| width: 12px; | |
| height: 12px; | |
| border-radius: 50%; | |
| margin-right: 8px; | |
| } | |
| .recording { background-color: #ff4444; } | |
| .idle { background-color: #44ff44; } | |
| """ | |
| ) as demo: | |
| # Title and description | |
| gr.Markdown(""" | |
| # π€π ChatCal Voice Assistant | |
| **Book your Google Calendar appointments with voice or text!** | |
| - π£οΈ **Voice Input**: Click record, speak naturally | |
| - π¬ **Text Input**: Type your message | |
| - π **Smart Booking**: AI understands dates, times, and preferences | |
| - π₯ **Google Meet**: Automatic video conference setup | |
| """) | |
| # Session state | |
| session_id = gr.State(value=lambda: f"session_{datetime.now().timestamp()}") | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| # Chat history display | |
| chatbot = gr.Chatbot( | |
| label="Chat History", | |
| height=400, | |
| elem_classes=["chat-container"] | |
| ) | |
| with gr.Row(elem_classes=["voice-controls"]): | |
| # Traditional Voice input section | |
| with gr.Column(scale=2): | |
| audio_input = gr.Audio( | |
| type="numpy", | |
| label="π€ Voice Input (Gradio)", | |
| interactive=True | |
| ) | |
| voice_status = gr.HTML( | |
| value='<span class="status-indicator idle"></span>Ready for voice input' | |
| ) | |
| with gr.Column(scale=1): | |
| # Audio output | |
| audio_output = gr.Audio( | |
| label="π AI Response", | |
| type="numpy", | |
| interactive=False | |
| ) | |
| # WebRTC Real-time Voice Section | |
| with gr.Row(): | |
| gr.HTML(""" | |
| <div style="background: linear-gradient(45deg, #28a745 0%, #20c997 100%); | |
| padding: 15px; border-radius: 10px; margin: 10px 0;"> | |
| <h3 style="color: white; margin: 0;">π WebRTC Real-time Voice (Beta)</h3> | |
| <p style="color: white; margin: 5px 0;"> | |
| Enhanced real-time voice interaction with streaming transcription | |
| </p> | |
| <p style="color: white; margin: 5px 0; font-size: 0.9em;"> | |
| π‘ <strong>WebSocket endpoints:</strong> /ws/webrtc/{client_id} | | |
| π§ͺ <strong>Test page:</strong> <a href="/webrtc/demo" style="color: #fff; text-decoration: underline;">WebRTC Demo</a> | | |
| β‘ <strong>API Status:</strong> <a href="/webrtc/test" style="color: #fff; text-decoration: underline;">Test Endpoint</a> | |
| </p> | |
| </div> | |
| """) | |
| # Text input section | |
| with gr.Row(): | |
| text_input = gr.Textbox( | |
| label="π¬ Type your message or see voice transcription", | |
| placeholder="Hi! I'm [Your Name]. Book a 30-minute meeting tomorrow at 2 PM...", | |
| lines=2, | |
| scale=4 | |
| ) | |
| send_btn = gr.Button("Send", variant="primary", scale=1) | |
| with gr.Column(scale=1): | |
| # Quick action buttons | |
| gr.Markdown("### π Quick Actions") | |
| quick_meet = gr.Button( | |
| "π₯ Google Meet (30m)", | |
| variant="secondary" | |
| ) | |
| quick_availability = gr.Button( | |
| "π Check Availability", | |
| variant="secondary" | |
| ) | |
| quick_cancel = gr.Button( | |
| "β Cancel Meeting", | |
| variant="secondary" | |
| ) | |
| # Version info | |
| version_btn = gr.Button( | |
| "βΉοΈ Version Info", | |
| variant="secondary" | |
| ) | |
| version_display = gr.Textbox( | |
| label="Version Information", | |
| interactive=False, | |
| visible=False | |
| ) | |
| # Voice settings | |
| gr.Markdown("### π Voice Settings") | |
| voice_enabled = gr.Checkbox( | |
| label="Enable voice responses", | |
| value=True | |
| ) | |
| voice_selection = gr.Dropdown( | |
| choices=[ | |
| "v2/en_speaker_0", | |
| "v2/en_speaker_1", | |
| "v2/en_speaker_2", | |
| "v2/en_speaker_6", | |
| "v2/en_speaker_9" | |
| ], | |
| value="v2/en_speaker_6", | |
| label="AI Voice" | |
| ) | |
| # Event handlers | |
| def handle_text_submit(message, history, session): | |
| if message.strip(): | |
| # Use asyncio to handle the async function | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| try: | |
| result = loop.run_until_complete( | |
| app.process_message(message, history, session) | |
| ) | |
| return result | |
| finally: | |
| loop.close() | |
| return history, message | |
| def handle_audio_submit(audio, history, session): | |
| print(f"π€ AUDIO DEBUG: Received audio input: {type(audio)}") | |
| print(f"π€ AUDIO DEBUG: Audio data: {audio}") | |
| if audio is not None: | |
| print(f"π€ AUDIO DEBUG: Processing audio...") | |
| # Convert audio data and process | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| try: | |
| # Debug audio format | |
| if isinstance(audio, tuple) and len(audio) >= 2: | |
| sample_rate, audio_array = audio | |
| print(f"π€ AUDIO DEBUG: Sample rate: {sample_rate}") | |
| print(f"π€ AUDIO DEBUG: Audio array type: {type(audio_array)}") | |
| print(f"π€ AUDIO DEBUG: Audio array shape: {audio_array.shape if hasattr(audio_array, 'shape') else 'No shape'}") | |
| # Use the audio handler's process method instead | |
| transcription = app.audio_handler.process_audio_input(audio) | |
| print(f"π€ AUDIO DEBUG: Transcription result: {transcription}") | |
| if transcription and transcription != "No audio received": | |
| # Process the transcription as a message | |
| result = loop.run_until_complete( | |
| app.process_message(transcription, history, session) | |
| ) | |
| # Return updated history, transcription in text box, and no audio output for now | |
| return result[0], transcription, None | |
| else: | |
| print(f"π€ AUDIO DEBUG: No valid transcription received") | |
| return history, "No audio transcription available", None | |
| else: | |
| print(f"π€ AUDIO DEBUG: Invalid audio format") | |
| return history, "Invalid audio format", None | |
| except Exception as e: | |
| print(f"π€ AUDIO ERROR: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return history, f"Audio processing error: {str(e)}", None | |
| finally: | |
| loop.close() | |
| else: | |
| print(f"π€ AUDIO DEBUG: No audio received") | |
| return history, "No audio received", None | |
| def handle_quick_action(action_text, history, session): | |
| """Handle quick action button clicks.""" | |
| loop = asyncio.new_event_loop() | |
| asyncio.set_event_loop(loop) | |
| try: | |
| result = loop.run_until_complete( | |
| app.process_message(action_text, history, session) | |
| ) | |
| return result[0], "" # Return updated history and clear text input | |
| finally: | |
| loop.close() | |
| # Wire up the event handlers | |
| send_btn.click( | |
| fn=handle_text_submit, | |
| inputs=[text_input, chatbot, session_id], | |
| outputs=[chatbot, text_input] | |
| ) | |
| text_input.submit( | |
| fn=handle_text_submit, | |
| inputs=[text_input, chatbot, session_id], | |
| outputs=[chatbot, text_input] | |
| ) | |
| audio_input.change( | |
| fn=handle_audio_submit, | |
| inputs=[audio_input, chatbot, session_id], | |
| outputs=[chatbot, text_input, audio_output] | |
| ) | |
| # Quick action handlers | |
| quick_meet.click( | |
| fn=lambda hist, sess: handle_quick_action( | |
| "Book a 30-minute Google Meet with Peter for next available time", | |
| hist, sess | |
| ), | |
| inputs=[chatbot, session_id], | |
| outputs=[chatbot, text_input] | |
| ) | |
| quick_availability.click( | |
| fn=lambda hist, sess: handle_quick_action( | |
| "What is Peter's availability this week?", | |
| hist, sess | |
| ), | |
| inputs=[chatbot, session_id], | |
| outputs=[chatbot, text_input] | |
| ) | |
| quick_cancel.click( | |
| fn=lambda hist, sess: handle_quick_action( | |
| "Cancel my upcoming meeting with Peter", | |
| hist, sess | |
| ), | |
| inputs=[chatbot, session_id], | |
| outputs=[chatbot, text_input] | |
| ) | |
| # Version info handler | |
| def show_version(): | |
| info = get_version_info() | |
| version_text = f"Version: {info['version']}\nBuild: {info['build_date']}\nDescription: {info['description']}\nStatus: {info['status']}" | |
| return version_text, gr.update(visible=True) | |
| version_btn.click( | |
| fn=show_version, | |
| outputs=[version_display, version_display] | |
| ) | |
| return demo | |
| # Global app instance | |
| app = ChatCalVoiceApp() | |
| # Create and launch the interface | |
| if __name__ == "__main__": | |
| import uvicorn | |
| try: | |
| # Create WebRTC-enabled FastAPI app as main app | |
| webrtc_app = create_fastapi_app() | |
| # Create Gradio interface (for future integration) | |
| demo = app.create_interface() | |
| # WebRTC-first approach: Launch FastAPI with WebSocket endpoints | |
| print("π ChatCal WebRTC-First Deployment v0.4.3") | |
| print("π‘ WebSocket endpoint: /ws/webrtc/{client_id}") | |
| print("π§ͺ WebRTC demo page: /webrtc/demo") | |
| print("β‘ API status: /webrtc/test") | |
| print("β οΈ Gradio interface development - WebRTC priority") | |
| # Launch WebRTC FastAPI app directly | |
| uvicorn.run(webrtc_app, host="0.0.0.0", port=7860) | |
| except Exception as e: | |
| print(f"β WebRTC integration error: {e}") | |
| print("π Falling back to Gradio-only deployment") | |
| import traceback | |
| traceback.print_exc() | |
| # Create stable Gradio interface fallback | |
| demo = app.create_interface() | |
| print("π ChatCal Voice-Enabled Assistant v0.4.3") | |
| print("π± Traditional voice input available via Gradio Audio component") | |
| print("βοΈ WebRTC real-time streaming: Debugging in progress") | |
| # Launch configuration for HF Spaces (stable fallback) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, # HF handles sharing | |
| show_error=True | |
| ) |