ChatCal.ai-1 / app.py
Peter Michael Gits
feat: WebRTC-first implementation with demo audio processing v0.4.4
65f90da
#!/usr/bin/env python3
"""
ChatCal Voice-Enabled AI Assistant - Hugging Face Gradio Implementation
A voice-enabled calendar booking assistant with real-time speech-to-text,
text-to-speech responses, and Google Calendar integration.
"""
import gradio as gr
import os
import asyncio
import json
from typing import Dict, List, Tuple, Optional
from datetime import datetime
# Core functionality imports
from core.chat_agent import ChatCalAgent
from core.session_manager import SessionManager
from core.mcp_audio_handler import MCPAudioHandler
from core.config import config
from version import get_version_info
# WebRTC imports - re-enabled for WebRTC-first approach
from webrtc.server.fastapi_integration import create_fastapi_app
class ChatCalVoiceApp:
"""Main application class for voice-enabled ChatCal."""
def __init__(self):
self.session_manager = SessionManager()
self.chat_agent = ChatCalAgent()
self.audio_handler = MCPAudioHandler()
async def process_message(
self,
message: str,
history: List[Tuple[str, str]],
session_id: str
) -> Tuple[List[Tuple[str, str]], str]:
"""Process a chat message and return updated history."""
try:
# Get or create session
session = await self.session_manager.get_session(session_id)
# Process message through ChatCal agent
response = await self.chat_agent.process_message(message, session)
# Update conversation history
history.append((message, response))
return history, ""
except Exception as e:
error_msg = f"Sorry, I encountered an error: {str(e)}"
history.append((message, error_msg))
return history, ""
async def process_audio(
self,
audio_data: bytes,
history: List[Tuple[str, str]],
session_id: str
) -> Tuple[List[Tuple[str, str]], str, bytes]:
"""Process audio input and return transcription + response audio."""
try:
# Convert audio to text via STT service
transcription = await self.audio_handler.speech_to_text(audio_data)
# Process the transcribed message
history, _ = await self.process_message(transcription, history, session_id)
# Get the latest response for TTS
if history:
latest_response = history[-1][1]
# Convert response to speech
response_audio = await self.audio_handler.text_to_speech(latest_response)
return history, transcription, response_audio
return history, transcription, None
except Exception as e:
error_msg = f"Audio processing error: {str(e)}"
history.append(("(Audio input)", error_msg))
return history, "", None
def create_interface(self) -> gr.Interface:
"""Create the main Gradio interface."""
with gr.Blocks(
theme=gr.themes.Soft(),
title="ChatCal Voice Assistant",
css="""
.chat-container {
max-height: 500px;
overflow-y: auto;
}
.voice-controls {
background: linear-gradient(45deg, #667eea 0%, #764ba2 100%);
padding: 10px;
border-radius: 10px;
margin: 10px 0;
}
.status-indicator {
display: inline-block;
width: 12px;
height: 12px;
border-radius: 50%;
margin-right: 8px;
}
.recording { background-color: #ff4444; }
.idle { background-color: #44ff44; }
"""
) as demo:
# Title and description
gr.Markdown("""
# πŸŽ€πŸ“… ChatCal Voice Assistant
**Book your Google Calendar appointments with voice or text!**
- πŸ—£οΈ **Voice Input**: Click record, speak naturally
- πŸ’¬ **Text Input**: Type your message
- πŸ“… **Smart Booking**: AI understands dates, times, and preferences
- πŸŽ₯ **Google Meet**: Automatic video conference setup
""")
# Session state
session_id = gr.State(value=lambda: f"session_{datetime.now().timestamp()}")
with gr.Row():
with gr.Column(scale=3):
# Chat history display
chatbot = gr.Chatbot(
label="Chat History",
height=400,
elem_classes=["chat-container"]
)
with gr.Row(elem_classes=["voice-controls"]):
# Traditional Voice input section
with gr.Column(scale=2):
audio_input = gr.Audio(
type="numpy",
label="🎀 Voice Input (Gradio)",
interactive=True
)
voice_status = gr.HTML(
value='<span class="status-indicator idle"></span>Ready for voice input'
)
with gr.Column(scale=1):
# Audio output
audio_output = gr.Audio(
label="πŸ”Š AI Response",
type="numpy",
interactive=False
)
# WebRTC Real-time Voice Section
with gr.Row():
gr.HTML("""
<div style="background: linear-gradient(45deg, #28a745 0%, #20c997 100%);
padding: 15px; border-radius: 10px; margin: 10px 0;">
<h3 style="color: white; margin: 0;">πŸš€ WebRTC Real-time Voice (Beta)</h3>
<p style="color: white; margin: 5px 0;">
Enhanced real-time voice interaction with streaming transcription
</p>
<p style="color: white; margin: 5px 0; font-size: 0.9em;">
πŸ“‘ <strong>WebSocket endpoints:</strong> /ws/webrtc/{client_id} |
πŸ§ͺ <strong>Test page:</strong> <a href="/webrtc/demo" style="color: #fff; text-decoration: underline;">WebRTC Demo</a> |
⚑ <strong>API Status:</strong> <a href="/webrtc/test" style="color: #fff; text-decoration: underline;">Test Endpoint</a>
</p>
</div>
""")
# Text input section
with gr.Row():
text_input = gr.Textbox(
label="πŸ’¬ Type your message or see voice transcription",
placeholder="Hi! I'm [Your Name]. Book a 30-minute meeting tomorrow at 2 PM...",
lines=2,
scale=4
)
send_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Column(scale=1):
# Quick action buttons
gr.Markdown("### πŸš€ Quick Actions")
quick_meet = gr.Button(
"πŸŽ₯ Google Meet (30m)",
variant="secondary"
)
quick_availability = gr.Button(
"πŸ“… Check Availability",
variant="secondary"
)
quick_cancel = gr.Button(
"❌ Cancel Meeting",
variant="secondary"
)
# Version info
version_btn = gr.Button(
"ℹ️ Version Info",
variant="secondary"
)
version_display = gr.Textbox(
label="Version Information",
interactive=False,
visible=False
)
# Voice settings
gr.Markdown("### 🎭 Voice Settings")
voice_enabled = gr.Checkbox(
label="Enable voice responses",
value=True
)
voice_selection = gr.Dropdown(
choices=[
"v2/en_speaker_0",
"v2/en_speaker_1",
"v2/en_speaker_2",
"v2/en_speaker_6",
"v2/en_speaker_9"
],
value="v2/en_speaker_6",
label="AI Voice"
)
# Event handlers
def handle_text_submit(message, history, session):
if message.strip():
# Use asyncio to handle the async function
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
app.process_message(message, history, session)
)
return result
finally:
loop.close()
return history, message
def handle_audio_submit(audio, history, session):
print(f"🎀 AUDIO DEBUG: Received audio input: {type(audio)}")
print(f"🎀 AUDIO DEBUG: Audio data: {audio}")
if audio is not None:
print(f"🎀 AUDIO DEBUG: Processing audio...")
# Convert audio data and process
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Debug audio format
if isinstance(audio, tuple) and len(audio) >= 2:
sample_rate, audio_array = audio
print(f"🎀 AUDIO DEBUG: Sample rate: {sample_rate}")
print(f"🎀 AUDIO DEBUG: Audio array type: {type(audio_array)}")
print(f"🎀 AUDIO DEBUG: Audio array shape: {audio_array.shape if hasattr(audio_array, 'shape') else 'No shape'}")
# Use the audio handler's process method instead
transcription = app.audio_handler.process_audio_input(audio)
print(f"🎀 AUDIO DEBUG: Transcription result: {transcription}")
if transcription and transcription != "No audio received":
# Process the transcription as a message
result = loop.run_until_complete(
app.process_message(transcription, history, session)
)
# Return updated history, transcription in text box, and no audio output for now
return result[0], transcription, None
else:
print(f"🎀 AUDIO DEBUG: No valid transcription received")
return history, "No audio transcription available", None
else:
print(f"🎀 AUDIO DEBUG: Invalid audio format")
return history, "Invalid audio format", None
except Exception as e:
print(f"🎀 AUDIO ERROR: {str(e)}")
import traceback
traceback.print_exc()
return history, f"Audio processing error: {str(e)}", None
finally:
loop.close()
else:
print(f"🎀 AUDIO DEBUG: No audio received")
return history, "No audio received", None
def handle_quick_action(action_text, history, session):
"""Handle quick action button clicks."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
app.process_message(action_text, history, session)
)
return result[0], "" # Return updated history and clear text input
finally:
loop.close()
# Wire up the event handlers
send_btn.click(
fn=handle_text_submit,
inputs=[text_input, chatbot, session_id],
outputs=[chatbot, text_input]
)
text_input.submit(
fn=handle_text_submit,
inputs=[text_input, chatbot, session_id],
outputs=[chatbot, text_input]
)
audio_input.change(
fn=handle_audio_submit,
inputs=[audio_input, chatbot, session_id],
outputs=[chatbot, text_input, audio_output]
)
# Quick action handlers
quick_meet.click(
fn=lambda hist, sess: handle_quick_action(
"Book a 30-minute Google Meet with Peter for next available time",
hist, sess
),
inputs=[chatbot, session_id],
outputs=[chatbot, text_input]
)
quick_availability.click(
fn=lambda hist, sess: handle_quick_action(
"What is Peter's availability this week?",
hist, sess
),
inputs=[chatbot, session_id],
outputs=[chatbot, text_input]
)
quick_cancel.click(
fn=lambda hist, sess: handle_quick_action(
"Cancel my upcoming meeting with Peter",
hist, sess
),
inputs=[chatbot, session_id],
outputs=[chatbot, text_input]
)
# Version info handler
def show_version():
info = get_version_info()
version_text = f"Version: {info['version']}\nBuild: {info['build_date']}\nDescription: {info['description']}\nStatus: {info['status']}"
return version_text, gr.update(visible=True)
version_btn.click(
fn=show_version,
outputs=[version_display, version_display]
)
return demo
# Global app instance
app = ChatCalVoiceApp()
# Create and launch the interface
if __name__ == "__main__":
import uvicorn
try:
# Create WebRTC-enabled FastAPI app as main app
webrtc_app = create_fastapi_app()
# Create Gradio interface (for future integration)
demo = app.create_interface()
# WebRTC-first approach: Launch FastAPI with WebSocket endpoints
print("πŸš€ ChatCal WebRTC-First Deployment v0.4.3")
print("πŸ“‘ WebSocket endpoint: /ws/webrtc/{client_id}")
print("πŸ§ͺ WebRTC demo page: /webrtc/demo")
print("⚑ API status: /webrtc/test")
print("⚠️ Gradio interface development - WebRTC priority")
# Launch WebRTC FastAPI app directly
uvicorn.run(webrtc_app, host="0.0.0.0", port=7860)
except Exception as e:
print(f"❌ WebRTC integration error: {e}")
print("πŸ“‹ Falling back to Gradio-only deployment")
import traceback
traceback.print_exc()
# Create stable Gradio interface fallback
demo = app.create_interface()
print("πŸš€ ChatCal Voice-Enabled Assistant v0.4.3")
print("πŸ“± Traditional voice input available via Gradio Audio component")
print("βš™οΈ WebRTC real-time streaming: Debugging in progress")
# Launch configuration for HF Spaces (stable fallback)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # HF handles sharing
show_error=True
)