Spaces:

pgits
/

ChatCal.ai-1

Paused

Peter Michael Gits

feat: WebRTC-first implementation with demo audio processing v0.4.4

65f90da 8 months ago

17.3 kB

	#!/usr/bin/env python3
	"""
	ChatCal Voice-Enabled AI Assistant - Hugging Face Gradio Implementation

	A voice-enabled calendar booking assistant with real-time speech-to-text,
	text-to-speech responses, and Google Calendar integration.
	"""

	import gradio as gr
	import os
	import asyncio
	import json
	from typing import Dict, List, Tuple, Optional
	from datetime import datetime

	# Core functionality imports
	from core.chat_agent import ChatCalAgent
	from core.session_manager import SessionManager
	from core.mcp_audio_handler import MCPAudioHandler
	from core.config import config
	from version import get_version_info

	# WebRTC imports - re-enabled for WebRTC-first approach
	from webrtc.server.fastapi_integration import create_fastapi_app

	class ChatCalVoiceApp:
	"""Main application class for voice-enabled ChatCal."""

	def __init__(self):
	self.session_manager = SessionManager()
	self.chat_agent = ChatCalAgent()
	self.audio_handler = MCPAudioHandler()

	async def process_message(
	self,
	message: str,
	history: List[Tuple[str, str]],
	session_id: str
	) -> Tuple[List[Tuple[str, str]], str]:
	"""Process a chat message and return updated history."""
	try:
	# Get or create session
	session = await self.session_manager.get_session(session_id)

	# Process message through ChatCal agent
	response = await self.chat_agent.process_message(message, session)

	# Update conversation history
	history.append((message, response))

	return history, ""

	except Exception as e:
	error_msg = f"Sorry, I encountered an error: {str(e)}"
	history.append((message, error_msg))
	return history, ""

	async def process_audio(
	self,
	audio_data: bytes,
	history: List[Tuple[str, str]],
	session_id: str
	) -> Tuple[List[Tuple[str, str]], str, bytes]:
	"""Process audio input and return transcription + response audio."""
	try:
	# Convert audio to text via STT service
	transcription = await self.audio_handler.speech_to_text(audio_data)

	# Process the transcribed message
	history, _ = await self.process_message(transcription, history, session_id)

	# Get the latest response for TTS
	if history:
	latest_response = history[-1][1]
	# Convert response to speech
	response_audio = await self.audio_handler.text_to_speech(latest_response)
	return history, transcription, response_audio

	return history, transcription, None

	except Exception as e:
	error_msg = f"Audio processing error: {str(e)}"
	history.append(("(Audio input)", error_msg))
	return history, "", None

	def create_interface(self) -> gr.Interface:
	"""Create the main Gradio interface."""

	with gr.Blocks(
	theme=gr.themes.Soft(),
	title="ChatCal Voice Assistant",
	css="""
	.chat-container {
	max-height: 500px;
	overflow-y: auto;
	}
	.voice-controls {
	background: linear-gradient(45deg, #667eea 0%, #764ba2 100%);
	padding: 10px;
	border-radius: 10px;
	margin: 10px 0;
	}
	.status-indicator {
	display: inline-block;
	width: 12px;
	height: 12px;
	border-radius: 50%;
	margin-right: 8px;
	}
	.recording { background-color: #ff4444; }
	.idle { background-color: #44ff44; }
	"""
	) as demo:

	# Title and description
	gr.Markdown("""
	# 🎤📅 ChatCal Voice Assistant

	Book your Google Calendar appointments with voice or text!

	- 🗣️ Voice Input: Click record, speak naturally
	- 💬 Text Input: Type your message
	- 📅 Smart Booking: AI understands dates, times, and preferences
	- 🎥 Google Meet: Automatic video conference setup
	""")

	# Session state
	session_id = gr.State(value=lambda: f"session_{datetime.now().timestamp()}")

	with gr.Row():
	with gr.Column(scale=3):
	# Chat history display
	chatbot = gr.Chatbot(
	label="Chat History",
	height=400,
	elem_classes=["chat-container"]
	)

	with gr.Row(elem_classes=["voice-controls"]):
	# Traditional Voice input section
	with gr.Column(scale=2):
	audio_input = gr.Audio(
	type="numpy",
	label="🎤 Voice Input (Gradio)",
	interactive=True
	)
	voice_status = gr.HTML(
	value='<span class="status-indicator idle"></span>Ready for voice input'
	)

	with gr.Column(scale=1):
	# Audio output
	audio_output = gr.Audio(
	label="🔊 AI Response",
	type="numpy",
	interactive=False
	)

	# WebRTC Real-time Voice Section
	with gr.Row():
	gr.HTML("""
	<div style="background: linear-gradient(45deg, #28a745 0%, #20c997 100%);
	padding: 15px; border-radius: 10px; margin: 10px 0;">
	<h3 style="color: white; margin: 0;">🚀 WebRTC Real-time Voice (Beta)</h3>
	<p style="color: white; margin: 5px 0;">
	Enhanced real-time voice interaction with streaming transcription
	</p>
	<p style="color: white; margin: 5px 0; font-size: 0.9em;">
	📡 <strong>WebSocket endpoints:</strong> /ws/webrtc/{client_id} \|
	🧪 <strong>Test page:</strong> <a href="/webrtc/demo" style="color: #fff; text-decoration: underline;">WebRTC Demo</a> \|
	⚡ <strong>API Status:</strong> <a href="/webrtc/test" style="color: #fff; text-decoration: underline;">Test Endpoint</a>
	</p>
	</div>
	""")

	# Text input section
	with gr.Row():
	text_input = gr.Textbox(
	label="💬 Type your message or see voice transcription",
	placeholder="Hi! I'm [Your Name]. Book a 30-minute meeting tomorrow at 2 PM...",
	lines=2,
	scale=4
	)
	send_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Column(scale=1):
	# Quick action buttons
	gr.Markdown("### 🚀 Quick Actions")

	quick_meet = gr.Button(
	"🎥 Google Meet (30m)",
	variant="secondary"
	)
	quick_availability = gr.Button(
	"📅 Check Availability",
	variant="secondary"
	)
	quick_cancel = gr.Button(
	"❌ Cancel Meeting",
	variant="secondary"
	)

	# Version info
	version_btn = gr.Button(
	"ℹ️ Version Info",
	variant="secondary"
	)
	version_display = gr.Textbox(
	label="Version Information",
	interactive=False,
	visible=False
	)

	# Voice settings
	gr.Markdown("### 🎭 Voice Settings")
	voice_enabled = gr.Checkbox(
	label="Enable voice responses",
	value=True
	)
	voice_selection = gr.Dropdown(
	choices=[
	"v2/en_speaker_0",
	"v2/en_speaker_1",
	"v2/en_speaker_2",
	"v2/en_speaker_6",
	"v2/en_speaker_9"
	],
	value="v2/en_speaker_6",
	label="AI Voice"
	)

	# Event handlers
	def handle_text_submit(message, history, session):
	if message.strip():
	# Use asyncio to handle the async function
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	result = loop.run_until_complete(
	app.process_message(message, history, session)
	)
	return result
	finally:
	loop.close()
	return history, message

	def handle_audio_submit(audio, history, session):
	print(f"🎤 AUDIO DEBUG: Received audio input: {type(audio)}")
	print(f"🎤 AUDIO DEBUG: Audio data: {audio}")

	if audio is not None:
	print(f"🎤 AUDIO DEBUG: Processing audio...")
	# Convert audio data and process
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	# Debug audio format
	if isinstance(audio, tuple) and len(audio) >= 2:
	sample_rate, audio_array = audio
	print(f"🎤 AUDIO DEBUG: Sample rate: {sample_rate}")
	print(f"🎤 AUDIO DEBUG: Audio array type: {type(audio_array)}")
	print(f"🎤 AUDIO DEBUG: Audio array shape: {audio_array.shape if hasattr(audio_array, 'shape') else 'No shape'}")

	# Use the audio handler's process method instead
	transcription = app.audio_handler.process_audio_input(audio)
	print(f"🎤 AUDIO DEBUG: Transcription result: {transcription}")

	if transcription and transcription != "No audio received":
	# Process the transcription as a message
	result = loop.run_until_complete(
	app.process_message(transcription, history, session)
	)
	# Return updated history, transcription in text box, and no audio output for now
	return result[0], transcription, None
	else:
	print(f"🎤 AUDIO DEBUG: No valid transcription received")
	return history, "No audio transcription available", None
	else:
	print(f"🎤 AUDIO DEBUG: Invalid audio format")
	return history, "Invalid audio format", None

	except Exception as e:
	print(f"🎤 AUDIO ERROR: {str(e)}")
	import traceback
	traceback.print_exc()
	return history, f"Audio processing error: {str(e)}", None
	finally:
	loop.close()
	else:
	print(f"🎤 AUDIO DEBUG: No audio received")
	return history, "No audio received", None

	def handle_quick_action(action_text, history, session):
	"""Handle quick action button clicks."""
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	try:
	result = loop.run_until_complete(
	app.process_message(action_text, history, session)
	)
	return result[0], "" # Return updated history and clear text input
	finally:
	loop.close()

	# Wire up the event handlers
	send_btn.click(
	fn=handle_text_submit,
	inputs=[text_input, chatbot, session_id],
	outputs=[chatbot, text_input]
	)

	text_input.submit(
	fn=handle_text_submit,
	inputs=[text_input, chatbot, session_id],
	outputs=[chatbot, text_input]
	)

	audio_input.change(
	fn=handle_audio_submit,
	inputs=[audio_input, chatbot, session_id],
	outputs=[chatbot, text_input, audio_output]
	)

	# Quick action handlers
	quick_meet.click(
	fn=lambda hist, sess: handle_quick_action(
	"Book a 30-minute Google Meet with Peter for next available time",
	hist, sess
	),
	inputs=[chatbot, session_id],
	outputs=[chatbot, text_input]
	)

	quick_availability.click(
	fn=lambda hist, sess: handle_quick_action(
	"What is Peter's availability this week?",
	hist, sess
	),
	inputs=[chatbot, session_id],
	outputs=[chatbot, text_input]
	)

	quick_cancel.click(
	fn=lambda hist, sess: handle_quick_action(
	"Cancel my upcoming meeting with Peter",
	hist, sess
	),
	inputs=[chatbot, session_id],
	outputs=[chatbot, text_input]
	)

	# Version info handler
	def show_version():
	info = get_version_info()
	version_text = f"Version: {info['version']}\nBuild: {info['build_date']}\nDescription: {info['description']}\nStatus: {info['status']}"
	return version_text, gr.update(visible=True)

	version_btn.click(
	fn=show_version,
	outputs=[version_display, version_display]
	)

	return demo

	# Global app instance
	app = ChatCalVoiceApp()

	# Create and launch the interface
	if __name__ == "__main__":
	import uvicorn

	try:
	# Create WebRTC-enabled FastAPI app as main app
	webrtc_app = create_fastapi_app()

	# Create Gradio interface (for future integration)
	demo = app.create_interface()

	# WebRTC-first approach: Launch FastAPI with WebSocket endpoints
	print("🚀 ChatCal WebRTC-First Deployment v0.4.3")
	print("📡 WebSocket endpoint: /ws/webrtc/{client_id}")
	print("🧪 WebRTC demo page: /webrtc/demo")
	print("⚡ API status: /webrtc/test")
	print("⚠️ Gradio interface development - WebRTC priority")

	# Launch WebRTC FastAPI app directly
	uvicorn.run(webrtc_app, host="0.0.0.0", port=7860)

	except Exception as e:
	print(f"❌ WebRTC integration error: {e}")
	print("📋 Falling back to Gradio-only deployment")
	import traceback
	traceback.print_exc()

	# Create stable Gradio interface fallback
	demo = app.create_interface()

	print("🚀 ChatCal Voice-Enabled Assistant v0.4.3")
	print("📱 Traditional voice input available via Gradio Audio component")
	print("⚙️ WebRTC real-time streaming: Debugging in progress")

	# Launch configuration for HF Spaces (stable fallback)
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False, # HF handles sharing
	show_error=True
	)