Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Voice Worker for Modal Deployment | |
| Handles voice processing tasks on Modal infrastructure | |
| """ | |
| import asyncio | |
| import json | |
| import logging | |
| import base64 | |
| from typing import Dict, List, Any, Optional | |
| from datetime import datetime | |
| # Modal imports | |
| import modal | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Modal app setup | |
| app = modal.App("voice-worker") | |
| class VoiceWorker: | |
| """Voice processing worker for Modal deployment.""" | |
| def __init__(self): | |
| self.config = { | |
| "whisper_model": "whisper-1", | |
| "voice_id": "pNInz6obpgDQGcFmaJgB", # Adam voice | |
| "language": "en", | |
| "response_format": "json" | |
| } | |
| async def process_whisper_transcription(self, audio_data: str, language: str = "auto") -> Dict[str, Any]: | |
| """Process audio with Whisper for transcription.""" | |
| try: | |
| # In production, this would use actual OpenAI Whisper API | |
| # For demo, simulate the processing | |
| await asyncio.sleep(0.1) # Simulate processing time | |
| mock_transcription = { | |
| "text": "Hello, this is a test of the voice transcription system.", | |
| "language": language, | |
| "duration": 4.2, | |
| "confidence": 0.97, | |
| "words": [ | |
| {"word": "Hello", "start": 0.0, "end": 0.5, "confidence": 0.99}, | |
| {"word": "this", "start": 0.6, "end": 0.8, "confidence": 0.95}, | |
| {"word": "is", "start": 0.9, "end": 1.1, "confidence": 0.98}, | |
| {"word": "a", "start": 1.2, "end": 1.3, "confidence": 0.94}, | |
| {"word": "test", "start": 1.4, "end": 1.8, "confidence": 0.99}, | |
| {"word": "of", "start": 1.9, "end": 2.1, "confidence": 0.96}, | |
| {"word": "the", "start": 2.2, "end": 2.4, "confidence": 0.98}, | |
| {"word": "voice", "start": 2.5, "end": 2.9, "confidence": 0.97}, | |
| {"word": "transcription", "start": 3.0, "end": 3.8, "confidence": 0.99}, | |
| {"word": "system", "start": 3.9, "end": 4.2, "confidence": 0.98} | |
| ] | |
| } | |
| logger.info(f"Whisper transcription completed: {len(mock_transcription['text'])} characters") | |
| return mock_transcription | |
| except Exception as e: | |
| logger.error(f"Whisper transcription error: {e}") | |
| return {"error": str(e), "text": None} | |
| async def process_elevenlabs_synthesis(self, text: str, voice_id: str, stability: float = 0.5) -> Dict[str, Any]: | |
| """Process text with ElevenLabs for voice synthesis.""" | |
| try: | |
| # In production, this would use actual ElevenLabs API | |
| # For demo, simulate the processing | |
| await asyncio.sleep(0.2) # Simulate processing time | |
| # Generate mock audio data | |
| audio_duration = len(text) * 0.1 # Rough estimate | |
| audio_size = len(text) * 0.5 # Rough estimate in KB | |
| mock_audio_data = base64.b64encode(b"mock_audio_data").decode() | |
| voice_names = { | |
| "pNInz6obpgDQGcFmaJgB": "Adam (Male, Professional)", | |
| "21m00Tcm4TlvDq8ikWAM": "Rachel (Female, Warm)", | |
| "29vD33N1CtxCmqQRPOHJ": "Cloyd (Male, Deep)" | |
| } | |
| mock_synthesis = { | |
| "audio_data": mock_audio_data, | |
| "duration": audio_duration, | |
| "voice_name": voice_names.get(voice_id, "Custom Voice"), | |
| "voice_id": voice_id, | |
| "model_id": "eleven_monolingual_v1", | |
| "settings": { | |
| "stability": stability, | |
| "similarity_boost": 0.5, | |
| "style": 0.0, | |
| "use_speaker_boost": True | |
| }, | |
| "file_size_kb": audio_size, | |
| "format": "mp3", | |
| "sample_rate": 44100 | |
| } | |
| logger.info(f"ElevenLabs synthesis completed: {audio_duration:.1f}s audio") | |
| return mock_synthesis | |
| except Exception as e: | |
| logger.error(f"ElevenLabs synthesis error: {e}") | |
| return {"error": str(e), "audio_data": None} | |
| async def process_gpt4o_conversation(self, user_input: str, context: List[Dict] = None) -> Dict[str, Any]: | |
| """Process conversation with GPT-4o.""" | |
| try: | |
| # In production, this would use actual OpenAI GPT-4o API | |
| # For demo, simulate intelligent responses | |
| await asyncio.sleep(0.15) # Simulate API latency | |
| # Simple context-aware responses | |
| if any(word in user_input.lower() for word in ["hello", "hi", "hey"]): | |
| response = "Hello! I'm your voice AI assistant. How can I help you today? I can transcribe audio, generate speech, or have a conversation with you." | |
| elif any(word in user_input.lower() for word in ["transcribe", "speech to text"]): | |
| response = "I can transcribe your audio using Whisper AI. Please upload your audio file or record directly, and I'll convert it to text with high accuracy." | |
| elif any(word in user_input.lower() for word in ["speak", "say", "voice"]): | |
| response = "I can generate natural-sounding speech using ElevenLabs. What would you like me to say? I have multiple voice options available." | |
| elif any(word in user_input.lower() for word in ["translate", "language"]): | |
| response = "I support multiple languages including English, Spanish, French, and Nepali. I can automatically detect the language and provide appropriate responses." | |
| else: | |
| response = f"I understand you're asking about: '{user_input}'. As your voice AI, I can help with transcription, speech synthesis, multilingual processing, and intelligent conversations. What specific voice task would you like me to help with?" | |
| mock_conversation = { | |
| "response": response, | |
| "model": "gpt-4o", | |
| "tokens_used": len(user_input.split()) + len(response.split()), | |
| "confidence": 0.95, | |
| "processing_time": 0.15, | |
| "context_aware": True, | |
| "timestamp": datetime.utcnow().isoformat() | |
| } | |
| logger.info(f"GPT-4o conversation processed: {len(response)} character response") | |
| return mock_conversation | |
| except Exception as e: | |
| logger.error(f"GPT-4o conversation error: {e}") | |
| return {"error": str(e), "response": None} | |
| async def process_multilingual_detection(self, audio_data: str) -> Dict[str, Any]: | |
| """Detect language and process multilingual audio.""" | |
| try: | |
| # In production, this would use language detection APIs | |
| # For demo, simulate language detection | |
| await asyncio.sleep(0.1) | |
| # Mock language detection results | |
| mock_detection = { | |
| "detected_language": "en", | |
| "language_name": "English", | |
| "confidence": 0.94, | |
| "alternative_languages": [ | |
| {"language": "es", "confidence": 0.12}, | |
| {"language": "fr", "confidence": 0.08}, | |
| {"language": "ne", "confidence": 0.05} | |
| ], | |
| "auto_switch": True, | |
| "cultural_context": "Western business communication", | |
| "phonetic_features": { | |
| "accent": "neutral", | |
| "clarity": "high", | |
| "speech_rate": "normal" | |
| } | |
| } | |
| logger.info(f"Language detection completed: {mock_detection['language_name']}") | |
| return mock_detection | |
| except Exception as e: | |
| logger.error(f"Language detection error: {e}") | |
| return {"error": str(e), "detected_language": None} | |
| # Modal endpoints | |
| async def whisper_transcribe(audio_data: str, language: str = "auto") -> str: | |
| """Modal endpoint for Whisper transcription.""" | |
| worker = VoiceWorker() | |
| result = await worker.process_whisper_transcription(audio_data, language) | |
| return json.dumps(result) | |
| async def elevenlabs_synthesize(text: str, voice_id: str = "pNInz6obpgDQGcFmaJgB", stability: float = 0.5) -> str: | |
| """Modal endpoint for ElevenLabs voice synthesis.""" | |
| worker = VoiceWorker() | |
| result = await worker.process_elevenlabs_synthesis(text, voice_id, stability) | |
| return json.dumps(result) | |
| async def gpt4o_converse(user_input: str, context: str = "[]") -> str: | |
| """Modal endpoint for GPT-4o conversation.""" | |
| worker = VoiceWorker() | |
| context_list = json.loads(context) if context != "[]" else None | |
| result = await worker.process_gpt4o_conversation(user_input, context_list) | |
| return json.dumps(result) | |
| async def detect_language(audio_data: str) -> str: | |
| """Modal endpoint for language detection.""" | |
| worker = VoiceWorker() | |
| result = await worker.process_multilingual_detection(audio_data) | |
| return json.dumps(result) | |
| async def voice_pipeline(audio_data: str, operation: str = "full", language: str = "auto") -> str: | |
| """Modal endpoint for complete voice processing pipeline.""" | |
| worker = VoiceWorker() | |
| try: | |
| if operation == "transcribe": | |
| result = await worker.process_whisper_transcription(audio_data, language) | |
| elif operation == "synthesize": | |
| # For synthesis, we need text input | |
| text = "Hello, this is a test of the voice synthesis system." | |
| result = await worker.process_elevenlabs_synthesis(text) | |
| elif operation == "detect": | |
| result = await worker.process_multilingual_detection(audio_data) | |
| elif operation == "full": | |
| # Full pipeline: detect language, transcribe, and respond | |
| detection = await worker.process_multilingual_detection(audio_data) | |
| transcription = await worker.process_whisper_transcription(audio_data, detection.get("detected_language", "en")) | |
| conversation = await worker.process_gpt4o_conversation(transcription.get("text", "")) | |
| result = { | |
| "pipeline": "complete", | |
| "language_detection": detection, | |
| "transcription": transcription, | |
| "conversation": conversation, | |
| "timestamp": datetime.utcnow().isoformat() | |
| } | |
| else: | |
| result = {"error": f"Unknown operation: {operation}"} | |
| return json.dumps(result) | |
| except Exception as e: | |
| logger.error(f"Voice pipeline error: {e}") | |
| return json.dumps({"error": str(e), "operation": operation}) | |
| async def health_check() -> str: | |
| """Modal endpoint for health check.""" | |
| health_status = { | |
| "status": "healthy", | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "services": { | |
| "whisper": "available", | |
| "elevenlabs": "available", | |
| "gpt4o": "available", | |
| "language_detection": "available" | |
| }, | |
| "version": "1.0.0", | |
| "uptime": "100%" | |
| } | |
| return json.dumps(health_status) | |
| if __name__ == "__main__": | |
| # Local testing | |
| async def test_voice_worker(): | |
| worker = VoiceWorker() | |
| print("🎤 Testing Voice Worker...") | |
| # Test transcription | |
| print("\n1. Testing Whisper Transcription:") | |
| audio_data = base64.b64encode(b"mock_audio_data").decode() | |
| result = await worker.process_whisper_transcription(audio_data) | |
| print(f" Result: {result.get('text', 'No text')}") | |
| # Test synthesis | |
| print("\n2. Testing ElevenLabs Synthesis:") | |
| result = await worker.process_elevenlabs_synthesis("Hello, this is a test") | |
| print(f" Voice: {result.get('voice_name', 'Unknown')}") | |
| print(f" Duration: {result.get('duration', 0):.1f}s") | |
| # Test conversation | |
| print("\n3. Testing GPT-4o Conversation:") | |
| result = await worker.process_gpt4o_conversation("Hello, how can you help me?") | |
| print(f" Response: {result.get('response', 'No response')[:100]}...") | |
| # Test language detection | |
| print("\n4. Testing Language Detection:") | |
| result = await worker.process_multilingual_detection(audio_data) | |
| print(f" Language: {result.get('language_name', 'Unknown')} ({result.get('confidence', 0):.1%})") | |
| print("\n✅ Voice Worker tests completed!") | |
| # Run tests | |
| asyncio.run(test_voice_worker()) |