""" Real-time Audio Processing Utilities for WebRTC Handles STT service integration without demo modes """ import aiohttp import asyncio import logging from typing import Optional import json logger = logging.getLogger(__name__) class RealTimeSTTProcessor: """Real-time STT processor - connects only to real services""" def __init__(self, stt_service_url: str): self.stt_service_url = stt_service_url.rstrip('/') async def transcribe_audio_file(self, audio_file_path: str) -> Optional[str]: """Transcribe audio file using real STT service - NO DEMO MODE""" try: logger.info(f"🎤 Real-time STT: Processing {audio_file_path}") # Try multiple API endpoint patterns systematically api_patterns = [ f"{self.stt_service_url}/api/predict", f"{self.stt_service_url}/call/predict", f"{self.stt_service_url}/api/transcribe_audio", f"{self.stt_service_url}/call/transcribe_audio" ] async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=30)) as session: for api_url in api_patterns: try: logger.info(f"🎤 Trying STT API: {api_url}") # Prepare file upload with open(audio_file_path, 'rb') as audio_file: # Try different payload formats payload_formats = [ # Format 1: Standard Gradio API { 'data': aiohttp.FormData() }, # Format 2: Direct form data { 'data': aiohttp.FormData() } ] # Format 1: Gradio API style payload_formats[0]['data'].add_field('data', audio_file, filename='audio.wav') payload_formats[0]['data'].add_field('data', json.dumps(["auto", "base", True])) # Format 2: Direct style audio_file.seek(0) payload_formats[1]['data'].add_field('audio', audio_file, filename='audio.wav') payload_formats[1]['data'].add_field('language', 'auto') payload_formats[1]['data'].add_field('model', 'base') payload_formats[1]['data'].add_field('timestamps', 'true') for i, payload in enumerate(payload_formats): try: logger.info(f"🎤 Trying payload format {i+1}") async with session.post(api_url, data=payload['data']) as response: logger.info(f"🎤 Response status: {response.status}") if response.status == 200: result = await response.json() logger.info(f"🎤 Response JSON: {result}") # Extract transcription from various response formats transcription = self._extract_transcription(result) if transcription and transcription.strip(): logger.info(f"🎤 SUCCESS: {transcription}") return transcription.strip() else: error_text = await response.text() logger.warning(f"🎤 API failed: {response.status} - {error_text[:200]}") except Exception as payload_error: logger.error(f"🎤 Payload {i+1} failed: {payload_error}") continue except Exception as url_error: logger.error(f"🎤 URL {api_url} failed: {url_error}") continue logger.error("🎤 All STT API attempts failed") return None except Exception as e: logger.error(f"🎤 STT processing error: {e}") return None def _extract_transcription(self, result) -> Optional[str]: """Extract transcription from different API response formats""" try: # Try different response formats transcription = None if isinstance(result, dict): # Gradio API format: {"data": [status, transcription, timestamps]} if 'data' in result and isinstance(result['data'], list) and len(result['data']) > 1: transcription = result['data'][1] # Direct API formats elif 'transcription' in result: transcription = result['transcription'] elif 'text' in result: transcription = result['text'] elif 'result' in result: transcription = result['result'] elif isinstance(result, list) and len(result) > 1: # Direct array format: [status, transcription, timestamps] transcription = result[1] return transcription except Exception as e: logger.error(f"Failed to extract transcription: {e}") return None class RealTimeTTSProcessor: """Real-time TTS processor for voice responses""" def __init__(self, tts_service_url: str): self.tts_service_url = tts_service_url.rstrip('/') async def synthesize_text(self, text: str, voice_preset: str = "v2/en_speaker_6") -> Optional[bytes]: """Synthesize text to speech using real TTS service""" try: logger.info(f"🔊 Real-time TTS: Synthesizing '{text[:50]}...'") # Implementation for TTS service calls # This will be implemented in Phase 4 (TTS integration) logger.info("🔊 TTS synthesis placeholder - Phase 4 implementation") return None except Exception as e: logger.error(f"🔊 TTS synthesis error: {e}") return None