Spaces:
Running
Running
File size: 4,917 Bytes
e568430 8e9e85e e568430 71ca2eb e568430 71ca2eb e568430 8e9e85e e568430 8e9e85e b4f9ff5 8e9e85e e568430 8e9e85e e568430 71ca2eb e568430 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
"""Unified audio processing service for STT and TTS integration."""
from functools import lru_cache
from typing import Any
import numpy as np
import structlog
from src.agents.audio_refiner import audio_refiner
from src.services.stt_gradio import STTService, get_stt_service
from src.utils.config import settings
logger = structlog.get_logger(__name__)
# Type stub for TTS service (will be imported when available)
try:
from src.services.tts_modal import TTSService, get_tts_service
_TTS_AVAILABLE = True
except ImportError:
_TTS_AVAILABLE = False
TTSService = None # type: ignore[assignment, misc]
get_tts_service = None # type: ignore[assignment, misc]
class AudioService:
"""Unified audio processing service."""
def __init__(
self,
stt_service: STTService | None = None,
tts_service: Any | None = None,
) -> None:
"""Initialize audio service with STT and TTS.
Args:
stt_service: STT service instance (default: get_stt_service())
tts_service: TTS service instance (default: get_tts_service() if available)
"""
self.stt = stt_service or get_stt_service()
# TTS is optional (requires Modal)
if tts_service is not None:
self.tts = tts_service
elif _TTS_AVAILABLE and settings.modal_available:
try:
self.tts = get_tts_service() # type: ignore[misc]
except Exception as e:
logger.warning("tts_service_unavailable", error=str(e))
self.tts = None
else:
self.tts = None
async def process_audio_input(
self,
audio_input: tuple[int, np.ndarray[Any, Any]] | None, # type: ignore[type-arg]
hf_token: str | None = None,
) -> str | None:
"""Process audio input and return transcribed text.
Args:
audio_input: Tuple of (sample_rate, audio_array) or None
hf_token: HuggingFace token for authenticated Gradio Spaces
Returns:
Transcribed text string or None if no audio input
"""
if audio_input is None:
return None
try:
transcribed_text = await self.stt.transcribe_audio(audio_input, hf_token=hf_token)
logger.info("audio_input_processed", text_length=len(transcribed_text))
return transcribed_text
except Exception as e:
logger.error("audio_input_processing_failed", error=str(e))
# Return None on failure (graceful degradation)
return None
async def generate_audio_output(
self,
text: str,
voice: str | None = None,
speed: float | None = None,
) -> tuple[int, np.ndarray[Any, Any]] | None: # type: ignore[type-arg]
"""Generate audio output from text.
Args:
text: Text to synthesize (markdown will be cleaned for audio)
voice: Voice ID (default: settings.tts_voice)
speed: Speech speed (default: settings.tts_speed)
Returns:
Tuple of (sample_rate, audio_array) or None if TTS unavailable
"""
if self.tts is None:
logger.warning("tts_unavailable", message="TTS service not available")
return None
if not text or not text.strip():
logger.warning("empty_text_for_tts")
return None
try:
# Refine text for audio (remove markdown, citations, etc.)
# Use LLM polish if enabled in settings
refined_text = await audio_refiner.refine_for_audio(
text, use_llm_polish=settings.tts_use_llm_polish
)
logger.info(
"text_refined_for_audio",
original_length=len(text),
refined_length=len(refined_text),
llm_polish_enabled=settings.tts_use_llm_polish,
)
# Use provided voice/speed or fallback to settings defaults
voice = voice if voice else settings.tts_voice
speed = speed if speed is not None else settings.tts_speed
audio_output = await self.tts.synthesize_async(refined_text, voice, speed) # type: ignore[misc]
if audio_output:
logger.info(
"audio_output_generated",
text_length=len(text),
sample_rate=audio_output[0],
)
return audio_output # type: ignore[no-any-return]
except Exception as e:
logger.error("audio_output_generation_failed", error=str(e))
# Return None on failure (graceful degradation)
return None
@lru_cache(maxsize=1)
def get_audio_service() -> AudioService:
"""Get or create singleton audio service instance.
Returns:
AudioService instance
"""
return AudioService()
|