""" Utility functions for Multi-Language TTS application """ import os import tempfile import logging from typing import Optional, Tuple, List import numpy as np import torch import librosa from pathlib import Path logger = logging.getLogger(__name__) def get_device() -> str: """Get the best available device for inference""" if torch.cuda.is_available(): return "cuda" elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): return "mps" # Apple Silicon else: return "cpu" def validate_text(text: str, max_length: int = 1000) -> str: """Validate and clean input text""" if not text or not text.strip(): raise ValueError("Text cannot be empty") text = text.strip() if len(text) > max_length: logger.warning(f"Text truncated from {len(text)} to {max_length} characters") text = text[:max_length] return text def validate_audio_file(file_path: str) -> bool: """Validate audio file format and accessibility""" if not file_path or not os.path.exists(file_path): return False supported_formats = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'] file_ext = Path(file_path).suffix.lower() return file_ext in supported_formats def create_temp_audio_file(suffix: str = ".wav") -> str: """Create a temporary audio file""" temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) temp_file.close() return temp_file.name def cleanup_temp_file(file_path: str) -> None: """Safely remove temporary file""" try: if file_path and os.path.exists(file_path): os.unlink(file_path) except Exception as e: logger.warning(f"Failed to cleanup temp file {file_path}: {e}") def load_audio(file_path: str, target_sr: int = 22050) -> Tuple[np.ndarray, int]: """Load audio file with proper error handling""" try: audio, sr = librosa.load(file_path, sr=target_sr) return audio, sr except Exception as e: logger.error(f"Failed to load audio from {file_path}: {e}") raise ValueError(f"Could not load audio file: {e}") def normalize_audio(audio: np.ndarray) -> np.ndarray: """Normalize audio to prevent clipping""" if len(audio) == 0: return audio # Normalize to [-1, 1] range max_val = np.max(np.abs(audio)) if max_val > 0: audio = audio / max_val return audio def get_supported_languages() -> List[str]: """Get list of supported languages""" from config import LANGUAGE_MODELS return list(LANGUAGE_MODELS.keys()) def format_duration(seconds: float) -> str: """Format duration in seconds to human readable format""" if seconds < 1: return f"{seconds*1000:.0f}ms" elif seconds < 60: return f"{seconds:.1f}s" else: minutes = int(seconds // 60) seconds = seconds % 60 return f"{minutes}m {seconds:.1f}s" def estimate_synthesis_time(text_length: int, language: str = "English") -> float: """Estimate synthesis time based on text length and language""" # Base time estimates (seconds per character) base_times = { "English": 0.05, "Korean": 0.08, "German": 0.06, "Spanish": 0.05 } base_time = base_times.get(language, 0.06) return text_length * base_time + 2.0 # Add 2s overhead def log_system_info(): """Log system information for debugging""" logger.info(f"Device: {get_device()}") logger.info(f"PyTorch version: {torch.__version__}") logger.info(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): logger.info(f"CUDA device: {torch.cuda.get_device_name()}") logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB") class AudioProcessor: """Audio processing utilities""" @staticmethod def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: """Resample audio to target sample rate""" if orig_sr == target_sr: return audio return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) @staticmethod def trim_silence(audio: np.ndarray, sr: int, threshold: float = 0.01) -> np.ndarray: """Trim silence from beginning and end of audio""" return librosa.effects.trim(audio, top_db=20)[0] @staticmethod def apply_fade(audio: np.ndarray, sr: int, fade_duration: float = 0.1) -> np.ndarray: """Apply fade in/out to audio""" fade_samples = int(fade_duration * sr) if len(audio) <= 2 * fade_samples: return audio # Fade in audio[:fade_samples] *= np.linspace(0, 1, fade_samples) # Fade out audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) return audio