| | """ |
| | Utility functions for Multi-Language TTS application |
| | """ |
| |
|
| | import os |
| | import tempfile |
| | import logging |
| | from typing import Optional, Tuple, List |
| | import numpy as np |
| | import torch |
| | import librosa |
| | from pathlib import Path |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | def get_device() -> str: |
| | """Get the best available device for inference""" |
| | if torch.cuda.is_available(): |
| | return "cuda" |
| | elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): |
| | return "mps" |
| | else: |
| | return "cpu" |
| |
|
| | def validate_text(text: str, max_length: int = 1000) -> str: |
| | """Validate and clean input text""" |
| | if not text or not text.strip(): |
| | raise ValueError("Text cannot be empty") |
| | |
| | text = text.strip() |
| | if len(text) > max_length: |
| | logger.warning(f"Text truncated from {len(text)} to {max_length} characters") |
| | text = text[:max_length] |
| | |
| | return text |
| |
|
| | def validate_audio_file(file_path: str) -> bool: |
| | """Validate audio file format and accessibility""" |
| | if not file_path or not os.path.exists(file_path): |
| | return False |
| | |
| | supported_formats = ['.wav', '.mp3', '.flac', '.ogg', '.m4a'] |
| | file_ext = Path(file_path).suffix.lower() |
| | |
| | return file_ext in supported_formats |
| |
|
| | def create_temp_audio_file(suffix: str = ".wav") -> str: |
| | """Create a temporary audio file""" |
| | temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False) |
| | temp_file.close() |
| | return temp_file.name |
| |
|
| | def cleanup_temp_file(file_path: str) -> None: |
| | """Safely remove temporary file""" |
| | try: |
| | if file_path and os.path.exists(file_path): |
| | os.unlink(file_path) |
| | except Exception as e: |
| | logger.warning(f"Failed to cleanup temp file {file_path}: {e}") |
| |
|
| | def load_audio(file_path: str, target_sr: int = 22050) -> Tuple[np.ndarray, int]: |
| | """Load audio file with proper error handling""" |
| | try: |
| | audio, sr = librosa.load(file_path, sr=target_sr) |
| | return audio, sr |
| | except Exception as e: |
| | logger.error(f"Failed to load audio from {file_path}: {e}") |
| | raise ValueError(f"Could not load audio file: {e}") |
| |
|
| | def normalize_audio(audio: np.ndarray) -> np.ndarray: |
| | """Normalize audio to prevent clipping""" |
| | if len(audio) == 0: |
| | return audio |
| | |
| | |
| | max_val = np.max(np.abs(audio)) |
| | if max_val > 0: |
| | audio = audio / max_val |
| | |
| | return audio |
| |
|
| | def get_supported_languages() -> List[str]: |
| | """Get list of supported languages""" |
| | from config import LANGUAGE_MODELS |
| | return list(LANGUAGE_MODELS.keys()) |
| |
|
| | def format_duration(seconds: float) -> str: |
| | """Format duration in seconds to human readable format""" |
| | if seconds < 1: |
| | return f"{seconds*1000:.0f}ms" |
| | elif seconds < 60: |
| | return f"{seconds:.1f}s" |
| | else: |
| | minutes = int(seconds // 60) |
| | seconds = seconds % 60 |
| | return f"{minutes}m {seconds:.1f}s" |
| |
|
| | def estimate_synthesis_time(text_length: int, language: str = "English") -> float: |
| | """Estimate synthesis time based on text length and language""" |
| | |
| | base_times = { |
| | "English": 0.05, |
| | "Korean": 0.08, |
| | "German": 0.06, |
| | "Spanish": 0.05 |
| | } |
| | |
| | base_time = base_times.get(language, 0.06) |
| | return text_length * base_time + 2.0 |
| |
|
| | def log_system_info(): |
| | """Log system information for debugging""" |
| | logger.info(f"Device: {get_device()}") |
| | logger.info(f"PyTorch version: {torch.__version__}") |
| | logger.info(f"CUDA available: {torch.cuda.is_available()}") |
| | |
| | if torch.cuda.is_available(): |
| | logger.info(f"CUDA device: {torch.cuda.get_device_name()}") |
| | logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB") |
| |
|
| | class AudioProcessor: |
| | """Audio processing utilities""" |
| | |
| | @staticmethod |
| | def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray: |
| | """Resample audio to target sample rate""" |
| | if orig_sr == target_sr: |
| | return audio |
| | return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) |
| | |
| | @staticmethod |
| | def trim_silence(audio: np.ndarray, sr: int, threshold: float = 0.01) -> np.ndarray: |
| | """Trim silence from beginning and end of audio""" |
| | return librosa.effects.trim(audio, top_db=20)[0] |
| | |
| | @staticmethod |
| | def apply_fade(audio: np.ndarray, sr: int, fade_duration: float = 0.1) -> np.ndarray: |
| | """Apply fade in/out to audio""" |
| | fade_samples = int(fade_duration * sr) |
| | if len(audio) <= 2 * fade_samples: |
| | return audio |
| | |
| | |
| | audio[:fade_samples] *= np.linspace(0, 1, fade_samples) |
| | |
| | audio[-fade_samples:] *= np.linspace(1, 0, fade_samples) |
| | |
| | return audio |
| |
|