coqui-tss / utils.py
R-TA's picture
Create utils.py
4d0887b verified
"""
Utility functions for Multi-Language TTS application
"""
import os
import tempfile
import logging
from typing import Optional, Tuple, List
import numpy as np
import torch
import librosa
from pathlib import Path
logger = logging.getLogger(__name__)
def get_device() -> str:
"""Get the best available device for inference"""
if torch.cuda.is_available():
return "cuda"
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
return "mps" # Apple Silicon
else:
return "cpu"
def validate_text(text: str, max_length: int = 1000) -> str:
"""Validate and clean input text"""
if not text or not text.strip():
raise ValueError("Text cannot be empty")
text = text.strip()
if len(text) > max_length:
logger.warning(f"Text truncated from {len(text)} to {max_length} characters")
text = text[:max_length]
return text
def validate_audio_file(file_path: str) -> bool:
"""Validate audio file format and accessibility"""
if not file_path or not os.path.exists(file_path):
return False
supported_formats = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
file_ext = Path(file_path).suffix.lower()
return file_ext in supported_formats
def create_temp_audio_file(suffix: str = ".wav") -> str:
"""Create a temporary audio file"""
temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
temp_file.close()
return temp_file.name
def cleanup_temp_file(file_path: str) -> None:
"""Safely remove temporary file"""
try:
if file_path and os.path.exists(file_path):
os.unlink(file_path)
except Exception as e:
logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
def load_audio(file_path: str, target_sr: int = 22050) -> Tuple[np.ndarray, int]:
"""Load audio file with proper error handling"""
try:
audio, sr = librosa.load(file_path, sr=target_sr)
return audio, sr
except Exception as e:
logger.error(f"Failed to load audio from {file_path}: {e}")
raise ValueError(f"Could not load audio file: {e}")
def normalize_audio(audio: np.ndarray) -> np.ndarray:
"""Normalize audio to prevent clipping"""
if len(audio) == 0:
return audio
# Normalize to [-1, 1] range
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val
return audio
def get_supported_languages() -> List[str]:
"""Get list of supported languages"""
from config import LANGUAGE_MODELS
return list(LANGUAGE_MODELS.keys())
def format_duration(seconds: float) -> str:
"""Format duration in seconds to human readable format"""
if seconds < 1:
return f"{seconds*1000:.0f}ms"
elif seconds < 60:
return f"{seconds:.1f}s"
else:
minutes = int(seconds // 60)
seconds = seconds % 60
return f"{minutes}m {seconds:.1f}s"
def estimate_synthesis_time(text_length: int, language: str = "English") -> float:
"""Estimate synthesis time based on text length and language"""
# Base time estimates (seconds per character)
base_times = {
"English": 0.05,
"Korean": 0.08,
"German": 0.06,
"Spanish": 0.05
}
base_time = base_times.get(language, 0.06)
return text_length * base_time + 2.0 # Add 2s overhead
def log_system_info():
"""Log system information for debugging"""
logger.info(f"Device: {get_device()}")
logger.info(f"PyTorch version: {torch.__version__}")
logger.info(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
logger.info(f"CUDA device: {torch.cuda.get_device_name()}")
logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
class AudioProcessor:
"""Audio processing utilities"""
@staticmethod
def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
"""Resample audio to target sample rate"""
if orig_sr == target_sr:
return audio
return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
@staticmethod
def trim_silence(audio: np.ndarray, sr: int, threshold: float = 0.01) -> np.ndarray:
"""Trim silence from beginning and end of audio"""
return librosa.effects.trim(audio, top_db=20)[0]
@staticmethod
def apply_fade(audio: np.ndarray, sr: int, fade_duration: float = 0.1) -> np.ndarray:
"""Apply fade in/out to audio"""
fade_samples = int(fade_duration * sr)
if len(audio) <= 2 * fade_samples:
return audio
# Fade in
audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
# Fade out
audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
return audio