Spaces:

R-TA
/

coqui-tss

Runtime error

App Files Files Community

R-TA commited on Aug 16, 2025

Commit

4d0887b

verified ·

1 Parent(s): 91cdcbf

Create utils.py

Browse files

Files changed (1) hide show

utils.py +148 -0

utils.py ADDED Viewed

	@@ -0,0 +1,148 @@

+"""
+Utility functions for Multi-Language TTS application
+"""
+import os
+import tempfile
+import logging
+from typing import Optional, Tuple, List
+import numpy as np
+import torch
+import librosa
+from pathlib import Path
+logger = logging.getLogger(__name__)
+def get_device() -> str:
+    """Get the best available device for inference"""
+    if torch.cuda.is_available():
+        return "cuda"
+    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        return "mps"  # Apple Silicon
+    else:
+        return "cpu"
+def validate_text(text: str, max_length: int = 1000) -> str:
+    """Validate and clean input text"""
+    if not text or not text.strip():
+        raise ValueError("Text cannot be empty")
+    text = text.strip()
+    if len(text) > max_length:
+        logger.warning(f"Text truncated from {len(text)} to {max_length} characters")
+        text = text[:max_length]
+    return text
+def validate_audio_file(file_path: str) -> bool:
+    """Validate audio file format and accessibility"""
+    if not file_path or not os.path.exists(file_path):
+        return False
+    supported_formats = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
+    file_ext = Path(file_path).suffix.lower()
+    return file_ext in supported_formats
+def create_temp_audio_file(suffix: str = ".wav") -> str:
+    """Create a temporary audio file"""
+    temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
+    temp_file.close()
+    return temp_file.name
+def cleanup_temp_file(file_path: str) -> None:
+    """Safely remove temporary file"""
+    try:
+        if file_path and os.path.exists(file_path):
+            os.unlink(file_path)
+    except Exception as e:
+        logger.warning(f"Failed to cleanup temp file {file_path}: {e}")
+def load_audio(file_path: str, target_sr: int = 22050) -> Tuple[np.ndarray, int]:
+    """Load audio file with proper error handling"""
+    try:
+        audio, sr = librosa.load(file_path, sr=target_sr)
+        return audio, sr
+    except Exception as e:
+        logger.error(f"Failed to load audio from {file_path}: {e}")
+        raise ValueError(f"Could not load audio file: {e}")
+def normalize_audio(audio: np.ndarray) -> np.ndarray:
+    """Normalize audio to prevent clipping"""
+    if len(audio) == 0:
+        return audio
+    # Normalize to [-1, 1] range
+    max_val = np.max(np.abs(audio))
+    if max_val > 0:
+        audio = audio / max_val
+    return audio
+def get_supported_languages() -> List[str]:
+    """Get list of supported languages"""
+    from config import LANGUAGE_MODELS
+    return list(LANGUAGE_MODELS.keys())
+def format_duration(seconds: float) -> str:
+    """Format duration in seconds to human readable format"""
+    if seconds < 1:
+        return f"{seconds*1000:.0f}ms"
+    elif seconds < 60:
+        return f"{seconds:.1f}s"
+    else:
+        minutes = int(seconds // 60)
+        seconds = seconds % 60
+        return f"{minutes}m {seconds:.1f}s"
+def estimate_synthesis_time(text_length: int, language: str = "English") -> float:
+    """Estimate synthesis time based on text length and language"""
+    # Base time estimates (seconds per character)
+    base_times = {
+        "English": 0.05,
+        "Korean": 0.08,
+        "German": 0.06,
+        "Spanish": 0.05
+    }
+    base_time = base_times.get(language, 0.06)
+    return text_length * base_time + 2.0  # Add 2s overhead
+def log_system_info():
+    """Log system information for debugging"""
+    logger.info(f"Device: {get_device()}")
+    logger.info(f"PyTorch version: {torch.__version__}")
+    logger.info(f"CUDA available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        logger.info(f"CUDA device: {torch.cuda.get_device_name()}")
+        logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")
+class AudioProcessor:
+    """Audio processing utilities"""
+    @staticmethod
+    def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+        """Resample audio to target sample rate"""
+        if orig_sr == target_sr:
+            return audio
+        return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+    @staticmethod
+    def trim_silence(audio: np.ndarray, sr: int, threshold: float = 0.01) -> np.ndarray:
+        """Trim silence from beginning and end of audio"""
+        return librosa.effects.trim(audio, top_db=20)[0]
+    @staticmethod
+    def apply_fade(audio: np.ndarray, sr: int, fade_duration: float = 0.1) -> np.ndarray:
+        """Apply fade in/out to audio"""
+        fade_samples = int(fade_duration * sr)
+        if len(audio) <= 2 * fade_samples:
+            return audio
+        # Fade in
+        audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
+        # Fade out
+        audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)
+        return audio