Spaces:

R-TA
/

coqui-tss

Runtime error

App Files Files Community

coqui-tss / utils.py

R-TA

Create utils.py

4d0887b verified 7 months ago

raw

history blame contribute delete

4.89 kB

	"""
	Utility functions for Multi-Language TTS application
	"""

	import os
	import tempfile
	import logging
	from typing import Optional, Tuple, List
	import numpy as np
	import torch
	import librosa
	from pathlib import Path

	logger = logging.getLogger(__name__)

	def get_device() -> str:
	"""Get the best available device for inference"""
	if torch.cuda.is_available():
	return "cuda"
	elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
	return "mps" # Apple Silicon
	else:
	return "cpu"

	def validate_text(text: str, max_length: int = 1000) -> str:
	"""Validate and clean input text"""
	if not text or not text.strip():
	raise ValueError("Text cannot be empty")

	text = text.strip()
	if len(text) > max_length:
	logger.warning(f"Text truncated from {len(text)} to {max_length} characters")
	text = text[:max_length]

	return text

	def validate_audio_file(file_path: str) -> bool:
	"""Validate audio file format and accessibility"""
	if not file_path or not os.path.exists(file_path):
	return False

	supported_formats = ['.wav', '.mp3', '.flac', '.ogg', '.m4a']
	file_ext = Path(file_path).suffix.lower()

	return file_ext in supported_formats

	def create_temp_audio_file(suffix: str = ".wav") -> str:
	"""Create a temporary audio file"""
	temp_file = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
	temp_file.close()
	return temp_file.name

	def cleanup_temp_file(file_path: str) -> None:
	"""Safely remove temporary file"""
	try:
	if file_path and os.path.exists(file_path):
	os.unlink(file_path)
	except Exception as e:
	logger.warning(f"Failed to cleanup temp file {file_path}: {e}")

	def load_audio(file_path: str, target_sr: int = 22050) -> Tuple[np.ndarray, int]:
	"""Load audio file with proper error handling"""
	try:
	audio, sr = librosa.load(file_path, sr=target_sr)
	return audio, sr
	except Exception as e:
	logger.error(f"Failed to load audio from {file_path}: {e}")
	raise ValueError(f"Could not load audio file: {e}")

	def normalize_audio(audio: np.ndarray) -> np.ndarray:
	"""Normalize audio to prevent clipping"""
	if len(audio) == 0:
	return audio

	# Normalize to [-1, 1] range
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	return audio

	def get_supported_languages() -> List[str]:
	"""Get list of supported languages"""
	from config import LANGUAGE_MODELS
	return list(LANGUAGE_MODELS.keys())

	def format_duration(seconds: float) -> str:
	"""Format duration in seconds to human readable format"""
	if seconds < 1:
	return f"{seconds*1000:.0f}ms"
	elif seconds < 60:
	return f"{seconds:.1f}s"
	else:
	minutes = int(seconds // 60)
	seconds = seconds % 60
	return f"{minutes}m {seconds:.1f}s"

	def estimate_synthesis_time(text_length: int, language: str = "English") -> float:
	"""Estimate synthesis time based on text length and language"""
	# Base time estimates (seconds per character)
	base_times = {
	"English": 0.05,
	"Korean": 0.08,
	"German": 0.06,
	"Spanish": 0.05
	}

	base_time = base_times.get(language, 0.06)
	return text_length * base_time + 2.0 # Add 2s overhead

	def log_system_info():
	"""Log system information for debugging"""
	logger.info(f"Device: {get_device()}")
	logger.info(f"PyTorch version: {torch.__version__}")
	logger.info(f"CUDA available: {torch.cuda.is_available()}")

	if torch.cuda.is_available():
	logger.info(f"CUDA device: {torch.cuda.get_device_name()}")
	logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

	class AudioProcessor:
	"""Audio processing utilities"""

	@staticmethod
	def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
	"""Resample audio to target sample rate"""
	if orig_sr == target_sr:
	return audio
	return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

	@staticmethod
	def trim_silence(audio: np.ndarray, sr: int, threshold: float = 0.01) -> np.ndarray:
	"""Trim silence from beginning and end of audio"""
	return librosa.effects.trim(audio, top_db=20)[0]

	@staticmethod
	def apply_fade(audio: np.ndarray, sr: int, fade_duration: float = 0.1) -> np.ndarray:
	"""Apply fade in/out to audio"""
	fade_samples = int(fade_duration * sr)
	if len(audio) <= 2 * fade_samples:
	return audio

	# Fade in
	audio[:fade_samples] *= np.linspace(0, 1, fade_samples)
	# Fade out
	audio[-fade_samples:] *= np.linspace(1, 0, fade_samples)

	return audio