""" Audio utilities for processing and file I/O Handles loading, saving, and processing audio files """ import logging import numpy as np import soundfile as sf import librosa from typing import Tuple, Optional import tempfile import os logger = logging.getLogger(__name__) class AudioProcessor: """Handles audio file operations and processing""" SUPPORTED_FORMATS = ['wav', 'mp3', 'm4a', 'flac', 'ogg'] DEFAULT_SAMPLE_RATE = 24000 # For XTTS-v2 @staticmethod def load_audio( file_path: str, sr: Optional[int] = None, mono: bool = True ) -> Tuple[np.ndarray, int]: """ Load audio file Args: file_path: Path to audio file sr: Target sample rate (None = original) mono: Convert to mono if True Returns: Tuple of (audio_waveform, sample_rate) """ logger.info(f"Loading audio from: {file_path}") try: # Load with librosa for flexibility audio, sample_rate = librosa.load( file_path, sr=sr, mono=mono ) logger.info(f"Audio loaded. Shape: {audio.shape}, SR: {sample_rate}") return audio, sample_rate except Exception as e: logger.error(f"Error loading audio: {str(e)}") raise @staticmethod def save_audio( audio_waveform: np.ndarray, sample_rate: int, output_path: str, subtype: str = 'PCM_16' ) -> str: """ Save audio to WAV file Args: audio_waveform: Audio waveform array sample_rate: Sample rate output_path: Output file path subtype: Audio subtype (PCM_16, PCM_24, PCM_32, FLOAT) Returns: Path to saved file """ logger.info(f"Saving audio to: {output_path}") try: # Ensure output directory exists os.makedirs(os.path.dirname(output_path), exist_ok=True) # Save audio sf.write(output_path, audio_waveform, sample_rate, subtype=subtype) logger.info(f"Audio saved successfully. Size: {os.path.getsize(output_path)} bytes") return output_path except Exception as e: logger.error(f"Error saving audio: {str(e)}") raise @staticmethod def resample_audio( audio: np.ndarray, orig_sr: int, target_sr: int ) -> np.ndarray: """ Resample audio to target sample rate Args: audio: Audio waveform orig_sr: Original sample rate target_sr: Target sample rate Returns: Resampled audio """ if orig_sr == target_sr: return audio logger.info(f"Resampling from {orig_sr} to {target_sr}") return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) @staticmethod def concatenate_audio(*audio_arrays) -> np.ndarray: """ Concatenate multiple audio arrays Args: *audio_arrays: Variable number of audio arrays Returns: Concatenated audio array """ logger.info(f"Concatenating {len(audio_arrays)} audio segments") return np.concatenate(audio_arrays) @staticmethod def get_audio_duration(audio: np.ndarray, sr: int) -> float: """Get duration of audio in seconds""" return len(audio) / sr @staticmethod def validate_audio_file(file_path: str) -> bool: """ Validate if file is a supported audio format Args: file_path: Path to audio file Returns: True if valid, False otherwise """ ext = file_path.split('.')[-1].lower() is_valid = ext in AudioProcessor.SUPPORTED_FORMATS if not is_valid: logger.warning(f"Unsupported format: {ext}") return is_valid @staticmethod def create_temp_audio_file(suffix: str = '.wav') -> str: """ Create a temporary audio file Returns: Path to temporary file """ temp_file = tempfile.NamedTemporaryFile( suffix=suffix, delete=False ) logger.info(f"Created temporary file: {temp_file.name}") return temp_file.name @staticmethod def cleanup_temp_file(file_path: str): """ Delete temporary file safely Args: file_path: Path to file to delete """ try: if os.path.exists(file_path): os.remove(file_path) logger.info(f"Deleted temporary file: {file_path}") except Exception as e: logger.warning(f"Could not delete file {file_path}: {str(e)}") @staticmethod def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray: """ Normalize audio to target loudness Args: audio: Audio waveform target_db: Target loudness in dB Returns: Normalized audio """ # Calculate RMS rms = np.sqrt(np.mean(audio ** 2)) if rms == 0: return audio # Convert target db to linear scale target_linear = 10 ** (target_db / 20.0) # Scale audio normalized = audio * (target_linear / rms) # Clip to prevent clipping normalized = np.clip(normalized, -1.0, 1.0) logger.info(f"Audio normalized to {target_db} dB") return normalized