Spaces:
Sleeping
Sleeping
| """Audio utility functions for format conversion and processing.""" | |
| import io | |
| import numpy as np | |
| import soundfile as sf | |
| def to_mono(audio: np.ndarray) -> np.ndarray: | |
| """ | |
| Convert stereo audio to mono by averaging channels. | |
| Args: | |
| audio: Audio array, shape (samples,) for mono or (samples, channels) for stereo | |
| Returns: | |
| Mono audio array, shape (samples,) | |
| """ | |
| if audio.ndim == 2: | |
| return np.mean(audio, axis=1) | |
| return audio | |
| def to_float32(audio: np.ndarray) -> np.ndarray: | |
| """ | |
| Ensure audio is float32 in [-1, 1] range. | |
| Args: | |
| audio: Audio array in any numeric format | |
| Returns: | |
| Audio array as float32 normalized to [-1, 1] | |
| """ | |
| audio = audio.astype(np.float32) | |
| # Check if already normalized | |
| max_val = np.max(np.abs(audio)) | |
| if max_val > 1.0: | |
| audio = audio / max_val | |
| return audio | |
| def normalize(audio: np.ndarray, peak: float = 0.95) -> np.ndarray: | |
| """ | |
| Normalize audio so peak amplitude equals given value. | |
| Args: | |
| audio: Audio array | |
| peak: Target peak amplitude (default 0.95 to avoid clipping) | |
| Returns: | |
| Normalized audio array | |
| """ | |
| max_val = np.max(np.abs(audio)) | |
| if max_val > 0: | |
| audio = audio / max_val * peak | |
| return audio | |
| def pad_or_trim(audio: np.ndarray, target_length: int) -> np.ndarray: | |
| """ | |
| Pad with zeros or trim audio to target length. | |
| Args: | |
| audio: Audio array | |
| target_length: Desired length in samples | |
| Returns: | |
| Audio array with exactly target_length samples | |
| """ | |
| current_length = len(audio) | |
| if current_length == target_length: | |
| return audio | |
| elif current_length > target_length: | |
| return audio[:target_length] | |
| else: | |
| # Pad with zeros | |
| padding = np.zeros(target_length - current_length, dtype=audio.dtype) | |
| return np.concatenate([audio, padding]) | |
| def encode_wav_to_bytes(audio: np.ndarray, sr: int) -> bytes: | |
| """ | |
| Encode numpy array to WAV bytes. | |
| Args: | |
| audio: Audio array | |
| sr: Sample rate | |
| Returns: | |
| WAV file as bytes | |
| """ | |
| buf = io.BytesIO() | |
| sf.write(buf, audio, sr, format='WAV') | |
| buf.seek(0) | |
| return buf.read() | |
| def encode_flac_to_bytes(audio: np.ndarray, sr: int) -> bytes: | |
| """ | |
| Encode numpy array to FLAC bytes (lossless, ~50% smaller than WAV). | |
| Args: | |
| audio: Audio array | |
| sr: Sample rate | |
| Returns: | |
| FLAC file as bytes | |
| """ | |
| buf = io.BytesIO() | |
| # Convert float32 to int16 for FLAC (better compression) | |
| if audio.dtype == np.float32 or audio.dtype == np.float64: | |
| audio_int = (audio * 32767).astype(np.int16) | |
| else: | |
| audio_int = audio | |
| sf.write(buf, audio_int, sr, format='FLAC') | |
| buf.seek(0) | |
| return buf.read() | |