Spaces:
Sleeping
Sleeping
File size: 2,833 Bytes
a0fcd39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | """Audio utility functions for format conversion and processing."""
import io
import numpy as np
import soundfile as sf
def to_mono(audio: np.ndarray) -> np.ndarray:
"""
Convert stereo audio to mono by averaging channels.
Args:
audio: Audio array, shape (samples,) for mono or (samples, channels) for stereo
Returns:
Mono audio array, shape (samples,)
"""
if audio.ndim == 2:
return np.mean(audio, axis=1)
return audio
def to_float32(audio: np.ndarray) -> np.ndarray:
"""
Ensure audio is float32 in [-1, 1] range.
Args:
audio: Audio array in any numeric format
Returns:
Audio array as float32 normalized to [-1, 1]
"""
audio = audio.astype(np.float32)
# Check if already normalized
max_val = np.max(np.abs(audio))
if max_val > 1.0:
audio = audio / max_val
return audio
def normalize(audio: np.ndarray, peak: float = 0.95) -> np.ndarray:
"""
Normalize audio so peak amplitude equals given value.
Args:
audio: Audio array
peak: Target peak amplitude (default 0.95 to avoid clipping)
Returns:
Normalized audio array
"""
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val * peak
return audio
def pad_or_trim(audio: np.ndarray, target_length: int) -> np.ndarray:
"""
Pad with zeros or trim audio to target length.
Args:
audio: Audio array
target_length: Desired length in samples
Returns:
Audio array with exactly target_length samples
"""
current_length = len(audio)
if current_length == target_length:
return audio
elif current_length > target_length:
return audio[:target_length]
else:
# Pad with zeros
padding = np.zeros(target_length - current_length, dtype=audio.dtype)
return np.concatenate([audio, padding])
def encode_wav_to_bytes(audio: np.ndarray, sr: int) -> bytes:
"""
Encode numpy array to WAV bytes.
Args:
audio: Audio array
sr: Sample rate
Returns:
WAV file as bytes
"""
buf = io.BytesIO()
sf.write(buf, audio, sr, format='WAV')
buf.seek(0)
return buf.read()
def encode_flac_to_bytes(audio: np.ndarray, sr: int) -> bytes:
"""
Encode numpy array to FLAC bytes (lossless, ~50% smaller than WAV).
Args:
audio: Audio array
sr: Sample rate
Returns:
FLAC file as bytes
"""
buf = io.BytesIO()
# Convert float32 to int16 for FLAC (better compression)
if audio.dtype == np.float32 or audio.dtype == np.float64:
audio_int = (audio * 32767).astype(np.int16)
else:
audio_int = audio
sf.write(buf, audio_int, sr, format='FLAC')
buf.seek(0)
return buf.read()
|