jam-tracks / backend /utils /audio_utils.py
Mina Emadi
updated the MVP-Initial upload
a0fcd39
"""Audio utility functions for format conversion and processing."""
import io
import numpy as np
import soundfile as sf
def to_mono(audio: np.ndarray) -> np.ndarray:
"""
Convert stereo audio to mono by averaging channels.
Args:
audio: Audio array, shape (samples,) for mono or (samples, channels) for stereo
Returns:
Mono audio array, shape (samples,)
"""
if audio.ndim == 2:
return np.mean(audio, axis=1)
return audio
def to_float32(audio: np.ndarray) -> np.ndarray:
"""
Ensure audio is float32 in [-1, 1] range.
Args:
audio: Audio array in any numeric format
Returns:
Audio array as float32 normalized to [-1, 1]
"""
audio = audio.astype(np.float32)
# Check if already normalized
max_val = np.max(np.abs(audio))
if max_val > 1.0:
audio = audio / max_val
return audio
def normalize(audio: np.ndarray, peak: float = 0.95) -> np.ndarray:
"""
Normalize audio so peak amplitude equals given value.
Args:
audio: Audio array
peak: Target peak amplitude (default 0.95 to avoid clipping)
Returns:
Normalized audio array
"""
max_val = np.max(np.abs(audio))
if max_val > 0:
audio = audio / max_val * peak
return audio
def pad_or_trim(audio: np.ndarray, target_length: int) -> np.ndarray:
"""
Pad with zeros or trim audio to target length.
Args:
audio: Audio array
target_length: Desired length in samples
Returns:
Audio array with exactly target_length samples
"""
current_length = len(audio)
if current_length == target_length:
return audio
elif current_length > target_length:
return audio[:target_length]
else:
# Pad with zeros
padding = np.zeros(target_length - current_length, dtype=audio.dtype)
return np.concatenate([audio, padding])
def encode_wav_to_bytes(audio: np.ndarray, sr: int) -> bytes:
"""
Encode numpy array to WAV bytes.
Args:
audio: Audio array
sr: Sample rate
Returns:
WAV file as bytes
"""
buf = io.BytesIO()
sf.write(buf, audio, sr, format='WAV')
buf.seek(0)
return buf.read()
def encode_flac_to_bytes(audio: np.ndarray, sr: int) -> bytes:
"""
Encode numpy array to FLAC bytes (lossless, ~50% smaller than WAV).
Args:
audio: Audio array
sr: Sample rate
Returns:
FLAC file as bytes
"""
buf = io.BytesIO()
# Convert float32 to int16 for FLAC (better compression)
if audio.dtype == np.float32 or audio.dtype == np.float64:
audio_int = (audio * 32767).astype(np.int16)
else:
audio_int = audio
sf.write(buf, audio_int, sr, format='FLAC')
buf.seek(0)
return buf.read()