ffasr / backends /_audio_utils.py
whojavumusic's picture
cohere fix
a6beab2
Raw
History Blame Contribute Delete
1.89 kB
"""
Small helpers shared across backends (no heavy imports).
"""
from __future__ import annotations
from pathlib import Path
import numpy as np
def safe_pad_audio(audio: np.ndarray, multiple: int = 1600) -> np.ndarray:
"""
Right-pad a 1D float32 waveform with zeros so its length is a multiple of `multiple`.
Several model preprocessors (e.g. Moonshine's `view(B, -1, 80)` step) require the input
waveform length to divide a small chunk size. Trailing silence is harmless for ASR but
avoids silent crashes like ``shape '[1, -1, 80]' is invalid for input of size N``.
Default `multiple=1600` = 100 ms at 16 kHz, divisible by all common ASR strides
(80, 160, 320, 400, 800).
"""
arr = np.asarray(audio, dtype=np.float32).reshape(-1)
if multiple <= 1:
return arr
rem = arr.size % multiple
if rem == 0:
return arr
pad = multiple - rem
return np.concatenate([arr, np.zeros(pad, dtype=np.float32)])
def load_wav_mono(path: str | Path, sampling_rate: int = 16000) -> np.ndarray:
"""
Load a WAV file as a 1-D float32 mono waveform at ``sampling_rate`` Hz.
Uses ``soundfile`` only (no torchcodec / FFmpeg). Eval samples and the
custom ``evaluate(Path)`` hook are written as 16 kHz PCM WAVs.
"""
import soundfile as sf
audio, sr = sf.read(str(path), dtype="float32", always_2d=True)
audio = audio.mean(axis=1)
if int(sr) != int(sampling_rate):
try:
import librosa
audio = librosa.resample(
audio, orig_sr=int(sr), target_sr=int(sampling_rate)
)
except Exception as exc:
raise RuntimeError(
f"Audio is {sr} Hz but {sampling_rate} Hz was requested; "
"install librosa for resampling."
) from exc
return np.asarray(audio, dtype=np.float32).reshape(-1)