Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| Small helpers shared across backends (no heavy imports). | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import numpy as np | |
| def safe_pad_audio(audio: np.ndarray, multiple: int = 1600) -> np.ndarray: | |
| """ | |
| Right-pad a 1D float32 waveform with zeros so its length is a multiple of `multiple`. | |
| Several model preprocessors (e.g. Moonshine's `view(B, -1, 80)` step) require the input | |
| waveform length to divide a small chunk size. Trailing silence is harmless for ASR but | |
| avoids silent crashes like ``shape '[1, -1, 80]' is invalid for input of size N``. | |
| Default `multiple=1600` = 100 ms at 16 kHz, divisible by all common ASR strides | |
| (80, 160, 320, 400, 800). | |
| """ | |
| arr = np.asarray(audio, dtype=np.float32).reshape(-1) | |
| if multiple <= 1: | |
| return arr | |
| rem = arr.size % multiple | |
| if rem == 0: | |
| return arr | |
| pad = multiple - rem | |
| return np.concatenate([arr, np.zeros(pad, dtype=np.float32)]) | |
| def load_wav_mono(path: str | Path, sampling_rate: int = 16000) -> np.ndarray: | |
| """ | |
| Load a WAV file as a 1-D float32 mono waveform at ``sampling_rate`` Hz. | |
| Uses ``soundfile`` only (no torchcodec / FFmpeg). Eval samples and the | |
| custom ``evaluate(Path)`` hook are written as 16 kHz PCM WAVs. | |
| """ | |
| import soundfile as sf | |
| audio, sr = sf.read(str(path), dtype="float32", always_2d=True) | |
| audio = audio.mean(axis=1) | |
| if int(sr) != int(sampling_rate): | |
| try: | |
| import librosa | |
| audio = librosa.resample( | |
| audio, orig_sr=int(sr), target_sr=int(sampling_rate) | |
| ) | |
| except Exception as exc: | |
| raise RuntimeError( | |
| f"Audio is {sr} Hz but {sampling_rate} Hz was requested; " | |
| "install librosa for resampling." | |
| ) from exc | |
| return np.asarray(audio, dtype=np.float32).reshape(-1) | |