Spaces:

treble-technologies
/

ffasr

Running on CPU Upgrade

App Files Files Community

ffasr / backends /_audio_utils.py

whojavumusic

cohere fix

a6beab2 about 2 months ago

Raw

History Blame Contribute Delete

1.89 kB

	"""
	Small helpers shared across backends (no heavy imports).
	"""

	from __future__ import annotations

	from pathlib import Path

	import numpy as np


	def safe_pad_audio(audio: np.ndarray, multiple: int = 1600) -> np.ndarray:
	"""
	Right-pad a 1D float32 waveform with zeros so its length is a multiple of `multiple`.

	Several model preprocessors (e.g. Moonshine's `view(B, -1, 80)` step) require the input
	waveform length to divide a small chunk size. Trailing silence is harmless for ASR but
	avoids silent crashes like ``shape '[1, -1, 80]' is invalid for input of size N``.

	Default `multiple=1600` = 100 ms at 16 kHz, divisible by all common ASR strides
	(80, 160, 320, 400, 800).
	"""
	arr = np.asarray(audio, dtype=np.float32).reshape(-1)
	if multiple <= 1:
	return arr
	rem = arr.size % multiple
	if rem == 0:
	return arr
	pad = multiple - rem
	return np.concatenate([arr, np.zeros(pad, dtype=np.float32)])


	def load_wav_mono(path: str \| Path, sampling_rate: int = 16000) -> np.ndarray:
	"""
	Load a WAV file as a 1-D float32 mono waveform at ``sampling_rate`` Hz.

	Uses ``soundfile`` only (no torchcodec / FFmpeg). Eval samples and the
	custom ``evaluate(Path)`` hook are written as 16 kHz PCM WAVs.
	"""
	import soundfile as sf

	audio, sr = sf.read(str(path), dtype="float32", always_2d=True)
	audio = audio.mean(axis=1)
	if int(sr) != int(sampling_rate):
	try:
	import librosa

	audio = librosa.resample(
	audio, orig_sr=int(sr), target_sr=int(sampling_rate)
	)
	except Exception as exc:
	raise RuntimeError(
	f"Audio is {sr} Hz but {sampling_rate} Hz was requested; "
	"install librosa for resampling."
	) from exc
	return np.asarray(audio, dtype=np.float32).reshape(-1)