| import numpy as np | |
| import subprocess | |
| SAMPLE_RATE = 16000 | |
| def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray: | |
| """ | |
| Open an audio file and read as mono waveform, resampling as necessary | |
| Parameters | |
| ---------- | |
| file: str | |
| The audio file to open | |
| sr: int | |
| The sample rate to resample the audio if necessary | |
| Returns | |
| ------- | |
| A NumPy array containing the audio waveform, in float32 dtype. | |
| """ | |
| try: | |
| # Launches a subprocess to decode audio while down-mixing and resampling as necessary. | |
| # Requires the ffmpeg CLI to be installed. | |
| cmd = [ | |
| "ffmpeg", | |
| "-nostdin", | |
| "-threads", | |
| "0", | |
| "-i", | |
| file, | |
| "-f", | |
| "s16le", | |
| "-ac", | |
| "1", | |
| "-acodec", | |
| "pcm_s16le", | |
| "-ar", | |
| str(sr), | |
| "-", | |
| ] | |
| out = subprocess.run(cmd, capture_output=True, check=True).stdout | |
| except subprocess.CalledProcessError as e: | |
| raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e | |
| return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 |