Spaces:
Sleeping
Sleeping
| import torch | |
| import torchaudio | |
| from pathlib import Path | |
| import soundfile as sf | |
| from typing import Any | |
| from config import TARGET_SR, SUPPORTED_EXTS | |
| def transcribe_file(path: str | Path, pipe: Any) -> str: | |
| """ | |
| Transcribe an audio file to text using a given ASR pipeline. | |
| Args: | |
| path: Path or string pointing to an audio file. | |
| asr_pipeline: A Hugging Face transformers pipeline object for | |
| automatic-speech-recognition. Should accept a numpy | |
| array and return a dict with key 'text'. | |
| Returns: | |
| The transcribed text as returned by the pipeline. | |
| Raises: | |
| ValueError: If loading or decoding the audio fails. | |
| """ | |
| speech = load_resample(path) | |
| return pipe(speech.numpy())["text"] # type: ignore[index] | |
| def load_resample(path: str | Path, target_sr: int = TARGET_SR) -> torch.Tensor: | |
| """ | |
| Load an audio file and resample it to the target sample rate, returning | |
| a mono torch.Tensor. | |
| Args: | |
| path: Path or string pointing to an audio file. | |
| target_sr: Desired sample rate (in Hz). Defaults to TARGET_SR from config. | |
| Returns: | |
| A 1-D torch.Tensor of dtype float32 sampled at target_sr. | |
| Raises: | |
| ValueError: If the file extension is not in SUPPORTED_EXTS. | |
| ValueError: If the audio file cannot be decoded. | |
| """ | |
| ext = Path(path).suffix.lower() | |
| if ext not in SUPPORTED_EXTS: | |
| raise ValueError( | |
| f"Unsupported file-type β{ext or 'unknown'}β. Please upload WAV, FLAC, MP3, OGG/Opus or M4A." | |
| ) | |
| try: | |
| speech, sr = sf.read(str(path)) | |
| except RuntimeError as exc: | |
| raise ValueError( | |
| "Couldn't decode the audio file - maybe it's corrupted or in an uncommon codec." | |
| ) from exc | |
| speech = torch.tensor(speech).float() | |
| if speech.ndim == 2: # stereo to mono | |
| speech = speech.mean(dim=1) | |
| if sr != target_sr: | |
| speech = torchaudio.functional.resample( | |
| speech, orig_freq=sr, new_freq=target_sr | |
| ) | |
| return speech | |