Spaces:

botario
/

wave2vec_luxembourgish

Sleeping

App Files Files Community

wave2vec_luxembourgish / utils.py

botarioAcc

Initial Commit

6a91da6 verified 7 months ago

raw

history blame contribute delete

2.08 kB

	import torch
	import torchaudio
	from pathlib import Path
	import soundfile as sf
	from typing import Any


	from config import TARGET_SR, SUPPORTED_EXTS


	def transcribe_file(path: str \| Path, pipe: Any) -> str:
	"""
	Transcribe an audio file to text using a given ASR pipeline.

	Args:
	path: Path or string pointing to an audio file.
	asr_pipeline: A Hugging Face transformers pipeline object for
	automatic-speech-recognition. Should accept a numpy
	array and return a dict with key 'text'.

	Returns:
	The transcribed text as returned by the pipeline.

	Raises:
	ValueError: If loading or decoding the audio fails.
	"""
	speech = load_resample(path)
	return pipe(speech.numpy())["text"] # type: ignore[index]


	def load_resample(path: str \| Path, target_sr: int = TARGET_SR) -> torch.Tensor:
	"""
	Load an audio file and resample it to the target sample rate, returning
	a mono torch.Tensor.

	Args:
	path: Path or string pointing to an audio file.
	target_sr: Desired sample rate (in Hz). Defaults to TARGET_SR from config.

	Returns:
	A 1-D torch.Tensor of dtype float32 sampled at target_sr.

	Raises:
	ValueError: If the file extension is not in SUPPORTED_EXTS.
	ValueError: If the audio file cannot be decoded.
	"""
	ext = Path(path).suffix.lower()
	if ext not in SUPPORTED_EXTS:
	raise ValueError(
	f"Unsupported file-type “{ext or 'unknown'}”. Please upload WAV, FLAC, MP3, OGG/Opus or M4A."
	)

	try:
	speech, sr = sf.read(str(path))
	except RuntimeError as exc:
	raise ValueError(
	"Couldn't decode the audio file - maybe it's corrupted or in an uncommon codec."
	) from exc

	speech = torch.tensor(speech).float()
	if speech.ndim == 2: # stereo to mono
	speech = speech.mean(dim=1)
	if sr != target_sr:
	speech = torchaudio.functional.resample(
	speech, orig_freq=sr, new_freq=target_sr
	)
	return speech