Spaces:
Runtime error
Runtime error
| """ | |
| Audio Processor Module | |
| ======================= | |
| This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files. | |
| It includes functionalities to load, cut, and manage audio waveforms, offering efficient and | |
| flexible audio processing. | |
| Available Classes: | |
| - AudioProcessor: Processes audio waveforms and provides methods for loading, | |
| cutting, and handling audio. | |
| Usage: | |
| from .audio_import AudioProcessor | |
| processor = AudioProcessor.from_file("path/to/audiofile.wav") | |
| cut_waveform = processor.cut(start=1.0, end=5.0) | |
| Constants: | |
| - SAMPLE_RATE (int): Default sample rate for processing. | |
| - NORMALIZATION_FACTOR (float): Normalization factor for audio waveform. | |
| """ | |
| from subprocess import CalledProcessError, run | |
| import numpy as np | |
| import torch | |
| SAMPLE_RATE = 16000 | |
| NORMALIZATION_FACTOR = 32768.0 | |
| class AudioProcessor: | |
| """ | |
| Audio Processor class that leverages PyTorchaudio to provide functionalities | |
| for loading, cutting, and handling audio waveforms. | |
| Attributes: | |
| waveform: torch.Tensor | |
| The audio waveform tensor. | |
| sr: int | |
| The sample rate of the audio. | |
| """ | |
| def __init__(self, waveform: torch.Tensor, | |
| sr: int = SAMPLE_RATE) -> None: | |
| """ | |
| Initialize the AudioProcessor object. | |
| Args: | |
| waveform (torch.Tensor): The audio waveform tensor. | |
| sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE. | |
| Raises: | |
| ValueError: If the provided sample rate is not of type int. | |
| """ | |
| self.waveform = waveform | |
| self.sr = sr | |
| if not isinstance(self.sr, int): | |
| raise ValueError("Sample rate should be a single value of type int," | |
| f"not {len(self.sr)} and type {type(self.sr)}") | |
| def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor': | |
| """ | |
| Create an AudioProcessor instance from an audio file. | |
| Args: | |
| file (str): The audio file path. | |
| Returns: | |
| AudioProcessor: An instance of the AudioProcessor class containing the loaded audio. | |
| """ | |
| audio, sr = cls.load_audio(file, *args, **kwargs) | |
| audio = torch.from_numpy(audio) | |
| return cls(audio, sr) | |
| def cut(self, start: float, end: float) -> torch.Tensor: | |
| """ | |
| Cut a segment from the audio waveform between the specified start and end times. | |
| Args: | |
| start (float): Start time in seconds. | |
| end (float): End time in seconds. | |
| Returns: | |
| torch.Tensor: The cut waveform segment. | |
| """ | |
| start = int(start * self.sr) | |
| if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int): | |
| end = int(np.ceil(end * self.sr)) | |
| else: | |
| end = int(torch.ceil(end * self.sr)) | |
| return self.waveform[start:end] | |
| def load_audio(file: str, sr: int = SAMPLE_RATE): | |
| """ | |
| Open an audio file and read it as a mono waveform, resampling if necessary. | |
| This method ensures compatibility with pyannote.audio | |
| and requires the ffmpeg CLI in PATH. | |
| Args: | |
| file (str): The audio file to open. | |
| sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE. | |
| Returns: | |
| tuple: A NumPy array containing the audio waveform in float32 dtype | |
| and the sample rate. | |
| Raises: | |
| RuntimeError: If failed to load audio. | |
| """ | |
| # This launches a subprocess to decode audio while down-mixing | |
| # and resampling as necessary. Requires the ffmpeg CLI in PATH. | |
| # fmt: off | |
| cmd = [ | |
| "ffmpeg", | |
| "-nostdin", | |
| "-threads", "0", | |
| "-i", file, | |
| "-f", "s16le", | |
| "-ac", "1", | |
| "-acodec", "pcm_s16le", | |
| "-ar", str(sr), | |
| "-" | |
| ] | |
| # fmt: on | |
| try: | |
| out = run(cmd, capture_output=True, check=True).stdout | |
| except CalledProcessError as e: | |
| raise RuntimeError( | |
| f"Failed to load audio: {e.stderr.decode()}") from e | |
| out = np.frombuffer(out, np.int16).flatten().astype( | |
| np.float32) / NORMALIZATION_FACTOR | |
| return out, sr | |
| def __repr__(self) -> str: | |
| return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' | |