| import soundfile | |
| import io | |
| from typing import Any, Tuple, Union, Optional | |
| import numpy as np | |
| import torch | |
| def preprocess_wav(data: Any, incoming_sample_rate) -> Tuple[np.ndarray, int]: | |
| segment, sample_rate = soundfile.read( | |
| io.BytesIO(data), | |
| dtype="float32", | |
| always_2d=True, | |
| frames=-1, | |
| start=0, | |
| format="RAW", | |
| subtype="PCM_16", | |
| samplerate=incoming_sample_rate, | |
| channels=1, | |
| ) | |
| return segment, sample_rate | |
| def convert_waveform( | |
| waveform: Union[np.ndarray, torch.Tensor], | |
| sample_rate: int, | |
| normalize_volume: bool = False, | |
| to_mono: bool = False, | |
| to_sample_rate: Optional[int] = None, | |
| ) -> Tuple[Union[np.ndarray, torch.Tensor], int]: | |
| """convert a waveform: | |
| - to a target sample rate | |
| - from multi-channel to mono channel | |
| - volume normalization | |
| Args: | |
| waveform (numpy.ndarray or torch.Tensor): 2D original waveform | |
| (channels x length) | |
| sample_rate (int): original sample rate | |
| normalize_volume (bool): perform volume normalization | |
| to_mono (bool): convert to mono channel if having multiple channels | |
| to_sample_rate (Optional[int]): target sample rate | |
| Returns: | |
| waveform (numpy.ndarray): converted 2D waveform (channels x length) | |
| sample_rate (float): target sample rate | |
| """ | |
| try: | |
| import torchaudio.sox_effects as ta_sox | |
| except ImportError: | |
| raise ImportError("Please install torchaudio: pip install torchaudio") | |
| effects = [] | |
| if normalize_volume: | |
| effects.append(["gain", "-n"]) | |
| if to_sample_rate is not None and to_sample_rate != sample_rate: | |
| effects.append(["rate", f"{to_sample_rate}"]) | |
| if to_mono and waveform.shape[0] > 1: | |
| effects.append(["channels", "1"]) | |
| if len(effects) > 0: | |
| is_np_input = isinstance(waveform, np.ndarray) | |
| _waveform = torch.from_numpy(waveform) if is_np_input else waveform | |
| converted, converted_sample_rate = ta_sox.apply_effects_tensor( | |
| _waveform, sample_rate, effects | |
| ) | |
| if is_np_input: | |
| converted = converted.numpy() | |
| return converted, converted_sample_rate | |
| return waveform, sample_rate |