Spaces:
Running
on
Zero
Running
on
Zero
| import numpy as np | |
| import os | |
| import soundfile as sf | |
| import matplotlib.pyplot as plt | |
| from typing import Dict, Tuple, Optional | |
| import torch.distributed as dist | |
| def read_audio_transposed(path: str, instr: Optional[str] = None, skip_err: bool = False) -> Tuple[Optional[np.ndarray], Optional[int]]: | |
| """ | |
| Read an audio file and return transposed waveform data with channels first. | |
| Loads the audio file from `path`, converts mono signals to 2D format, and | |
| transposes the array so that its shape is (channels, length). In case of | |
| errors, either raises an exception or skips gracefully depending on | |
| `skip_err`. | |
| Args: | |
| path (str): Path to the audio file to load. | |
| instr (Optional[str], optional): Instrument name, used for informative | |
| messages when `skip_err` is True. Defaults to None. | |
| skip_err (bool, optional): If True, skip files with read errors and | |
| return `(None, None)` instead of raising. Defaults to False. | |
| Returns: | |
| Tuple[Optional[np.ndarray], Optional[int]]: A tuple containing: | |
| - NumPy array of shape (channels, length), or None if skipped. | |
| - Sampling rate as an integer, or None if skipped. | |
| """ | |
| should_print = not dist.is_initialized() or dist.get_rank() == 0 | |
| try: | |
| mix, sr = sf.read(path) | |
| except Exception as e: | |
| if skip_err: | |
| if should_print: | |
| print(f"No stem {instr}: skip!") | |
| return None, None | |
| else: | |
| raise RuntimeError(f"Error reading the file at {path}: {e}") | |
| else: | |
| if len(mix.shape) == 1: # For mono audio | |
| mix = np.expand_dims(mix, axis=-1) | |
| return mix.T, sr | |
| def normalize_audio(audio: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]: | |
| """ | |
| Normalize an audio signal using mean and standard deviation. | |
| Computes the mean and standard deviation from the mono mix of the input | |
| signal, then applies normalization to each channel. | |
| Args: | |
| audio (np.ndarray): Input audio array of shape (channels, time) or (time,). | |
| Returns: | |
| Tuple[np.ndarray, Dict[str, float]]: A tuple containing: | |
| - Normalized audio with the same shape as the input. | |
| - A dictionary with keys "mean" and "std" from the original audio. | |
| """ | |
| mono = audio.mean(0) | |
| mean, std = mono.mean(), mono.std() | |
| return (audio - mean) / std, {"mean": mean, "std": std} | |
| def denormalize_audio(audio: np.ndarray, norm_params: Dict[str, float]) -> np.ndarray: | |
| """ | |
| Reverse normalization on an audio signal. | |
| Applies the stored mean and standard deviation to restore the original | |
| scale of a previously normalized signal. | |
| Args: | |
| audio (np.ndarray): Normalized audio array to be denormalized. | |
| norm_params (Dict[str, float]): Dictionary containing the keys | |
| "mean" and "std" used during normalization. | |
| Returns: | |
| np.ndarray: Denormalized audio with the same shape as the input. | |
| """ | |
| return audio * norm_params["std"] + norm_params["mean"] | |
| def draw_spectrogram(waveform: np.ndarray, sample_rate: int, length: float, output_file: str) -> None: | |
| """ | |
| Generate and save a spectrogram image from an audio waveform. | |
| Converts the provided waveform into a mono signal, computes its Short-Time | |
| Fourier Transform (STFT), converts the amplitude spectrogram to dB scale, | |
| and plots it using a plasma colormap. | |
| Args: | |
| waveform (np.ndarray): Input audio waveform array of shape (time, channels) | |
| or (time,). | |
| sample_rate (int): Sampling rate of the waveform in Hz. | |
| length (float): Duration (in seconds) of the waveform to include in the | |
| spectrogram. | |
| output_file (str): Path to save the resulting spectrogram image. | |
| Returns: | |
| None | |
| """ | |
| import librosa.display | |
| # Cut only required part of spectorgram | |
| x = waveform[:int(length * sample_rate), :] | |
| X = librosa.stft(x.mean(axis=-1)) # perform short-term fourier transform on mono signal | |
| Xdb = librosa.amplitude_to_db(np.abs(X), ref=np.max) # convert an amplitude spectrogram to dB-scaled spectrogram. | |
| fig, ax = plt.subplots() | |
| # plt.figure(figsize=(30, 10)) # initialize the fig size | |
| img = librosa.display.specshow( | |
| Xdb, | |
| cmap='plasma', | |
| sr=sample_rate, | |
| x_axis='time', | |
| y_axis='linear', | |
| ax=ax | |
| ) | |
| ax.set(title='File: ' + os.path.basename(output_file)) | |
| fig.colorbar(img, ax=ax, format="%+2.f dB") | |
| if output_file is not None: | |
| plt.savefig(output_file) |