Xinsheng-Wang's picture
Upload folder using huggingface_hub
c7f3ffb verified
import numpy as np
import os
import soundfile as sf
import matplotlib.pyplot as plt
from typing import Dict, Tuple, Optional
import torch.distributed as dist
def read_audio_transposed(path: str, instr: Optional[str] = None, skip_err: bool = False) -> Tuple[Optional[np.ndarray], Optional[int]]:
"""
Read an audio file and return transposed waveform data with channels first.
Loads the audio file from `path`, converts mono signals to 2D format, and
transposes the array so that its shape is (channels, length). In case of
errors, either raises an exception or skips gracefully depending on
`skip_err`.
Args:
path (str): Path to the audio file to load.
instr (Optional[str], optional): Instrument name, used for informative
messages when `skip_err` is True. Defaults to None.
skip_err (bool, optional): If True, skip files with read errors and
return `(None, None)` instead of raising. Defaults to False.
Returns:
Tuple[Optional[np.ndarray], Optional[int]]: A tuple containing:
- NumPy array of shape (channels, length), or None if skipped.
- Sampling rate as an integer, or None if skipped.
"""
should_print = not dist.is_initialized() or dist.get_rank() == 0
try:
mix, sr = sf.read(path)
except Exception as e:
if skip_err:
if should_print:
print(f"No stem {instr}: skip!")
return None, None
else:
raise RuntimeError(f"Error reading the file at {path}: {e}")
else:
if len(mix.shape) == 1: # For mono audio
mix = np.expand_dims(mix, axis=-1)
return mix.T, sr
def normalize_audio(audio: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]:
"""
Normalize an audio signal using mean and standard deviation.
Computes the mean and standard deviation from the mono mix of the input
signal, then applies normalization to each channel.
Args:
audio (np.ndarray): Input audio array of shape (channels, time) or (time,).
Returns:
Tuple[np.ndarray, Dict[str, float]]: A tuple containing:
- Normalized audio with the same shape as the input.
- A dictionary with keys "mean" and "std" from the original audio.
"""
mono = audio.mean(0)
mean, std = mono.mean(), mono.std()
return (audio - mean) / std, {"mean": mean, "std": std}
def denormalize_audio(audio: np.ndarray, norm_params: Dict[str, float]) -> np.ndarray:
"""
Reverse normalization on an audio signal.
Applies the stored mean and standard deviation to restore the original
scale of a previously normalized signal.
Args:
audio (np.ndarray): Normalized audio array to be denormalized.
norm_params (Dict[str, float]): Dictionary containing the keys
"mean" and "std" used during normalization.
Returns:
np.ndarray: Denormalized audio with the same shape as the input.
"""
return audio * norm_params["std"] + norm_params["mean"]
def draw_spectrogram(waveform: np.ndarray, sample_rate: int, length: float, output_file: str) -> None:
"""
Generate and save a spectrogram image from an audio waveform.
Converts the provided waveform into a mono signal, computes its Short-Time
Fourier Transform (STFT), converts the amplitude spectrogram to dB scale,
and plots it using a plasma colormap.
Args:
waveform (np.ndarray): Input audio waveform array of shape (time, channels)
or (time,).
sample_rate (int): Sampling rate of the waveform in Hz.
length (float): Duration (in seconds) of the waveform to include in the
spectrogram.
output_file (str): Path to save the resulting spectrogram image.
Returns:
None
"""
import librosa.display
# Cut only required part of spectorgram
x = waveform[:int(length * sample_rate), :]
X = librosa.stft(x.mean(axis=-1)) # perform short-term fourier transform on mono signal
Xdb = librosa.amplitude_to_db(np.abs(X), ref=np.max) # convert an amplitude spectrogram to dB-scaled spectrogram.
fig, ax = plt.subplots()
# plt.figure(figsize=(30, 10)) # initialize the fig size
img = librosa.display.specshow(
Xdb,
cmap='plasma',
sr=sample_rate,
x_axis='time',
y_axis='linear',
ax=ax
)
ax.set(title='File: ' + os.path.basename(output_file))
fig.colorbar(img, ax=ax, format="%+2.f dB")
if output_file is not None:
plt.savefig(output_file)