Spaces:

Soul-AILab
/

SoulX-Singer

Running on Zero

File size: 4,607 Bytes

c7f3ffb


import numpy as np
import os
import soundfile as sf
import matplotlib.pyplot as plt
from typing import Dict, Tuple, Optional

import torch.distributed as dist


def read_audio_transposed(path: str, instr: Optional[str] = None, skip_err: bool = False) -> Tuple[Optional[np.ndarray], Optional[int]]:
    """
    Read an audio file and return transposed waveform data with channels first.

    Loads the audio file from `path`, converts mono signals to 2D format, and
    transposes the array so that its shape is (channels, length). In case of
    errors, either raises an exception or skips gracefully depending on
    `skip_err`.

    Args:
        path (str): Path to the audio file to load.
        instr (Optional[str], optional): Instrument name, used for informative
            messages when `skip_err` is True. Defaults to None.
        skip_err (bool, optional): If True, skip files with read errors and
            return `(None, None)` instead of raising. Defaults to False.

    Returns:
        Tuple[Optional[np.ndarray], Optional[int]]: A tuple containing:
            - NumPy array of shape (channels, length), or None if skipped.
            - Sampling rate as an integer, or None if skipped.
    """

    should_print = not dist.is_initialized() or dist.get_rank() == 0

    try:
        mix, sr = sf.read(path)
    except Exception as e:
        if skip_err:
            if should_print:
                print(f"No stem {instr}: skip!")
            return None, None
        else:
            raise RuntimeError(f"Error reading the file at {path}: {e}")
    else:
        if len(mix.shape) == 1:  # For mono audio
            mix = np.expand_dims(mix, axis=-1)
        return mix.T, sr


def normalize_audio(audio: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]:
    """
    Normalize an audio signal using mean and standard deviation.

    Computes the mean and standard deviation from the mono mix of the input
    signal, then applies normalization to each channel.

    Args:
        audio (np.ndarray): Input audio array of shape (channels, time) or (time,).

    Returns:
        Tuple[np.ndarray, Dict[str, float]]: A tuple containing:
            - Normalized audio with the same shape as the input.
            - A dictionary with keys "mean" and "std" from the original audio.
    """

    mono = audio.mean(0)
    mean, std = mono.mean(), mono.std()
    return (audio - mean) / std, {"mean": mean, "std": std}


def denormalize_audio(audio: np.ndarray, norm_params: Dict[str, float]) -> np.ndarray:
    """
    Reverse normalization on an audio signal.

    Applies the stored mean and standard deviation to restore the original
    scale of a previously normalized signal.

    Args:
        audio (np.ndarray): Normalized audio array to be denormalized.
        norm_params (Dict[str, float]): Dictionary containing the keys
            "mean" and "std" used during normalization.

    Returns:
        np.ndarray: Denormalized audio with the same shape as the input.
    """

    return audio * norm_params["std"] + norm_params["mean"]


def draw_spectrogram(waveform: np.ndarray, sample_rate: int, length: float, output_file: str) -> None:
    """
    Generate and save a spectrogram image from an audio waveform.

    Converts the provided waveform into a mono signal, computes its Short-Time
    Fourier Transform (STFT), converts the amplitude spectrogram to dB scale,
    and plots it using a plasma colormap.

    Args:
        waveform (np.ndarray): Input audio waveform array of shape (time, channels)
            or (time,).
        sample_rate (int): Sampling rate of the waveform in Hz.
        length (float): Duration (in seconds) of the waveform to include in the
            spectrogram.
        output_file (str): Path to save the resulting spectrogram image.

    Returns:
        None
    """

    import librosa.display

    # Cut only required part of spectorgram
    x = waveform[:int(length * sample_rate), :]
    X = librosa.stft(x.mean(axis=-1))  # perform short-term fourier transform on mono signal
    Xdb = librosa.amplitude_to_db(np.abs(X), ref=np.max)  # convert an amplitude spectrogram to dB-scaled spectrogram.
    fig, ax = plt.subplots()
    # plt.figure(figsize=(30, 10))  # initialize the fig size
    img = librosa.display.specshow(
        Xdb,
        cmap='plasma',
        sr=sample_rate,
        x_axis='time',
        y_axis='linear',
        ax=ax
    )
    ax.set(title='File: ' + os.path.basename(output_file))
    fig.colorbar(img, ax=ax, format="%+2.f dB")
    if output_file is not None:
        plt.savefig(output_file)