Spaces:

Soul-AILab
/

SoulX-Singer

Running on Zero

App Files Files Community

SoulX-Singer / preprocess /tools /vocal_separation /utils /audio_utils.py

Xinsheng-Wang

Upload folder using huggingface_hub

c7f3ffb verified 7 days ago

raw

history blame contribute delete

4.61 kB


	import numpy as np
	import os
	import soundfile as sf
	import matplotlib.pyplot as plt
	from typing import Dict, Tuple, Optional

	import torch.distributed as dist


	def read_audio_transposed(path: str, instr: Optional[str] = None, skip_err: bool = False) -> Tuple[Optional[np.ndarray], Optional[int]]:
	"""
	Read an audio file and return transposed waveform data with channels first.

	Loads the audio file from `path`, converts mono signals to 2D format, and
	transposes the array so that its shape is (channels, length). In case of
	errors, either raises an exception or skips gracefully depending on
	`skip_err`.

	Args:
	path (str): Path to the audio file to load.
	instr (Optional[str], optional): Instrument name, used for informative
	messages when `skip_err` is True. Defaults to None.
	skip_err (bool, optional): If True, skip files with read errors and
	return `(None, None)` instead of raising. Defaults to False.

	Returns:
	Tuple[Optional[np.ndarray], Optional[int]]: A tuple containing:
	- NumPy array of shape (channels, length), or None if skipped.
	- Sampling rate as an integer, or None if skipped.
	"""

	should_print = not dist.is_initialized() or dist.get_rank() == 0

	try:
	mix, sr = sf.read(path)
	except Exception as e:
	if skip_err:
	if should_print:
	print(f"No stem {instr}: skip!")
	return None, None
	else:
	raise RuntimeError(f"Error reading the file at {path}: {e}")
	else:
	if len(mix.shape) == 1: # For mono audio
	mix = np.expand_dims(mix, axis=-1)
	return mix.T, sr


	def normalize_audio(audio: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]:
	"""
	Normalize an audio signal using mean and standard deviation.

	Computes the mean and standard deviation from the mono mix of the input
	signal, then applies normalization to each channel.

	Args:
	audio (np.ndarray): Input audio array of shape (channels, time) or (time,).

	Returns:
	Tuple[np.ndarray, Dict[str, float]]: A tuple containing:
	- Normalized audio with the same shape as the input.
	- A dictionary with keys "mean" and "std" from the original audio.
	"""

	mono = audio.mean(0)
	mean, std = mono.mean(), mono.std()
	return (audio - mean) / std, {"mean": mean, "std": std}


	def denormalize_audio(audio: np.ndarray, norm_params: Dict[str, float]) -> np.ndarray:
	"""
	Reverse normalization on an audio signal.

	Applies the stored mean and standard deviation to restore the original
	scale of a previously normalized signal.

	Args:
	audio (np.ndarray): Normalized audio array to be denormalized.
	norm_params (Dict[str, float]): Dictionary containing the keys
	"mean" and "std" used during normalization.

	Returns:
	np.ndarray: Denormalized audio with the same shape as the input.
	"""

	return audio * norm_params["std"] + norm_params["mean"]


	def draw_spectrogram(waveform: np.ndarray, sample_rate: int, length: float, output_file: str) -> None:
	"""
	Generate and save a spectrogram image from an audio waveform.

	Converts the provided waveform into a mono signal, computes its Short-Time
	Fourier Transform (STFT), converts the amplitude spectrogram to dB scale,
	and plots it using a plasma colormap.

	Args:
	waveform (np.ndarray): Input audio waveform array of shape (time, channels)
	or (time,).
	sample_rate (int): Sampling rate of the waveform in Hz.
	length (float): Duration (in seconds) of the waveform to include in the
	spectrogram.
	output_file (str): Path to save the resulting spectrogram image.

	Returns:
	None
	"""

	import librosa.display

	# Cut only required part of spectorgram
	x = waveform[:int(length * sample_rate), :]
	X = librosa.stft(x.mean(axis=-1)) # perform short-term fourier transform on mono signal
	Xdb = librosa.amplitude_to_db(np.abs(X), ref=np.max) # convert an amplitude spectrogram to dB-scaled spectrogram.
	fig, ax = plt.subplots()
	# plt.figure(figsize=(30, 10)) # initialize the fig size
	img = librosa.display.specshow(
	Xdb,
	cmap='plasma',
	sr=sample_rate,
	x_axis='time',
	y_axis='linear',
	ax=ax
	)
	ax.set(title='File: ' + os.path.basename(output_file))
	fig.colorbar(img, ax=ax, format="%+2.f dB")
	if output_file is not None:
	plt.savefig(output_file)