YingMusic-Singer-Plus

Running

YingMusic-Singer-Plus / src /third_party /MusicSourceSeparationTraining /utils /audio_utils.py

64ec292 4 months ago

12.5 kB

	import os
	from typing import Dict, Optional, Tuple

	import matplotlib.pyplot as plt
	import numpy as np
	import soundfile as sf
	import torch.distributed as dist


	def read_audio_transposed(
	path: str, instr: Optional[str] = None, skip_err: bool = False
	) -> Tuple[Optional[np.ndarray], Optional[int]]:
	"""
	Read an audio file and return transposed waveform data with channels first.

	Loads the audio file from `path`, converts mono signals to 2D format, and
	transposes the array so that its shape is (channels, length). In case of
	errors, either raises an exception or skips gracefully depending on
	`skip_err`.

	Args:
	path (str): Path to the audio file to load.
	instr (Optional[str], optional): Instrument name, used for informative
	messages when `skip_err` is True. Defaults to None.
	skip_err (bool, optional): If True, skip files with read errors and
	return `(None, None)` instead of raising. Defaults to False.

	Returns:
	Tuple[Optional[np.ndarray], Optional[int]]: A tuple containing:
	- NumPy array of shape (channels, length), or None if skipped.
	- Sampling rate as an integer, or None if skipped.
	"""

	should_print = not dist.is_initialized() or dist.get_rank() == 0

	try:
	mix, sr = sf.read(path)
	except Exception as e:
	if skip_err:
	if should_print:
	print(f"No stem {instr}: skip!")
	return None, None
	else:
	raise RuntimeError(f"Error reading the file at {path}: {e}")
	else:
	if len(mix.shape) == 1: # For mono audio
	mix = np.expand_dims(mix, axis=-1)
	return mix.T, sr


	def normalize_audio(audio: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]:
	"""
	Normalize an audio signal using mean and standard deviation.

	Computes the mean and standard deviation from the mono mix of the input
	signal, then applies normalization to each channel.

	Args:
	audio (np.ndarray): Input audio array of shape (channels, time) or (time,).

	Returns:
	Tuple[np.ndarray, Dict[str, float]]: A tuple containing:
	- Normalized audio with the same shape as the input.
	- A dictionary with keys "mean" and "std" from the original audio.
	"""

	mono = audio.mean(0)
	mean, std = mono.mean(), mono.std()
	return (audio - mean) / std, {"mean": mean, "std": std}


	def denormalize_audio(audio: np.ndarray, norm_params: Dict[str, float]) -> np.ndarray:
	"""
	Reverse normalization on an audio signal.

	Applies the stored mean and standard deviation to restore the original
	scale of a previously normalized signal.

	Args:
	audio (np.ndarray): Normalized audio array to be denormalized.
	norm_params (Dict[str, float]): Dictionary containing the keys
	"mean" and "std" used during normalization.

	Returns:
	np.ndarray: Denormalized audio with the same shape as the input.
	"""

	return audio * norm_params["std"] + norm_params["mean"]


	def draw_spectrogram(
	waveform: np.ndarray, sample_rate: int, length: float, output_file: str
	) -> None:
	"""
	Generate and save a spectrogram image from an audio waveform.

	Converts the provided waveform into a mono signal, computes its Short-Time
	Fourier Transform (STFT), converts the amplitude spectrogram to dB scale,
	and plots it using a plasma colormap.

	Args:
	waveform (np.ndarray): Input audio waveform array of shape (time, channels)
	or (time,).
	sample_rate (int): Sampling rate of the waveform in Hz.
	length (float): Duration (in seconds) of the waveform to include in the
	spectrogram.
	output_file (str): Path to save the resulting spectrogram image.

	Returns:
	None
	"""

	import librosa.display

	# Cut only required part of spectorgram
	x = waveform[: int(length * sample_rate), :]
	X = librosa.stft(
	x.mean(axis=-1)
	) # perform short-term fourier transform on mono signal
	Xdb = librosa.amplitude_to_db(
	np.abs(X), ref=np.max
	) # convert an amplitude spectrogram to dB-scaled spectrogram.
	fig, ax = plt.subplots()
	# plt.figure(figsize=(30, 10)) # initialize the fig size
	img = librosa.display.specshow(
	Xdb, cmap="plasma", sr=sample_rate, x_axis="time", y_axis="linear", ax=ax
	)
	ax.set(title="File: " + os.path.basename(output_file))
	fig.colorbar(img, ax=ax, format="%+2.f dB")
	if output_file is not None:
	plt.savefig(output_file)


	def draw_2_mel_spectrogram(
	estimates_waveform: np.ndarray,
	track_waveform: np.ndarray,
	sample_rate: int,
	length: float,
	output_base: str,
	) -> None:
	"""
	Generate and save separate images for spectrograms and waveforms
	for both estimated and original audio.

	Creates two separate images:
	- One with mel-spectrograms (estimated vs original)
	- One with waveforms (estimated vs original)

	Args:
	estimates_waveform (np.ndarray): Estimated audio waveform
	track_waveform (np.ndarray): Original audio waveform
	sample_rate (int): Sampling rate in Hz
	length (float): Duration in seconds to include
	output_base (str): Base path for output files (without extension)

	Returns:
	None
	"""
	import librosa.display

	# Prepare both waveforms
	waveforms = [estimates_waveform, track_waveform]
	titles = ["Estimated", "Original"]

	# Store processed (mono, possibly decimated) waveforms
	processed_waveforms: list[tuple[np.ndarray, int]] = []

	for waveform in waveforms:
	# Convert to mono if multi-channel
	mono_signal = waveform.mean(axis=-1) if len(waveform.shape) > 1 else waveform

	# Apply decimation for long audio signals
	if len(mono_signal) > 60 * sample_rate:
	# Decimation: take every second sample
	mono_signal = mono_signal[::2]
	effective_sr = sample_rate // 2
	else:
	effective_sr = sample_rate

	processed_waveforms.append((mono_signal, effective_sr))

	# Create mel-spectrograms figure
	fig_spec, axes_spec = plt.subplots(2, 1, figsize=(16, 10))

	for i, ((mono_signal, effective_sr), title) in enumerate(
	zip(processed_waveforms, titles)
	):
	# Compute mel-spectrogram with reduced number of mel bins
	S = librosa.feature.melspectrogram(y=mono_signal, sr=effective_sr, n_mels=128)
	S_db = librosa.power_to_db(S, ref=np.max)

	# Plot mel-spectrogram
	img = librosa.display.specshow(
	S_db,
	cmap="plasma",
	sr=effective_sr,
	x_axis="time",
	y_axis="mel",
	ax=axes_spec[i],
	)
	axes_spec[i].set_title(
	f"Mel-spectrogram: {title}", fontsize=14, fontweight="bold"
	)
	axes_spec[i].set_xlabel("Time (seconds)", fontsize=12)
	axes_spec[i].set_ylabel("Frequency (Mel)", fontsize=12)

	# Colorbar intentionally disabled
	# fig_spec.colorbar(img, ax=axes_spec, format="%+2.f dB",
	# shrink=0.8, pad=0.02, location="right")

	# Set global title for spectrograms
	fig_spec.suptitle(
	f"Mel-spectrograms: {os.path.basename(output_base)}",
	fontsize=16,
	fontweight="bold",
	y=0.98,
	)

	plt.tight_layout()
	plt.subplots_adjust(top=0.94, hspace=0.4, right=0.88)

	# Save spectrograms image with reduced DPI
	spec_output = f"{output_base}_spectrograms.jpg"
	plt.savefig(spec_output, dpi=150, bbox_inches="tight")
	plt.close(fig_spec)

	# Create waveforms figure
	fig_wave, axes_wave = plt.subplots(2, 1, figsize=(16, 8))

	for i, ((mono_signal, effective_sr), title) in enumerate(
	zip(processed_waveforms, titles)
	):
	# Generate time axis
	time = np.linspace(0, len(mono_signal) / effective_sr, len(mono_signal))

	# Plot simplified waveform for very long signals
	if len(mono_signal) > 100000:
	# Take every 10th sample for plotting
	plot_indices = np.arange(0, len(mono_signal), 10)
	axes_wave[i].plot(
	time[plot_indices],
	mono_signal[plot_indices],
	color="#00ff88",
	alpha=0.9,
	linewidth=0.5,
	)
	else:
	axes_wave[i].plot(
	time, mono_signal, color="#00ff88", alpha=0.9, linewidth=0.8
	)

	axes_wave[i].fill_between(time, mono_signal, alpha=0.3, color="#00ff8833")
	axes_wave[i].set_xlabel("Time (seconds)", fontsize=12)
	axes_wave[i].set_ylabel("Amplitude", fontsize=12)
	axes_wave[i].set_title(f"Waveform: {title}", fontsize=14, fontweight="bold")
	axes_wave[i].grid(True, alpha=0.3, color="gray")
	axes_wave[i].set_xlim(0, time[-1])

	# Set global title for waveforms
	fig_wave.suptitle(
	f"Waveforms: {os.path.basename(output_base)}",
	fontsize=16,
	fontweight="bold",
	y=0.98,
	)

	plt.tight_layout()
	plt.subplots_adjust(top=0.94, hspace=0.4)

	# Save waveforms image
	wave_output = f"{output_base}_waveforms.jpg"
	plt.savefig(wave_output, dpi=150, bbox_inches="tight")
	plt.close(fig_wave)


	def draw_mel_spectrogram(
	waveform: np.ndarray, sample_rate: int, length: float, output_file: str
	) -> None:
	"""
	Generate and save a spectrogram image from an audio waveform.

	Converts the provided waveform into a mono signal, computes its Short-Time
	Fourier Transform (STFT), converts the amplitude spectrogram to dB scale,
	and plots it using a plasma colormap.

	Args:
	waveform (np.ndarray): Input audio waveform array of shape (time, channels)
	or (time,).
	sample_rate (int): Sampling rate of the waveform in Hz.
	length (float): Duration (in seconds) of the waveform to include in the
	spectrogram.
	output_file (str): Path to save the resulting spectrogram image.

	Returns:
	None
	"""

	import librosa.display

	# Cut only required part of spectrogram
	x = waveform

	# Compute mel-spectrogram instead of STFT
	S = librosa.feature.melspectrogram(
	y=x.mean(axis=-1), # mono signal
	sr=sample_rate,
	)

	# Convert to dB scale
	S_db = librosa.power_to_db(S, ref=np.max)

	fig, ax = plt.subplots()
	try:
	img = librosa.display.specshow(
	S_db, cmap="plasma", sr=sample_rate, x_axis="time", y_axis="mel", ax=ax
	)
	ax.set(title="Mel-spectrogram: " + os.path.basename(output_file))
	fig.colorbar(img, ax=ax, format="%+2.f dB")
	if output_file is not None:
	plt.savefig(output_file)
	finally:
	plt.close(fig)

	plot_waveform_basic(
	waveform, sample_rate, output_file.replace(".jpg", "_waveform.jpg")
	)


	def plot_waveform_basic(waveform, samplerate, output_path=None, theme="dark"):
	data = waveform
	if len(data.shape) > 1:
	data = np.mean(data, axis=1)
	try:
	themes = {
	"dark": {"bg": "#0f0f0f", "wave": "#00ff88", "fill": "#00ff8833"},
	"light": {"bg": "white", "wave": "#2563eb", "fill": "#3b82f633"},
	"purple": {"bg": "#1a1a2e", "wave": "#e94560", "fill": "#e9456033"},
	}

	colors = themes.get(theme, themes["dark"])

	fig, ax = plt.subplots(figsize=(12, 3), facecolor=colors["bg"])

	time = np.linspace(0, len(data) / samplerate, len(data))

	ax.plot(time, data, color=colors["wave"], alpha=0.9, linewidth=0.8)
	ax.fill_between(time, data, alpha=0.3, color=colors["fill"])

	ax.set_facecolor(colors["bg"])
	if theme == "dark" or theme == "purple":
	ax.tick_params(colors="white", labelsize=8)
	ax.set_xlabel("Time (seconds)", color="white", fontsize=10)
	ax.set_ylabel("Amplitude", color="white", fontsize=10)
	else:
	ax.tick_params(colors="black", labelsize=8)

	ax.grid(True, alpha=0.2, color="gray")
	ax.set_xlim(0, time[-1])

	plt.tight_layout()

	if output_path:
	plt.savefig(
	output_path,
	dpi=200,
	bbox_inches="tight",
	facecolor=colors["bg"],
	edgecolor="none",
	)

	finally:
	plt.close()