Spaces:

WeixuanYuan
/

DiffuSynthV0.2

Sleeping

App Files Files Community

DiffuSynthV0.2 / webUI /natural_language_guided_4 /utils.py

WeixuanYuan

Upload 70 files

bd6e54b verified over 1 year ago

raw

history blame contribute delete

9.27 kB

	import librosa
	import numpy as np
	import torch
	from PIL import Image
	from tools import np_power_to_db, decode_stft, depad_STFT


	def spectrogram_to_Gradio_image(spc):
	### input: spc [np.ndarray]
	frequency_resolution, time_resolution = spc.shape[-2], spc.shape[-1]
	spc = np.reshape(spc, (frequency_resolution, time_resolution))

	# Todo:
	magnitude_spectrum = np.abs(spc)
	log_spectrum = np_power_to_db(magnitude_spectrum)
	flipped_log_spectrum = np.flipud(log_spectrum)

	colorful_spc = np.ones((frequency_resolution, time_resolution, 3)) * -80.0
	colorful_spc[:, :, 0] = flipped_log_spectrum
	colorful_spc[:, :, 1] = flipped_log_spectrum
	colorful_spc[:, :, 2] = np.ones((frequency_resolution, time_resolution)) * -60.0
	# Rescale to 0-255 and convert to uint8
	rescaled = (colorful_spc + 80.0) / 80.0
	rescaled = (255.0 * rescaled).astype(np.uint8)
	return rescaled


	def phase_to_Gradio_image(phase):
	### input: spc [np.ndarray]
	frequency_resolution, time_resolution = phase.shape[-2], phase.shape[-1]
	phase = np.reshape(phase, (frequency_resolution, time_resolution))

	# Todo:
	flipped_phase = np.flipud(phase)
	flipped_phase = (flipped_phase + 1.0) / 2.0

	colorful_spc = np.zeros((frequency_resolution, time_resolution, 3))
	colorful_spc[:, :, 0] = flipped_phase
	colorful_spc[:, :, 1] = flipped_phase
	colorful_spc[:, :, 2] = 0.2
	# Rescale to 0-255 and convert to uint8
	rescaled = (255.0 * colorful_spc).astype(np.uint8)
	return rescaled


	def latent_representation_to_Gradio_image(latent_representation):
	# input: latent_representation [torch.tensor]
	if not isinstance(latent_representation, np.ndarray):
	latent_representation = latent_representation.to("cpu").detach().numpy()
	image = latent_representation

	def normalize_image(img):
	min_val = img.min()
	max_val = img.max()
	normalized_img = ((img - min_val) / (max_val - min_val) * 255)
	return normalized_img

	image[0, :, :] = normalize_image(image[0, :, :])
	image[1, :, :] = normalize_image(image[1, :, :])
	image[2, :, :] = normalize_image(image[2, :, :])
	image[3, :, :] = normalize_image(image[3, :, :])
	image_transposed = np.transpose(image, (1, 2, 0))
	enlarged_image = np.repeat(image_transposed, 8, axis=0)
	enlarged_image = np.repeat(enlarged_image, 8, axis=1)
	return np.flipud(enlarged_image).astype(np.uint8)


	def InputBatch2Encode_STFT(encoder, STFT_batch, resolution=(512, 256), quantizer=None, squared=True):
	"""Transform batch of numpy spectrogram's into signals and encodings."""
	# Todo: remove resolution hard-coding
	frequency_resolution, time_resolution = resolution

	device = next(encoder.parameters()).device
	if not (quantizer is None):
	latent_representation_batch = encoder(STFT_batch.to(device))
	quantized_latent_representation_batch, loss, (_, _, _) = quantizer(latent_representation_batch)
	else:
	mu, logvar, latent_representation_batch = encoder(STFT_batch.to(device))
	quantized_latent_representation_batch = None

	STFT_batch = STFT_batch.to("cpu").detach().numpy()

	origin_flipped_log_spectrums, origin_flipped_phases, origin_signals = [], [], []
	for STFT in STFT_batch:

	padded_D_rec = decode_stft(STFT)
	D_rec = depad_STFT(padded_D_rec)
	spc = np.abs(D_rec)
	phase = np.angle(D_rec)

	flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
	flipped_phase = phase_to_Gradio_image(phase)

	# get_audio
	rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)

	origin_flipped_log_spectrums.append(flipped_log_spectrum)
	origin_flipped_phases.append(flipped_phase)
	origin_signals.append(rec_signal)

	return origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, \
	latent_representation_batch, quantized_latent_representation_batch


	def encodeBatch2GradioOutput_STFT(decoder, latent_vector_batch, resolution=(512, 256), original_STFT_batch=None):
	"""Show a spectrogram."""
	# Todo: remove resolution hard-coding
	frequency_resolution, time_resolution = resolution

	if isinstance(latent_vector_batch, np.ndarray):
	latent_vector_batch = torch.from_numpy(latent_vector_batch).to(next(decoder.parameters()).device)

	reconstruction_batch = decoder(latent_vector_batch).to("cpu").detach().numpy()

	flipped_log_spectrums, flipped_phases, rec_signals = [], [], []
	flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp = [], [], []

	for index, STFT in enumerate(reconstruction_batch):
	padded_D_rec = decode_stft(STFT)
	D_rec = depad_STFT(padded_D_rec)
	spc = np.abs(D_rec)
	phase = np.angle(D_rec)

	flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
	flipped_phase = phase_to_Gradio_image(phase)

	# get_audio
	rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)

	flipped_log_spectrums.append(flipped_log_spectrum)
	flipped_phases.append(flipped_phase)
	rec_signals.append(rec_signal)

	##########################################

	if original_STFT_batch is not None:
	STFT[0, :, :] = original_STFT_batch[index, 0, :, :]

	padded_D_rec = decode_stft(STFT)
	D_rec = depad_STFT(padded_D_rec)
	spc = np.abs(D_rec)
	phase = np.angle(D_rec)

	flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
	flipped_phase = phase_to_Gradio_image(phase)

	# get_audio
	rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024)

	flipped_log_spectrums_with_original_amp.append(flipped_log_spectrum)
	flipped_phases_with_original_amp.append(flipped_phase)
	rec_signals_with_original_amp.append(rec_signal)


	return flipped_log_spectrums, flipped_phases, rec_signals, \
	flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp



	def add_instrument(source_dict, virtual_instruments_dict, virtual_instrument_name, sample_index):

	virtual_instruments = virtual_instruments_dict["virtual_instruments"]
	virtual_instrument = {
	"latent_representation": source_dict["latent_representations"][sample_index],
	"quantized_latent_representation": source_dict["quantized_latent_representations"][sample_index],
	"sampler": source_dict["sampler"],
	"signal": source_dict["new_sound_rec_signals_gradio"][sample_index],
	"spectrogram_gradio_image": source_dict["new_sound_spectrogram_gradio_images"][
	sample_index],
	"phase_gradio_image": source_dict["new_sound_phase_gradio_images"][
	sample_index]}
	virtual_instruments[virtual_instrument_name] = virtual_instrument
	virtual_instruments_dict["virtual_instruments"] = virtual_instruments
	return virtual_instruments_dict


	def resize_image_to_aspect_ratio(image_data, aspect_ratio_width, aspect_ratio_height):
	"""
	根据给定的宽高比例拉伸图像，并保持输入输出数据为 NumPy 数组。

	参数:
	image_data (numpy array): 输入图像数据 (height, width, 3)
	aspect_ratio_width (int): 目标宽度比例
	aspect_ratio_height (int): 目标高度比例

	返回:
	numpy array: 调整大小后的图像数据
	"""
	# 获取图像的当前宽度和高度
	original_height, original_width, channels = image_data.shape

	# 计算当前的宽高比
	current_aspect_ratio = original_width / original_height

	# 计算目标的宽高比
	target_aspect_ratio = aspect_ratio_width / aspect_ratio_height

	# 判断是拉伸宽度还是高度
	if current_aspect_ratio > target_aspect_ratio:
	# 当前图像宽高比大于目标宽高比，说明宽度相对较大，需要拉伸高度
	new_width = original_width
	new_height = int(new_width / target_aspect_ratio)
	else:
	# 当前图像宽高比小于或等于目标宽高比，拉伸宽度
	new_height = original_height
	new_width = int(new_height * target_aspect_ratio)

	# 将 numpy 数组转换为 PIL 图像对象
	image = Image.fromarray(image_data.astype('uint8'))

	# 使用 PIL 的 resize 函数进行缩放，使用 LANCZOS 替代 ANTIALIAS
	resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)

	# 将 PIL 图像转换回 numpy 数组
	resized_image_data = np.array(resized_image)

	return resized_image_data


	def average_np_arrays(arr_list):
	if not arr_list:
	raise ValueError("Input list cannot be empty")

	stacked_arrays = np.stack(arr_list, axis=0)

	avg_array = np.mean(stacked_arrays, axis=0)

	return avg_array