Voice-Clone-Multilingual

Running

App Files Files Community

Voice-Clone-Multilingual / TTS /tts /utils /visual.py

Shadhil

voice-clone with single audio sample input

9b2107c over 2 years ago

raw

history blame

6.67 kB

	import librosa
	import matplotlib
	import matplotlib.pyplot as plt
	import numpy as np
	import torch
	from matplotlib.colors import LogNorm

	matplotlib.use("Agg")


	def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None, output_fig=False, plot_log=False):
	if isinstance(alignment, torch.Tensor):
	alignment_ = alignment.detach().cpu().numpy().squeeze()
	else:
	alignment_ = alignment
	alignment_ = alignment_.astype(np.float32) if alignment_.dtype == np.float16 else alignment_
	fig, ax = plt.subplots(figsize=fig_size)
	im = ax.imshow(
	alignment_.T, aspect="auto", origin="lower", interpolation="none", norm=LogNorm() if plot_log else None
	)
	fig.colorbar(im, ax=ax)
	xlabel = "Decoder timestep"
	if info is not None:
	xlabel += "\n\n" + info
	plt.xlabel(xlabel)
	plt.ylabel("Encoder timestep")
	# plt.yticks(range(len(text)), list(text))
	plt.tight_layout()
	if title is not None:
	plt.title(title)
	if not output_fig:
	plt.close()
	return fig


	def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
	if isinstance(spectrogram, torch.Tensor):
	spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
	else:
	spectrogram_ = spectrogram.T
	spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
	if ap is not None:
	spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access
	fig = plt.figure(figsize=fig_size)
	plt.imshow(spectrogram_, aspect="auto", origin="lower")
	plt.colorbar()
	plt.tight_layout()
	if not output_fig:
	plt.close()
	return fig


	def plot_pitch(pitch, spectrogram, ap=None, fig_size=(30, 10), output_fig=False):
	"""Plot pitch curves on top of the spectrogram.

	Args:
	pitch (np.array): Pitch values.
	spectrogram (np.array): Spectrogram values.

	Shapes:
	pitch: :math:`(T,)`
	spec: :math:`(C, T)`
	"""

	if isinstance(spectrogram, torch.Tensor):
	spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T
	else:
	spectrogram_ = spectrogram.T
	spectrogram_ = spectrogram_.astype(np.float32) if spectrogram_.dtype == np.float16 else spectrogram_
	if ap is not None:
	spectrogram_ = ap.denormalize(spectrogram_) # pylint: disable=protected-access

	old_fig_size = plt.rcParams["figure.figsize"]
	if fig_size is not None:
	plt.rcParams["figure.figsize"] = fig_size

	fig, ax = plt.subplots()

	ax.imshow(spectrogram_, aspect="auto", origin="lower")
	ax.set_xlabel("time")
	ax.set_ylabel("spec_freq")

	ax2 = ax.twinx()
	ax2.plot(pitch, linewidth=5.0, color="red")
	ax2.set_ylabel("F0")

	plt.rcParams["figure.figsize"] = old_fig_size
	if not output_fig:
	plt.close()
	return fig


	def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
	"""Plot pitch curves on top of the input characters.

	Args:
	pitch (np.array): Pitch values.
	chars (str): Characters to place to the x-axis.

	Shapes:
	pitch: :math:`(T,)`
	"""
	old_fig_size = plt.rcParams["figure.figsize"]
	if fig_size is not None:
	plt.rcParams["figure.figsize"] = fig_size

	fig, ax = plt.subplots()

	x = np.array(range(len(chars)))
	my_xticks = chars
	plt.xticks(x, my_xticks)

	ax.set_xlabel("characters")
	ax.set_ylabel("freq")

	ax2 = ax.twinx()
	ax2.plot(pitch, linewidth=5.0, color="red")
	ax2.set_ylabel("F0")

	plt.rcParams["figure.figsize"] = old_fig_size
	if not output_fig:
	plt.close()
	return fig


	def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
	"""Plot energy curves on top of the input characters.

	Args:
	energy (np.array): energy values.
	chars (str): Characters to place to the x-axis.

	Shapes:
	energy: :math:`(T,)`
	"""
	old_fig_size = plt.rcParams["figure.figsize"]
	if fig_size is not None:
	plt.rcParams["figure.figsize"] = fig_size

	fig, ax = plt.subplots()

	x = np.array(range(len(chars)))
	my_xticks = chars
	plt.xticks(x, my_xticks)

	ax.set_xlabel("characters")
	ax.set_ylabel("freq")

	ax2 = ax.twinx()
	ax2.plot(energy, linewidth=5.0, color="red")
	ax2.set_ylabel("energy")

	plt.rcParams["figure.figsize"] = old_fig_size
	if not output_fig:
	plt.close()
	return fig


	def visualize(
	alignment,
	postnet_output,
	text,
	hop_length,
	CONFIG,
	tokenizer,
	stop_tokens=None,
	decoder_output=None,
	output_path=None,
	figsize=(8, 24),
	output_fig=False,
	):
	"""Intended to be used in Notebooks."""

	if decoder_output is not None:
	num_plot = 4
	else:
	num_plot = 3

	label_fontsize = 16
	fig = plt.figure(figsize=figsize)

	plt.subplot(num_plot, 1, 1)
	plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
	plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
	plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
	# compute phoneme representation and back
	if CONFIG.use_phonemes:
	seq = tokenizer.text_to_ids(text)
	text = tokenizer.ids_to_text(seq)
	print(text)
	plt.yticks(range(len(text)), list(text))
	plt.colorbar()

	if stop_tokens is not None:
	# plot stopnet predictions
	plt.subplot(num_plot, 1, 2)
	plt.plot(range(len(stop_tokens)), list(stop_tokens))

	# plot postnet spectrogram
	plt.subplot(num_plot, 1, 3)
	librosa.display.specshow(
	postnet_output.T,
	sr=CONFIG.audio["sample_rate"],
	hop_length=hop_length,
	x_axis="time",
	y_axis="linear",
	fmin=CONFIG.audio["mel_fmin"],
	fmax=CONFIG.audio["mel_fmax"],
	)

	plt.xlabel("Time", fontsize=label_fontsize)
	plt.ylabel("Hz", fontsize=label_fontsize)
	plt.tight_layout()
	plt.colorbar()

	if decoder_output is not None:
	plt.subplot(num_plot, 1, 4)
	librosa.display.specshow(
	decoder_output.T,
	sr=CONFIG.audio["sample_rate"],
	hop_length=hop_length,
	x_axis="time",
	y_axis="linear",
	fmin=CONFIG.audio["mel_fmin"],
	fmax=CONFIG.audio["mel_fmax"],
	)
	plt.xlabel("Time", fontsize=label_fontsize)
	plt.ylabel("Hz", fontsize=label_fontsize)
	plt.tight_layout()
	plt.colorbar()

	if output_path:
	print(output_path)
	fig.savefig(output_path)
	plt.close()

	if not output_fig:
	plt.close()