infinisoft
/

tts

Model card Files Files and versions

tts / TTS /vc /modules /freevc /speaker_encoder /audio.py

abuzahid's picture

Upload 542 files

127d53c almost 3 years ago

history blame contribute delete

2.5 kB

	import struct
	from pathlib import Path
	from typing import Optional, Union

	# import webrtcvad
	import librosa
	import numpy as np
	from scipy.ndimage.morphology import binary_dilation

	from TTS.vc.modules.freevc.speaker_encoder.hparams import *

	int16_max = (2**15) - 1


	def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None):
	"""
	Applies the preprocessing operations used in training the Speaker Encoder to a waveform
	either on disk or in memory. The waveform will be resampled to match the data hyperparameters.

	:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
	just .wav), either the waveform as a numpy array of floats.
	:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
	preprocessing. After preprocessing, the waveform's sampling rate will match the data
	hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
	this argument will be ignored.
	"""
	# Load the wav from disk if needed
	if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
	wav, source_sr = librosa.load(fpath_or_wav, sr=None)
	else:
	wav = fpath_or_wav

	# Resample the wav if needed
	if source_sr is not None and source_sr != sampling_rate:
	wav = librosa.resample(wav, source_sr, sampling_rate)

	# Apply the preprocessing: normalize volume and shorten long silences
	wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
	wav = trim_long_silences(wav)

	return wav


	def wav_to_mel_spectrogram(wav):
	"""
	Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
	Note: this not a log-mel spectrogram.
	"""
	frames = librosa.feature.melspectrogram(
	y=wav,
	sr=sampling_rate,
	n_fft=int(sampling_rate * mel_window_length / 1000),
	hop_length=int(sampling_rate * mel_window_step / 1000),
	n_mels=mel_n_channels,
	)
	return frames.astype(np.float32).T


	def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
	if increase_only and decrease_only:
	raise ValueError("Both increase only and decrease only are set")
	dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
	if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
	return wav
	return wav * (10 ** (dBFS_change / 20))