| | import numpy as np |
| | import torch |
| | import librosa |
| | from librosa.core import load |
| | import matplotlib.pyplot as plt |
| | import pysptk |
| | import pyworld as pw |
| | from fastdtw import fastdtw |
| | from scipy import spatial |
| |
|
| | from librosa.filters import mel as librosa_mel_fn |
| | mel_basis = librosa_mel_fn(sr=24000, n_fft=1024, n_mels=100, fmin=0, fmax=12000) |
| |
|
| |
|
| | def _get_best_mcep_params(fs): |
| | if fs == 16000: |
| | return 23, 0.42 |
| | elif fs == 22050: |
| | return 34, 0.45 |
| | elif fs == 24000: |
| | return 34, 0.46 |
| | elif fs == 44100: |
| | return 39, 0.53 |
| | elif fs == 48000: |
| | return 39, 0.55 |
| | else: |
| | raise ValueError(f"Not found the setting for {fs}.") |
| |
|
| |
|
| | def get_mel(wav_path): |
| | wav, _ = load(wav_path, sr=24000) |
| | wav = wav[:(wav.shape[0] // 256)*256] |
| | wav = np.pad(wav, 384, mode='reflect') |
| | stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False) |
| | stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9)) |
| | mel_spectrogram = np.matmul(mel_basis, stftm) |
| | if mel_spectrogram.shape[-1] % 8 != 0: |
| | mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, 8 - mel_spectrogram.shape[-1] % 8)), 'minimum') |
| |
|
| | log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None)) |
| | return log_mel_spectrogram |
| |
|
| |
|
| | def get_world_mel(wav_path=None, sr=24000, wav=None): |
| | if wav_path is not None: |
| | wav, _ = librosa.load(wav_path, sr=24000) |
| | wav = (wav * 32767).astype(np.int16) |
| | wav = (wav / 32767).astype(np.float64) |
| | |
| | wav = wav[:(wav.shape[0] // 256) * 256] |
| |
|
| | |
| | _f0, t = pw.dio(wav, sr) |
| | f0 = pw.stonemask(wav, _f0, t, sr) |
| | sp = pw.cheaptrick(wav, f0, t, sr) |
| | ap = pw.d4c(wav, f0, t, sr) |
| | wav_hat = pw.synthesize(f0 * 0, sp, ap, sr) |
| | |
| |
|
| | |
| | wav_hat = wav_hat[:len(wav)] |
| | |
| | assert len(wav_hat) == len(wav) |
| | wav = wav_hat.astype(np.float32) |
| | wav = np.pad(wav, 384, mode='reflect') |
| | stft = librosa.core.stft(wav, n_fft=1024, hop_length=256, win_length=1024, window='hann', center=False) |
| | stftm = np.sqrt(np.real(stft) ** 2 + np.imag(stft) ** 2 + (1e-9)) |
| | mel_spectrogram = np.matmul(mel_basis, stftm) |
| | if mel_spectrogram.shape[-1] % 8 != 0: |
| | mel_spectrogram = np.pad(mel_spectrogram, ((0, 0), (0, 8 - mel_spectrogram.shape[-1] % 8)), 'minimum') |
| |
|
| | log_mel_spectrogram = np.log(np.clip(mel_spectrogram, a_min=1e-5, a_max=None)) |
| | return log_mel_spectrogram |
| |
|
| |
|
| | def get_f0(wav_path, method='pyin', padding=True): |
| | if method == 'pyin': |
| | wav, sr = load(wav_path, sr=24000) |
| | wav = wav[:(wav.shape[0] // 256) * 256] |
| | wav = np.pad(wav, 384, mode='reflect') |
| | f0, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=256, center=False, sr=24000, |
| | fmin=librosa.note_to_hz('C2'), |
| | fmax=librosa.note_to_hz('C6'), fill_na=0) |
| | elif method == 'world': |
| | wav, sr = librosa.load(wav_path, sr=24000) |
| | wav = (wav * 32767).astype(np.int16) |
| | wav = (wav / 32767).astype(np.float64) |
| | _f0, t = pw.dio(wav, fs=24000, frame_period=256/sr*1000, |
| | f0_floor=librosa.note_to_hz('C2'), |
| | f0_ceil=librosa.note_to_hz('C6')) |
| | f0 = pw.stonemask(wav, _f0, t, sr) |
| | f0 = f0[:-1] |
| |
|
| | if padding is True: |
| | if f0.shape[-1] % 8 !=0: |
| | f0 = np.pad(f0, ((0, 8-f0.shape[-1] % 8)), 'constant', constant_values=0) |
| |
|
| | return f0 |
| |
|
| |
|
| | def get_mcep(x, n_fft=1024, n_shift=256, sr=24000): |
| | x, sr = load(x, sr=24000) |
| | n_frame = (x.shape[0] // 256) |
| | x = np.pad(x, 384, mode='reflect') |
| | |
| | win = pysptk.sptk.hamming(n_fft) |
| | mcep_dim, mcep_alpha = _get_best_mcep_params(sr) |
| | mcep = [pysptk.mcep(x[n_shift * i: n_shift * i + n_fft] * win, |
| | mcep_dim, mcep_alpha, |
| | eps=1e-6, etype=1,) |
| | for i in range(n_frame) |
| | ] |
| | mcep = np.stack(mcep) |
| | return mcep |
| |
|
| |
|
| | def get_matched_f0(x, y, method='world', n_fft=1024, n_shift=256): |
| | |
| | f0_y = get_f0(y, method=method, padding=False) |
| | |
| | |
| |
|
| | mcep_x = get_mcep(x, n_fft=n_fft, n_shift=n_shift) |
| | mcep_y = get_mcep(y, n_fft=n_fft, n_shift=n_shift) |
| |
|
| | _, path = fastdtw(mcep_x, mcep_y, dist=spatial.distance.euclidean) |
| | twf = np.array(path).T |
| | |
| | nearest = [] |
| | for i in range(len(f0_y)): |
| | idx = np.argmax(1 * twf[0] == i) |
| | nearest.append(twf[1][idx]) |
| |
|
| | f0_y = f0_y[nearest] |
| |
|
| | |
| |
|
| | if f0_y.shape[-1] % 8 != 0: |
| | f0_y = np.pad(f0_y, ((0, 8 - f0_y.shape[-1] % 8)), 'constant', constant_values=0) |
| |
|
| | return f0_y |
| |
|
| |
|
| | def f0_to_coarse(f0, hparams): |
| |
|
| | f0_bin = hparams['f0_bin'] |
| | f0_max = hparams['f0_max'] |
| | f0_min = hparams['f0_min'] |
| | is_torch = isinstance(f0, torch.Tensor) |
| | |
| | f0_mel_min = 1127 * np.log(1 + f0_min / 700) |
| | f0_mel_max = 1127 * np.log(1 + f0_max / 700) |
| | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) |
| |
|
| | unvoiced = (f0_mel == 0) |
| |
|
| | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 |
| |
|
| | f0_mel[f0_mel <= 1] = 1 |
| | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 |
| |
|
| | f0_mel[unvoiced] = 0 |
| |
|
| | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) |
| | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min()) |
| | return f0_coarse |
| |
|
| |
|
| | def log_f0(f0, hparams): |
| | f0_bin = hparams['f0_bin'] |
| | f0_max = hparams['f0_max'] |
| | f0_min = hparams['f0_min'] |
| |
|
| | f0_mel = np.zeros_like(f0) |
| | f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1 |
| | f0_mel_min = 12*np.log2(f0_min/f0_min) + 1 |
| | f0_mel_max = 12*np.log2(f0_max/f0_min) + 1 |
| |
|
| | unvoiced = (f0_mel == 0) |
| |
|
| | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 |
| |
|
| | f0_mel[f0_mel <= 1] = 1 |
| | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 |
| |
|
| | f0_mel[unvoiced] = 0 |
| |
|
| | f0_coarse = np.rint(f0_mel).astype(int) |
| | assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min()) |
| | return f0_coarse |
| |
|
| |
|
| | def show_plot(tensor): |
| | tensor = tensor.squeeze().cpu() |
| | |
| | fig, ax = plt.subplots(figsize=(12, 3)) |
| | im = ax.imshow(tensor, aspect="auto", origin="lower", interpolation='none') |
| | plt.colorbar(im, ax=ax) |
| | plt.tight_layout() |
| | fig.canvas.draw() |
| | plt.show() |
| |
|
| |
|
| | if __name__ == '__main__': |
| | mel = get_mel('target.wav') |
| | f0 = get_f0('target.wav') |