| # from nsf_hifigan.models import load_model | |
| from modules.BigVGAN.inference import load_model | |
| import librosa | |
| import torch | |
| import torch.nn.functional as F | |
| import torchaudio | |
| import torchaudio.transforms as transforms | |
| import numpy as np | |
| import soundfile as sf | |
| class LogMelSpectrogram(torch.nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.melspctrogram = transforms.MelSpectrogram( | |
| sample_rate=22050, | |
| n_fft=1024, | |
| win_length=1024, | |
| hop_length=256, | |
| center=False, | |
| power=1.0, | |
| norm="slaney", | |
| n_mels=80, | |
| mel_scale="slaney", | |
| f_max=8000, | |
| f_min=0, | |
| ) | |
| def forward(self, wav): | |
| wav = F.pad(wav, ((1024 - 256) // 2, (1024 - 256) // 2), "reflect") | |
| mel = self.melspctrogram(wav) | |
| logmel = torch.log(torch.clamp(mel, min=1e-5)) | |
| return logmel | |
| hifigan, cfg = load_model('modules/BigVGAN/ckpt/bigvgan_22khz_80band/g_05000000', device='cuda') | |
| M = LogMelSpectrogram() | |
| source, sr = torchaudio.load("music.mp3") | |
| source = torchaudio.functional.resample(source, sr, 22050) | |
| source = source.unsqueeze(0) | |
| mel = M(source).squeeze(0) | |
| # f0, f0_bin = get_pitch("116_1_pred.wav") | |
| # f0 = torch.tensor(f0).unsqueeze(0) | |
| with torch.no_grad(): | |
| y_hat = hifigan(mel.cuda()).cpu().numpy().squeeze(1) | |
| sf.write('test.wav', y_hat[0], samplerate=22050) |