HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis
Paper • 2010.05646 • Published
A HiFi-GAN vocoder trained from scratch on piano music, designed for audio reconstruction tasks.
Training: Trained from scratch (random initialization) - NOT fine-tuned from pretrained
pip install torch torchaudio librosa scipy
import torch
import torchaudio
from scipy.io.wavfile import write
# Load model
from env import AttrDict
from models import Generator
import json
config = AttrDict(json.load(open('config.json')))
generator = Generator(config).cuda()
checkpoint = torch.load('generator_best.pt', map_location='cuda')
generator.load_state_dict(checkpoint['generator'])
generator.eval()
generator.remove_weight_norm()
# Load and process audio
wav, sr = torchaudio.load('your_audio.wav')
if sr != 22050:
wav = torchaudio.functional.resample(wav, sr, 22050)
wav = wav.mean(dim=0) # stereo to mono
# Generate mel spectrogram
from meldataset import mel_spectrogram
mel = mel_spectrogram(wav.unsqueeze(0).cuda(), config.n_fft, config.num_mels,
config.sampling_rate, config.hop_size, config.win_size,
config.fmin, config.fmax)
# Generate audio
with torch.no_grad():
generated = generator(mel)
audio = generated.squeeze().cpu().numpy()
# Save
write('output.wav', 22050, (audio * 32768).astype('int16'))
{
"upsample_rates": [8, 8, 2, 2],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3, 7, 11],
"num_mels": 80,
"n_fft": 1024,
"hop_size": 256,
"sampling_rate": 22050
}
config.json - Model architecture configurationgenerator_best.pt - Generator checkpoint (best validation error at step 34000)discriminator_optimizer.pt - Discriminator + optimizer checkpointtraining_metrics.csv - Training step-by-step metricstraining_loss_plot.png - Training loss visualizationIf you use this model, consider citing HiFi-GAN:
@article{kong2020hifigan,
author = {Jongun Kong and Jaeheyun Kim and Jaekyoung Bae},
title = {HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis},
year = {2020},
eprint = {2010.05646},
archivePrefix = {arXiv}
}