Spaces:
No application file
No application file
| """ | |
| DiffSVC architecture with WaveNet denoiser and NSF-HiFiGAN vocoder. | |
| Comparing to v1, this version | |
| - Doesn't need spec stats anymore. | |
| - Added dilation cycle to WaveNet denoiser. | |
| - Used the log10 mel spectrogram. | |
| - Better matching DiffSinger architecture. | |
| """ | |
| from fish_diffusion.utils.pitch import pitch_to_scale | |
| sampling_rate = 44100 | |
| mel_channels = 128 | |
| hidden_size = 256 | |
| model = dict( | |
| type="DiffSVC", | |
| diffusion=dict( | |
| type="GaussianDiffusion", | |
| mel_channels=mel_channels, | |
| noise_schedule="linear", | |
| timesteps=1000, | |
| max_beta=0.01, | |
| s=0.008, | |
| noise_loss="l1", | |
| denoiser=dict( | |
| type="WaveNetDenoiser", | |
| mel_channels=mel_channels, | |
| d_encoder=hidden_size, | |
| residual_channels=512, | |
| residual_layers=20, | |
| dilation_cycle=4, | |
| use_linear_bias=True, | |
| ), | |
| sampler_interval=10, | |
| spec_min=[-5], | |
| spec_max=[0], | |
| ), | |
| text_encoder=dict( | |
| type="NaiveProjectionEncoder", | |
| input_size=256, | |
| output_size=hidden_size, | |
| ), | |
| speaker_encoder=dict( | |
| type="NaiveProjectionEncoder", | |
| input_size=10, | |
| output_size=hidden_size, | |
| use_embedding=True, | |
| ), | |
| pitch_encoder=dict( | |
| type="NaiveProjectionEncoder", | |
| input_size=1, | |
| output_size=hidden_size, | |
| use_embedding=False, | |
| preprocessing=pitch_to_scale, | |
| ), | |
| vocoder=dict( | |
| type="NsfHifiGAN", | |
| checkpoint_path="checkpoints/nsf_hifigan/model", | |
| sampling_rate=sampling_rate, | |
| mel_channels=mel_channels, | |
| use_natural_log=False, | |
| ), | |
| ) | |