niobures commited on
Commit
6aa95e5
·
verified ·
1 Parent(s): 1bf9996

Vocos (models, paper)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. Vocos. Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis.pdf +3 -0
  3. alvocat-vocos-22khz/.gitattributes +35 -0
  4. alvocat-vocos-22khz/README.md +174 -0
  5. alvocat-vocos-22khz/config.yaml +33 -0
  6. alvocat-vocos-22khz/mel_spec_22khz_cat.onnx +3 -0
  7. alvocat-vocos-22khz/pytorch_model.bin +3 -0
  8. alvocat-vocos-22khz/source.txt +1 -0
  9. vocos-audioset-32khz/.gitattributes +35 -0
  10. vocos-audioset-32khz/README.md +34 -0
  11. vocos-audioset-32khz/config.yaml +24 -0
  12. vocos-audioset-32khz/source.txt +1 -0
  13. vocos-audioset-32khz/vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt +3 -0
  14. vocos-encodec-24khz/.gitattributes +34 -0
  15. vocos-encodec-24khz/README.md +73 -0
  16. vocos-encodec-24khz/config.yaml +23 -0
  17. vocos-encodec-24khz/pytorch_model.bin +3 -0
  18. vocos-encodec-24khz/source.txt +1 -0
  19. vocos-mel-10ms-24khz/.gitattributes +35 -0
  20. vocos-mel-10ms-24khz/README.md +33 -0
  21. vocos-mel-10ms-24khz/config.yaml +31 -0
  22. vocos-mel-10ms-24khz/pytorch_model.bin +3 -0
  23. vocos-mel-10ms-24khz/source.txt +1 -0
  24. vocos-mel-22khz/.gitattributes +35 -0
  25. vocos-mel-22khz/README.md +182 -0
  26. vocos-mel-22khz/config.yaml +33 -0
  27. vocos-mel-22khz/mel_spec_22khz_univ.onnx +3 -0
  28. vocos-mel-22khz/pytorch_model.bin +3 -0
  29. vocos-mel-22khz/source.txt +1 -0
  30. vocos-mel-22khz/vocos_checkpoint_epoch=183_step=3690672_val_loss=3.8142.ckpt +3 -0
  31. vocos-mel-24khz-onnx/.gitattributes +35 -0
  32. vocos-mel-24khz-onnx/README.md +33 -0
  33. vocos-mel-24khz-onnx/config.yaml +24 -0
  34. vocos-mel-24khz-onnx/mel_spec_24khz.onnx +3 -0
  35. vocos-mel-24khz-onnx/source.txt +1 -0
  36. vocos-mel-24khz/.gitattributes +34 -0
  37. vocos-mel-24khz/README.md +71 -0
  38. vocos-mel-24khz/config.yaml +24 -0
  39. vocos-mel-24khz/pytorch_model.bin +3 -0
  40. vocos-mel-24khz/source.txt +1 -0
  41. vocos-mel-48khz-alpha1/.gitattributes +35 -0
  42. vocos-mel-48khz-alpha1/README.md +75 -0
  43. vocos-mel-48khz-alpha1/config.yaml +40 -0
  44. vocos-mel-48khz-alpha1/pytorch_model.bin +3 -0
  45. vocos-mel-48khz-alpha1/source.txt +1 -0
  46. vocos-mel-hifigan-compat-44100khz/.gitattributes +37 -0
  47. vocos-mel-hifigan-compat-44100khz/README.md +97 -0
  48. vocos-mel-hifigan-compat-44100khz/config.yaml +28 -0
  49. vocos-mel-hifigan-compat-44100khz/logs/version_0/config.yaml +151 -0
  50. vocos-mel-hifigan-compat-44100khz/logs/version_0/events.out.tfevents.1713993466.gpuserver +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Vocos.[[:space:]]Closing[[:space:]]the[[:space:]]gap[[:space:]]between[[:space:]]time-domain[[:space:]]and[[:space:]]Fourier-based[[:space:]]neural[[:space:]]vocoders[[:space:]]for[[:space:]]high-quality[[:space:]]audio[[:space:]]synthesis.pdf filter=lfs diff=lfs merge=lfs -text
Vocos. Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5a8dcb0a6b18a77b7f0fabeeadb8d51149246337a3ab9035ca42ffe910b7eb3
3
+ size 6612764
alvocat-vocos-22khz/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
alvocat-vocos-22khz/README.md ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - projecte-aina/festcat_trimmed_denoised
5
+ - projecte-aina/openslr-slr69-ca-trimmed-denoised
6
+ tags:
7
+ - vocoder
8
+ - vocos
9
+ - tts
10
+ ---
11
+
12
+ # 🥑 alVoCat
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+ 🥑 alVoCat is a vocoder for Catalan TTS, based on Vocos architecture. It is highly performant and
16
+ high quality, works together with [🍵 Matxa](https://huggingface.co/BSC-LT/matcha-tts-cat-multiaccent)
17
+ and you can find our fork [here](https://github.com/langtech-bsc/vocos/tree/matcha) and a demo [here](https://huggingface.co/spaces/BSC-LT/matchatts-vocos-onnx-ca).
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+ **Vocos** is a fast neural vocoder designed to synthesize audio waveforms from acoustic features.
26
+ Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain.
27
+ Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through
28
+ inverse Fourier transform.
29
+
30
+ This version of **Vocos** uses 80-bin mel spectrograms as acoustic features which are widespread
31
+ in the TTS domain since the introduction of [hifi-gan](https://github.com/jik876/hifi-gan/blob/master/meldataset.py)
32
+ The goal of this model is to provide an alternative to hifi-gan that is faster and compatible with the
33
+ acoustic output of several TTS models. This version is tailored for the Catalan language,
34
+ as it was trained only on Catalan speech datasets.
35
+
36
+ We are grateful with the authors for open sourcing the code allowing us to modify and train this version.
37
+
38
+ ## Intended Uses and limitations
39
+
40
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
41
+ The model is aimed to serve as a vocoder to synthesize audio waveforms from mel spectrograms. Is trained to generate speech and if is used in other audio
42
+ domain is possible that the model won't produce high quality samples.
43
+
44
+ ## How to Get Started with the Model
45
+
46
+ Use the code below to get started with the model.
47
+
48
+ ### Installation
49
+
50
+ To use Vocos only in inference mode, install it using:
51
+
52
+ ```bash
53
+ pip install git+https://github.com/langtech-bsc/vocos.git@matcha
54
+ ```
55
+
56
+ ### Reconstruct audio from mel-spectrogram
57
+
58
+ ```python
59
+ import torch
60
+
61
+ from vocos import Vocos
62
+
63
+ vocos = Vocos.from_pretrained("projecte-aina/alvocat-vocos-22khz")
64
+
65
+ mel = torch.randn(1, 80, 256) # B, C, T
66
+ audio = vocos.decode(mel)
67
+ ```
68
+
69
+ ### Copy-synthesis from a file:
70
+
71
+ ```python
72
+ import torchaudio
73
+
74
+ y, sr = torchaudio.load(YOUR_AUDIO_FILE)
75
+ if y.size(0) > 1: # mix to mono
76
+ y = y.mean(dim=0, keepdim=True)
77
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=22050)
78
+ y_hat = vocos(y)
79
+ ```
80
+
81
+ ### Onnx
82
+
83
+ We also release an onnx version of the model, you can check in colab:
84
+
85
+ <a target="_blank" href="https://colab.research.google.com/github/langtech-bsc/vocos/blob/matcha/notebooks/vocos_22khz_onnx_inference.ipynb">
86
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
87
+ </a>
88
+
89
+ ## Training Details
90
+
91
+ ### Training Data
92
+
93
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
94
+
95
+ The model was trained on 3 Catalan speech datasets
96
+
97
+ | Dataset | Language | Hours |
98
+ |---------------------|----------|---------|
99
+ | Festcat | ca | 22 |
100
+ | OpenSLR69 | ca | 5 |
101
+ | LaFrescat | ca | 3.5 |
102
+
103
+
104
+
105
+ ### Training Procedure
106
+
107
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
108
+ The model was trained for 1.5M steps and 1.3k epochs with a batch size of 16 for stability. We used a Cosine scheduler with an initial learning rate of 5e-4.
109
+ We also modified the mel spectrogram loss to use 128 bins and fmax of 11025 instead of the same input mel spectrogram.
110
+
111
+
112
+ #### Training Hyperparameters
113
+
114
+
115
+ * initial_learning_rate: 5e-4
116
+ * scheduler: cosine without warmup or restarts
117
+ * mel_loss_coeff: 45
118
+ * mrd_loss_coeff: 0.1
119
+ * batch_size: 16
120
+ * num_samples: 16384
121
+
122
+ ## Evaluation
123
+
124
+ <!-- This section describes the evaluation protocols and provides the results. -->
125
+
126
+ Evaluation was done using the metrics on the [original repo](https://github.com/gemelo-ai/vocos), after ~ 1000 epochs we achieve:
127
+
128
+ * val_loss: 3.57
129
+ * f1_score: 0.95
130
+ * mel_loss: 0.22
131
+ * periodicity_loss: 0.113
132
+ * pesq_score: 3.31
133
+ * pitch_loss: 31.61
134
+ * utmos_score: 3.33
135
+
136
+
137
+ ## Citation
138
+
139
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
140
+
141
+ If this code contributes to your research, please cite the work:
142
+
143
+ ```
144
+ @article{siuzdak2023vocos,
145
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
146
+ author={Siuzdak, Hubert},
147
+ journal={arXiv preprint arXiv:2306.00814},
148
+ year={2023}
149
+ }
150
+ ```
151
+
152
+ ## Additional information
153
+
154
+ ### Author
155
+ The Language Technologies Unit from Barcelona Supercomputing Center.
156
+
157
+ ### Contact
158
+ For further information, please send an email to <langtech@bsc.es>.
159
+
160
+ ### Copyright
161
+ Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
162
+
163
+ ### License
164
+ [Creative Commons Attribution Non-commercial 4.0](https://www.creativecommons.org/licenses/by-nc/4.0/)
165
+
166
+ These models are free to use for non-commercial and research purposes. Commercial use is only possible through licensing by
167
+ the voice artists. For further information, contact <langtech@bsc.es> and <lafrescaproduccions@gmail.com>.
168
+
169
+ ### Funding
170
+
171
+ This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
172
+
173
+ Part of the training of the model was possible thanks to the compute time given by Galician Supercomputing Center CESGA
174
+ ([Centro de Supercomputación de Galicia](https://www.cesga.es/))
alvocat-vocos-22khz/config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_lightning==1.8.6
2
+
3
+ feature_extractor:
4
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
5
+ init_args:
6
+ sample_rate: 22050
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ n_mels: 80
10
+ padding: same
11
+ f_min: 0
12
+ f_max: 8000
13
+ norm: "slaney"
14
+ mel_scale: "slaney"
15
+
16
+
17
+ backbone:
18
+ class_path: vocos.models.VocosBackbone
19
+ init_args:
20
+ input_channels: 80
21
+ dim: 512
22
+ intermediate_dim: 1536
23
+ num_layers: 8
24
+
25
+ head:
26
+ class_path: vocos.heads.ISTFTHead
27
+ init_args:
28
+ dim: 512
29
+ n_fft: 1024
30
+ hop_length: 256
31
+ padding: same
32
+
33
+
alvocat-vocos-22khz/mel_spec_22khz_cat.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab0744d7d49601ed8ad9be2927fcc99fb359cc90fe28bc9535c0484b3621de3
3
+ size 53883652
alvocat-vocos-22khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af7b6f4b153819ada44a917135acf33944cdbb70cde0701eda3d100153799c7
3
+ size 54051047
alvocat-vocos-22khz/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/projecte-aina/alvocat-vocos-22khz
vocos-audioset-32khz/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-audioset-32khz/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ This model is trained on Google's AudioSet (28GB data) for 1 million steps. (Originally planned 2 million steps, but I'm exploring better training schedule)
6
+
7
+ You can regard it as a pretrained base model, which is common in language models but not for vocoders.
8
+
9
+ How to load and use this model:
10
+
11
+ ```python
12
+ import torch
13
+ import torchaudio
14
+ from scipy.io.wavfile import write
15
+ with torch.no_grad():
16
+ from vocos import Vocos
17
+ A = torch.load("./vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt", map_location="cpu")
18
+ V = Vocos.from_hparams("./config.yaml")
19
+ V.load_state_dict(A['state_dict'], strict=False)
20
+ V.eval()
21
+ def safe_log(x: torch.Tensor, clip_val: float = 1e-7):
22
+ return torch.log(torch.clip(x, min=clip_val))
23
+ voice, sr = torchaudio.load('example.wav') # must be sample_rate=32000
24
+ if sr != 32000:
25
+ raise ValueError
26
+ mel = torchaudio.transforms.MelSpectrogram(
27
+ sample_rate=32000, n_fft=2048, hop_length=1024, n_mels=128, center=True, power=1,
28
+ )(voice)
29
+ mel = safe_log(mel)
30
+ audio = V.decode(mel)
31
+ write('out.wav', 32000, audio.flatten().numpy())
32
+ ```
33
+
34
+
vocos-audioset-32khz/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
3
+ init_args:
4
+ sample_rate: 32000
5
+ n_fft: 2048
6
+ hop_length: 1024
7
+ n_mels: 128
8
+ padding: center
9
+
10
+ backbone:
11
+ class_path: vocos.models.VocosBackbone
12
+ init_args:
13
+ input_channels: 128
14
+ dim: 512
15
+ intermediate_dim: 1536
16
+ num_layers: 8
17
+
18
+ head:
19
+ class_path: vocos.heads.ISTFTHead
20
+ init_args:
21
+ dim: 512
22
+ n_fft: 2048
23
+ hop_length: 1024
24
+ padding: center
vocos-audioset-32khz/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/ZhangRC/vocos-audioset-32khz
vocos-audioset-32khz/vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7043569c8f810cede62d02bf5480438d29e3dcff21cdfb6b0dce5a96e39e730a
3
+ size 681397231
vocos-encodec-24khz/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-encodec-24khz/README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
6
+
7
+ [Audio samples](https://charactr-platform.github.io/vocos/) |
8
+ Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
9
+
10
+ Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
11
+ Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
12
+ GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
13
+ coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
14
+
15
+ ## Installation
16
+
17
+ To use Vocos only in inference mode, install it using:
18
+
19
+ ```bash
20
+ pip install vocos
21
+ ```
22
+
23
+ If you wish to train the model, install it with additional dependencies:
24
+
25
+ ```bash
26
+ pip install vocos[train]
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### Reconstruct audio from EnCodec tokens
32
+
33
+ Additionally, you need to provide a `bandwidth_id` which corresponds to the embedding for bandwidth from the
34
+ list: `[1.5, 3.0, 6.0, 12.0]`.
35
+
36
+ ```python
37
+ vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz")
38
+
39
+ audio_tokens = torch.randint(low=0, high=1024, size=(8, 200)) # 8 codeboooks, 200 frames
40
+ features = vocos.codes_to_features(audio_tokens)
41
+ bandwidth_id = torch.tensor([2]) # 6 kbps
42
+
43
+ audio = vocos.decode(features, bandwidth_id=bandwidth_id)
44
+ ```
45
+
46
+ Copy-synthesis from a file: It extracts and quantizes features with EnCodec, then reconstructs them with Vocos in a
47
+ single forward pass.
48
+
49
+ ```python
50
+ y, sr = torchaudio.load(YOUR_AUDIO_FILE)
51
+ if y.size(0) > 1: # mix to mono
52
+ y = y.mean(dim=0, keepdim=True)
53
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
54
+
55
+ y_hat = vocos(y, bandwidth_id=bandwidth_id)
56
+ ```
57
+
58
+ ## Citation
59
+
60
+ If this code contributes to your research, please cite our work:
61
+
62
+ ```
63
+ @article{siuzdak2023vocos,
64
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
65
+ author={Siuzdak, Hubert},
66
+ journal={arXiv preprint arXiv:2306.00814},
67
+ year={2023}
68
+ }
69
+ ```
70
+
71
+ ## License
72
+
73
+ The code in this repository is released under the MIT license.
vocos-encodec-24khz/config.yaml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.EncodecFeatures
3
+ init_args:
4
+ encodec_model: encodec_24khz
5
+ bandwidths: [1.5, 3.0, 6.0, 12.0]
6
+ train_codebooks: false
7
+
8
+ backbone:
9
+ class_path: vocos.models.VocosBackbone
10
+ init_args:
11
+ input_channels: 128
12
+ dim: 384
13
+ intermediate_dim: 1152
14
+ num_layers: 8
15
+ adanorm_num_embeddings: 4 # len(bandwidths)
16
+
17
+ head:
18
+ class_path: vocos.heads.ISTFTHead
19
+ init_args:
20
+ dim: 384
21
+ n_fft: 1280
22
+ hop_length: 320
23
+ padding: same
vocos-encodec-24khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e95bb260b74a1bfc43c52d355831c951acb81c8960e9c62b79bd2b3ab1e3a90
3
+ size 40356708
vocos-encodec-24khz/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/charactr/vocos-encodec-24khz
vocos-mel-10ms-24khz/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-mel-10ms-24khz/README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ ## Reconstruct audio from mel-spectrogram with 10 ms frame shift
6
+
7
+ To use Vocos only in inference mode, install it using:
8
+
9
+ ```bash
10
+ pip install vocos
11
+ ```
12
+
13
+ Load the model and run inference:
14
+
15
+ ```python
16
+ import torch
17
+
18
+ from vocos import Vocos
19
+
20
+ vocos = Vocos.from_pretrained("meaningteam/vocos-mel-10ms-24khz")
21
+
22
+ audio = torch.randn(1, 24000)
23
+ mel = vocos.feature_extractor(audio)
24
+ prediction = vocos.decode(mel)
25
+ ```
26
+
27
+ ## Model details
28
+
29
+ This model was trained on the DNS Challenge dataset for 1M steps. Also, it has 10 ms frame shift compared to `charactr/vocos-mel-24khz`.
30
+
31
+ ## License
32
+
33
+ The code in this repository is released under the MIT license.
vocos-mel-10ms-24khz/config.yaml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backbone:
2
+ class_path: vocos.models.VocosBackbone
3
+ init_args:
4
+ dim: 512
5
+ input_channels: 100
6
+ intermediate_dim: 1536
7
+ num_layers: 8
8
+ evaluate_periodicty: false
9
+ evaluate_pesq: true
10
+ evaluate_utmos: false
11
+ feature_extractor:
12
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
13
+ init_args:
14
+ hop_length: 240
15
+ n_fft: 960
16
+ n_mels: 100
17
+ padding: center
18
+ sample_rate: 24000
19
+ head:
20
+ class_path: vocos.heads.ISTFTHead
21
+ init_args:
22
+ dim: 512
23
+ hop_length: 240
24
+ n_fft: 960
25
+ padding: center
26
+ initial_learning_rate: 5e-4
27
+ mel_loss_coeff: 45
28
+ mrd_loss_coeff: 0.1
29
+ num_warmup_steps: 0
30
+ pretrain_mel_steps: 0
31
+ sample_rate: 24000
vocos-mel-10ms-24khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3885f32d463665bcff9df6381a5f73e5ca12dbe77c960f02965f7fe85a4f275
3
+ size 54221351
vocos-mel-10ms-24khz/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/meaningteam/vocos-mel-10ms-24khz
vocos-mel-22khz/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-mel-22khz/README.md ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - projecte-aina/festcat_trimmed_denoised
5
+ - projecte-aina/openslr-slr69-ca-trimmed-denoised
6
+ - lj_speech
7
+ - blabble-io/libritts_r
8
+ tags:
9
+ - vocoder
10
+ - mel
11
+ - vocos
12
+ - hifigan
13
+ - tts
14
+ ---
15
+
16
+ # Vocos-mel-22khz
17
+
18
+ <!-- Provide a quick summary of what the model is/does. -->
19
+
20
+
21
+
22
+ ## Model Details
23
+
24
+ ### Model Description
25
+
26
+ <!-- Provide a longer summary of what this model is. -->
27
+
28
+ **Vocos** is a fast neural vocoder designed to synthesize audio waveforms from acoustic features.
29
+ Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain.
30
+ Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through
31
+ inverse Fourier transform.
32
+
33
+ This version of vocos uses 80-bin mel spectrograms as acoustic features which are widespread
34
+ in the TTS domain since the introduction of [hifi-gan](https://github.com/jik876/hifi-gan/blob/master/meldataset.py)
35
+ The goal of this model is to provide an alternative to hifi-gan that is faster and compatible with the
36
+ acoustic output of several TTS models.
37
+
38
+ We are grateful with the authors for open sourcing the code allowing us to modify and train this version.
39
+
40
+ ## Intended Uses and limitations
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+ The model is aimed to serve as a vocoder to synthesize audio waveforms from mel spectrograms. Is trained to generate speech and if is used in other audio
44
+ domain is possible that the model won't produce high quality samples.
45
+
46
+ ## How to Get Started with the Model
47
+
48
+ Use the code below to get started with the model.
49
+
50
+ ### Installation
51
+
52
+ To use Vocos only in inference mode, install it using:
53
+
54
+ ```bash
55
+ pip install git+https://github.com/langtech-bsc/vocos.git@matcha
56
+ ```
57
+
58
+ ### Reconstruct audio from mel-spectrogram
59
+
60
+ ```python
61
+ import torch
62
+
63
+ from vocos import Vocos
64
+
65
+ vocos = Vocos.from_pretrained("BSC-LT/vocos-mel-22khz")
66
+
67
+ mel = torch.randn(1, 80, 256) # B, C, T
68
+ audio = vocos.decode(mel)
69
+ ```
70
+ ### Integrate with existing TTS models:
71
+
72
+ * Matcha-TTS
73
+ <a target="_blank" href="https://colab.research.google.com/drive/1DvMR8z4XbyuhsSpeLZ83i95Ud0z6vQ2H">
74
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
75
+ </a>
76
+
77
+ * Fastpitch
78
+ <a target="_blank" href="https://colab.research.google.com/drive/1SA90s_TMoTLpxbWWBB4CxGKz0hVw4fwL">
79
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
80
+ </a>
81
+
82
+ ### Copy-synthesis from a file:
83
+
84
+ ```python
85
+ import torchaudio
86
+
87
+ y, sr = torchaudio.load(YOUR_AUDIO_FILE)
88
+ if y.size(0) > 1: # mix to mono
89
+ y = y.mean(dim=0, keepdim=True)
90
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=22050)
91
+ y_hat = vocos(y)
92
+ ```
93
+
94
+
95
+ ### Onnx
96
+
97
+ We also release a onnx version of the model, you can check in colab:
98
+
99
+ <a target="_blank" href="https://colab.research.google.com/github/langtech-bsc/vocos/blob/matcha/notebooks/vocos_22khz_onnx_inference.ipynb">
100
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
101
+ </a>
102
+
103
+ ## Training Details
104
+
105
+ ### Training Data
106
+
107
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
108
+
109
+ The model was trained on 4 speech datasets
110
+
111
+ | Dataset | Language | Hours |
112
+ |---------------------|----------|---------|
113
+ | LibriTTS-r | en | 585 |
114
+ | LJSpeech | en | 24 |
115
+ | Festcat | ca | 22 |
116
+ | OpenSLR69 | ca | 5 |
117
+
118
+
119
+ ### Training Procedure
120
+
121
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
122
+ The model was trained for 1.8M steps and 183 epochs with a batch size of 16 for stability. We used a Cosine scheduler with a initial learning rate of 5e-4.
123
+ We also modified the mel spectrogram loss to use 128 bins and fmax of 11025 instead of the same input mel spectrogram.
124
+
125
+
126
+ #### Training Hyperparameters
127
+
128
+
129
+ * initial_learning_rate: 5e-4
130
+ * scheduler: cosine without warmup or restarts
131
+ * mel_loss_coeff: 45
132
+ * mrd_loss_coeff: 0.1
133
+ * batch_size: 16
134
+ * num_samples: 16384
135
+
136
+ ## Evaluation
137
+
138
+ <!-- This section describes the evaluation protocols and provides the results. -->
139
+
140
+ Evaluation was done using the metrics on the original repo, after 183 epochs we achieve:
141
+
142
+ * val_loss: 3.81
143
+ * f1_score: 0.94
144
+ * mel_loss: 0.25
145
+ * periodicity_loss:0.132
146
+ * pesq_score: 3.16
147
+ * pitch_loss: 38.11
148
+ * utmos_score: 3.27
149
+
150
+
151
+ ## Citation
152
+
153
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
154
+
155
+ If this code contributes to your research, please cite the work:
156
+
157
+ ```
158
+ @article{siuzdak2023vocos,
159
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
160
+ author={Siuzdak, Hubert},
161
+ journal={arXiv preprint arXiv:2306.00814},
162
+ year={2023}
163
+ }
164
+ ```
165
+
166
+ ## Additional information
167
+
168
+ ### Author
169
+ The Language Technologies Unit from Barcelona Supercomputing Center.
170
+
171
+ ### Contact
172
+ For further information, please send an email to <langtech@bsc.es>.
173
+
174
+ ### Copyright
175
+ Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
176
+
177
+ ### License
178
+ [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
179
+
180
+ ### Funding
181
+
182
+ This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
vocos-mel-22khz/config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_lightning==1.8.6
2
+
3
+ feature_extractor:
4
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
5
+ init_args:
6
+ sample_rate: 22050
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ n_mels: 80
10
+ padding: same
11
+ f_min: 0
12
+ f_max: 8000
13
+ norm: "slaney"
14
+ mel_scale: "slaney"
15
+
16
+
17
+ backbone:
18
+ class_path: vocos.models.VocosBackbone
19
+ init_args:
20
+ input_channels: 80
21
+ dim: 512
22
+ intermediate_dim: 1536
23
+ num_layers: 8
24
+
25
+ head:
26
+ class_path: vocos.heads.ISTFTHead
27
+ init_args:
28
+ dim: 512
29
+ n_fft: 1024
30
+ hop_length: 256
31
+ padding: same
32
+
33
+
vocos-mel-22khz/mel_spec_22khz_univ.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ab0744d7d49601ed8ad9be2927fcc99fb359cc90fe28bc9535c0484b3621de3
3
+ size 53883652
vocos-mel-22khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af7b6f4b153819ada44a917135acf33944cdbb70cde0701eda3d100153799c7
3
+ size 54051047
vocos-mel-22khz/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/BSC-LT/vocos-mel-22khz
vocos-mel-22khz/vocos_checkpoint_epoch=183_step=3690672_val_loss=3.8142.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec7cc19942235286d3243c6e47798af4510b1feda50901ea46d41073403f40c9
3
+ size 672720367
vocos-mel-24khz-onnx/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-mel-24khz-onnx/README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ library: ONNX
4
+ base_model: charactr/vocos-mel-24khz
5
+ ---
6
+
7
+ **Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis**
8
+
9
+ **Audio samples | Paper [abs] [pdf]**
10
+
11
+ Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
12
+
13
+ This is a ONNX version of the original 24khz mel spectrogram [model](https://huggingface.co/charactr/vocos-mel-24khz). The model predicts spectrograms and the ISTFT is performed outside ONNX as ISTFT is still not implemented as an operator in ONNX.
14
+
15
+ ## Usage
16
+
17
+ Try out in colab:
18
+
19
+ <a target="_blank" href="https://colab.research.google.com/drive/1J1tWd56D7CPwmVCP-pbMNzlRWYvlyADN">
20
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
21
+ </a>
22
+
23
+ ## Citation
24
+
25
+ ```
26
+ @article{siuzdak2023vocos,
27
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
28
+ author={Siuzdak, Hubert},
29
+ journal={arXiv preprint arXiv:2306.00814},
30
+ year={2023}
31
+ }
32
+
33
+ ```
vocos-mel-24khz-onnx/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
3
+ init_args:
4
+ sample_rate: 24000
5
+ n_fft: 1024
6
+ hop_length: 256
7
+ n_mels: 100
8
+ padding: center
9
+
10
+ backbone:
11
+ class_path: vocos.models.VocosBackbone
12
+ init_args:
13
+ input_channels: 100
14
+ dim: 512
15
+ intermediate_dim: 1536
16
+ num_layers: 8
17
+
18
+ head:
19
+ class_path: vocos.heads.ISTFTHead
20
+ init_args:
21
+ dim: 512
22
+ n_fft: 1024
23
+ hop_length: 256
24
+ padding: center
vocos-mel-24khz-onnx/mel_spec_24khz.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a84c58728a769e8a98eeca75bb89102987db0028e5e3d44b45af2ae3ef0104e2
3
+ size 54156978
vocos-mel-24khz-onnx/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/wetdog/vocos-mel-24khz-onnx
vocos-mel-24khz/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-mel-24khz/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
6
+
7
+ [Audio samples](https://charactr-platform.github.io/vocos/) |
8
+ Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
9
+
10
+ Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
11
+ Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
12
+ GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
13
+ coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
14
+
15
+ ## Installation
16
+
17
+ To use Vocos only in inference mode, install it using:
18
+
19
+ ```bash
20
+ pip install vocos
21
+ ```
22
+
23
+ If you wish to train the model, install it with additional dependencies:
24
+
25
+ ```bash
26
+ pip install vocos[train]
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### Reconstruct audio from mel-spectrogram
32
+
33
+ ```python
34
+ import torch
35
+
36
+ from vocos import Vocos
37
+
38
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
39
+
40
+ mel = torch.randn(1, 100, 256) # B, C, T
41
+ audio = vocos.decode(mel)
42
+ ```
43
+
44
+ Copy-synthesis from a file:
45
+
46
+ ```python
47
+ import torchaudio
48
+
49
+ y, sr = torchaudio.load(YOUR_AUDIO_FILE)
50
+ if y.size(0) > 1: # mix to mono
51
+ y = y.mean(dim=0, keepdim=True)
52
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
53
+ y_hat = vocos(y)
54
+ ```
55
+
56
+ ## Citation
57
+
58
+ If this code contributes to your research, please cite our work:
59
+
60
+ ```
61
+ @article{siuzdak2023vocos,
62
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
63
+ author={Siuzdak, Hubert},
64
+ journal={arXiv preprint arXiv:2306.00814},
65
+ year={2023}
66
+ }
67
+ ```
68
+
69
+ ## License
70
+
71
+ The code in this repository is released under the MIT license.
vocos-mel-24khz/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
3
+ init_args:
4
+ sample_rate: 24000
5
+ n_fft: 1024
6
+ hop_length: 256
7
+ n_mels: 100
8
+ padding: center
9
+
10
+ backbone:
11
+ class_path: vocos.models.VocosBackbone
12
+ init_args:
13
+ input_channels: 100
14
+ dim: 512
15
+ intermediate_dim: 1536
16
+ num_layers: 8
17
+
18
+ head:
19
+ class_path: vocos.heads.ISTFTHead
20
+ init_args:
21
+ dim: 512
22
+ n_fft: 1024
23
+ hop_length: 256
24
+ padding: center
vocos-mel-24khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ec976ad1fd67a33ab2682d29c0ac7df85234fae875aefcc5fb215681a91b2a
3
+ size 54365991
vocos-mel-24khz/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/charactr/vocos-mel-24khz
vocos-mel-48khz-alpha1/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
vocos-mel-48khz-alpha1/README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - audio
5
+ library_name: pytorch
6
+ ---
7
+
8
+ # Vocos
9
+
10
+ #### Note: This repo has no affiliation with the author of Vocos.
11
+
12
+ Pretrained Vocos model with a 48kHz sampling rate, as opposed to 24kHz of the official.
13
+
14
+ ## Usage
15
+ Make sure the Vocos library is installed:
16
+
17
+ ```bash
18
+ pip install vocos
19
+ ```
20
+
21
+ then, load the model as usual:
22
+
23
+ ```python
24
+ from vocos import Vocos
25
+ vocos = Vocos.from_pretrained("kittn/vocos-mel-48khz-alpha1")
26
+ ```
27
+
28
+ For more detailed examples, see [github.com/charactr-platform/vocos#usage](https://github.com/charactr-platform/vocos#usage)
29
+
30
+ ## Evals
31
+ TODO
32
+
33
+ ## Training details
34
+ TODO
35
+
36
+ ## What is Vocos?
37
+
38
+ Here's a summary from the official repo [[link](https://github.com/charactr-platform/vocos)]:
39
+
40
+ > Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
41
+
42
+ For more details and other variants, check out the repo link above.
43
+
44
+ ## Model summary
45
+ ```bash
46
+ =================================================================
47
+ Layer (type:depth-idx) Param #
48
+ =================================================================
49
+ Vocos --
50
+ ├─MelSpectrogramFeatures: 1-1 --
51
+ │ └─MelSpectrogram: 2-1 --
52
+ │ │ └─Spectrogram: 3-1 --
53
+ │ │ └─MelScale: 3-2 --
54
+ ├─VocosBackbone: 1-2 --
55
+ │ └─Conv1d: 2-2 918,528
56
+ │ └─LayerNorm: 2-3 2,048
57
+ │ └─ModuleList: 2-4 --
58
+ │ │ └─ConvNeXtBlock: 3-3 4,208,640
59
+ │ │ └─ConvNeXtBlock: 3-4 4,208,640
60
+ │ │ └─ConvNeXtBlock: 3-5 4,208,640
61
+ │ │ └─ConvNeXtBlock: 3-6 4,208,640
62
+ │ │ └─ConvNeXtBlock: 3-7 4,208,640
63
+ │ │ └─ConvNeXtBlock: 3-8 4,208,640
64
+ │ │ └─ConvNeXtBlock: 3-9 4,208,640
65
+ │ │ └─ConvNeXtBlock: 3-10 4,208,640
66
+ │ └─LayerNorm: 2-5 2,048
67
+ ├─ISTFTHead: 1-3 --
68
+ │ └─Linear: 2-6 2,101,250
69
+ │ └─ISTFT: 2-7 --
70
+ =================================================================
71
+ Total params: 36,692,994
72
+ Trainable params: 36,692,994
73
+ Non-trainable params: 0
74
+ =================================================================
75
+ ```
vocos-mel-48khz-alpha1/config.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ backbone:
2
+ class_path: vocos.models.VocosBackbone
3
+ init_args:
4
+ adanorm_num_embeddings: null
5
+ dim: 1024
6
+ input_channels: 128
7
+ intermediate_dim: 2048
8
+ layer_scale_init_value: null
9
+ num_layers: 8
10
+ decay_mel_coeff: false
11
+ enable_discriminator: true
12
+ evaluate_periodicty: true
13
+ evaluate_pesq: true
14
+ evaluate_utmos: true
15
+ feature_extractor:
16
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
17
+ init_args:
18
+ hop_length: 256
19
+ n_fft: 2048
20
+ n_mels: 128
21
+ padding: center
22
+ sample_rate: 48000
23
+ generator_period: 3
24
+ grad_acc: 1
25
+ head:
26
+ class_path: vocos.heads.ISTFTHead
27
+ init_args:
28
+ dim: 1024
29
+ hop_length: 256
30
+ n_fft: 2048
31
+ padding: center
32
+ initial_learning_rate: 0.0003
33
+ mel_loss_coeff: 15.0
34
+ mrd_loss_coeff: 0.1
35
+ num_warmup_steps: 500
36
+ pretrain_decoupled_steps: 0
37
+ pretrain_disc_steps: 500
38
+ pretrain_mel_steps: 0
39
+ pretrained_ckpt: null
40
+ sample_rate: 48000
vocos-mel-48khz-alpha1/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3315c87d130922dff1c4c0cfd153ac3ef037950ac0eba13f355bb38cbda46fc2
3
+ size 147342055
vocos-mel-48khz-alpha1/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/kittn/vocos-mel-48khz-alpha1
vocos-mel-hifigan-compat-44100khz/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt filter=lfs diff=lfs merge=lfs -text
37
+ pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
vocos-mel-hifigan-compat-44100khz/README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ pipeline_tag: audio-to-audio
4
+ tags:
5
+ - vocos
6
+ - hifigan
7
+ - tts
8
+ - melspectrogram
9
+ - vocoder
10
+ - mel
11
+ ---
12
+
13
+ ### Model Description
14
+
15
+ <!-- Provide a longer summary of what this model is. -->
16
+
17
+ **Vocos** is a fast neural vocoder designed to synthesize audio waveforms from acoustic features.
18
+ Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain.
19
+ Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through
20
+ inverse Fourier transform.
21
+
22
+ This version of vocos uses 80-bin mel spectrograms as acoustic features which are widespread
23
+ in the TTS domain since the introduction of [hifi-gan](https://github.com/jik876/hifi-gan/blob/master/meldataset.py)
24
+ The goal of this model is to provide an alternative to hifi-gan that is faster and compatible with the
25
+ acoustic output of several TTS models.
26
+
27
+ ## Intended Uses and limitations
28
+
29
+ The model is aimed to serve as a vocoder to synthesize audio waveforms from mel spectrograms. Is trained to generate speech and if is used in other audio
30
+ domain is possible that the model won't produce high quality samples.
31
+
32
+ ### Installation
33
+
34
+ To use Vocos only in inference mode, install it using:
35
+
36
+ ```bash
37
+ pip install git+https://github.com/langtech-bsc/vocos.git@matcha
38
+ ```
39
+
40
+ ### Reconstruct audio from mel-spectrogram
41
+
42
+ ```python
43
+ import torch
44
+
45
+ from vocos import Vocos
46
+
47
+ vocos = Vocos.from_pretrained("patriotyk/vocos-mel-hifigan-compat-44100khz")
48
+
49
+ mel = torch.randn(1, 80, 256) # B, C, T
50
+ audio = vocos.decode(mel)
51
+ ```
52
+
53
+ ### Training Data
54
+
55
+ The model was trained on private 800+ hours dataset, made from Ukrainian audio books, using [narizaka](https://github.com/patriotyk/narizaka) tool.
56
+
57
+ ### Training Procedure
58
+
59
+ The model was trained for 2.0M steps and 210 epochs with a batch size of 20. We used a Cosine scheduler with a initial learning rate of 3e-4.
60
+ We where using two RTX-3090 video cards for training, and it took about one month of continuous training.
61
+
62
+ #### Training Hyperparameters
63
+
64
+ * initial_learning_rate: 3e-4
65
+ * scheduler: cosine without warmup or restarts
66
+ * mel_loss_coeff: 45
67
+ * mrd_loss_coeff: 1.0
68
+ * batch_size: 20
69
+ * num_samples: 32768
70
+
71
+ ## Evaluation
72
+
73
+
74
+ Evaluation was done using the metrics on the original repo, after 210 epochs we achieve:
75
+
76
+ * val_loss: 3.703
77
+ * f1_score: 0.950
78
+ * mel_loss: 0.248
79
+ * periodicity_loss:0.127
80
+ * pesq_score: 3.399
81
+ * pitch_loss: 38.26
82
+ * utmos_score: 3.146
83
+
84
+
85
+ ## Citation
86
+
87
+
88
+ If this code contributes to your research, please cite the work:
89
+
90
+ ```
91
+ @article{siuzdak2023vocos,
92
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
93
+ author={Siuzdak, Hubert},
94
+ journal={arXiv preprint arXiv:2306.00814},
95
+ year={2023}
96
+ }
97
+ ```
vocos-mel-hifigan-compat-44100khz/config.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
3
+ init_args:
4
+ sample_rate: 44100
5
+ n_fft: 2048
6
+ hop_length: 512
7
+ n_mels: 80
8
+ padding: same
9
+ f_min: 0
10
+ f_max: 8000
11
+ norm: "slaney"
12
+ mel_scale: "slaney"
13
+
14
+ backbone:
15
+ class_path: vocos.models.VocosBackbone
16
+ init_args:
17
+ input_channels: 80
18
+ dim: 512
19
+ intermediate_dim: 1536
20
+ num_layers: 8
21
+
22
+ head:
23
+ class_path: vocos.heads.ISTFTHead
24
+ init_args:
25
+ dim: 512
26
+ n_fft: 2048
27
+ hop_length: 512
28
+ padding: same
vocos-mel-hifigan-compat-44100khz/logs/version_0/config.yaml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_lightning==1.8.6
2
+ seed_everything: 4444
3
+ trainer:
4
+ logger:
5
+ class_path: pytorch_lightning.loggers.TensorBoardLogger
6
+ init_args:
7
+ save_dir: /home/patriotyk/vocos/logs
8
+ name: lightning_logs
9
+ version: null
10
+ log_graph: false
11
+ default_hp_metric: true
12
+ prefix: ''
13
+ sub_dir: null
14
+ logdir: null
15
+ comment: ''
16
+ purge_step: null
17
+ max_queue: 10
18
+ flush_secs: 120
19
+ filename_suffix: ''
20
+ write_to_disk: true
21
+ comet_config:
22
+ disabled: true
23
+ enable_checkpointing: true
24
+ callbacks:
25
+ - class_path: pytorch_lightning.callbacks.LearningRateMonitor
26
+ init_args:
27
+ logging_interval: null
28
+ log_momentum: false
29
+ - class_path: pytorch_lightning.callbacks.ModelSummary
30
+ init_args:
31
+ max_depth: 2
32
+ - class_path: pytorch_lightning.callbacks.ModelCheckpoint
33
+ init_args:
34
+ dirpath: null
35
+ filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
36
+ monitor: val_loss
37
+ verbose: false
38
+ save_last: true
39
+ save_top_k: 3
40
+ save_weights_only: false
41
+ mode: min
42
+ auto_insert_metric_name: true
43
+ every_n_train_steps: null
44
+ train_time_interval: null
45
+ every_n_epochs: null
46
+ save_on_train_epoch_end: null
47
+ - class_path: vocos.helpers.GradNormCallback
48
+ default_root_dir: null
49
+ gradient_clip_val: null
50
+ gradient_clip_algorithm: null
51
+ num_nodes: 1
52
+ num_processes: null
53
+ devices: -1
54
+ gpus: null
55
+ auto_select_gpus: false
56
+ tpu_cores: null
57
+ ipus: null
58
+ enable_progress_bar: true
59
+ overfit_batches: 0.0
60
+ track_grad_norm: -1
61
+ check_val_every_n_epoch: 1
62
+ fast_dev_run: false
63
+ accumulate_grad_batches: null
64
+ max_epochs: null
65
+ min_epochs: null
66
+ max_steps: -1
67
+ min_steps: null
68
+ max_time: null
69
+ limit_train_batches: null
70
+ limit_val_batches: 100
71
+ limit_test_batches: null
72
+ limit_predict_batches: null
73
+ val_check_interval: null
74
+ log_every_n_steps: 100
75
+ accelerator: gpu
76
+ strategy: ddp
77
+ sync_batchnorm: false
78
+ precision: 32
79
+ enable_model_summary: true
80
+ num_sanity_val_steps: 2
81
+ resume_from_checkpoint: null
82
+ profiler: null
83
+ benchmark: null
84
+ deterministic: null
85
+ reload_dataloaders_every_n_epochs: 0
86
+ auto_lr_find: false
87
+ replace_sampler_ddp: true
88
+ detect_anomaly: false
89
+ auto_scale_batch_size: false
90
+ plugins: null
91
+ amp_backend: native
92
+ amp_level: null
93
+ move_metrics_to_cpu: false
94
+ multiple_trainloader_mode: max_size_cycle
95
+ inference_mode: true
96
+ data:
97
+ class_path: vocos.dataset.VocosDataModule
98
+ init_args:
99
+ train_params:
100
+ filelist_path: /home/patriotyk/tts_corpus_44100/train_vocos.txt
101
+ sampling_rate: 44100
102
+ num_samples: 32768
103
+ batch_size: 20
104
+ num_workers: 24
105
+ val_params:
106
+ filelist_path: /home/patriotyk/tts_corpus_44100/val_vocos.txt
107
+ sampling_rate: 44100
108
+ num_samples: 96768
109
+ batch_size: 20
110
+ num_workers: 24
111
+ model:
112
+ class_path: vocos.experiment.VocosExp
113
+ init_args:
114
+ feature_extractor:
115
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
116
+ init_args:
117
+ sample_rate: 44100
118
+ n_fft: 2048
119
+ hop_length: 512
120
+ n_mels: 80
121
+ padding: same
122
+ f_min: 0
123
+ f_max: 8000
124
+ norm: slaney
125
+ mel_scale: slaney
126
+ backbone:
127
+ class_path: vocos.models.VocosBackbone
128
+ init_args:
129
+ input_channels: 80
130
+ dim: 512
131
+ intermediate_dim: 1536
132
+ num_layers: 8
133
+ layer_scale_init_value: null
134
+ adanorm_num_embeddings: null
135
+ head:
136
+ class_path: vocos.heads.ISTFTHead
137
+ init_args:
138
+ dim: 512
139
+ n_fft: 2048
140
+ hop_length: 512
141
+ padding: same
142
+ sample_rate: 44100
143
+ initial_learning_rate: 0.0003
144
+ num_warmup_steps: 0
145
+ mel_loss_coeff: 45.0
146
+ mrd_loss_coeff: 1.0
147
+ pretrain_mel_steps: 0
148
+ decay_mel_coeff: false
149
+ evaluate_utmos: true
150
+ evaluate_pesq: true
151
+ evaluate_periodicty: true
vocos-mel-hifigan-compat-44100khz/logs/version_0/events.out.tfevents.1713993466.gpuserver ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776d06fc99b9d864dedb323dda38c32b09759b7cb04e488437269fe68a9919db
3
+ size 303299046