Vocos (models, paper)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- Vocos. Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis.pdf +3 -0
- alvocat-vocos-22khz/.gitattributes +35 -0
- alvocat-vocos-22khz/README.md +174 -0
- alvocat-vocos-22khz/config.yaml +33 -0
- alvocat-vocos-22khz/mel_spec_22khz_cat.onnx +3 -0
- alvocat-vocos-22khz/pytorch_model.bin +3 -0
- alvocat-vocos-22khz/source.txt +1 -0
- vocos-audioset-32khz/.gitattributes +35 -0
- vocos-audioset-32khz/README.md +34 -0
- vocos-audioset-32khz/config.yaml +24 -0
- vocos-audioset-32khz/source.txt +1 -0
- vocos-audioset-32khz/vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt +3 -0
- vocos-encodec-24khz/.gitattributes +34 -0
- vocos-encodec-24khz/README.md +73 -0
- vocos-encodec-24khz/config.yaml +23 -0
- vocos-encodec-24khz/pytorch_model.bin +3 -0
- vocos-encodec-24khz/source.txt +1 -0
- vocos-mel-10ms-24khz/.gitattributes +35 -0
- vocos-mel-10ms-24khz/README.md +33 -0
- vocos-mel-10ms-24khz/config.yaml +31 -0
- vocos-mel-10ms-24khz/pytorch_model.bin +3 -0
- vocos-mel-10ms-24khz/source.txt +1 -0
- vocos-mel-22khz/.gitattributes +35 -0
- vocos-mel-22khz/README.md +182 -0
- vocos-mel-22khz/config.yaml +33 -0
- vocos-mel-22khz/mel_spec_22khz_univ.onnx +3 -0
- vocos-mel-22khz/pytorch_model.bin +3 -0
- vocos-mel-22khz/source.txt +1 -0
- vocos-mel-22khz/vocos_checkpoint_epoch=183_step=3690672_val_loss=3.8142.ckpt +3 -0
- vocos-mel-24khz-onnx/.gitattributes +35 -0
- vocos-mel-24khz-onnx/README.md +33 -0
- vocos-mel-24khz-onnx/config.yaml +24 -0
- vocos-mel-24khz-onnx/mel_spec_24khz.onnx +3 -0
- vocos-mel-24khz-onnx/source.txt +1 -0
- vocos-mel-24khz/.gitattributes +34 -0
- vocos-mel-24khz/README.md +71 -0
- vocos-mel-24khz/config.yaml +24 -0
- vocos-mel-24khz/pytorch_model.bin +3 -0
- vocos-mel-24khz/source.txt +1 -0
- vocos-mel-48khz-alpha1/.gitattributes +35 -0
- vocos-mel-48khz-alpha1/README.md +75 -0
- vocos-mel-48khz-alpha1/config.yaml +40 -0
- vocos-mel-48khz-alpha1/pytorch_model.bin +3 -0
- vocos-mel-48khz-alpha1/source.txt +1 -0
- vocos-mel-hifigan-compat-44100khz/.gitattributes +37 -0
- vocos-mel-hifigan-compat-44100khz/README.md +97 -0
- vocos-mel-hifigan-compat-44100khz/config.yaml +28 -0
- vocos-mel-hifigan-compat-44100khz/logs/version_0/config.yaml +151 -0
- vocos-mel-hifigan-compat-44100khz/logs/version_0/events.out.tfevents.1713993466.gpuserver +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Vocos.[[:space:]]Closing[[:space:]]the[[:space:]]gap[[:space:]]between[[:space:]]time-domain[[:space:]]and[[:space:]]Fourier-based[[:space:]]neural[[:space:]]vocoders[[:space:]]for[[:space:]]high-quality[[:space:]]audio[[:space:]]synthesis.pdf filter=lfs diff=lfs merge=lfs -text
|
Vocos. Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a5a8dcb0a6b18a77b7f0fabeeadb8d51149246337a3ab9035ca42ffe910b7eb3
|
| 3 |
+
size 6612764
|
alvocat-vocos-22khz/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
alvocat-vocos-22khz/README.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- projecte-aina/festcat_trimmed_denoised
|
| 5 |
+
- projecte-aina/openslr-slr69-ca-trimmed-denoised
|
| 6 |
+
tags:
|
| 7 |
+
- vocoder
|
| 8 |
+
- vocos
|
| 9 |
+
- tts
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 🥑 alVoCat
|
| 13 |
+
|
| 14 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 15 |
+
🥑 alVoCat is a vocoder for Catalan TTS, based on Vocos architecture. It is highly performant and
|
| 16 |
+
high quality, works together with [🍵 Matxa](https://huggingface.co/BSC-LT/matcha-tts-cat-multiaccent)
|
| 17 |
+
and you can find our fork [here](https://github.com/langtech-bsc/vocos/tree/matcha) and a demo [here](https://huggingface.co/spaces/BSC-LT/matchatts-vocos-onnx-ca).
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
**Vocos** is a fast neural vocoder designed to synthesize audio waveforms from acoustic features.
|
| 26 |
+
Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain.
|
| 27 |
+
Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through
|
| 28 |
+
inverse Fourier transform.
|
| 29 |
+
|
| 30 |
+
This version of **Vocos** uses 80-bin mel spectrograms as acoustic features which are widespread
|
| 31 |
+
in the TTS domain since the introduction of [hifi-gan](https://github.com/jik876/hifi-gan/blob/master/meldataset.py)
|
| 32 |
+
The goal of this model is to provide an alternative to hifi-gan that is faster and compatible with the
|
| 33 |
+
acoustic output of several TTS models. This version is tailored for the Catalan language,
|
| 34 |
+
as it was trained only on Catalan speech datasets.
|
| 35 |
+
|
| 36 |
+
We are grateful with the authors for open sourcing the code allowing us to modify and train this version.
|
| 37 |
+
|
| 38 |
+
## Intended Uses and limitations
|
| 39 |
+
|
| 40 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 41 |
+
The model is aimed to serve as a vocoder to synthesize audio waveforms from mel spectrograms. Is trained to generate speech and if is used in other audio
|
| 42 |
+
domain is possible that the model won't produce high quality samples.
|
| 43 |
+
|
| 44 |
+
## How to Get Started with the Model
|
| 45 |
+
|
| 46 |
+
Use the code below to get started with the model.
|
| 47 |
+
|
| 48 |
+
### Installation
|
| 49 |
+
|
| 50 |
+
To use Vocos only in inference mode, install it using:
|
| 51 |
+
|
| 52 |
+
```bash
|
| 53 |
+
pip install git+https://github.com/langtech-bsc/vocos.git@matcha
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
### Reconstruct audio from mel-spectrogram
|
| 57 |
+
|
| 58 |
+
```python
|
| 59 |
+
import torch
|
| 60 |
+
|
| 61 |
+
from vocos import Vocos
|
| 62 |
+
|
| 63 |
+
vocos = Vocos.from_pretrained("projecte-aina/alvocat-vocos-22khz")
|
| 64 |
+
|
| 65 |
+
mel = torch.randn(1, 80, 256) # B, C, T
|
| 66 |
+
audio = vocos.decode(mel)
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### Copy-synthesis from a file:
|
| 70 |
+
|
| 71 |
+
```python
|
| 72 |
+
import torchaudio
|
| 73 |
+
|
| 74 |
+
y, sr = torchaudio.load(YOUR_AUDIO_FILE)
|
| 75 |
+
if y.size(0) > 1: # mix to mono
|
| 76 |
+
y = y.mean(dim=0, keepdim=True)
|
| 77 |
+
y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=22050)
|
| 78 |
+
y_hat = vocos(y)
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### Onnx
|
| 82 |
+
|
| 83 |
+
We also release an onnx version of the model, you can check in colab:
|
| 84 |
+
|
| 85 |
+
<a target="_blank" href="https://colab.research.google.com/github/langtech-bsc/vocos/blob/matcha/notebooks/vocos_22khz_onnx_inference.ipynb">
|
| 86 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
| 87 |
+
</a>
|
| 88 |
+
|
| 89 |
+
## Training Details
|
| 90 |
+
|
| 91 |
+
### Training Data
|
| 92 |
+
|
| 93 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 94 |
+
|
| 95 |
+
The model was trained on 3 Catalan speech datasets
|
| 96 |
+
|
| 97 |
+
| Dataset | Language | Hours |
|
| 98 |
+
|---------------------|----------|---------|
|
| 99 |
+
| Festcat | ca | 22 |
|
| 100 |
+
| OpenSLR69 | ca | 5 |
|
| 101 |
+
| LaFrescat | ca | 3.5 |
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
### Training Procedure
|
| 106 |
+
|
| 107 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 108 |
+
The model was trained for 1.5M steps and 1.3k epochs with a batch size of 16 for stability. We used a Cosine scheduler with an initial learning rate of 5e-4.
|
| 109 |
+
We also modified the mel spectrogram loss to use 128 bins and fmax of 11025 instead of the same input mel spectrogram.
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
#### Training Hyperparameters
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
* initial_learning_rate: 5e-4
|
| 116 |
+
* scheduler: cosine without warmup or restarts
|
| 117 |
+
* mel_loss_coeff: 45
|
| 118 |
+
* mrd_loss_coeff: 0.1
|
| 119 |
+
* batch_size: 16
|
| 120 |
+
* num_samples: 16384
|
| 121 |
+
|
| 122 |
+
## Evaluation
|
| 123 |
+
|
| 124 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 125 |
+
|
| 126 |
+
Evaluation was done using the metrics on the [original repo](https://github.com/gemelo-ai/vocos), after ~ 1000 epochs we achieve:
|
| 127 |
+
|
| 128 |
+
* val_loss: 3.57
|
| 129 |
+
* f1_score: 0.95
|
| 130 |
+
* mel_loss: 0.22
|
| 131 |
+
* periodicity_loss: 0.113
|
| 132 |
+
* pesq_score: 3.31
|
| 133 |
+
* pitch_loss: 31.61
|
| 134 |
+
* utmos_score: 3.33
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
## Citation
|
| 138 |
+
|
| 139 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 140 |
+
|
| 141 |
+
If this code contributes to your research, please cite the work:
|
| 142 |
+
|
| 143 |
+
```
|
| 144 |
+
@article{siuzdak2023vocos,
|
| 145 |
+
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
|
| 146 |
+
author={Siuzdak, Hubert},
|
| 147 |
+
journal={arXiv preprint arXiv:2306.00814},
|
| 148 |
+
year={2023}
|
| 149 |
+
}
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
## Additional information
|
| 153 |
+
|
| 154 |
+
### Author
|
| 155 |
+
The Language Technologies Unit from Barcelona Supercomputing Center.
|
| 156 |
+
|
| 157 |
+
### Contact
|
| 158 |
+
For further information, please send an email to <langtech@bsc.es>.
|
| 159 |
+
|
| 160 |
+
### Copyright
|
| 161 |
+
Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
|
| 162 |
+
|
| 163 |
+
### License
|
| 164 |
+
[Creative Commons Attribution Non-commercial 4.0](https://www.creativecommons.org/licenses/by-nc/4.0/)
|
| 165 |
+
|
| 166 |
+
These models are free to use for non-commercial and research purposes. Commercial use is only possible through licensing by
|
| 167 |
+
the voice artists. For further information, contact <langtech@bsc.es> and <lafrescaproduccions@gmail.com>.
|
| 168 |
+
|
| 169 |
+
### Funding
|
| 170 |
+
|
| 171 |
+
This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
|
| 172 |
+
|
| 173 |
+
Part of the training of the model was possible thanks to the compute time given by Galician Supercomputing Center CESGA
|
| 174 |
+
([Centro de Supercomputación de Galicia](https://www.cesga.es/))
|
alvocat-vocos-22khz/config.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytorch_lightning==1.8.6
|
| 2 |
+
|
| 3 |
+
feature_extractor:
|
| 4 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 5 |
+
init_args:
|
| 6 |
+
sample_rate: 22050
|
| 7 |
+
n_fft: 1024
|
| 8 |
+
hop_length: 256
|
| 9 |
+
n_mels: 80
|
| 10 |
+
padding: same
|
| 11 |
+
f_min: 0
|
| 12 |
+
f_max: 8000
|
| 13 |
+
norm: "slaney"
|
| 14 |
+
mel_scale: "slaney"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
backbone:
|
| 18 |
+
class_path: vocos.models.VocosBackbone
|
| 19 |
+
init_args:
|
| 20 |
+
input_channels: 80
|
| 21 |
+
dim: 512
|
| 22 |
+
intermediate_dim: 1536
|
| 23 |
+
num_layers: 8
|
| 24 |
+
|
| 25 |
+
head:
|
| 26 |
+
class_path: vocos.heads.ISTFTHead
|
| 27 |
+
init_args:
|
| 28 |
+
dim: 512
|
| 29 |
+
n_fft: 1024
|
| 30 |
+
hop_length: 256
|
| 31 |
+
padding: same
|
| 32 |
+
|
| 33 |
+
|
alvocat-vocos-22khz/mel_spec_22khz_cat.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ab0744d7d49601ed8ad9be2927fcc99fb359cc90fe28bc9535c0484b3621de3
|
| 3 |
+
size 53883652
|
alvocat-vocos-22khz/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0af7b6f4b153819ada44a917135acf33944cdbb70cde0701eda3d100153799c7
|
| 3 |
+
size 54051047
|
alvocat-vocos-22khz/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/projecte-aina/alvocat-vocos-22khz
|
vocos-audioset-32khz/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-audioset-32khz/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
This model is trained on Google's AudioSet (28GB data) for 1 million steps. (Originally planned 2 million steps, but I'm exploring better training schedule)
|
| 6 |
+
|
| 7 |
+
You can regard it as a pretrained base model, which is common in language models but not for vocoders.
|
| 8 |
+
|
| 9 |
+
How to load and use this model:
|
| 10 |
+
|
| 11 |
+
```python
|
| 12 |
+
import torch
|
| 13 |
+
import torchaudio
|
| 14 |
+
from scipy.io.wavfile import write
|
| 15 |
+
with torch.no_grad():
|
| 16 |
+
from vocos import Vocos
|
| 17 |
+
A = torch.load("./vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt", map_location="cpu")
|
| 18 |
+
V = Vocos.from_hparams("./config.yaml")
|
| 19 |
+
V.load_state_dict(A['state_dict'], strict=False)
|
| 20 |
+
V.eval()
|
| 21 |
+
def safe_log(x: torch.Tensor, clip_val: float = 1e-7):
|
| 22 |
+
return torch.log(torch.clip(x, min=clip_val))
|
| 23 |
+
voice, sr = torchaudio.load('example.wav') # must be sample_rate=32000
|
| 24 |
+
if sr != 32000:
|
| 25 |
+
raise ValueError
|
| 26 |
+
mel = torchaudio.transforms.MelSpectrogram(
|
| 27 |
+
sample_rate=32000, n_fft=2048, hop_length=1024, n_mels=128, center=True, power=1,
|
| 28 |
+
)(voice)
|
| 29 |
+
mel = safe_log(mel)
|
| 30 |
+
audio = V.decode(mel)
|
| 31 |
+
write('out.wav', 32000, audio.flatten().numpy())
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
|
vocos-audioset-32khz/config.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feature_extractor:
|
| 2 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 3 |
+
init_args:
|
| 4 |
+
sample_rate: 32000
|
| 5 |
+
n_fft: 2048
|
| 6 |
+
hop_length: 1024
|
| 7 |
+
n_mels: 128
|
| 8 |
+
padding: center
|
| 9 |
+
|
| 10 |
+
backbone:
|
| 11 |
+
class_path: vocos.models.VocosBackbone
|
| 12 |
+
init_args:
|
| 13 |
+
input_channels: 128
|
| 14 |
+
dim: 512
|
| 15 |
+
intermediate_dim: 1536
|
| 16 |
+
num_layers: 8
|
| 17 |
+
|
| 18 |
+
head:
|
| 19 |
+
class_path: vocos.heads.ISTFTHead
|
| 20 |
+
init_args:
|
| 21 |
+
dim: 512
|
| 22 |
+
n_fft: 2048
|
| 23 |
+
hop_length: 1024
|
| 24 |
+
padding: center
|
vocos-audioset-32khz/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/ZhangRC/vocos-audioset-32khz
|
vocos-audioset-32khz/vocos_checkpoint_epoch=464_step=1001610_val_loss=7.1732.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7043569c8f810cede62d02bf5480438d29e3dcff21cdfb6b0dce5a96e39e730a
|
| 3 |
+
size 681397231
|
vocos-encodec-24khz/.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-encodec-24khz/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
# Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
|
| 6 |
+
|
| 7 |
+
[Audio samples](https://charactr-platform.github.io/vocos/) |
|
| 8 |
+
Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
|
| 9 |
+
|
| 10 |
+
Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
|
| 11 |
+
Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
|
| 12 |
+
GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
|
| 13 |
+
coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
|
| 14 |
+
|
| 15 |
+
## Installation
|
| 16 |
+
|
| 17 |
+
To use Vocos only in inference mode, install it using:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
pip install vocos
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
If you wish to train the model, install it with additional dependencies:
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
pip install vocos[train]
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
|
| 31 |
+
### Reconstruct audio from EnCodec tokens
|
| 32 |
+
|
| 33 |
+
Additionally, you need to provide a `bandwidth_id` which corresponds to the embedding for bandwidth from the
|
| 34 |
+
list: `[1.5, 3.0, 6.0, 12.0]`.
|
| 35 |
+
|
| 36 |
+
```python
|
| 37 |
+
vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz")
|
| 38 |
+
|
| 39 |
+
audio_tokens = torch.randint(low=0, high=1024, size=(8, 200)) # 8 codeboooks, 200 frames
|
| 40 |
+
features = vocos.codes_to_features(audio_tokens)
|
| 41 |
+
bandwidth_id = torch.tensor([2]) # 6 kbps
|
| 42 |
+
|
| 43 |
+
audio = vocos.decode(features, bandwidth_id=bandwidth_id)
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
Copy-synthesis from a file: It extracts and quantizes features with EnCodec, then reconstructs them with Vocos in a
|
| 47 |
+
single forward pass.
|
| 48 |
+
|
| 49 |
+
```python
|
| 50 |
+
y, sr = torchaudio.load(YOUR_AUDIO_FILE)
|
| 51 |
+
if y.size(0) > 1: # mix to mono
|
| 52 |
+
y = y.mean(dim=0, keepdim=True)
|
| 53 |
+
y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
|
| 54 |
+
|
| 55 |
+
y_hat = vocos(y, bandwidth_id=bandwidth_id)
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Citation
|
| 59 |
+
|
| 60 |
+
If this code contributes to your research, please cite our work:
|
| 61 |
+
|
| 62 |
+
```
|
| 63 |
+
@article{siuzdak2023vocos,
|
| 64 |
+
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
|
| 65 |
+
author={Siuzdak, Hubert},
|
| 66 |
+
journal={arXiv preprint arXiv:2306.00814},
|
| 67 |
+
year={2023}
|
| 68 |
+
}
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
## License
|
| 72 |
+
|
| 73 |
+
The code in this repository is released under the MIT license.
|
vocos-encodec-24khz/config.yaml
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feature_extractor:
|
| 2 |
+
class_path: vocos.feature_extractors.EncodecFeatures
|
| 3 |
+
init_args:
|
| 4 |
+
encodec_model: encodec_24khz
|
| 5 |
+
bandwidths: [1.5, 3.0, 6.0, 12.0]
|
| 6 |
+
train_codebooks: false
|
| 7 |
+
|
| 8 |
+
backbone:
|
| 9 |
+
class_path: vocos.models.VocosBackbone
|
| 10 |
+
init_args:
|
| 11 |
+
input_channels: 128
|
| 12 |
+
dim: 384
|
| 13 |
+
intermediate_dim: 1152
|
| 14 |
+
num_layers: 8
|
| 15 |
+
adanorm_num_embeddings: 4 # len(bandwidths)
|
| 16 |
+
|
| 17 |
+
head:
|
| 18 |
+
class_path: vocos.heads.ISTFTHead
|
| 19 |
+
init_args:
|
| 20 |
+
dim: 384
|
| 21 |
+
n_fft: 1280
|
| 22 |
+
hop_length: 320
|
| 23 |
+
padding: same
|
vocos-encodec-24khz/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e95bb260b74a1bfc43c52d355831c951acb81c8960e9c62b79bd2b3ab1e3a90
|
| 3 |
+
size 40356708
|
vocos-encodec-24khz/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/charactr/vocos-encodec-24khz
|
vocos-mel-10ms-24khz/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-mel-10ms-24khz/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
## Reconstruct audio from mel-spectrogram with 10 ms frame shift
|
| 6 |
+
|
| 7 |
+
To use Vocos only in inference mode, install it using:
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
pip install vocos
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
Load the model and run inference:
|
| 14 |
+
|
| 15 |
+
```python
|
| 16 |
+
import torch
|
| 17 |
+
|
| 18 |
+
from vocos import Vocos
|
| 19 |
+
|
| 20 |
+
vocos = Vocos.from_pretrained("meaningteam/vocos-mel-10ms-24khz")
|
| 21 |
+
|
| 22 |
+
audio = torch.randn(1, 24000)
|
| 23 |
+
mel = vocos.feature_extractor(audio)
|
| 24 |
+
prediction = vocos.decode(mel)
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
## Model details
|
| 28 |
+
|
| 29 |
+
This model was trained on the DNS Challenge dataset for 1M steps. Also, it has 10 ms frame shift compared to `charactr/vocos-mel-24khz`.
|
| 30 |
+
|
| 31 |
+
## License
|
| 32 |
+
|
| 33 |
+
The code in this repository is released under the MIT license.
|
vocos-mel-10ms-24khz/config.yaml
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
backbone:
|
| 2 |
+
class_path: vocos.models.VocosBackbone
|
| 3 |
+
init_args:
|
| 4 |
+
dim: 512
|
| 5 |
+
input_channels: 100
|
| 6 |
+
intermediate_dim: 1536
|
| 7 |
+
num_layers: 8
|
| 8 |
+
evaluate_periodicty: false
|
| 9 |
+
evaluate_pesq: true
|
| 10 |
+
evaluate_utmos: false
|
| 11 |
+
feature_extractor:
|
| 12 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 13 |
+
init_args:
|
| 14 |
+
hop_length: 240
|
| 15 |
+
n_fft: 960
|
| 16 |
+
n_mels: 100
|
| 17 |
+
padding: center
|
| 18 |
+
sample_rate: 24000
|
| 19 |
+
head:
|
| 20 |
+
class_path: vocos.heads.ISTFTHead
|
| 21 |
+
init_args:
|
| 22 |
+
dim: 512
|
| 23 |
+
hop_length: 240
|
| 24 |
+
n_fft: 960
|
| 25 |
+
padding: center
|
| 26 |
+
initial_learning_rate: 5e-4
|
| 27 |
+
mel_loss_coeff: 45
|
| 28 |
+
mrd_loss_coeff: 0.1
|
| 29 |
+
num_warmup_steps: 0
|
| 30 |
+
pretrain_mel_steps: 0
|
| 31 |
+
sample_rate: 24000
|
vocos-mel-10ms-24khz/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3885f32d463665bcff9df6381a5f73e5ca12dbe77c960f02965f7fe85a4f275
|
| 3 |
+
size 54221351
|
vocos-mel-10ms-24khz/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/meaningteam/vocos-mel-10ms-24khz
|
vocos-mel-22khz/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-mel-22khz/README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
datasets:
|
| 4 |
+
- projecte-aina/festcat_trimmed_denoised
|
| 5 |
+
- projecte-aina/openslr-slr69-ca-trimmed-denoised
|
| 6 |
+
- lj_speech
|
| 7 |
+
- blabble-io/libritts_r
|
| 8 |
+
tags:
|
| 9 |
+
- vocoder
|
| 10 |
+
- mel
|
| 11 |
+
- vocos
|
| 12 |
+
- hifigan
|
| 13 |
+
- tts
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# Vocos-mel-22khz
|
| 17 |
+
|
| 18 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## Model Details
|
| 23 |
+
|
| 24 |
+
### Model Description
|
| 25 |
+
|
| 26 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 27 |
+
|
| 28 |
+
**Vocos** is a fast neural vocoder designed to synthesize audio waveforms from acoustic features.
|
| 29 |
+
Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain.
|
| 30 |
+
Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through
|
| 31 |
+
inverse Fourier transform.
|
| 32 |
+
|
| 33 |
+
This version of vocos uses 80-bin mel spectrograms as acoustic features which are widespread
|
| 34 |
+
in the TTS domain since the introduction of [hifi-gan](https://github.com/jik876/hifi-gan/blob/master/meldataset.py)
|
| 35 |
+
The goal of this model is to provide an alternative to hifi-gan that is faster and compatible with the
|
| 36 |
+
acoustic output of several TTS models.
|
| 37 |
+
|
| 38 |
+
We are grateful with the authors for open sourcing the code allowing us to modify and train this version.
|
| 39 |
+
|
| 40 |
+
## Intended Uses and limitations
|
| 41 |
+
|
| 42 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 43 |
+
The model is aimed to serve as a vocoder to synthesize audio waveforms from mel spectrograms. Is trained to generate speech and if is used in other audio
|
| 44 |
+
domain is possible that the model won't produce high quality samples.
|
| 45 |
+
|
| 46 |
+
## How to Get Started with the Model
|
| 47 |
+
|
| 48 |
+
Use the code below to get started with the model.
|
| 49 |
+
|
| 50 |
+
### Installation
|
| 51 |
+
|
| 52 |
+
To use Vocos only in inference mode, install it using:
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
pip install git+https://github.com/langtech-bsc/vocos.git@matcha
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Reconstruct audio from mel-spectrogram
|
| 59 |
+
|
| 60 |
+
```python
|
| 61 |
+
import torch
|
| 62 |
+
|
| 63 |
+
from vocos import Vocos
|
| 64 |
+
|
| 65 |
+
vocos = Vocos.from_pretrained("BSC-LT/vocos-mel-22khz")
|
| 66 |
+
|
| 67 |
+
mel = torch.randn(1, 80, 256) # B, C, T
|
| 68 |
+
audio = vocos.decode(mel)
|
| 69 |
+
```
|
| 70 |
+
### Integrate with existing TTS models:
|
| 71 |
+
|
| 72 |
+
* Matcha-TTS
|
| 73 |
+
<a target="_blank" href="https://colab.research.google.com/drive/1DvMR8z4XbyuhsSpeLZ83i95Ud0z6vQ2H">
|
| 74 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
| 75 |
+
</a>
|
| 76 |
+
|
| 77 |
+
* Fastpitch
|
| 78 |
+
<a target="_blank" href="https://colab.research.google.com/drive/1SA90s_TMoTLpxbWWBB4CxGKz0hVw4fwL">
|
| 79 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
| 80 |
+
</a>
|
| 81 |
+
|
| 82 |
+
### Copy-synthesis from a file:
|
| 83 |
+
|
| 84 |
+
```python
|
| 85 |
+
import torchaudio
|
| 86 |
+
|
| 87 |
+
y, sr = torchaudio.load(YOUR_AUDIO_FILE)
|
| 88 |
+
if y.size(0) > 1: # mix to mono
|
| 89 |
+
y = y.mean(dim=0, keepdim=True)
|
| 90 |
+
y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=22050)
|
| 91 |
+
y_hat = vocos(y)
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
### Onnx
|
| 96 |
+
|
| 97 |
+
We also release a onnx version of the model, you can check in colab:
|
| 98 |
+
|
| 99 |
+
<a target="_blank" href="https://colab.research.google.com/github/langtech-bsc/vocos/blob/matcha/notebooks/vocos_22khz_onnx_inference.ipynb">
|
| 100 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
| 101 |
+
</a>
|
| 102 |
+
|
| 103 |
+
## Training Details
|
| 104 |
+
|
| 105 |
+
### Training Data
|
| 106 |
+
|
| 107 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 108 |
+
|
| 109 |
+
The model was trained on 4 speech datasets
|
| 110 |
+
|
| 111 |
+
| Dataset | Language | Hours |
|
| 112 |
+
|---------------------|----------|---------|
|
| 113 |
+
| LibriTTS-r | en | 585 |
|
| 114 |
+
| LJSpeech | en | 24 |
|
| 115 |
+
| Festcat | ca | 22 |
|
| 116 |
+
| OpenSLR69 | ca | 5 |
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
### Training Procedure
|
| 120 |
+
|
| 121 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 122 |
+
The model was trained for 1.8M steps and 183 epochs with a batch size of 16 for stability. We used a Cosine scheduler with a initial learning rate of 5e-4.
|
| 123 |
+
We also modified the mel spectrogram loss to use 128 bins and fmax of 11025 instead of the same input mel spectrogram.
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
#### Training Hyperparameters
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
* initial_learning_rate: 5e-4
|
| 130 |
+
* scheduler: cosine without warmup or restarts
|
| 131 |
+
* mel_loss_coeff: 45
|
| 132 |
+
* mrd_loss_coeff: 0.1
|
| 133 |
+
* batch_size: 16
|
| 134 |
+
* num_samples: 16384
|
| 135 |
+
|
| 136 |
+
## Evaluation
|
| 137 |
+
|
| 138 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 139 |
+
|
| 140 |
+
Evaluation was done using the metrics on the original repo, after 183 epochs we achieve:
|
| 141 |
+
|
| 142 |
+
* val_loss: 3.81
|
| 143 |
+
* f1_score: 0.94
|
| 144 |
+
* mel_loss: 0.25
|
| 145 |
+
* periodicity_loss:0.132
|
| 146 |
+
* pesq_score: 3.16
|
| 147 |
+
* pitch_loss: 38.11
|
| 148 |
+
* utmos_score: 3.27
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
## Citation
|
| 152 |
+
|
| 153 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 154 |
+
|
| 155 |
+
If this code contributes to your research, please cite the work:
|
| 156 |
+
|
| 157 |
+
```
|
| 158 |
+
@article{siuzdak2023vocos,
|
| 159 |
+
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
|
| 160 |
+
author={Siuzdak, Hubert},
|
| 161 |
+
journal={arXiv preprint arXiv:2306.00814},
|
| 162 |
+
year={2023}
|
| 163 |
+
}
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
## Additional information
|
| 167 |
+
|
| 168 |
+
### Author
|
| 169 |
+
The Language Technologies Unit from Barcelona Supercomputing Center.
|
| 170 |
+
|
| 171 |
+
### Contact
|
| 172 |
+
For further information, please send an email to <langtech@bsc.es>.
|
| 173 |
+
|
| 174 |
+
### Copyright
|
| 175 |
+
Copyright(c) 2024 by Language Technologies Unit, Barcelona Supercomputing Center.
|
| 176 |
+
|
| 177 |
+
### License
|
| 178 |
+
[Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
|
| 179 |
+
|
| 180 |
+
### Funding
|
| 181 |
+
|
| 182 |
+
This work has been promoted and financed by the Generalitat de Catalunya through the [Aina project](https://projecteaina.cat/).
|
vocos-mel-22khz/config.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytorch_lightning==1.8.6
|
| 2 |
+
|
| 3 |
+
feature_extractor:
|
| 4 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 5 |
+
init_args:
|
| 6 |
+
sample_rate: 22050
|
| 7 |
+
n_fft: 1024
|
| 8 |
+
hop_length: 256
|
| 9 |
+
n_mels: 80
|
| 10 |
+
padding: same
|
| 11 |
+
f_min: 0
|
| 12 |
+
f_max: 8000
|
| 13 |
+
norm: "slaney"
|
| 14 |
+
mel_scale: "slaney"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
backbone:
|
| 18 |
+
class_path: vocos.models.VocosBackbone
|
| 19 |
+
init_args:
|
| 20 |
+
input_channels: 80
|
| 21 |
+
dim: 512
|
| 22 |
+
intermediate_dim: 1536
|
| 23 |
+
num_layers: 8
|
| 24 |
+
|
| 25 |
+
head:
|
| 26 |
+
class_path: vocos.heads.ISTFTHead
|
| 27 |
+
init_args:
|
| 28 |
+
dim: 512
|
| 29 |
+
n_fft: 1024
|
| 30 |
+
hop_length: 256
|
| 31 |
+
padding: same
|
| 32 |
+
|
| 33 |
+
|
vocos-mel-22khz/mel_spec_22khz_univ.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ab0744d7d49601ed8ad9be2927fcc99fb359cc90fe28bc9535c0484b3621de3
|
| 3 |
+
size 53883652
|
vocos-mel-22khz/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0af7b6f4b153819ada44a917135acf33944cdbb70cde0701eda3d100153799c7
|
| 3 |
+
size 54051047
|
vocos-mel-22khz/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/BSC-LT/vocos-mel-22khz
|
vocos-mel-22khz/vocos_checkpoint_epoch=183_step=3690672_val_loss=3.8142.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec7cc19942235286d3243c6e47798af4510b1feda50901ea46d41073403f40c9
|
| 3 |
+
size 672720367
|
vocos-mel-24khz-onnx/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-mel-24khz-onnx/README.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
library: ONNX
|
| 4 |
+
base_model: charactr/vocos-mel-24khz
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
**Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis**
|
| 8 |
+
|
| 9 |
+
**Audio samples | Paper [abs] [pdf]**
|
| 10 |
+
|
| 11 |
+
Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
|
| 12 |
+
|
| 13 |
+
This is a ONNX version of the original 24khz mel spectrogram [model](https://huggingface.co/charactr/vocos-mel-24khz). The model predicts spectrograms and the ISTFT is performed outside ONNX as ISTFT is still not implemented as an operator in ONNX.
|
| 14 |
+
|
| 15 |
+
## Usage
|
| 16 |
+
|
| 17 |
+
Try out in colab:
|
| 18 |
+
|
| 19 |
+
<a target="_blank" href="https://colab.research.google.com/drive/1J1tWd56D7CPwmVCP-pbMNzlRWYvlyADN">
|
| 20 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
|
| 21 |
+
</a>
|
| 22 |
+
|
| 23 |
+
## Citation
|
| 24 |
+
|
| 25 |
+
```
|
| 26 |
+
@article{siuzdak2023vocos,
|
| 27 |
+
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
|
| 28 |
+
author={Siuzdak, Hubert},
|
| 29 |
+
journal={arXiv preprint arXiv:2306.00814},
|
| 30 |
+
year={2023}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
```
|
vocos-mel-24khz-onnx/config.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feature_extractor:
|
| 2 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 3 |
+
init_args:
|
| 4 |
+
sample_rate: 24000
|
| 5 |
+
n_fft: 1024
|
| 6 |
+
hop_length: 256
|
| 7 |
+
n_mels: 100
|
| 8 |
+
padding: center
|
| 9 |
+
|
| 10 |
+
backbone:
|
| 11 |
+
class_path: vocos.models.VocosBackbone
|
| 12 |
+
init_args:
|
| 13 |
+
input_channels: 100
|
| 14 |
+
dim: 512
|
| 15 |
+
intermediate_dim: 1536
|
| 16 |
+
num_layers: 8
|
| 17 |
+
|
| 18 |
+
head:
|
| 19 |
+
class_path: vocos.heads.ISTFTHead
|
| 20 |
+
init_args:
|
| 21 |
+
dim: 512
|
| 22 |
+
n_fft: 1024
|
| 23 |
+
hop_length: 256
|
| 24 |
+
padding: center
|
vocos-mel-24khz-onnx/mel_spec_24khz.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a84c58728a769e8a98eeca75bb89102987db0028e5e3d44b45af2ae3ef0104e2
|
| 3 |
+
size 54156978
|
vocos-mel-24khz-onnx/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/wetdog/vocos-mel-24khz-onnx
|
vocos-mel-24khz/.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-mel-24khz/README.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+
# Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
|
| 6 |
+
|
| 7 |
+
[Audio samples](https://charactr-platform.github.io/vocos/) |
|
| 8 |
+
Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
|
| 9 |
+
|
| 10 |
+
Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
|
| 11 |
+
Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
|
| 12 |
+
GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
|
| 13 |
+
coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
|
| 14 |
+
|
| 15 |
+
## Installation
|
| 16 |
+
|
| 17 |
+
To use Vocos only in inference mode, install it using:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
pip install vocos
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
If you wish to train the model, install it with additional dependencies:
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
pip install vocos[train]
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## Usage
|
| 30 |
+
|
| 31 |
+
### Reconstruct audio from mel-spectrogram
|
| 32 |
+
|
| 33 |
+
```python
|
| 34 |
+
import torch
|
| 35 |
+
|
| 36 |
+
from vocos import Vocos
|
| 37 |
+
|
| 38 |
+
vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
|
| 39 |
+
|
| 40 |
+
mel = torch.randn(1, 100, 256) # B, C, T
|
| 41 |
+
audio = vocos.decode(mel)
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Copy-synthesis from a file:
|
| 45 |
+
|
| 46 |
+
```python
|
| 47 |
+
import torchaudio
|
| 48 |
+
|
| 49 |
+
y, sr = torchaudio.load(YOUR_AUDIO_FILE)
|
| 50 |
+
if y.size(0) > 1: # mix to mono
|
| 51 |
+
y = y.mean(dim=0, keepdim=True)
|
| 52 |
+
y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
|
| 53 |
+
y_hat = vocos(y)
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Citation
|
| 57 |
+
|
| 58 |
+
If this code contributes to your research, please cite our work:
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
@article{siuzdak2023vocos,
|
| 62 |
+
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
|
| 63 |
+
author={Siuzdak, Hubert},
|
| 64 |
+
journal={arXiv preprint arXiv:2306.00814},
|
| 65 |
+
year={2023}
|
| 66 |
+
}
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
## License
|
| 70 |
+
|
| 71 |
+
The code in this repository is released under the MIT license.
|
vocos-mel-24khz/config.yaml
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feature_extractor:
|
| 2 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 3 |
+
init_args:
|
| 4 |
+
sample_rate: 24000
|
| 5 |
+
n_fft: 1024
|
| 6 |
+
hop_length: 256
|
| 7 |
+
n_mels: 100
|
| 8 |
+
padding: center
|
| 9 |
+
|
| 10 |
+
backbone:
|
| 11 |
+
class_path: vocos.models.VocosBackbone
|
| 12 |
+
init_args:
|
| 13 |
+
input_channels: 100
|
| 14 |
+
dim: 512
|
| 15 |
+
intermediate_dim: 1536
|
| 16 |
+
num_layers: 8
|
| 17 |
+
|
| 18 |
+
head:
|
| 19 |
+
class_path: vocos.heads.ISTFTHead
|
| 20 |
+
init_args:
|
| 21 |
+
dim: 512
|
| 22 |
+
n_fft: 1024
|
| 23 |
+
hop_length: 256
|
| 24 |
+
padding: center
|
vocos-mel-24khz/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97ec976ad1fd67a33ab2682d29c0ac7df85234fae875aefcc5fb215681a91b2a
|
| 3 |
+
size 54365991
|
vocos-mel-24khz/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/charactr/vocos-mel-24khz
|
vocos-mel-48khz-alpha1/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
vocos-mel-48khz-alpha1/README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- audio
|
| 5 |
+
library_name: pytorch
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Vocos
|
| 9 |
+
|
| 10 |
+
#### Note: This repo has no affiliation with the author of Vocos.
|
| 11 |
+
|
| 12 |
+
Pretrained Vocos model with a 48kHz sampling rate, as opposed to 24kHz of the official.
|
| 13 |
+
|
| 14 |
+
## Usage
|
| 15 |
+
Make sure the Vocos library is installed:
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
pip install vocos
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
then, load the model as usual:
|
| 22 |
+
|
| 23 |
+
```python
|
| 24 |
+
from vocos import Vocos
|
| 25 |
+
vocos = Vocos.from_pretrained("kittn/vocos-mel-48khz-alpha1")
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
For more detailed examples, see [github.com/charactr-platform/vocos#usage](https://github.com/charactr-platform/vocos#usage)
|
| 29 |
+
|
| 30 |
+
## Evals
|
| 31 |
+
TODO
|
| 32 |
+
|
| 33 |
+
## Training details
|
| 34 |
+
TODO
|
| 35 |
+
|
| 36 |
+
## What is Vocos?
|
| 37 |
+
|
| 38 |
+
Here's a summary from the official repo [[link](https://github.com/charactr-platform/vocos)]:
|
| 39 |
+
|
| 40 |
+
> Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
|
| 41 |
+
|
| 42 |
+
For more details and other variants, check out the repo link above.
|
| 43 |
+
|
| 44 |
+
## Model summary
|
| 45 |
+
```bash
|
| 46 |
+
=================================================================
|
| 47 |
+
Layer (type:depth-idx) Param #
|
| 48 |
+
=================================================================
|
| 49 |
+
Vocos --
|
| 50 |
+
├─MelSpectrogramFeatures: 1-1 --
|
| 51 |
+
│ └─MelSpectrogram: 2-1 --
|
| 52 |
+
│ │ └─Spectrogram: 3-1 --
|
| 53 |
+
│ │ └─MelScale: 3-2 --
|
| 54 |
+
├─VocosBackbone: 1-2 --
|
| 55 |
+
│ └─Conv1d: 2-2 918,528
|
| 56 |
+
│ └─LayerNorm: 2-3 2,048
|
| 57 |
+
│ └─ModuleList: 2-4 --
|
| 58 |
+
│ │ └─ConvNeXtBlock: 3-3 4,208,640
|
| 59 |
+
│ │ └─ConvNeXtBlock: 3-4 4,208,640
|
| 60 |
+
│ │ └─ConvNeXtBlock: 3-5 4,208,640
|
| 61 |
+
│ │ └─ConvNeXtBlock: 3-6 4,208,640
|
| 62 |
+
│ │ └─ConvNeXtBlock: 3-7 4,208,640
|
| 63 |
+
│ │ └─ConvNeXtBlock: 3-8 4,208,640
|
| 64 |
+
│ │ └─ConvNeXtBlock: 3-9 4,208,640
|
| 65 |
+
│ │ └─ConvNeXtBlock: 3-10 4,208,640
|
| 66 |
+
│ └─LayerNorm: 2-5 2,048
|
| 67 |
+
├─ISTFTHead: 1-3 --
|
| 68 |
+
│ └─Linear: 2-6 2,101,250
|
| 69 |
+
│ └─ISTFT: 2-7 --
|
| 70 |
+
=================================================================
|
| 71 |
+
Total params: 36,692,994
|
| 72 |
+
Trainable params: 36,692,994
|
| 73 |
+
Non-trainable params: 0
|
| 74 |
+
=================================================================
|
| 75 |
+
```
|
vocos-mel-48khz-alpha1/config.yaml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
backbone:
|
| 2 |
+
class_path: vocos.models.VocosBackbone
|
| 3 |
+
init_args:
|
| 4 |
+
adanorm_num_embeddings: null
|
| 5 |
+
dim: 1024
|
| 6 |
+
input_channels: 128
|
| 7 |
+
intermediate_dim: 2048
|
| 8 |
+
layer_scale_init_value: null
|
| 9 |
+
num_layers: 8
|
| 10 |
+
decay_mel_coeff: false
|
| 11 |
+
enable_discriminator: true
|
| 12 |
+
evaluate_periodicty: true
|
| 13 |
+
evaluate_pesq: true
|
| 14 |
+
evaluate_utmos: true
|
| 15 |
+
feature_extractor:
|
| 16 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 17 |
+
init_args:
|
| 18 |
+
hop_length: 256
|
| 19 |
+
n_fft: 2048
|
| 20 |
+
n_mels: 128
|
| 21 |
+
padding: center
|
| 22 |
+
sample_rate: 48000
|
| 23 |
+
generator_period: 3
|
| 24 |
+
grad_acc: 1
|
| 25 |
+
head:
|
| 26 |
+
class_path: vocos.heads.ISTFTHead
|
| 27 |
+
init_args:
|
| 28 |
+
dim: 1024
|
| 29 |
+
hop_length: 256
|
| 30 |
+
n_fft: 2048
|
| 31 |
+
padding: center
|
| 32 |
+
initial_learning_rate: 0.0003
|
| 33 |
+
mel_loss_coeff: 15.0
|
| 34 |
+
mrd_loss_coeff: 0.1
|
| 35 |
+
num_warmup_steps: 500
|
| 36 |
+
pretrain_decoupled_steps: 0
|
| 37 |
+
pretrain_disc_steps: 500
|
| 38 |
+
pretrain_mel_steps: 0
|
| 39 |
+
pretrained_ckpt: null
|
| 40 |
+
sample_rate: 48000
|
vocos-mel-48khz-alpha1/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3315c87d130922dff1c4c0cfd153ac3ef037950ac0eba13f355bb38cbda46fc2
|
| 3 |
+
size 147342055
|
vocos-mel-48khz-alpha1/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/kittn/vocos-mel-48khz-alpha1
|
vocos-mel-hifigan-compat-44100khz/.gitattributes
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
|
vocos-mel-hifigan-compat-44100khz/README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
pipeline_tag: audio-to-audio
|
| 4 |
+
tags:
|
| 5 |
+
- vocos
|
| 6 |
+
- hifigan
|
| 7 |
+
- tts
|
| 8 |
+
- melspectrogram
|
| 9 |
+
- vocoder
|
| 10 |
+
- mel
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
### Model Description
|
| 14 |
+
|
| 15 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 16 |
+
|
| 17 |
+
**Vocos** is a fast neural vocoder designed to synthesize audio waveforms from acoustic features.
|
| 18 |
+
Unlike other typical GAN-based vocoders, Vocos does not model audio samples in the time domain.
|
| 19 |
+
Instead, it generates spectral coefficients, facilitating rapid audio reconstruction through
|
| 20 |
+
inverse Fourier transform.
|
| 21 |
+
|
| 22 |
+
This version of vocos uses 80-bin mel spectrograms as acoustic features which are widespread
|
| 23 |
+
in the TTS domain since the introduction of [hifi-gan](https://github.com/jik876/hifi-gan/blob/master/meldataset.py)
|
| 24 |
+
The goal of this model is to provide an alternative to hifi-gan that is faster and compatible with the
|
| 25 |
+
acoustic output of several TTS models.
|
| 26 |
+
|
| 27 |
+
## Intended Uses and limitations
|
| 28 |
+
|
| 29 |
+
The model is aimed to serve as a vocoder to synthesize audio waveforms from mel spectrograms. Is trained to generate speech and if is used in other audio
|
| 30 |
+
domain is possible that the model won't produce high quality samples.
|
| 31 |
+
|
| 32 |
+
### Installation
|
| 33 |
+
|
| 34 |
+
To use Vocos only in inference mode, install it using:
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
pip install git+https://github.com/langtech-bsc/vocos.git@matcha
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
### Reconstruct audio from mel-spectrogram
|
| 41 |
+
|
| 42 |
+
```python
|
| 43 |
+
import torch
|
| 44 |
+
|
| 45 |
+
from vocos import Vocos
|
| 46 |
+
|
| 47 |
+
vocos = Vocos.from_pretrained("patriotyk/vocos-mel-hifigan-compat-44100khz")
|
| 48 |
+
|
| 49 |
+
mel = torch.randn(1, 80, 256) # B, C, T
|
| 50 |
+
audio = vocos.decode(mel)
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### Training Data
|
| 54 |
+
|
| 55 |
+
The model was trained on private 800+ hours dataset, made from Ukrainian audio books, using [narizaka](https://github.com/patriotyk/narizaka) tool.
|
| 56 |
+
|
| 57 |
+
### Training Procedure
|
| 58 |
+
|
| 59 |
+
The model was trained for 2.0M steps and 210 epochs with a batch size of 20. We used a Cosine scheduler with a initial learning rate of 3e-4.
|
| 60 |
+
We where using two RTX-3090 video cards for training, and it took about one month of continuous training.
|
| 61 |
+
|
| 62 |
+
#### Training Hyperparameters
|
| 63 |
+
|
| 64 |
+
* initial_learning_rate: 3e-4
|
| 65 |
+
* scheduler: cosine without warmup or restarts
|
| 66 |
+
* mel_loss_coeff: 45
|
| 67 |
+
* mrd_loss_coeff: 1.0
|
| 68 |
+
* batch_size: 20
|
| 69 |
+
* num_samples: 32768
|
| 70 |
+
|
| 71 |
+
## Evaluation
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
Evaluation was done using the metrics on the original repo, after 210 epochs we achieve:
|
| 75 |
+
|
| 76 |
+
* val_loss: 3.703
|
| 77 |
+
* f1_score: 0.950
|
| 78 |
+
* mel_loss: 0.248
|
| 79 |
+
* periodicity_loss:0.127
|
| 80 |
+
* pesq_score: 3.399
|
| 81 |
+
* pitch_loss: 38.26
|
| 82 |
+
* utmos_score: 3.146
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
## Citation
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
If this code contributes to your research, please cite the work:
|
| 89 |
+
|
| 90 |
+
```
|
| 91 |
+
@article{siuzdak2023vocos,
|
| 92 |
+
title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
|
| 93 |
+
author={Siuzdak, Hubert},
|
| 94 |
+
journal={arXiv preprint arXiv:2306.00814},
|
| 95 |
+
year={2023}
|
| 96 |
+
}
|
| 97 |
+
```
|
vocos-mel-hifigan-compat-44100khz/config.yaml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
feature_extractor:
|
| 2 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 3 |
+
init_args:
|
| 4 |
+
sample_rate: 44100
|
| 5 |
+
n_fft: 2048
|
| 6 |
+
hop_length: 512
|
| 7 |
+
n_mels: 80
|
| 8 |
+
padding: same
|
| 9 |
+
f_min: 0
|
| 10 |
+
f_max: 8000
|
| 11 |
+
norm: "slaney"
|
| 12 |
+
mel_scale: "slaney"
|
| 13 |
+
|
| 14 |
+
backbone:
|
| 15 |
+
class_path: vocos.models.VocosBackbone
|
| 16 |
+
init_args:
|
| 17 |
+
input_channels: 80
|
| 18 |
+
dim: 512
|
| 19 |
+
intermediate_dim: 1536
|
| 20 |
+
num_layers: 8
|
| 21 |
+
|
| 22 |
+
head:
|
| 23 |
+
class_path: vocos.heads.ISTFTHead
|
| 24 |
+
init_args:
|
| 25 |
+
dim: 512
|
| 26 |
+
n_fft: 2048
|
| 27 |
+
hop_length: 512
|
| 28 |
+
padding: same
|
vocos-mel-hifigan-compat-44100khz/logs/version_0/config.yaml
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytorch_lightning==1.8.6
|
| 2 |
+
seed_everything: 4444
|
| 3 |
+
trainer:
|
| 4 |
+
logger:
|
| 5 |
+
class_path: pytorch_lightning.loggers.TensorBoardLogger
|
| 6 |
+
init_args:
|
| 7 |
+
save_dir: /home/patriotyk/vocos/logs
|
| 8 |
+
name: lightning_logs
|
| 9 |
+
version: null
|
| 10 |
+
log_graph: false
|
| 11 |
+
default_hp_metric: true
|
| 12 |
+
prefix: ''
|
| 13 |
+
sub_dir: null
|
| 14 |
+
logdir: null
|
| 15 |
+
comment: ''
|
| 16 |
+
purge_step: null
|
| 17 |
+
max_queue: 10
|
| 18 |
+
flush_secs: 120
|
| 19 |
+
filename_suffix: ''
|
| 20 |
+
write_to_disk: true
|
| 21 |
+
comet_config:
|
| 22 |
+
disabled: true
|
| 23 |
+
enable_checkpointing: true
|
| 24 |
+
callbacks:
|
| 25 |
+
- class_path: pytorch_lightning.callbacks.LearningRateMonitor
|
| 26 |
+
init_args:
|
| 27 |
+
logging_interval: null
|
| 28 |
+
log_momentum: false
|
| 29 |
+
- class_path: pytorch_lightning.callbacks.ModelSummary
|
| 30 |
+
init_args:
|
| 31 |
+
max_depth: 2
|
| 32 |
+
- class_path: pytorch_lightning.callbacks.ModelCheckpoint
|
| 33 |
+
init_args:
|
| 34 |
+
dirpath: null
|
| 35 |
+
filename: vocos_checkpoint_{epoch}_{step}_{val_loss:.4f}
|
| 36 |
+
monitor: val_loss
|
| 37 |
+
verbose: false
|
| 38 |
+
save_last: true
|
| 39 |
+
save_top_k: 3
|
| 40 |
+
save_weights_only: false
|
| 41 |
+
mode: min
|
| 42 |
+
auto_insert_metric_name: true
|
| 43 |
+
every_n_train_steps: null
|
| 44 |
+
train_time_interval: null
|
| 45 |
+
every_n_epochs: null
|
| 46 |
+
save_on_train_epoch_end: null
|
| 47 |
+
- class_path: vocos.helpers.GradNormCallback
|
| 48 |
+
default_root_dir: null
|
| 49 |
+
gradient_clip_val: null
|
| 50 |
+
gradient_clip_algorithm: null
|
| 51 |
+
num_nodes: 1
|
| 52 |
+
num_processes: null
|
| 53 |
+
devices: -1
|
| 54 |
+
gpus: null
|
| 55 |
+
auto_select_gpus: false
|
| 56 |
+
tpu_cores: null
|
| 57 |
+
ipus: null
|
| 58 |
+
enable_progress_bar: true
|
| 59 |
+
overfit_batches: 0.0
|
| 60 |
+
track_grad_norm: -1
|
| 61 |
+
check_val_every_n_epoch: 1
|
| 62 |
+
fast_dev_run: false
|
| 63 |
+
accumulate_grad_batches: null
|
| 64 |
+
max_epochs: null
|
| 65 |
+
min_epochs: null
|
| 66 |
+
max_steps: -1
|
| 67 |
+
min_steps: null
|
| 68 |
+
max_time: null
|
| 69 |
+
limit_train_batches: null
|
| 70 |
+
limit_val_batches: 100
|
| 71 |
+
limit_test_batches: null
|
| 72 |
+
limit_predict_batches: null
|
| 73 |
+
val_check_interval: null
|
| 74 |
+
log_every_n_steps: 100
|
| 75 |
+
accelerator: gpu
|
| 76 |
+
strategy: ddp
|
| 77 |
+
sync_batchnorm: false
|
| 78 |
+
precision: 32
|
| 79 |
+
enable_model_summary: true
|
| 80 |
+
num_sanity_val_steps: 2
|
| 81 |
+
resume_from_checkpoint: null
|
| 82 |
+
profiler: null
|
| 83 |
+
benchmark: null
|
| 84 |
+
deterministic: null
|
| 85 |
+
reload_dataloaders_every_n_epochs: 0
|
| 86 |
+
auto_lr_find: false
|
| 87 |
+
replace_sampler_ddp: true
|
| 88 |
+
detect_anomaly: false
|
| 89 |
+
auto_scale_batch_size: false
|
| 90 |
+
plugins: null
|
| 91 |
+
amp_backend: native
|
| 92 |
+
amp_level: null
|
| 93 |
+
move_metrics_to_cpu: false
|
| 94 |
+
multiple_trainloader_mode: max_size_cycle
|
| 95 |
+
inference_mode: true
|
| 96 |
+
data:
|
| 97 |
+
class_path: vocos.dataset.VocosDataModule
|
| 98 |
+
init_args:
|
| 99 |
+
train_params:
|
| 100 |
+
filelist_path: /home/patriotyk/tts_corpus_44100/train_vocos.txt
|
| 101 |
+
sampling_rate: 44100
|
| 102 |
+
num_samples: 32768
|
| 103 |
+
batch_size: 20
|
| 104 |
+
num_workers: 24
|
| 105 |
+
val_params:
|
| 106 |
+
filelist_path: /home/patriotyk/tts_corpus_44100/val_vocos.txt
|
| 107 |
+
sampling_rate: 44100
|
| 108 |
+
num_samples: 96768
|
| 109 |
+
batch_size: 20
|
| 110 |
+
num_workers: 24
|
| 111 |
+
model:
|
| 112 |
+
class_path: vocos.experiment.VocosExp
|
| 113 |
+
init_args:
|
| 114 |
+
feature_extractor:
|
| 115 |
+
class_path: vocos.feature_extractors.MelSpectrogramFeatures
|
| 116 |
+
init_args:
|
| 117 |
+
sample_rate: 44100
|
| 118 |
+
n_fft: 2048
|
| 119 |
+
hop_length: 512
|
| 120 |
+
n_mels: 80
|
| 121 |
+
padding: same
|
| 122 |
+
f_min: 0
|
| 123 |
+
f_max: 8000
|
| 124 |
+
norm: slaney
|
| 125 |
+
mel_scale: slaney
|
| 126 |
+
backbone:
|
| 127 |
+
class_path: vocos.models.VocosBackbone
|
| 128 |
+
init_args:
|
| 129 |
+
input_channels: 80
|
| 130 |
+
dim: 512
|
| 131 |
+
intermediate_dim: 1536
|
| 132 |
+
num_layers: 8
|
| 133 |
+
layer_scale_init_value: null
|
| 134 |
+
adanorm_num_embeddings: null
|
| 135 |
+
head:
|
| 136 |
+
class_path: vocos.heads.ISTFTHead
|
| 137 |
+
init_args:
|
| 138 |
+
dim: 512
|
| 139 |
+
n_fft: 2048
|
| 140 |
+
hop_length: 512
|
| 141 |
+
padding: same
|
| 142 |
+
sample_rate: 44100
|
| 143 |
+
initial_learning_rate: 0.0003
|
| 144 |
+
num_warmup_steps: 0
|
| 145 |
+
mel_loss_coeff: 45.0
|
| 146 |
+
mrd_loss_coeff: 1.0
|
| 147 |
+
pretrain_mel_steps: 0
|
| 148 |
+
decay_mel_coeff: false
|
| 149 |
+
evaluate_utmos: true
|
| 150 |
+
evaluate_pesq: true
|
| 151 |
+
evaluate_periodicty: true
|
vocos-mel-hifigan-compat-44100khz/logs/version_0/events.out.tfevents.1713993466.gpuserver
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:776d06fc99b9d864dedb323dda38c32b09759b7cb04e488437269fe68a9919db
|
| 3 |
+
size 303299046
|