Duplicate from alefiury/free-svc

Browse files

Co-authored-by: Alef Iury Siqueira Ferreira <alefiury@users.noreply.huggingface.co>

Files changed (8) hide show

.gitattributes +35 -0
G_00014_0225000.pth +3 -0
README.md +94 -0
common.yaml +22 -0
config.yaml +93 -0
hyperparams.yaml +33 -0
rmvpe.pt +3 -0
spin.ckpt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

G_00014_0225000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a132e8cb7656b69064fc705c65daf79cda76be1e435de5a6cb6126802f84b1e
+size 861739692

README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+---
+license: cc-by-nc-sa-4.0
+language:
+- en
+- pt
+- es
+- zh
+- nl
+- fr
+- de
+- it
+- ja
+- pl
+pipeline_tag: audio-to-audio
+tags:
+- audio
+- voice
+- voice conversion
+- singing voice conversion
+- vc
+- svc
+- multilingual
+---
+# FreeSVC: Zero-shot Multilingual Singing Voice Conversion
+**FreeSVC** is a promising multilingual zero-shot singing voice conversion model. It enables the conversion of singing voices across languages without the need for extensive language-specific training. [GitHub repository](https://github.com/freds0/free-svc). [Paper arXiv pre-print](https://arxiv.org/abs/2501.05586).
+## Supported Languages
+| Language    | ID  | Status       | Speech Data | Singing Data |
+|------------|-----|--------------|-------------|--------------|
+| Chinese    | 0   | ✅ Full      | 255h        | 70h        |
+| Dutch      | 1   | ✅ Full      | Part of CML | -           |
+| English    | 2   | ✅ Full      | 921h        | 47h         |
+| French     | 3   | ✅ Full      | Part of CML | -           |
+| German     | 4   | ✅ Full      | Part of CML | -           |
+| Italian    | 5   | ✅ Full      | Part of CML | -           |
+| Japanese   | 6   | ✅ Full      | 30h         | -           |
+| Other*     | 7   | ⚠️ Partial   | -           | 10h         |
+| Polish     | 8   | ✅ Full      | Part of CML | -           |
+| Portuguese | 9   | ✅ Full      | Part of CML | -           |
+| Spanish    | 10  | ✅ Full      | Part of CML | -           |
+*Note: The "Other" category is used for vocal techniques without content.
+## Model Overview
+FreeSVC leverages an enhanced VITS architecture integrated with Speaker-invariant Clustering (SPIN) and the ECAPA2 speaker encoder. This combination effectively separates speaker characteristics from linguistic content, ensuring high-quality and natural-sounding voice conversions across multiple languages.
+## Training Datasets
+FreeSVC was trained on a diverse set of speech and singing datasets covering multiple languages:
+| **Dataset**          | **Hours**  | **Language** | **Type**    |
+|----------------------|------------|--------------|--------------|
+| AISHELL-1            | 170h       | Chinese      | Speech      |
+| AISHELL-3            | 85h        | Chinese      | Speech      |
+| CML-TTS              | 3.1k       | 7 Languages  | Speech      |
+| HiFiTTS              | 292h       | English      | Speech      |
+| JVS                  | 30h        | Japanese     | Speech      |
+| LibriTTS-R           | 585h       | English      | Speech      |
+| NUS (NHSS)           | 7h         | English      | Speech, Singing        |
+| OpenSinger           | 50h        | Chinese      | Singing     |
+| Opencpop             | 5h         | Chinese      | Singing     |
+| PopBuTFy             | 10h, 40h   | Chinese, English | Singing |
+| POPCS                | 5h         | Chinese      | Singing     |
+| VCTK                 | 44h        | English      | Speech      |
+| VocalSet             | 10h        | Other      | Singing     |
+## License
+FreeSVC is released under the **Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)** license. This means:
+- The model **can only be used for research and non-commercial purposes**. Any commercial use is strictly prohibited.
+- Any derivative works must be **shared under the same license**.
+- Proper attribution must be given when using the model.
+Users must also **comply with the licenses of the original datasets** used for training. Some datasets may have additional restrictions beyond CC BY-NC-SA 4.0. Ensure you review and adhere to their terms before using the model.
+For full details, refer to the [CC BY-NC-SA 4.0 License](https://creativecommons.org/licenses/by-nc-sa/4.0/).
+## Citation
+```
+@INPROCEEDINGS{10890068,
+  author={Ferreira, Alef Iury and Gris, Lucas Rafael and Da Rosa, Augusto and Oliveira, Frederico and Casanova, Edresson and Sousa, Rafael and Junior, Arnaldo and Soares, Anderson and Filho, Arlindo Galvão},
+  booktitle={ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  title={FreeSVC: Towards Zero-shot Multilingual Singing Voice Conversion},
+  year={2025},
+  volume={},
+  number={},
+  pages={1-5},
+  keywords={Training;Source coding;Zero shot learning;Refining;Signal processing;Data models;Acoustics;Multilingual;Data mining;Speech synthesis;Singing Voice Conversion;Synthesis of Singing Voices;Cross-lingual and multilingual aspects in speech synthesis},
+  doi={10.1109/ICASSP49660.2025.10890068}}
+```

common.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+path: ./logs/${hydra.job.config_name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+log_level: INFO
+seed: 1
+tb_log_dir: tensorboard
+tqdm: true
+hydra:
+  run:
+    dir: ${path}
+  job_logging:
+    formatters:
+      colorlog:
+        format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s:%(lineno)s:%(funcName)s()%(reset)s][%(log_color)s%(levelname)s%(reset)s]
+          - %(message)s'
+    handlers:
+      file:
+        filename: ${hydra.run.dir}/${hydra.job.name}_${now:%Y-%m-%d}_${now:%H-%M-%S}.log
+defaults:
+  - override hydra/job_logging: colorlog
+  - override hydra/hydra_logging: colorlog

config.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+defaults:
+- common
+train:
+  batch_size: 128
+  betas: [0.8, 0.99]
+  c_kl: 1.0
+  c_mel: 45
+  distributed: false  # BUG: multi-gpu is not working
+  use_multiprocessing: false # BUG: multi-gpu is not working
+  epochs: 20
+  eps: 1e-9
+  fp16_run: false
+  init_lr_ratio: 1
+  raise_error: false
+  learning_rate: 2e-4
+  log_interval: 10
+  log_level: ${log_level}
+  lr_decay: 0.98
+  max_speclen: 128
+  port: 8005
+  resume_training: false  # set to false to finetune from a model
+  seed: 1234
+  segment_size: 8960
+  use_sr: false
+  valid_epoch_interval: 1
+  valid_steps_interval: 1000
+  save_epoch_interval: 10
+  save_steps_interval: 1000
+  warmup_epochs: 0
+  # weighted_batch_speaker_sampling : false
+  # weighted_batch_lang_sampling : false
+  weighted_batch_speaker_sampling : 0.5
+  weighted_batch_lang_sampling : 0.5
+data:
+  dataset_dir: /raid/lucasgris/free-svc/data
+  filter_length: 1280
+  hop_length: 320
+  max_wav_value: 32768.0
+  mel_fmax: null
+  mel_fmin: 0.0
+  n_mel_channels: 80
+  num_workers: 64
+  # For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk)
+  pitch_predictor: rmvpe # pm | crepe | harvest | dio | rmvpe | fcpe
+  pitch_features_dir: ${data.dataset_dir}/pitch_features/
+  sampling_rate: 24000
+  spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space
+  # For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward
+  use_spk_emb: true
+  spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings
+  # SR augmentation is deprecated, set use_sr to False
+  sr_min_max: [68, 92]
+  # For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward
+  content_feature_dir: null
+  training_files: data/train.csv
+  validation_files: data/valid.csv
+  win_length: 1280
+model:
+  save_dir: null
+  filter_channels: 768
+  finetune_from_model:
+    discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth
+    generator: /raid/lucasgris/free-svc/freevc-24.pth
+  hidden_channels: 192
+  inter_channels: 192
+  kernel_size: 3
+  n_heads: 2
+  n_layers_q: 3
+  n_layers: 6
+  p_dropout: 0.1
+  resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+  resblock_kernel_sizes: [3,7,11]
+  resblock: 1
+  c_dim: 768
+  upsample_initial_channel: 512
+  upsample_kernel_sizes: [16,16,4,4]
+  upsample_rates: [10,8,2,2]
+  use_spectral_norm: false
+  freeze_external_spk: true
+  device: cuda
+  # For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type
+  use_spk_emb: false
+  gin_channels: null # gin_channels = spk_encoder.embedding_dim
+  spk_encoder_type: null # ECAPA2SpeakerEncoder16k |
+  # For content feature extraction, set the content_encoder_type and content_encoder_ckpt
+  content_encoder_type: null # load from disk (data) - hubert | wavlm
+  content_encoder_ckpt: null # load from disk (data) - [path] | models/wavlm/WavLM-Large.pt | lengyue233/content-vec-best
+  post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck
+  coarse_f0: true
+  cond_f0_on_flow: false

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+defaults:
+- common
+- config
+data:
+  use_lang_emb: true
+  num_langs: 11
+  lang_dim: 192 # same size as hidden_channels to facilitate the addition
+  lang2id:
+    chinese: 0
+    dutch: 1
+    english: 2
+    french: 3
+    german: 4
+    italian: 5
+    japanese: 6
+    other: 7
+    polish: 8
+    portuguese: 9
+    spanish: 10
+  use_spk_emb: false
+  spk_embeddings_dir: null # compute on forward (model)
+  spk_encoder_type: null # compute on forward (model) | ECAPA2SpeakerEncoder16k
+  content_encoder_type: null # compute on forward (model) | hubert
+  content_encoder_ckpt: null # compute on forward (model) | lengyue233/content-vec-best
+model:
+  use_spk_emb: true
+  spk_encoder_type: ECAPA2SpeakerEncoder16k
+  spk_encoder_ckpt: null  # Not used for ECAPA2SpeakerEncoder16k
+  content_encoder_type: spin # hubert | wavlm | spin
+  content_encoder_config: models/spin/spin.yaml # path to the config file for the content encoder
+  content_encoder_ckpt: models/spin/spin.ckpt # or models/wavlm/WavLM-Large.pt

rmvpe.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d49bd662038808878c9d7420e0f583f506fe69086cc384f0da88f0b3a4e1115
+size 368492925

spin.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08b2f5082bc4b4748640a67316feaf4bc577d333d1af7f85cabf5b8fe816f6ee
+size 500185599