Text-to-Speech
ONNX
zero-shot
multilingual
Approximetal commited on
Commit
7b632eb
·
verified ·
1 Parent(s): 7f94024

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pretrained_models/ckpts/autoregressive/encodec_4cb2048_giga.th filter=lfs diff=lfs merge=lfs -text
37
+ pretrained_models/demo/test.wav filter=lfs diff=lfs merge=lfs -text
38
+ pretrained_models/whisperx/whisperx-vad-segmentation.bak filter=lfs diff=lfs merge=lfs -text
pretrained_models/ckpts/autoregressive/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "seed": 1,
3
+ "precision": "float16",
4
+ "num_workers": 8,
5
+ "resume": false,
6
+ "tb_write_every_n_steps": 100,
7
+ "print_every_n_steps": 400,
8
+ "val_every_n_steps": 1600,
9
+ "lr": 1e-05,
10
+ "batch_size": 100,
11
+ "weight_decay": 0.0,
12
+ "warmup_fraction": 0.1,
13
+ "num_epochs": 10,
14
+ "num_steps": 500000,
15
+ "gradient_accumulation_steps": 24,
16
+ "gradient_clip_val": 1.0,
17
+ "early_stop_step": 3200,
18
+ "early_stop_threshold": -1.0,
19
+ "exp_dir": "/data/scratch/pyp/exp_pyp/VoiceCraft/gigaspeech/tts_enhanced_330M",
20
+ "dataset": "gigaspeech",
21
+ "dataset_dir": "/data/scratch/pyp/datasets/gigaspeech_phn_enc_manifest/xl",
22
+ "pseudo_epoch_size": 3000,
23
+ "phn_folder_name": "phonemes",
24
+ "encodec_folder_name": "encodec_16khz_4codebooks",
25
+ "manifest_name": "manifest_large16khz_lessambi",
26
+ "pad_x": 0,
27
+ "max_num_tokens": 20000,
28
+ "val_max_num_tokens": 6000,
29
+ "num_buckets": 10,
30
+ "dynamic_batching": 1,
31
+ "audio_max_length": 16.0,
32
+ "audio_min_length": 1.0,
33
+ "text_max_length": 400,
34
+ "text_min_length": 10.0,
35
+ "encodec_sr": 50,
36
+ "mask_len_min": 1,
37
+ "mask_len_max": 600,
38
+ "drop_long": 1,
39
+ "eos": 2051,
40
+ "reduced_eog": 1,
41
+ "special_first": 0,
42
+ "n_special": 4,
43
+ "codebook_weight": "[2,1,1,1]",
44
+ "empty_token": 2048,
45
+ "optimizer_name": "AdamW",
46
+ "reduce_lr_start_step": 3000,
47
+ "reduce_lr_start_epoch": 4,
48
+ "clipping_update_period": 1000,
49
+ "max_mask_portion": 0.9,
50
+ "max_n_spans": 3,
51
+ "shuffle_mask_embedding": 0,
52
+ "mask_sample_dist": "poisson1",
53
+ "min_gap": 5,
54
+ "n_codebooks": 4,
55
+ "text_vocab_size": 120,
56
+ "text_pad_token": 120,
57
+ "audio_vocab_size": 2048,
58
+ "eog": 2049,
59
+ "audio_pad_token": 2050,
60
+ "d_model": 1024,
61
+ "audio_embedding_dim": 1024,
62
+ "text_embedding_dropout": 0.0,
63
+ "audio_embedding_dropout": 0.0,
64
+ "text_positional_embedding_dropout": 0.0,
65
+ "audio_positional_embedding_dropout": 0.0,
66
+ "trm_dropout": 0.0,
67
+ "nhead": 16,
68
+ "num_decoder_layers": 24,
69
+ "load_model_from": "./pretrained_models/giga330M.pth"
70
+ }
pretrained_models/ckpts/autoregressive/dac_SR_8codes_2048_hop960_speech.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e0948b7e4315975e33a40aca48fe8f659f370967209bd91729864416cb4651
3
+ size 599785518
pretrained_models/ckpts/autoregressive/encodec_4cb2048_giga.th ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caa0c595d4919527a9728d627150aa2a0b15b6d117b21855165851333dc63378
3
+ size 1167842971
pretrained_models/ckpts/autoregressive/multilingual_330M.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87f7dfbd2a179aebb5b69e829ec8b4509725d9f3ef5db5ffbbdb70353bae4ef8
3
+ size 5245363818
pretrained_models/ckpts/multilingual_grl/multilingual_grl.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67507ed9cdddfa35b52abe268a3be3ce3fa38106669bf849be4e484e8739eeb3
3
+ size 1345523969
pretrained_models/ckpts/multilingual_prosody/multilingual_prosody.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a21dc48082750e7d0f5898f247cbff792a7e88c94c9189a5455b9c8eedd7faf1
3
+ size 1379604641
pretrained_models/ckpts/prosody_encoder/pretssel_cfg.json ADDED
@@ -0,0 +1,822 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name": null,
3
+ "common": {
4
+ "_name": null,
5
+ "no_progress_bar": false,
6
+ "log_interval": 100,
7
+ "log_format": "simple",
8
+ "log_file": null,
9
+ "aim_repo": null,
10
+ "aim_run_hash": null,
11
+ "tensorboard_logdir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
12
+ "wandb_project": null,
13
+ "azureml_logging": false,
14
+ "seed": 1,
15
+ "cpu": false,
16
+ "tpu": false,
17
+ "bf16": false,
18
+ "fp16": false,
19
+ "memory_efficient_fp16": false,
20
+ "fp16_no_flatten_grads": false,
21
+ "fp16_init_scale": 128,
22
+ "fp16_scale_window": null,
23
+ "fp16_scale_tolerance": 0.0,
24
+ "on_cpu_convert_precision": false,
25
+ "min_loss_scale": 0.0001,
26
+ "threshold_loss_scale": null,
27
+ "amp": false,
28
+ "amp_batch_retries": 2,
29
+ "amp_init_scale": 128,
30
+ "amp_scale_window": null,
31
+ "user_dir": null,
32
+ "empty_cache_freq": 0,
33
+ "all_gather_list_size": 9999999,
34
+ "model_parallel_size": 1,
35
+ "quantization_config_path": null,
36
+ "profile": false,
37
+ "reset_logging": false,
38
+ "suppress_crashes": false,
39
+ "use_plasma_view": false,
40
+ "plasma_path": "/tmp/plasma",
41
+ "log_nvidia_smi": false,
42
+ "use_tutel_moe": false
43
+ },
44
+ "common_eval": {
45
+ "_name": null,
46
+ "path": null,
47
+ "post_process": null,
48
+ "quiet": false,
49
+ "model_overrides": "{}",
50
+ "results_path": null,
51
+ "is_moe": false,
52
+ "moe_generation": false
53
+ },
54
+ "distributed_training": {
55
+ "_name": null,
56
+ "distributed_world_size": 16,
57
+ "distributed_num_procs": 8,
58
+ "distributed_rank": 0,
59
+ "distributed_backend": "nccl",
60
+ "distributed_init_method": "tcp://learnfair0791:15129",
61
+ "distributed_port": 15129,
62
+ "device_id": 0,
63
+ "distributed_no_spawn": false,
64
+ "ddp_backend": "legacy_ddp",
65
+ "ddp_comm_hook": "none",
66
+ "bucket_cap_mb": 25,
67
+ "fix_batches_to_gpus": false,
68
+ "find_unused_parameters": true,
69
+ "gradient_as_bucket_view": false,
70
+ "fast_stat_sync": false,
71
+ "heartbeat_timeout": -1,
72
+ "broadcast_buffers": false,
73
+ "slowmo_momentum": null,
74
+ "slowmo_base_algorithm": "localsgd",
75
+ "localsgd_frequency": 3,
76
+ "nprocs_per_node": 8,
77
+ "pipeline_model_parallel": false,
78
+ "pipeline_balance": null,
79
+ "pipeline_devices": null,
80
+ "pipeline_chunks": 0,
81
+ "pipeline_encoder_balance": null,
82
+ "pipeline_encoder_devices": null,
83
+ "pipeline_decoder_balance": null,
84
+ "pipeline_decoder_devices": null,
85
+ "pipeline_checkpoint": "never",
86
+ "zero_sharding": "none",
87
+ "fp16": false,
88
+ "bf16": false,
89
+ "memory_efficient_fp16": false,
90
+ "tpu": false,
91
+ "no_reshard_after_forward": false,
92
+ "fp32_reduce_scatter": false,
93
+ "cpu_offload": false,
94
+ "use_sharded_state": false,
95
+ "not_fsdp_flatten_parameters": false,
96
+ "freeze_up_to_layer": null
97
+ },
98
+ "dataset": {
99
+ "_name": null,
100
+ "num_workers": 0,
101
+ "num_workers_valid": 0,
102
+ "skip_invalid_size_inputs_valid_test": true,
103
+ "max_tokens": 300000,
104
+ "batch_size": null,
105
+ "required_batch_size_multiple": 8,
106
+ "required_seq_len_multiple": 1,
107
+ "dataset_impl": null,
108
+ "data_buffer_size": 10,
109
+ "train_subset": "train_wenet_cmn_9_10,train_wenet_cmn_8_10,train_wenet_cmn_7_10,train_wenet_cmn_6_10,train_wenet_cmn_5_10,train_wenet_cmn_4_10,train_wenet_cmn_3_10,train_wenet_cmn_2_10,train_wenet_cmn_1_10,train_wenet_cmn_0_10,train_mls_en_9_10,train_mls_en_8_10,train_mls_en_7_10,train_mls_en_6_10,train_mls_en_5_10,train_mls_en_4_10,train_mls_en_3_10,train_mls_en_2_10,train_mls_en_1_10,train_mls_en_0_10,train_mls_deu,train_mls_fra,train_mls_spa,train_cv12_cmn,train_mls_ita,train_cv12_cmn_2,train_vl107_cmn",
110
+ "valid_subset": "dev_all",
111
+ "combine_valid_subsets": null,
112
+ "ignore_unused_valid_subsets": false,
113
+ "validate_interval": 1,
114
+ "validate_interval_updates": 5000,
115
+ "validate_after_updates": 0,
116
+ "fixed_validation_seed": null,
117
+ "disable_validation": false,
118
+ "max_tokens_valid": 300000,
119
+ "batch_size_valid": null,
120
+ "max_valid_steps": null,
121
+ "curriculum": 0,
122
+ "gen_subset": "test",
123
+ "num_shards": 1,
124
+ "shard_id": 0,
125
+ "grouped_shuffling": false,
126
+ "update_epoch_batch_itr": false,
127
+ "update_ordered_indices_seed": false
128
+ },
129
+ "optimization": {
130
+ "_name": null,
131
+ "max_epoch": 0,
132
+ "max_update": 500000,
133
+ "stop_time_hours": 0.0,
134
+ "clip_norm": 1.0,
135
+ "clip_norm_type": "l2",
136
+ "sentence_avg": false,
137
+ "update_freq": [
138
+ 4
139
+ ],
140
+ "lr": [
141
+ 0.0001
142
+ ],
143
+ "stop_min_lr": -1.0,
144
+ "use_bmuf": false,
145
+ "skip_remainder_batch": false
146
+ },
147
+ "checkpoint": {
148
+ "_name": null,
149
+ "save_dir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
150
+ "restore_file": "checkpoint_last.pt",
151
+ "continue_once": null,
152
+ "finetune_from_model": null,
153
+ "ignore_suffix": false,
154
+ "reset_dataloader": true,
155
+ "reset_lr_scheduler": false,
156
+ "reset_meters": false,
157
+ "reset_optimizer": false,
158
+ "optimizer_overrides": "{}",
159
+ "save_interval": 1,
160
+ "save_interval_updates": 10000,
161
+ "keep_interval_updates": 1,
162
+ "keep_interval_updates_pattern": -1,
163
+ "keep_last_epochs": -1,
164
+ "keep_best_checkpoints": 10,
165
+ "no_save": false,
166
+ "no_epoch_checkpoints": true,
167
+ "no_last_checkpoints": false,
168
+ "no_best_checkpoints": false,
169
+ "no_save_optimizer_state": false,
170
+ "no_save_optimizer_state_on_training_finished": false,
171
+ "synchronize_checkpoints_before_copy": false,
172
+ "symlink_best_and_last_checkpoints": false,
173
+ "best_checkpoint_metric": "mse_loss",
174
+ "maximize_best_checkpoint_metric": false,
175
+ "patience": 20,
176
+ "checkpoint_suffix": "",
177
+ "checkpoint_shard_count": 1,
178
+ "load_checkpoint_on_all_dp_ranks": false,
179
+ "write_checkpoints_asynchronously": false,
180
+ "s3_upload_path": null,
181
+ "replication_count": 1,
182
+ "model_parallel_size": 1
183
+ },
184
+ "bmuf": {
185
+ "_name": null,
186
+ "block_lr": 1.0,
187
+ "block_momentum": 0.875,
188
+ "global_sync_iter": 50,
189
+ "warmup_iterations": 500,
190
+ "use_nbm": false,
191
+ "average_sync": false,
192
+ "distributed_world_size": 16
193
+ },
194
+ "generation": {
195
+ "_name": null,
196
+ "beam": 5,
197
+ "beam_mt": 0,
198
+ "nbest": 1,
199
+ "max_len_a": 0.0,
200
+ "max_len_b": 200,
201
+ "max_len_a_mt": 0.0,
202
+ "max_len_b_mt": 200,
203
+ "min_len": 1,
204
+ "match_source_len": false,
205
+ "unnormalized": false,
206
+ "no_early_stop": false,
207
+ "no_beamable_mm": false,
208
+ "lenpen": 1.0,
209
+ "lenpen_mt": 1.0,
210
+ "unkpen": 0.0,
211
+ "blankpen": 0.0,
212
+ "replace_unk": null,
213
+ "sacrebleu": false,
214
+ "score_reference": false,
215
+ "prefix_size": 0,
216
+ "no_repeat_ngram_size": 0,
217
+ "sampling": false,
218
+ "sampling_topk": -1,
219
+ "sampling_topp": -1.0,
220
+ "constraints": null,
221
+ "temperature": 1.0,
222
+ "diverse_beam_groups": -1,
223
+ "diverse_beam_strength": 0.5,
224
+ "diversity_rate": -1.0,
225
+ "print_alignment": null,
226
+ "print_step": false,
227
+ "lm_path": null,
228
+ "lm_weight": 0.0,
229
+ "iter_decode_eos_penalty": 0.0,
230
+ "iter_decode_max_iter": 10,
231
+ "iter_decode_force_max_iter": false,
232
+ "iter_decode_with_beam": 1,
233
+ "iter_decode_with_external_reranker": false,
234
+ "retain_iter_history": false,
235
+ "retain_dropout": false,
236
+ "retain_dropout_modules": null,
237
+ "decoding_format": null,
238
+ "no_seed_provided": false,
239
+ "eos_token": null
240
+ },
241
+ "eval_lm": {
242
+ "_name": null,
243
+ "output_word_probs": false,
244
+ "output_word_stats": false,
245
+ "context_window": 0,
246
+ "softmax_batch": 9223372036854775807,
247
+ "stats_path": null,
248
+ "max_valid_steps": null
249
+ },
250
+ "interactive": {
251
+ "_name": null,
252
+ "buffer_size": 0,
253
+ "input": "-"
254
+ },
255
+ "model": {
256
+ "no_progress_bar": false,
257
+ "log_interval": 100,
258
+ "log_format": "simple",
259
+ "log_file": null,
260
+ "aim_repo": null,
261
+ "aim_run_hash": null,
262
+ "tensorboard_logdir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
263
+ "wandb_project": null,
264
+ "azureml_logging": false,
265
+ "seed": 1,
266
+ "cpu": false,
267
+ "tpu": false,
268
+ "bf16": false,
269
+ "fp16": false,
270
+ "memory_efficient_fp16": false,
271
+ "fp16_no_flatten_grads": false,
272
+ "fp16_init_scale": 128,
273
+ "fp16_scale_window": null,
274
+ "fp16_scale_tolerance": 0.0,
275
+ "on_cpu_convert_precision": false,
276
+ "min_loss_scale": 0.0001,
277
+ "threshold_loss_scale": null,
278
+ "amp": false,
279
+ "amp_batch_retries": 2,
280
+ "amp_init_scale": 128,
281
+ "amp_scale_window": null,
282
+ "user_dir": null,
283
+ "empty_cache_freq": 0,
284
+ "all_gather_list_size": 9999999,
285
+ "model_parallel_size": 1,
286
+ "quantization_config_path": null,
287
+ "profile": false,
288
+ "reset_logging": false,
289
+ "suppress_crashes": false,
290
+ "use_plasma_view": false,
291
+ "plasma_path": "/tmp/plasma",
292
+ "log_nvidia_smi": false,
293
+ "use_tutel_moe": false,
294
+ "tokenizer": null,
295
+ "bpe": null,
296
+ "optimizer": "adam",
297
+ "lr_scheduler": "fixed",
298
+ "simul_type": null,
299
+ "criterion": "nar_prosody2vec",
300
+ "scoring": "bleu",
301
+ "task": "prosody2vec",
302
+ "num_workers": 0,
303
+ "num_workers_valid": 0,
304
+ "skip_invalid_size_inputs_valid_test": true,
305
+ "max_tokens": 300000,
306
+ "batch_size": null,
307
+ "required_batch_size_multiple": 8,
308
+ "required_seq_len_multiple": 1,
309
+ "dataset_impl": null,
310
+ "data_buffer_size": 10,
311
+ "train_subset": "train_wenet_cmn_9_10,train_wenet_cmn_8_10,train_wenet_cmn_7_10,train_wenet_cmn_6_10,train_wenet_cmn_5_10,train_wenet_cmn_4_10,train_wenet_cmn_3_10,train_wenet_cmn_2_10,train_wenet_cmn_1_10,train_wenet_cmn_0_10,train_mls_en_9_10,train_mls_en_8_10,train_mls_en_7_10,train_mls_en_6_10,train_mls_en_5_10,train_mls_en_4_10,train_mls_en_3_10,train_mls_en_2_10,train_mls_en_1_10,train_mls_en_0_10,train_mls_deu,train_mls_fra,train_mls_spa,train_cv12_cmn,train_mls_ita,train_cv12_cmn_2,train_vl107_cmn",
312
+ "valid_subset": "dev_all",
313
+ "combine_valid_subsets": null,
314
+ "ignore_unused_valid_subsets": false,
315
+ "validate_interval": 1,
316
+ "validate_interval_updates": 5000,
317
+ "validate_after_updates": 0,
318
+ "fixed_validation_seed": null,
319
+ "disable_validation": false,
320
+ "max_tokens_valid": "300000",
321
+ "batch_size_valid": null,
322
+ "max_valid_steps": null,
323
+ "curriculum": 0,
324
+ "gen_subset": "test",
325
+ "num_shards": 1,
326
+ "shard_id": 0,
327
+ "grouped_shuffling": false,
328
+ "update_epoch_batch_itr": false,
329
+ "update_ordered_indices_seed": false,
330
+ "distributed_world_size": 16,
331
+ "distributed_num_procs": 8,
332
+ "distributed_rank": 0,
333
+ "distributed_backend": "nccl",
334
+ "distributed_init_method": null,
335
+ "distributed_port": 15129,
336
+ "device_id": 0,
337
+ "distributed_no_spawn": false,
338
+ "ddp_backend": "legacy_ddp",
339
+ "ddp_comm_hook": "none",
340
+ "bucket_cap_mb": 25,
341
+ "fix_batches_to_gpus": false,
342
+ "find_unused_parameters": true,
343
+ "gradient_as_bucket_view": false,
344
+ "fast_stat_sync": false,
345
+ "heartbeat_timeout": -1,
346
+ "broadcast_buffers": false,
347
+ "slowmo_momentum": null,
348
+ "slowmo_base_algorithm": "localsgd",
349
+ "localsgd_frequency": 3,
350
+ "nprocs_per_node": 8,
351
+ "pipeline_model_parallel": false,
352
+ "pipeline_balance": null,
353
+ "pipeline_devices": null,
354
+ "pipeline_chunks": 0,
355
+ "pipeline_encoder_balance": null,
356
+ "pipeline_encoder_devices": null,
357
+ "pipeline_decoder_balance": null,
358
+ "pipeline_decoder_devices": null,
359
+ "pipeline_checkpoint": "never",
360
+ "zero_sharding": "none",
361
+ "no_reshard_after_forward": false,
362
+ "fp32_reduce_scatter": false,
363
+ "cpu_offload": false,
364
+ "use_sharded_state": false,
365
+ "not_fsdp_flatten_parameters": false,
366
+ "freeze_up_to_layer": null,
367
+ "arch": "nar_p2v",
368
+ "max_epoch": 0,
369
+ "max_update": 500000,
370
+ "stop_time_hours": 0,
371
+ "clip_norm": 1.0,
372
+ "clip_norm_type": "l2",
373
+ "sentence_avg": false,
374
+ "update_freq": [
375
+ 4
376
+ ],
377
+ "lr": [
378
+ 0.0001
379
+ ],
380
+ "stop_min_lr": -1.0,
381
+ "use_bmuf": false,
382
+ "skip_remainder_batch": false,
383
+ "save_dir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
384
+ "restore_file": "checkpoint_last.pt",
385
+ "continue_once": null,
386
+ "finetune_from_model": null,
387
+ "ignore_suffix": false,
388
+ "reset_dataloader": true,
389
+ "reset_lr_scheduler": false,
390
+ "reset_meters": false,
391
+ "reset_optimizer": false,
392
+ "optimizer_overrides": "{}",
393
+ "save_interval": 1,
394
+ "save_interval_updates": 10000,
395
+ "keep_interval_updates": 1,
396
+ "keep_interval_updates_pattern": -1,
397
+ "keep_last_epochs": -1,
398
+ "keep_best_checkpoints": 10,
399
+ "no_save": false,
400
+ "no_epoch_checkpoints": true,
401
+ "no_last_checkpoints": false,
402
+ "no_best_checkpoints": false,
403
+ "no_save_optimizer_state": false,
404
+ "no_save_optimizer_state_on_training_finished": false,
405
+ "synchronize_checkpoints_before_copy": false,
406
+ "symlink_best_and_last_checkpoints": false,
407
+ "best_checkpoint_metric": "mse_loss",
408
+ "maximize_best_checkpoint_metric": false,
409
+ "patience": 20,
410
+ "checkpoint_suffix": "",
411
+ "checkpoint_shard_count": 1,
412
+ "load_checkpoint_on_all_dp_ranks": false,
413
+ "write_checkpoints_asynchronously": false,
414
+ "s3_upload_path": null,
415
+ "replication_count": 1,
416
+ "store_ema": false,
417
+ "ema_decay": 0.9999,
418
+ "ema_start_update": 0,
419
+ "ema_seed_model": null,
420
+ "ema_update_freq": 1,
421
+ "ema_fp32": false,
422
+ "load_prosody_encoder_from": null,
423
+ "freeze_prosody_encoder": false,
424
+ "unit_encoder_arch": "daft_exprt_encoder",
425
+ "prosody_encoder_arch": "ecapa_tdnn2",
426
+ "decoder_arch": "daft_exprt_decoder",
427
+ "data": "/large_experiments/seamless/ust/mjhwang/data/denoise_prosody2vec/mls_multilingual_6lang_xlsr_10k_noiseaug",
428
+ "config_yaml": "config_v2.yaml",
429
+ "max_source_positions": 300000,
430
+ "max_target_positions": 300000,
431
+ "n_frames_per_step": 1,
432
+ "eos_prob_threshold": 0.5,
433
+ "eval_inference": true,
434
+ "eval_tb_nsample": 8,
435
+ "eval_bleu": false,
436
+ "vocoder": "griffin_lim",
437
+ "spec_bwd_max_iter": 8,
438
+ "jit_data_offloading": true,
439
+ "jit_data_root": "/scratch/slurm_tmpdir/${SLURM_JOB_ID}",
440
+ "adam_betas": "(0.9, 0.98)",
441
+ "adam_eps": 1e-08,
442
+ "weight_decay": 0.0,
443
+ "use_old_adam": false,
444
+ "fp16_adam_stats": false,
445
+ "block_wise": false,
446
+ "force_anneal": null,
447
+ "lr_shrink": 0.1,
448
+ "warmup_updates": 1000,
449
+ "ctc_weight": 0.0,
450
+ "forward_sum_weight": 1.0,
451
+ "bin_loss_start_ratio": 0.1,
452
+ "bin_loss_warmup_steps": 6000,
453
+ "film_regul_weight": 0.001,
454
+ "pros_consist_weight": 0.0,
455
+ "denoise_target": true,
456
+ "snr_threshold": 2000000000000000.0,
457
+ "pad": 1,
458
+ "eos": 2,
459
+ "unk": 3,
460
+ "use_spkr_emb": 0,
461
+ "use_lang_emb": 1,
462
+ "prosody_embed_dim": 512,
463
+ "use_ucmvn": 0,
464
+ "use_spec_augment": 1,
465
+ "use_prosody_layernorm": 1,
466
+ "var_pred_hidden_dim": 512,
467
+ "var_pred_kernel_size": 5,
468
+ "var_pred_n_bins": -1,
469
+ "add_variance_parallel": 1,
470
+ "use_film_decoder": 1,
471
+ "predict_var_vuv": 1,
472
+ "predict_vuv_logit": 1,
473
+ "predict_frm_f0_vuv": 0,
474
+ "no_seed_provided": false,
475
+ "speaker_embed_dim": 192,
476
+ "use_utterance_speaker_embed": false,
477
+ "lang_embed_dim": 64,
478
+ "_name": "nar_p2v",
479
+ "lang_to_id": {
480
+ "cmn": 0,
481
+ "deu": 1,
482
+ "eng": 2,
483
+ "fra": 3,
484
+ "ita": 4,
485
+ "spa": 5
486
+ },
487
+ "pitch_min": 0.0,
488
+ "pitch_max": 6.858574643755327,
489
+ "energy_min": 0.0,
490
+ "energy_max": 6.360039234161377,
491
+ "speaker_emb_path": null,
492
+ "input_feat_per_channel": 80,
493
+ "input_channels": 1,
494
+ "speaker_to_id": null,
495
+ "dropout": 0.2,
496
+ "fft_hidden_dim": 1024,
497
+ "fft_kernel_size": 9,
498
+ "attention_dropout": 0.0,
499
+ "encoder_layers": 4,
500
+ "encoder_embed_dim": 256,
501
+ "encoder_attention_heads": 2,
502
+ "output_frame_dim": 80,
503
+ "prosody_channels": [
504
+ 512,
505
+ 512,
506
+ 512,
507
+ 512,
508
+ 1536
509
+ ],
510
+ "prosody_kernel_sizes": [
511
+ 5,
512
+ 3,
513
+ 3,
514
+ 3,
515
+ 1
516
+ ],
517
+ "prosody_dilations": [
518
+ 1,
519
+ 2,
520
+ 3,
521
+ 4,
522
+ 1
523
+ ],
524
+ "prosody_attention_channels": 128,
525
+ "prosody_res2net_scale": 8,
526
+ "prosody_se_channels": 128,
527
+ "prosody_global_context": true,
528
+ "prosody_groups": [
529
+ 1,
530
+ 1,
531
+ 1,
532
+ 1,
533
+ 1
534
+ ],
535
+ "decoder_layers": 4,
536
+ "decoder_embed_dim": 256,
537
+ "decoder_attention_heads": 2,
538
+ "var_pred_dropout": 0.5,
539
+ "add_postnet": true,
540
+ "postnet_dropout": 0.5,
541
+ "postnet_layers": 5,
542
+ "postnet_conv_dim": 512,
543
+ "postnet_conv_kernel_size": 5,
544
+ "upsampling": "gaussian"
545
+ },
546
+ "task": {
547
+ "no_progress_bar": false,
548
+ "log_interval": 100,
549
+ "log_format": "simple",
550
+ "log_file": null,
551
+ "aim_repo": null,
552
+ "aim_run_hash": null,
553
+ "tensorboard_logdir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
554
+ "wandb_project": null,
555
+ "azureml_logging": false,
556
+ "seed": 1,
557
+ "cpu": false,
558
+ "tpu": false,
559
+ "bf16": false,
560
+ "fp16": false,
561
+ "memory_efficient_fp16": false,
562
+ "fp16_no_flatten_grads": false,
563
+ "fp16_init_scale": 128,
564
+ "fp16_scale_window": null,
565
+ "fp16_scale_tolerance": 0.0,
566
+ "on_cpu_convert_precision": false,
567
+ "min_loss_scale": 0.0001,
568
+ "threshold_loss_scale": null,
569
+ "amp": false,
570
+ "amp_batch_retries": 2,
571
+ "amp_init_scale": 128,
572
+ "amp_scale_window": null,
573
+ "user_dir": null,
574
+ "empty_cache_freq": 0,
575
+ "all_gather_list_size": 9999999,
576
+ "model_parallel_size": 1,
577
+ "quantization_config_path": null,
578
+ "profile": false,
579
+ "reset_logging": false,
580
+ "suppress_crashes": false,
581
+ "use_plasma_view": false,
582
+ "plasma_path": "/tmp/plasma",
583
+ "log_nvidia_smi": false,
584
+ "use_tutel_moe": false,
585
+ "tokenizer": null,
586
+ "bpe": null,
587
+ "optimizer": "adam",
588
+ "lr_scheduler": "fixed",
589
+ "simul_type": null,
590
+ "criterion": "nar_prosody2vec",
591
+ "scoring": "bleu",
592
+ "task": "prosody2vec",
593
+ "num_workers": 0,
594
+ "num_workers_valid": 0,
595
+ "skip_invalid_size_inputs_valid_test": true,
596
+ "max_tokens": 300000,
597
+ "batch_size": null,
598
+ "required_batch_size_multiple": 8,
599
+ "required_seq_len_multiple": 1,
600
+ "dataset_impl": null,
601
+ "data_buffer_size": 10,
602
+ "train_subset": "train_wenet_cmn_9_10,train_wenet_cmn_8_10,train_wenet_cmn_7_10,train_wenet_cmn_6_10,train_wenet_cmn_5_10,train_wenet_cmn_4_10,train_wenet_cmn_3_10,train_wenet_cmn_2_10,train_wenet_cmn_1_10,train_wenet_cmn_0_10,train_mls_en_9_10,train_mls_en_8_10,train_mls_en_7_10,train_mls_en_6_10,train_mls_en_5_10,train_mls_en_4_10,train_mls_en_3_10,train_mls_en_2_10,train_mls_en_1_10,train_mls_en_0_10,train_mls_deu,train_mls_fra,train_mls_spa,train_cv12_cmn,train_mls_ita,train_cv12_cmn_2,train_vl107_cmn",
603
+ "valid_subset": "dev_all",
604
+ "combine_valid_subsets": null,
605
+ "ignore_unused_valid_subsets": false,
606
+ "validate_interval": 1,
607
+ "validate_interval_updates": 5000,
608
+ "validate_after_updates": 0,
609
+ "fixed_validation_seed": null,
610
+ "disable_validation": false,
611
+ "max_tokens_valid": "300000",
612
+ "batch_size_valid": null,
613
+ "max_valid_steps": null,
614
+ "curriculum": 0,
615
+ "gen_subset": "test",
616
+ "num_shards": 1,
617
+ "shard_id": 0,
618
+ "grouped_shuffling": false,
619
+ "update_epoch_batch_itr": false,
620
+ "update_ordered_indices_seed": false,
621
+ "distributed_world_size": 16,
622
+ "distributed_num_procs": 8,
623
+ "distributed_rank": 0,
624
+ "distributed_backend": "nccl",
625
+ "distributed_init_method": null,
626
+ "distributed_port": 15129,
627
+ "device_id": 0,
628
+ "distributed_no_spawn": false,
629
+ "ddp_backend": "legacy_ddp",
630
+ "ddp_comm_hook": "none",
631
+ "bucket_cap_mb": 25,
632
+ "fix_batches_to_gpus": false,
633
+ "find_unused_parameters": true,
634
+ "gradient_as_bucket_view": false,
635
+ "fast_stat_sync": false,
636
+ "heartbeat_timeout": -1,
637
+ "broadcast_buffers": false,
638
+ "slowmo_momentum": null,
639
+ "slowmo_base_algorithm": "localsgd",
640
+ "localsgd_frequency": 3,
641
+ "nprocs_per_node": 8,
642
+ "pipeline_model_parallel": false,
643
+ "pipeline_balance": null,
644
+ "pipeline_devices": null,
645
+ "pipeline_chunks": 0,
646
+ "pipeline_encoder_balance": null,
647
+ "pipeline_encoder_devices": null,
648
+ "pipeline_decoder_balance": null,
649
+ "pipeline_decoder_devices": null,
650
+ "pipeline_checkpoint": "never",
651
+ "zero_sharding": "none",
652
+ "no_reshard_after_forward": false,
653
+ "fp32_reduce_scatter": false,
654
+ "cpu_offload": false,
655
+ "use_sharded_state": false,
656
+ "not_fsdp_flatten_parameters": false,
657
+ "freeze_up_to_layer": null,
658
+ "arch": "nar_p2v",
659
+ "max_epoch": 0,
660
+ "max_update": 500000,
661
+ "stop_time_hours": 0,
662
+ "clip_norm": 1.0,
663
+ "clip_norm_type": "l2",
664
+ "sentence_avg": false,
665
+ "update_freq": [
666
+ 4
667
+ ],
668
+ "lr": [
669
+ 0.0001
670
+ ],
671
+ "stop_min_lr": -1.0,
672
+ "use_bmuf": false,
673
+ "skip_remainder_batch": false,
674
+ "save_dir": "/checkpoint/mjhwang/experiments/230930-noiseaug_p2v-mls_multilingual_6lang/231005-noiseaug_p2v-mls_multilingual_6lang-alignfix.config_v2.langemb1.vuv_logit1.denoise.ngpu16",
675
+ "restore_file": "checkpoint_last.pt",
676
+ "continue_once": null,
677
+ "finetune_from_model": null,
678
+ "ignore_suffix": false,
679
+ "reset_dataloader": true,
680
+ "reset_lr_scheduler": false,
681
+ "reset_meters": false,
682
+ "reset_optimizer": false,
683
+ "optimizer_overrides": "{}",
684
+ "save_interval": 1,
685
+ "save_interval_updates": 10000,
686
+ "keep_interval_updates": 1,
687
+ "keep_interval_updates_pattern": -1,
688
+ "keep_last_epochs": -1,
689
+ "keep_best_checkpoints": 10,
690
+ "no_save": false,
691
+ "no_epoch_checkpoints": true,
692
+ "no_last_checkpoints": false,
693
+ "no_best_checkpoints": false,
694
+ "no_save_optimizer_state": false,
695
+ "no_save_optimizer_state_on_training_finished": false,
696
+ "synchronize_checkpoints_before_copy": false,
697
+ "symlink_best_and_last_checkpoints": false,
698
+ "best_checkpoint_metric": "mse_loss",
699
+ "maximize_best_checkpoint_metric": false,
700
+ "patience": 20,
701
+ "checkpoint_suffix": "",
702
+ "checkpoint_shard_count": 1,
703
+ "load_checkpoint_on_all_dp_ranks": false,
704
+ "write_checkpoints_asynchronously": false,
705
+ "s3_upload_path": null,
706
+ "replication_count": 1,
707
+ "store_ema": false,
708
+ "ema_decay": 0.9999,
709
+ "ema_start_update": 0,
710
+ "ema_seed_model": null,
711
+ "ema_update_freq": 1,
712
+ "ema_fp32": false,
713
+ "load_prosody_encoder_from": null,
714
+ "freeze_prosody_encoder": false,
715
+ "unit_encoder_arch": "daft_exprt_encoder",
716
+ "prosody_encoder_arch": "ecapa_tdnn2",
717
+ "decoder_arch": "daft_exprt_decoder",
718
+ "data": "/large_experiments/seamless/ust/mjhwang/data/denoise_prosody2vec/mls_multilingual_6lang_xlsr_10k_noiseaug",
719
+ "config_yaml": "config_v2.yaml",
720
+ "max_source_positions": 300000,
721
+ "max_target_positions": 300000,
722
+ "n_frames_per_step": 1,
723
+ "eos_prob_threshold": 0.5,
724
+ "eval_inference": true,
725
+ "eval_tb_nsample": 8,
726
+ "eval_bleu": false,
727
+ "vocoder": "griffin_lim",
728
+ "spec_bwd_max_iter": 8,
729
+ "jit_data_offloading": true,
730
+ "jit_data_root": "/scratch/slurm_tmpdir/${SLURM_JOB_ID}",
731
+ "adam_betas": "(0.9, 0.98)",
732
+ "adam_eps": 1e-08,
733
+ "weight_decay": 0.0,
734
+ "use_old_adam": false,
735
+ "fp16_adam_stats": false,
736
+ "block_wise": false,
737
+ "force_anneal": null,
738
+ "lr_shrink": 0.1,
739
+ "warmup_updates": 1000,
740
+ "ctc_weight": 0.0,
741
+ "forward_sum_weight": 1.0,
742
+ "bin_loss_start_ratio": 0.1,
743
+ "bin_loss_warmup_steps": 6000,
744
+ "film_regul_weight": 0.001,
745
+ "pros_consist_weight": 0.0,
746
+ "denoise_target": true,
747
+ "snr_threshold": 2000000000000000.0,
748
+ "pad": 1,
749
+ "eos": 2,
750
+ "unk": 3,
751
+ "use_spkr_emb": 0,
752
+ "use_lang_emb": 1,
753
+ "prosody_embed_dim": 512,
754
+ "use_ucmvn": 0,
755
+ "use_spec_augment": 1,
756
+ "use_prosody_layernorm": 1,
757
+ "var_pred_hidden_dim": 512,
758
+ "var_pred_kernel_size": 5,
759
+ "var_pred_n_bins": -1,
760
+ "add_variance_parallel": 1,
761
+ "use_film_decoder": 1,
762
+ "predict_var_vuv": 1,
763
+ "predict_vuv_logit": 1,
764
+ "predict_frm_f0_vuv": 0,
765
+ "no_seed_provided": false,
766
+ "speaker_embed_dim": 192,
767
+ "use_utterance_speaker_embed": false,
768
+ "lang_embed_dim": 64,
769
+ "_name": "prosody2vec"
770
+ },
771
+ "criterion": {
772
+ "_name": "nar_prosody2vec",
773
+ "ctc_weight": 0.0,
774
+ "forward_sum_weight": 1.0,
775
+ "bin_loss_start_ratio": 0.1,
776
+ "bin_loss_warmup_steps": 6000,
777
+ "film_regul_weight": 0.001,
778
+ "pros_consist_weight": 0.0,
779
+ "denoise_target": true,
780
+ "snr_threshold": 2000000000000000.0
781
+ },
782
+ "optimizer": {
783
+ "_name": "adam",
784
+ "adam_betas": "(0.9, 0.98)",
785
+ "adam_eps": 1e-08,
786
+ "weight_decay": 0.0,
787
+ "use_old_adam": false,
788
+ "fp16_adam_stats": false,
789
+ "tpu": false,
790
+ "lr": [
791
+ 0.0001
792
+ ],
793
+ "block_wise": false
794
+ },
795
+ "lr_scheduler": {
796
+ "_name": "fixed",
797
+ "force_anneal": null,
798
+ "lr_shrink": 0.1,
799
+ "warmup_updates": 1000,
800
+ "lr": [
801
+ 0.0001
802
+ ]
803
+ },
804
+ "scoring": {
805
+ "_name": "bleu",
806
+ "pad": 1,
807
+ "eos": 2,
808
+ "unk": 3
809
+ },
810
+ "bpe": null,
811
+ "tokenizer": null,
812
+ "ema": {
813
+ "_name": null,
814
+ "store_ema": false,
815
+ "ema_decay": 0.9999,
816
+ "ema_start_update": 0,
817
+ "ema_seed_model": null,
818
+ "ema_update_freq": 1,
819
+ "ema_fp32": false
820
+ },
821
+ "simul_type": null
822
+ }
pretrained_models/ckpts/prosody_encoder/prosody_encoder_UnitY2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf01eaec68b9f8a078ac80550a9ad7de3857fb52f3aac126e5de31aa036bd015
3
+ size 14402800
pretrained_models/ckpts/vocos-mel-24khz/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
pretrained_models/ckpts/vocos-mel-24khz/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis
6
+
7
+ [Audio samples](https://charactr-platform.github.io/vocos/) |
8
+ Paper [[abs]](https://arxiv.org/abs/2306.00814) [[pdf]](https://arxiv.org/pdf/2306.00814.pdf)
9
+
10
+ Vocos is a fast neural vocoder designed to synthesize audio waveforms from acoustic features. Trained using a Generative
11
+ Adversarial Network (GAN) objective, Vocos can generate waveforms in a single forward pass. Unlike other typical
12
+ GAN-based vocoders, Vocos does not model audio samples in the time domain. Instead, it generates spectral
13
+ coefficients, facilitating rapid audio reconstruction through inverse Fourier transform.
14
+
15
+ ## Installation
16
+
17
+ To use Vocos only in inference mode, install it using:
18
+
19
+ ```bash
20
+ pip install vocos
21
+ ```
22
+
23
+ If you wish to train the model, install it with additional dependencies:
24
+
25
+ ```bash
26
+ pip install vocos[train]
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ### Reconstruct audio from mel-spectrogram
32
+
33
+ ```python
34
+ import torch
35
+
36
+ from vocos import Vocos
37
+
38
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
39
+
40
+ mel = torch.randn(1, 100, 256) # B, C, T
41
+ audio = vocos.decode(mel)
42
+ ```
43
+
44
+ Copy-synthesis from a file:
45
+
46
+ ```python
47
+ import torchaudio
48
+
49
+ y, sr = torchaudio.load(YOUR_AUDIO_FILE)
50
+ if y.size(0) > 1: # mix to mono
51
+ y = y.mean(dim=0, keepdim=True)
52
+ y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=24000)
53
+ y_hat = vocos(y)
54
+ ```
55
+
56
+ ## Citation
57
+
58
+ If this code contributes to your research, please cite our work:
59
+
60
+ ```
61
+ @article{siuzdak2023vocos,
62
+ title={Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis},
63
+ author={Siuzdak, Hubert},
64
+ journal={arXiv preprint arXiv:2306.00814},
65
+ year={2023}
66
+ }
67
+ ```
68
+
69
+ ## License
70
+
71
+ The code in this repository is released under the MIT license.
pretrained_models/ckpts/vocos-mel-24khz/config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ feature_extractor:
2
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
3
+ init_args:
4
+ sample_rate: 24000
5
+ n_fft: 1024
6
+ hop_length: 256
7
+ n_mels: 100
8
+ padding: center
9
+
10
+ backbone:
11
+ class_path: vocos.models.VocosBackbone
12
+ init_args:
13
+ input_channels: 100
14
+ dim: 512
15
+ intermediate_dim: 1536
16
+ num_layers: 8
17
+
18
+ head:
19
+ class_path: vocos.heads.ISTFTHead
20
+ init_args:
21
+ dim: 512
22
+ n_fft: 1024
23
+ hop_length: 256
24
+ padding: center
pretrained_models/ckpts/vocos-mel-24khz/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ec976ad1fd67a33ab2682d29c0ac7df85234fae875aefcc5fb215681a91b2a
3
+ size 54365991
pretrained_models/data/multilingual_grl/vocab.txt ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #1
5
+ #2
6
+ #3
7
+ #4
8
+ (zh)a1
9
+ (zh)a2
10
+ (zh)a3
11
+ (zh)a4
12
+ (zh)a5
13
+ (zh)ai1
14
+ (zh)ai2
15
+ (zh)ai3
16
+ (zh)ai4
17
+ (zh)ai5
18
+ (zh)an1
19
+ (zh)an2
20
+ (zh)an3
21
+ (zh)an4
22
+ (zh)an5
23
+ (zh)ang1
24
+ (zh)ang2
25
+ (zh)ang3
26
+ (zh)ang4
27
+ (zh)ang5
28
+ (zh)ao1
29
+ (zh)ao2
30
+ (zh)ao3
31
+ (zh)ao4
32
+ (zh)ao5
33
+ (zh)b
34
+ (zh)c
35
+ (zh)ch
36
+ (zh)d
37
+ (zh)e1
38
+ (zh)e2
39
+ (zh)e3
40
+ (zh)e4
41
+ (zh)e5
42
+ (zh)ei1
43
+ (zh)ei2
44
+ (zh)ei3
45
+ (zh)ei4
46
+ (zh)ei5
47
+ (zh)en1
48
+ (zh)en2
49
+ (zh)en3
50
+ (zh)en4
51
+ (zh)en5
52
+ (zh)eng1
53
+ (zh)eng2
54
+ (zh)eng3
55
+ (zh)eng4
56
+ (zh)eng5
57
+ (zh)er1
58
+ (zh)er2
59
+ (zh)er3
60
+ (zh)er4
61
+ (zh)er5
62
+ (zh)f
63
+ (zh)g
64
+ (zh)h
65
+ (zh)i1
66
+ (zh)i2
67
+ (zh)i3
68
+ (zh)i4
69
+ (zh)i5
70
+ (zh)ia1
71
+ (zh)ia2
72
+ (zh)ia3
73
+ (zh)ia4
74
+ (zh)ia5
75
+ (zh)ian1
76
+ (zh)ian2
77
+ (zh)ian3
78
+ (zh)ian4
79
+ (zh)ian5
80
+ (zh)iang1
81
+ (zh)iang2
82
+ (zh)iang3
83
+ (zh)iang4
84
+ (zh)iang5
85
+ (zh)iao1
86
+ (zh)iao2
87
+ (zh)iao3
88
+ (zh)iao4
89
+ (zh)iao5
90
+ (zh)ie1
91
+ (zh)ie2
92
+ (zh)ie3
93
+ (zh)ie4
94
+ (zh)ie5
95
+ (zh)in1
96
+ (zh)in2
97
+ (zh)in3
98
+ (zh)in4
99
+ (zh)in5
100
+ (zh)ing1
101
+ (zh)ing2
102
+ (zh)ing3
103
+ (zh)ing4
104
+ (zh)ing5
105
+ (zh)iong1
106
+ (zh)iong2
107
+ (zh)iong3
108
+ (zh)iong4
109
+ (zh)iong5
110
+ (zh)iou1
111
+ (zh)iou2
112
+ (zh)iou3
113
+ (zh)iou4
114
+ (zh)iou5
115
+ (zh)j
116
+ (zh)k
117
+ (zh)l
118
+ (zh)m
119
+ (zh)n
120
+ (zh)o1
121
+ (zh)o2
122
+ (zh)o3
123
+ (zh)o4
124
+ (zh)o5
125
+ (zh)ong1
126
+ (zh)ong2
127
+ (zh)ong3
128
+ (zh)ong4
129
+ (zh)ong5
130
+ (zh)ou1
131
+ (zh)ou2
132
+ (zh)ou3
133
+ (zh)ou4
134
+ (zh)ou5
135
+ (zh)p
136
+ (zh)q
137
+ (zh)r
138
+ (zh)s
139
+ (zh)sh
140
+ (zh)t
141
+ (zh)u1
142
+ (zh)u2
143
+ (zh)u3
144
+ (zh)u4
145
+ (zh)u5
146
+ (zh)ua1
147
+ (zh)ua2
148
+ (zh)ua3
149
+ (zh)ua4
150
+ (zh)ua5
151
+ (zh)uai1
152
+ (zh)uai2
153
+ (zh)uai3
154
+ (zh)uai4
155
+ (zh)uai5
156
+ (zh)uan1
157
+ (zh)uan2
158
+ (zh)uan3
159
+ (zh)uan4
160
+ (zh)uan5
161
+ (zh)uang1
162
+ (zh)uang2
163
+ (zh)uang3
164
+ (zh)uang4
165
+ (zh)uang5
166
+ (zh)uei1
167
+ (zh)uei2
168
+ (zh)uei3
169
+ (zh)uei4
170
+ (zh)uei5
171
+ (zh)uen1
172
+ (zh)uen2
173
+ (zh)uen3
174
+ (zh)uen4
175
+ (zh)uen5
176
+ (zh)ueng1
177
+ (zh)ueng2
178
+ (zh)ueng3
179
+ (zh)ueng4
180
+ (zh)ueng5
181
+ (zh)uo1
182
+ (zh)uo2
183
+ (zh)uo3
184
+ (zh)uo4
185
+ (zh)uo5
186
+ (zh)v1
187
+ (zh)v2
188
+ (zh)v3
189
+ (zh)v4
190
+ (zh)v5
191
+ (zh)van1
192
+ (zh)van2
193
+ (zh)van3
194
+ (zh)van4
195
+ (zh)van5
196
+ (zh)ve1
197
+ (zh)ve2
198
+ (zh)ve3
199
+ (zh)ve4
200
+ (zh)ve5
201
+ (zh)vn1
202
+ (zh)vn2
203
+ (zh)vn3
204
+ (zh)vn4
205
+ (zh)vn5
206
+ (zh)w
207
+ (zh)x
208
+ (zh)y
209
+ (zh)z
210
+ (zh)zh
211
+ (de)a
212
+ (de)aɪ
213
+ (de)aʊ
214
+ (de)b
215
+ (de)bʲ
216
+ (de)c
217
+ (de)d
218
+ (de)dʑ
219
+ (de)dʒ
220
+ (de)e
221
+ (de)eː
222
+ (de)f
223
+ (de)h
224
+ (de)i
225
+ (de)iː
226
+ (de)j
227
+ (de)k
228
+ (de)kʲ
229
+ (de)l
230
+ (de)m
231
+ (de)mʲ
232
+ (de)n
233
+ (de)o
234
+ (de)oɪ
235
+ (de)oː
236
+ (de)p
237
+ (de)pf
238
+ (de)pʲ
239
+ (de)r
240
+ (de)s
241
+ (de)t
242
+ (de)ts
243
+ (de)tɕ
244
+ (de)tʃ
245
+ (de)tʲ
246
+ (de)u
247
+ (de)uː
248
+ (de)v
249
+ (de)vʲ
250
+ (de)w
251
+ (de)x
252
+ (de)y
253
+ (de)yː
254
+ (de)z
255
+ (de)ç
256
+ (de)ð
257
+ (de)øː
258
+ (de)ŋ
259
+ (de)œ
260
+ (de)ɑ
261
+ (de)ɑː
262
+ (de)ɔ
263
+ (de)ɔø
264
+ (de)ɔː
265
+ (de)ɕ
266
+ (de)ə
267
+ (de)ɛ
268
+ (de)ɛɪ
269
+ (de)ɛː
270
+ (de)ɜ
271
+ (de)ɡ
272
+ (de)ɡʲ
273
+ (de)ɣ
274
+ (de)ɨ
275
+ (de)ɪ
276
+ (de)ɲ
277
+ (de)ɲʲ
278
+ (de)ɾ
279
+ (de)ʃ
280
+ (de)ʊ
281
+ (de)ʑ
282
+ (de)ʒ
283
+ (de)θ
284
+ (el)
285
+ (en)a
286
+ (en)aɪ
287
+ (en)aɪə
288
+ (en)aɪɚ
289
+ (en)aʊ
290
+ (en)aː
291
+ (en)b
292
+ (en)bʲ
293
+ (en)c
294
+ (en)d
295
+ (en)dʑ
296
+ (en)dʒ
297
+ (en)e
298
+ (en)eə
299
+ (en)eɪ
300
+ (en)f
301
+ (en)h
302
+ (en)i
303
+ (en)iə
304
+ (en)iː
305
+ (en)iːː
306
+ (en)j
307
+ (en)k
308
+ (en)l
309
+ (en)m
310
+ (en)n
311
+ (en)nʲ
312
+ (en)o
313
+ (en)oʊ
314
+ (en)oː
315
+ (en)oːɹ
316
+ (en)p
317
+ (en)q
318
+ (en)r
319
+ (en)s
320
+ (en)t
321
+ (en)tɕ
322
+ (en)tʃ
323
+ (en)u
324
+ (en)uː
325
+ (en)v
326
+ (en)w
327
+ (en)x
328
+ (en)z
329
+ (en)æ
330
+ (en)ææ
331
+ (en)ç
332
+ (en)ð
333
+ (en)ŋ
334
+ (en)ɐ
335
+ (en)ɐɐ
336
+ (en)ɑ
337
+ (en)ɑː
338
+ (en)ɑːɹ
339
+ (en)ɒ
340
+ (en)ɔ
341
+ (en)ɔɪ
342
+ (en)ɔː
343
+ (en)ɔːɹ
344
+ (en)ɕ
345
+ (en)ə
346
+ (en)əl
347
+ (en)əʊ
348
+ (en)ɚ
349
+ (en)ɛ
350
+ (en)ɛɹ
351
+ (en)ɛː
352
+ (en)ɜː
353
+ (en)ɡ
354
+ (en)ɡʲ
355
+ (en)ɣ
356
+ (en)ɨ
357
+ (en)ɪ
358
+ (en)ɪɹ
359
+ (en)ɪː
360
+ (en)ɬ
361
+ (en)ɲ
362
+ (en)ɲʲ
363
+ (en)ɹ
364
+ (en)ɾ
365
+ (en)ʁ
366
+ (en)ʃ
367
+ (en)ʊ
368
+ (en)ʊə
369
+ (en)ʊɹ
370
+ (en)ʌ
371
+ (en)ʍ
372
+ (en)ʒ
373
+ (en)ʔ
374
+ (en)θ
375
+ (en)ᵻ
376
+ (es)a
377
+ (es)aɪ
378
+ (es)aʊ
379
+ (es)b
380
+ (es)c
381
+ (es)d
382
+ (es)dʒ
383
+ (es)e
384
+ (es)eɪ
385
+ (es)eʊ
386
+ (es)f
387
+ (es)h
388
+ (es)i
389
+ (es)iː
390
+ (es)j
391
+ (es)k
392
+ (es)l
393
+ (es)m
394
+ (es)n
395
+ (es)o
396
+ (es)oɪ
397
+ (es)p
398
+ (es)pː
399
+ (es)r
400
+ (es)s
401
+ (es)t
402
+ (es)ts
403
+ (es)tʃ
404
+ (es)u
405
+ (es)v
406
+ (es)w
407
+ (es)x
408
+ (es)z
409
+ (es)ð
410
+ (es)ŋ
411
+ (es)ə
412
+ (es)ɛ
413
+ (es)ɟ
414
+ (es)ɡ
415
+ (es)ɣ
416
+ (es)ɫ
417
+ (es)ɲ
418
+ (es)ɾ
419
+ (es)ʃ
420
+ (es)ʎ
421
+ (es)ʝ
422
+ (es)β
423
+ (es)θ
424
+ (fr)a
425
+ (fr)aɪ
426
+ (fr)aʊ
427
+ (fr)aː
428
+ (fr)b
429
+ (fr)c
430
+ (fr)d
431
+ (fr)dʒ
432
+ (fr)e
433
+ (fr)eʊ
434
+ (fr)f
435
+ (fr)h
436
+ (fr)i
437
+ (fr)iʰr
438
+ (fr)iː
439
+ (fr)j
440
+ (fr)k
441
+ (fr)l
442
+ (fr)m
443
+ (fr)n
444
+ (fr)o
445
+ (fr)oː
446
+ (fr)p
447
+ (fr)r
448
+ (fr)s
449
+ (fr)t
450
+ (fr)tʃ
451
+ (fr)u
452
+ (fr)uː
453
+ (fr)v
454
+ (fr)w
455
+ (fr)x
456
+ (fr)y
457
+ (fr)yː
458
+ (fr)z
459
+ (fr)ç
460
+ (fr)ð
461
+ (fr)ø
462
+ (fr)øː
463
+ (fr)ŋ
464
+ (fr)œ
465
+ (fr)ɑ
466
+ (fr)ɔ
467
+ (fr)ə
468
+ (fr)ɛ
469
+ (fr)ɡ
470
+ (fr)ɣ
471
+ (fr)ɪ
472
+ (fr)ɪː
473
+ (fr)ɲ
474
+ (fr)ʁ
475
+ (fr)ʃ
476
+ (fr)ʎ
477
+ (fr)ʒ
478
+ (fr)ʰl
479
+ (fr)θ
480
+ (id)a
481
+ (id)aɪ
482
+ (id)aʊ
483
+ (id)b
484
+ (id)d
485
+ (id)dʒ
486
+ (id)e
487
+ (id)f
488
+ (id)h
489
+ (id)i
490
+ (id)j
491
+ (id)k
492
+ (id)l
493
+ (id)m
494
+ (id)n
495
+ (id)o
496
+ (id)p
497
+ (id)r
498
+ (id)s
499
+ (id)t
500
+ (id)tʃ
501
+ (id)u
502
+ (id)v
503
+ (id)w
504
+ (id)x
505
+ (id)z
506
+ (id)ç
507
+ (id)ŋ
508
+ (id)ɔ
509
+ (id)ə
510
+ (id)ɛ
511
+ (id)ɡ
512
+ (id)ɲ
513
+ (id)ɹ
514
+ (id)ʔ
515
+ (id)χ
516
+ (it)a
517
+ (it)aɪ
518
+ (it)aʊ
519
+ (it)aː
520
+ (it)b
521
+ (it)bː
522
+ (it)c
523
+ (it)d
524
+ (it)dz
525
+ (it)dzː
526
+ (it)dʒ
527
+ (it)dʒː
528
+ (it)dː
529
+ (it)e
530
+ (it)eɪ
531
+ (it)eʊ
532
+ (it)eː
533
+ (it)f
534
+ (it)fː
535
+ (it)h
536
+ (it)i
537
+ (it)iː
538
+ (it)j
539
+ (it)k
540
+ (it)kː
541
+ (it)l
542
+ (it)m
543
+ (it)mː
544
+ (it)n
545
+ (it)o
546
+ (it)oɪ
547
+ (it)oː
548
+ (it)p
549
+ (it)pː
550
+ (it)r
551
+ (it)s
552
+ (it)ss
553
+ (it)t
554
+ (it)ts
555
+ (it)tsː
556
+ (it)tʃ
557
+ (it)tʃː
558
+ (it)tː
559
+ (it)u
560
+ (it)uɪ
561
+ (it)uː
562
+ (it)v
563
+ (it)vʲ
564
+ (it)vː
565
+ (it)w
566
+ (it)y
567
+ (it)z
568
+ (it)ŋ
569
+ (it)ɔ
570
+ (it)ɔː
571
+ (it)ə
572
+ (it)əː
573
+ (it)ɛ
574
+ (it)ɛɪ
575
+ (it)ɛː
576
+ (it)ɟ
577
+ (it)ɡ
578
+ (it)ɡː
579
+ (it)ɪ
580
+ (it)ɪː
581
+ (it)ɲ
582
+ (it)ɹ
583
+ (it)ɾ
584
+ (it)ʃ
585
+ (it)ʊ
586
+ (it)ʊː
587
+ (it)ʎ
588
+ (it)ʒ
589
+ (it)ʝ
590
+ (it)ː
591
+ (it)θ
592
+ (it)θː
593
+ (pl)
594
+ (pt)a
595
+ (pt)aɪ
596
+ (pt)aʊ
597
+ (pt)aː
598
+ (pt)b
599
+ (pt)c
600
+ (pt)d
601
+ (pt)dʒ
602
+ (pt)e
603
+ (pt)eɪ
604
+ (pt)eʊ
605
+ (pt)f
606
+ (pt)h
607
+ (pt)i
608
+ (pt)iʊ
609
+ (pt)iː
610
+ (pt)j
611
+ (pt)k
612
+ (pt)l
613
+ (pt)m
614
+ (pt)n
615
+ (pt)o
616
+ (pt)oɪ
617
+ (pt)oː
618
+ (pt)p
619
+ (pt)r
620
+ (pt)s
621
+ (pt)t
622
+ (pt)ts
623
+ (pt)tʃ
624
+ (pt)u
625
+ (pt)uɪ
626
+ (pt)uː
627
+ (pt)v
628
+ (pt)w
629
+ (pt)x
630
+ (pt)y
631
+ (pt)z
632
+ (pt)æ
633
+ (pt)ç
634
+ (pt)ð
635
+ (pt)ŋ
636
+ (pt)ɐ
637
+ (pt)ɑ
638
+ (pt)ɔ
639
+ (pt)ɔɪ
640
+ (pt)ə
641
+ (pt)ɛ
642
+ (pt)ɛɪ
643
+ (pt)ɛʊ
644
+ (pt)ɡ
645
+ (pt)ɣ
646
+ (pt)ɪ
647
+ (pt)ɲ
648
+ (pt)ɹ
649
+ (pt)ɾ
650
+ (pt)ʃ
651
+ (pt)ʊ
652
+ (pt)ʎ
653
+ (pt)ʒ
654
+ (pt)θ
655
+ (ru)a
656
+ (ru)b
657
+ (ru)bʲ
658
+ (ru)c
659
+ (ru)d
660
+ (ru)dʒʲ
661
+ (ru)dʲ
662
+ (ru)e
663
+ (ru)eː
664
+ (ru)f
665
+ (ru)fʲ
666
+ (ru)i
667
+ (ru)iː
668
+ (ru)j
669
+ (ru)ja
670
+ (ru)ju
671
+ (ru)k
672
+ (ru)kʲ
673
+ (ru)l
674
+ (ru)m
675
+ (ru)mʲ
676
+ (ru)n
677
+ (ru)nʲ
678
+ (ru)o
679
+ (ru)p
680
+ (ru)pʲ
681
+ (ru)r
682
+ (ru)rʲ
683
+ (ru)s
684
+ (ru)sʲ
685
+ (ru)t
686
+ (ru)ts
687
+ (ru)tʃʲ
688
+ (ru)tʲ
689
+ (ru)u
690
+ (ru)v
691
+ (ru)vʲ
692
+ (ru)w
693
+ (ru)x
694
+ (ru)y
695
+ (ru)z
696
+ (ru)ç
697
+ (ru)ð
698
+ (ru)ŋ
699
+ (ru)ɑ
700
+ (ru)ɔ
701
+ (ru)ɕ
702
+ (ru)ə
703
+ (ru)ɛ
704
+ (ru)ɡ
705
+ (ru)ɡʲ
706
+ (ru)ɣ
707
+ (ru)ɪ
708
+ (ru)ɭ
709
+ (ru)ɭʲ
710
+ (ru)ɵ
711
+ (ru)ʃ
712
+ (ru)ʌ
713
+ (ru)ʑ
714
+ (ru)ʒ
715
+ (ru)θ
716
+ (vi)a
717
+ (vi)a2
718
+ (vi)a4
719
+ (vi)a5
720
+ (vi)a6
721
+ (vi)aɜ
722
+ (vi)aɪ4
723
+ (vi)aʊɜ
724
+ (vi)aː
725
+ (vi)aː2
726
+ (vi)aː4
727
+ (vi)aː5
728
+ (vi)aː6
729
+ (vi)aːɜ
730
+ (vi)aːɪ
731
+ (vi)b
732
+ (vi)c
733
+ (vi)d
734
+ (vi)e
735
+ (vi)e1
736
+ (vi)e2
737
+ (vi)e4
738
+ (vi)e5
739
+ (vi)e6
740
+ (vi)e7
741
+ (vi)eɜ
742
+ (vi)f
743
+ (vi)h
744
+ (vi)i
745
+ (vi)i2
746
+ (vi)i4
747
+ (vi)i5
748
+ (vi)i6
749
+ (vi)iə
750
+ (vi)iə2
751
+ (vi)iə4
752
+ (vi)iə5
753
+ (vi)iə6
754
+ (vi)iəɜ
755
+ (vi)iɛ
756
+ (vi)iɛ1
757
+ (vi)iɛ2
758
+ (vi)iɛ4
759
+ (vi)iɛ5
760
+ (vi)iɛ6
761
+ (vi)iɛɜ
762
+ (vi)iɜ
763
+ (vi)j
764
+ (vi)k
765
+ (vi)kh
766
+ (vi)l
767
+ (vi)m
768
+ (vi)n
769
+ (vi)o
770
+ (vi)o1
771
+ (vi)o2
772
+ (vi)o4
773
+ (vi)o5
774
+ (vi)o6
775
+ (vi)oɜ
776
+ (vi)p
777
+ (vi)s
778
+ (vi)t
779
+ (vi)tʃ
780
+ (vi)u
781
+ (vi)u2
782
+ (vi)u4
783
+ (vi)u5
784
+ (vi)u6
785
+ (vi)uə
786
+ (vi)uə2
787
+ (vi)uə4
788
+ (vi)uə5
789
+ (vi)uə6
790
+ (vi)uəɜ
791
+ (vi)uɜ
792
+ (vi)v
793
+ (vi)w
794
+ (vi)x
795
+ (vi)y
796
+ (vi)y2
797
+ (vi)y4
798
+ (vi)y5
799
+ (vi)y6
800
+ (vi)yə
801
+ (vi)yə2
802
+ (vi)yə4
803
+ (vi)yə5
804
+ (vi)yə6
805
+ (vi)yə7
806
+ (vi)yəɜ
807
+ (vi)yɜ
808
+ (vi)z
809
+ (vi)ð
810
+ (vi)ŋ
811
+ (vi)ɔ
812
+ (vi)ɔ2
813
+ (vi)ɔ4
814
+ (vi)ɔ5
815
+ (vi)ɔ6
816
+ (vi)ɔɜ
817
+ (vi)ɗ
818
+ (vi)ə
819
+ (vi)ə1
820
+ (vi)ə2
821
+ (vi)ə4
822
+ (vi)ə5
823
+ (vi)ə6
824
+ (vi)əɜ
825
+ (vi)əɪ
826
+ (vi)əɪ2
827
+ (vi)əɪ4
828
+ (vi)əɪ5
829
+ (vi)əɪ6
830
+ (vi)əɪɜ
831
+ (vi)əː
832
+ (vi)əː2
833
+ (vi)əː4
834
+ (vi)əː5
835
+ (vi)əː6
836
+ (vi)əːɜ
837
+ (vi)əːʊ
838
+ (vi)əːʊɜ
839
+ (vi)ɛ
840
+ (vi)ɛ2
841
+ (vi)ɛ4
842
+ (vi)ɛ5
843
+ (vi)ɛ6
844
+ (vi)ɛɜ
845
+ (vi)ɡ
846
+ (vi)ɣ
847
+ (vi)ɲ
848
+ (vi)ʐ
849
+ (vi)ʒ
850
+ ,
851
+ .
852
+ 1
853
+ ?
854
+ ^
855
+ _
856
+ a
857
+
858
+ b
859
+ c
860
+ d
861
+
862
+ e
863
+ f
864
+ i
865
+ j
866
+ k
867
+ l
868
+ m
869
+ n
870
+ o
871
+
872
+ p
873
+ r
874
+ s
875
+ t
876
+ ts
877
+
878
+ u
879
+ v
880
+ w
881
+ x
882
+ z
883
+ ¡
884
+ ç
885
+ ð
886
+ ŋ
887
+ ɔ
888
+ ɛ
889
+ ɡ
890
+ ɣ
891
+ ɲ
892
+ ʃ
893
+ ʎ
894
+ ̃
895
+ ̩
896
+ ̪
897
+ θ
898
+
pretrained_models/data/multilingual_prosody/vocab.txt ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #1
5
+ #2
6
+ #3
7
+ #4
8
+ (zh)a1
9
+ (zh)a2
10
+ (zh)a3
11
+ (zh)a4
12
+ (zh)a5
13
+ (zh)ai1
14
+ (zh)ai2
15
+ (zh)ai3
16
+ (zh)ai4
17
+ (zh)ai5
18
+ (zh)an1
19
+ (zh)an2
20
+ (zh)an3
21
+ (zh)an4
22
+ (zh)an5
23
+ (zh)ang1
24
+ (zh)ang2
25
+ (zh)ang3
26
+ (zh)ang4
27
+ (zh)ang5
28
+ (zh)ao1
29
+ (zh)ao2
30
+ (zh)ao3
31
+ (zh)ao4
32
+ (zh)ao5
33
+ (zh)b
34
+ (zh)c
35
+ (zh)ch
36
+ (zh)d
37
+ (zh)e1
38
+ (zh)e2
39
+ (zh)e3
40
+ (zh)e4
41
+ (zh)e5
42
+ (zh)ei1
43
+ (zh)ei2
44
+ (zh)ei3
45
+ (zh)ei4
46
+ (zh)ei5
47
+ (zh)en1
48
+ (zh)en2
49
+ (zh)en3
50
+ (zh)en4
51
+ (zh)en5
52
+ (zh)eng1
53
+ (zh)eng2
54
+ (zh)eng3
55
+ (zh)eng4
56
+ (zh)eng5
57
+ (zh)er1
58
+ (zh)er2
59
+ (zh)er3
60
+ (zh)er4
61
+ (zh)er5
62
+ (zh)f
63
+ (zh)g
64
+ (zh)h
65
+ (zh)i1
66
+ (zh)i2
67
+ (zh)i3
68
+ (zh)i4
69
+ (zh)i5
70
+ (zh)ia1
71
+ (zh)ia2
72
+ (zh)ia3
73
+ (zh)ia4
74
+ (zh)ia5
75
+ (zh)ian1
76
+ (zh)ian2
77
+ (zh)ian3
78
+ (zh)ian4
79
+ (zh)ian5
80
+ (zh)iang1
81
+ (zh)iang2
82
+ (zh)iang3
83
+ (zh)iang4
84
+ (zh)iang5
85
+ (zh)iao1
86
+ (zh)iao2
87
+ (zh)iao3
88
+ (zh)iao4
89
+ (zh)iao5
90
+ (zh)ie1
91
+ (zh)ie2
92
+ (zh)ie3
93
+ (zh)ie4
94
+ (zh)ie5
95
+ (zh)in1
96
+ (zh)in2
97
+ (zh)in3
98
+ (zh)in4
99
+ (zh)in5
100
+ (zh)ing1
101
+ (zh)ing2
102
+ (zh)ing3
103
+ (zh)ing4
104
+ (zh)ing5
105
+ (zh)iong1
106
+ (zh)iong2
107
+ (zh)iong3
108
+ (zh)iong4
109
+ (zh)iong5
110
+ (zh)iou1
111
+ (zh)iou2
112
+ (zh)iou3
113
+ (zh)iou4
114
+ (zh)iou5
115
+ (zh)j
116
+ (zh)k
117
+ (zh)l
118
+ (zh)m
119
+ (zh)n
120
+ (zh)o1
121
+ (zh)o2
122
+ (zh)o3
123
+ (zh)o4
124
+ (zh)o5
125
+ (zh)ong1
126
+ (zh)ong2
127
+ (zh)ong3
128
+ (zh)ong4
129
+ (zh)ong5
130
+ (zh)ou1
131
+ (zh)ou2
132
+ (zh)ou3
133
+ (zh)ou4
134
+ (zh)ou5
135
+ (zh)p
136
+ (zh)q
137
+ (zh)r
138
+ (zh)s
139
+ (zh)sh
140
+ (zh)t
141
+ (zh)u1
142
+ (zh)u2
143
+ (zh)u3
144
+ (zh)u4
145
+ (zh)u5
146
+ (zh)ua1
147
+ (zh)ua2
148
+ (zh)ua3
149
+ (zh)ua4
150
+ (zh)ua5
151
+ (zh)uai1
152
+ (zh)uai2
153
+ (zh)uai3
154
+ (zh)uai4
155
+ (zh)uai5
156
+ (zh)uan1
157
+ (zh)uan2
158
+ (zh)uan3
159
+ (zh)uan4
160
+ (zh)uan5
161
+ (zh)uang1
162
+ (zh)uang2
163
+ (zh)uang3
164
+ (zh)uang4
165
+ (zh)uang5
166
+ (zh)uei1
167
+ (zh)uei2
168
+ (zh)uei3
169
+ (zh)uei4
170
+ (zh)uei5
171
+ (zh)uen1
172
+ (zh)uen2
173
+ (zh)uen3
174
+ (zh)uen4
175
+ (zh)uen5
176
+ (zh)ueng1
177
+ (zh)ueng2
178
+ (zh)ueng3
179
+ (zh)ueng4
180
+ (zh)ueng5
181
+ (zh)uo1
182
+ (zh)uo2
183
+ (zh)uo3
184
+ (zh)uo4
185
+ (zh)uo5
186
+ (zh)v1
187
+ (zh)v2
188
+ (zh)v3
189
+ (zh)v4
190
+ (zh)v5
191
+ (zh)van1
192
+ (zh)van2
193
+ (zh)van3
194
+ (zh)van4
195
+ (zh)van5
196
+ (zh)ve1
197
+ (zh)ve2
198
+ (zh)ve3
199
+ (zh)ve4
200
+ (zh)ve5
201
+ (zh)vn1
202
+ (zh)vn2
203
+ (zh)vn3
204
+ (zh)vn4
205
+ (zh)vn5
206
+ (zh)w
207
+ (zh)x
208
+ (zh)y
209
+ (zh)z
210
+ (zh)zh
211
+ (de)a
212
+ (de)aɪ
213
+ (de)aʊ
214
+ (de)b
215
+ (de)bʲ
216
+ (de)c
217
+ (de)d
218
+ (de)dʑ
219
+ (de)dʒ
220
+ (de)e
221
+ (de)eː
222
+ (de)f
223
+ (de)h
224
+ (de)i
225
+ (de)iː
226
+ (de)j
227
+ (de)k
228
+ (de)kʲ
229
+ (de)l
230
+ (de)m
231
+ (de)mʲ
232
+ (de)n
233
+ (de)o
234
+ (de)oɪ
235
+ (de)oː
236
+ (de)p
237
+ (de)pf
238
+ (de)pʲ
239
+ (de)r
240
+ (de)s
241
+ (de)t
242
+ (de)ts
243
+ (de)tɕ
244
+ (de)tʃ
245
+ (de)tʲ
246
+ (de)u
247
+ (de)uː
248
+ (de)v
249
+ (de)vʲ
250
+ (de)w
251
+ (de)x
252
+ (de)y
253
+ (de)yː
254
+ (de)z
255
+ (de)ç
256
+ (de)ð
257
+ (de)øː
258
+ (de)ŋ
259
+ (de)œ
260
+ (de)ɑ
261
+ (de)ɑː
262
+ (de)ɔ
263
+ (de)ɔø
264
+ (de)ɔː
265
+ (de)ɕ
266
+ (de)ə
267
+ (de)ɛ
268
+ (de)ɛɪ
269
+ (de)ɛː
270
+ (de)ɜ
271
+ (de)ɡ
272
+ (de)ɡʲ
273
+ (de)ɣ
274
+ (de)ɨ
275
+ (de)ɪ
276
+ (de)ɲ
277
+ (de)ɲʲ
278
+ (de)ɾ
279
+ (de)ʃ
280
+ (de)ʊ
281
+ (de)ʑ
282
+ (de)ʒ
283
+ (de)θ
284
+ (el)
285
+ (en)a
286
+ (en)aɪ
287
+ (en)aɪə
288
+ (en)aɪɚ
289
+ (en)aʊ
290
+ (en)aː
291
+ (en)b
292
+ (en)bʲ
293
+ (en)c
294
+ (en)d
295
+ (en)dʑ
296
+ (en)dʒ
297
+ (en)e
298
+ (en)eə
299
+ (en)eɪ
300
+ (en)f
301
+ (en)h
302
+ (en)i
303
+ (en)iə
304
+ (en)iː
305
+ (en)iːː
306
+ (en)j
307
+ (en)k
308
+ (en)l
309
+ (en)m
310
+ (en)n
311
+ (en)nʲ
312
+ (en)o
313
+ (en)oʊ
314
+ (en)oː
315
+ (en)oːɹ
316
+ (en)p
317
+ (en)q
318
+ (en)r
319
+ (en)s
320
+ (en)t
321
+ (en)tɕ
322
+ (en)tʃ
323
+ (en)u
324
+ (en)uː
325
+ (en)v
326
+ (en)w
327
+ (en)x
328
+ (en)z
329
+ (en)æ
330
+ (en)ææ
331
+ (en)ç
332
+ (en)ð
333
+ (en)ŋ
334
+ (en)ɐ
335
+ (en)ɐɐ
336
+ (en)ɑ
337
+ (en)ɑː
338
+ (en)ɑːɹ
339
+ (en)ɒ
340
+ (en)ɔ
341
+ (en)ɔɪ
342
+ (en)ɔː
343
+ (en)ɔːɹ
344
+ (en)ɕ
345
+ (en)ə
346
+ (en)əl
347
+ (en)əʊ
348
+ (en)ɚ
349
+ (en)ɛ
350
+ (en)ɛɹ
351
+ (en)ɛː
352
+ (en)ɜː
353
+ (en)ɡ
354
+ (en)ɡʲ
355
+ (en)ɣ
356
+ (en)ɨ
357
+ (en)ɪ
358
+ (en)ɪɹ
359
+ (en)ɪː
360
+ (en)ɬ
361
+ (en)ɲ
362
+ (en)ɲʲ
363
+ (en)ɹ
364
+ (en)ɾ
365
+ (en)ʁ
366
+ (en)ʃ
367
+ (en)ʊ
368
+ (en)ʊə
369
+ (en)ʊɹ
370
+ (en)ʌ
371
+ (en)ʍ
372
+ (en)ʒ
373
+ (en)ʔ
374
+ (en)θ
375
+ (en)ᵻ
376
+ (es)a
377
+ (es)aɪ
378
+ (es)aʊ
379
+ (es)b
380
+ (es)c
381
+ (es)d
382
+ (es)dʒ
383
+ (es)e
384
+ (es)eɪ
385
+ (es)eʊ
386
+ (es)f
387
+ (es)h
388
+ (es)i
389
+ (es)iː
390
+ (es)j
391
+ (es)k
392
+ (es)l
393
+ (es)m
394
+ (es)n
395
+ (es)o
396
+ (es)oɪ
397
+ (es)p
398
+ (es)pː
399
+ (es)r
400
+ (es)s
401
+ (es)t
402
+ (es)ts
403
+ (es)tʃ
404
+ (es)u
405
+ (es)v
406
+ (es)w
407
+ (es)x
408
+ (es)z
409
+ (es)ð
410
+ (es)ŋ
411
+ (es)ə
412
+ (es)ɛ
413
+ (es)ɟ
414
+ (es)ɡ
415
+ (es)ɣ
416
+ (es)ɫ
417
+ (es)ɲ
418
+ (es)ɾ
419
+ (es)ʃ
420
+ (es)ʎ
421
+ (es)ʝ
422
+ (es)β
423
+ (es)θ
424
+ (fr)a
425
+ (fr)aɪ
426
+ (fr)aʊ
427
+ (fr)aː
428
+ (fr)b
429
+ (fr)c
430
+ (fr)d
431
+ (fr)dʒ
432
+ (fr)e
433
+ (fr)eʊ
434
+ (fr)f
435
+ (fr)h
436
+ (fr)i
437
+ (fr)iʰr
438
+ (fr)iː
439
+ (fr)j
440
+ (fr)k
441
+ (fr)l
442
+ (fr)m
443
+ (fr)n
444
+ (fr)o
445
+ (fr)oː
446
+ (fr)p
447
+ (fr)r
448
+ (fr)s
449
+ (fr)t
450
+ (fr)tʃ
451
+ (fr)u
452
+ (fr)uː
453
+ (fr)v
454
+ (fr)w
455
+ (fr)x
456
+ (fr)y
457
+ (fr)yː
458
+ (fr)z
459
+ (fr)ç
460
+ (fr)ð
461
+ (fr)ø
462
+ (fr)øː
463
+ (fr)ŋ
464
+ (fr)œ
465
+ (fr)ɑ
466
+ (fr)ɔ
467
+ (fr)ə
468
+ (fr)ɛ
469
+ (fr)ɡ
470
+ (fr)ɣ
471
+ (fr)ɪ
472
+ (fr)ɪː
473
+ (fr)ɲ
474
+ (fr)ʁ
475
+ (fr)ʃ
476
+ (fr)ʎ
477
+ (fr)ʒ
478
+ (fr)ʰl
479
+ (fr)θ
480
+ (id)a
481
+ (id)aɪ
482
+ (id)aʊ
483
+ (id)b
484
+ (id)d
485
+ (id)dʒ
486
+ (id)e
487
+ (id)f
488
+ (id)h
489
+ (id)i
490
+ (id)j
491
+ (id)k
492
+ (id)l
493
+ (id)m
494
+ (id)n
495
+ (id)o
496
+ (id)p
497
+ (id)r
498
+ (id)s
499
+ (id)t
500
+ (id)tʃ
501
+ (id)u
502
+ (id)v
503
+ (id)w
504
+ (id)x
505
+ (id)z
506
+ (id)ç
507
+ (id)ŋ
508
+ (id)ɔ
509
+ (id)ə
510
+ (id)ɛ
511
+ (id)ɡ
512
+ (id)ɲ
513
+ (id)ɹ
514
+ (id)ʔ
515
+ (id)χ
516
+ (it)a
517
+ (it)aɪ
518
+ (it)aʊ
519
+ (it)aː
520
+ (it)b
521
+ (it)bː
522
+ (it)c
523
+ (it)d
524
+ (it)dz
525
+ (it)dzː
526
+ (it)dʒ
527
+ (it)dʒː
528
+ (it)dː
529
+ (it)e
530
+ (it)eɪ
531
+ (it)eʊ
532
+ (it)eː
533
+ (it)f
534
+ (it)fː
535
+ (it)h
536
+ (it)i
537
+ (it)iː
538
+ (it)j
539
+ (it)k
540
+ (it)kː
541
+ (it)l
542
+ (it)m
543
+ (it)mː
544
+ (it)n
545
+ (it)o
546
+ (it)oɪ
547
+ (it)oː
548
+ (it)p
549
+ (it)pː
550
+ (it)r
551
+ (it)s
552
+ (it)ss
553
+ (it)t
554
+ (it)ts
555
+ (it)tsː
556
+ (it)tʃ
557
+ (it)tʃː
558
+ (it)tː
559
+ (it)u
560
+ (it)uɪ
561
+ (it)uː
562
+ (it)v
563
+ (it)vʲ
564
+ (it)vː
565
+ (it)w
566
+ (it)y
567
+ (it)z
568
+ (it)ŋ
569
+ (it)ɔ
570
+ (it)ɔː
571
+ (it)ə
572
+ (it)əː
573
+ (it)ɛ
574
+ (it)ɛɪ
575
+ (it)ɛː
576
+ (it)ɟ
577
+ (it)ɡ
578
+ (it)ɡː
579
+ (it)ɪ
580
+ (it)ɪː
581
+ (it)ɲ
582
+ (it)ɹ
583
+ (it)ɾ
584
+ (it)ʃ
585
+ (it)ʊ
586
+ (it)ʊː
587
+ (it)ʎ
588
+ (it)ʒ
589
+ (it)ʝ
590
+ (it)ː
591
+ (it)θ
592
+ (it)θː
593
+ (pl)
594
+ (pt)a
595
+ (pt)aɪ
596
+ (pt)aʊ
597
+ (pt)aː
598
+ (pt)b
599
+ (pt)c
600
+ (pt)d
601
+ (pt)dʒ
602
+ (pt)e
603
+ (pt)eɪ
604
+ (pt)eʊ
605
+ (pt)f
606
+ (pt)h
607
+ (pt)i
608
+ (pt)iʊ
609
+ (pt)iː
610
+ (pt)j
611
+ (pt)k
612
+ (pt)l
613
+ (pt)m
614
+ (pt)n
615
+ (pt)o
616
+ (pt)oɪ
617
+ (pt)oː
618
+ (pt)p
619
+ (pt)r
620
+ (pt)s
621
+ (pt)t
622
+ (pt)ts
623
+ (pt)tʃ
624
+ (pt)u
625
+ (pt)uɪ
626
+ (pt)uː
627
+ (pt)v
628
+ (pt)w
629
+ (pt)x
630
+ (pt)y
631
+ (pt)z
632
+ (pt)æ
633
+ (pt)ç
634
+ (pt)ð
635
+ (pt)ŋ
636
+ (pt)ɐ
637
+ (pt)ɑ
638
+ (pt)ɔ
639
+ (pt)ɔɪ
640
+ (pt)ə
641
+ (pt)ɛ
642
+ (pt)ɛɪ
643
+ (pt)ɛʊ
644
+ (pt)ɡ
645
+ (pt)ɣ
646
+ (pt)ɪ
647
+ (pt)ɲ
648
+ (pt)ɹ
649
+ (pt)ɾ
650
+ (pt)ʃ
651
+ (pt)ʊ
652
+ (pt)ʎ
653
+ (pt)ʒ
654
+ (pt)θ
655
+ (ru)a
656
+ (ru)b
657
+ (ru)bʲ
658
+ (ru)c
659
+ (ru)d
660
+ (ru)dʒʲ
661
+ (ru)dʲ
662
+ (ru)e
663
+ (ru)eː
664
+ (ru)f
665
+ (ru)fʲ
666
+ (ru)i
667
+ (ru)iː
668
+ (ru)j
669
+ (ru)ja
670
+ (ru)ju
671
+ (ru)k
672
+ (ru)kʲ
673
+ (ru)l
674
+ (ru)m
675
+ (ru)mʲ
676
+ (ru)n
677
+ (ru)nʲ
678
+ (ru)o
679
+ (ru)p
680
+ (ru)pʲ
681
+ (ru)r
682
+ (ru)rʲ
683
+ (ru)s
684
+ (ru)sʲ
685
+ (ru)t
686
+ (ru)ts
687
+ (ru)tʃʲ
688
+ (ru)tʲ
689
+ (ru)u
690
+ (ru)v
691
+ (ru)vʲ
692
+ (ru)w
693
+ (ru)x
694
+ (ru)y
695
+ (ru)z
696
+ (ru)ç
697
+ (ru)ð
698
+ (ru)ŋ
699
+ (ru)ɑ
700
+ (ru)ɔ
701
+ (ru)ɕ
702
+ (ru)ə
703
+ (ru)ɛ
704
+ (ru)ɡ
705
+ (ru)ɡʲ
706
+ (ru)ɣ
707
+ (ru)ɪ
708
+ (ru)ɭ
709
+ (ru)ɭʲ
710
+ (ru)ɵ
711
+ (ru)ʃ
712
+ (ru)ʌ
713
+ (ru)ʑ
714
+ (ru)ʒ
715
+ (ru)θ
716
+ (vi)a
717
+ (vi)a2
718
+ (vi)a4
719
+ (vi)a5
720
+ (vi)a6
721
+ (vi)aɜ
722
+ (vi)aɪ4
723
+ (vi)aʊɜ
724
+ (vi)aː
725
+ (vi)aː2
726
+ (vi)aː4
727
+ (vi)aː5
728
+ (vi)aː6
729
+ (vi)aːɜ
730
+ (vi)aːɪ
731
+ (vi)b
732
+ (vi)c
733
+ (vi)d
734
+ (vi)e
735
+ (vi)e1
736
+ (vi)e2
737
+ (vi)e4
738
+ (vi)e5
739
+ (vi)e6
740
+ (vi)e7
741
+ (vi)eɜ
742
+ (vi)f
743
+ (vi)h
744
+ (vi)i
745
+ (vi)i2
746
+ (vi)i4
747
+ (vi)i5
748
+ (vi)i6
749
+ (vi)iə
750
+ (vi)iə2
751
+ (vi)iə4
752
+ (vi)iə5
753
+ (vi)iə6
754
+ (vi)iəɜ
755
+ (vi)iɛ
756
+ (vi)iɛ1
757
+ (vi)iɛ2
758
+ (vi)iɛ4
759
+ (vi)iɛ5
760
+ (vi)iɛ6
761
+ (vi)iɛɜ
762
+ (vi)iɜ
763
+ (vi)j
764
+ (vi)k
765
+ (vi)kh
766
+ (vi)l
767
+ (vi)m
768
+ (vi)n
769
+ (vi)o
770
+ (vi)o1
771
+ (vi)o2
772
+ (vi)o4
773
+ (vi)o5
774
+ (vi)o6
775
+ (vi)oɜ
776
+ (vi)p
777
+ (vi)s
778
+ (vi)t
779
+ (vi)tʃ
780
+ (vi)u
781
+ (vi)u2
782
+ (vi)u4
783
+ (vi)u5
784
+ (vi)u6
785
+ (vi)uə
786
+ (vi)uə2
787
+ (vi)uə4
788
+ (vi)uə5
789
+ (vi)uə6
790
+ (vi)uəɜ
791
+ (vi)uɜ
792
+ (vi)v
793
+ (vi)w
794
+ (vi)x
795
+ (vi)y
796
+ (vi)y2
797
+ (vi)y4
798
+ (vi)y5
799
+ (vi)y6
800
+ (vi)yə
801
+ (vi)yə2
802
+ (vi)yə4
803
+ (vi)yə5
804
+ (vi)yə6
805
+ (vi)yə7
806
+ (vi)yəɜ
807
+ (vi)yɜ
808
+ (vi)z
809
+ (vi)ð
810
+ (vi)ŋ
811
+ (vi)ɔ
812
+ (vi)ɔ2
813
+ (vi)ɔ4
814
+ (vi)ɔ5
815
+ (vi)ɔ6
816
+ (vi)ɔɜ
817
+ (vi)ɗ
818
+ (vi)ə
819
+ (vi)ə1
820
+ (vi)ə2
821
+ (vi)ə4
822
+ (vi)ə5
823
+ (vi)ə6
824
+ (vi)əɜ
825
+ (vi)əɪ
826
+ (vi)əɪ2
827
+ (vi)əɪ4
828
+ (vi)əɪ5
829
+ (vi)əɪ6
830
+ (vi)əɪɜ
831
+ (vi)əː
832
+ (vi)əː2
833
+ (vi)əː4
834
+ (vi)əː5
835
+ (vi)əː6
836
+ (vi)əːɜ
837
+ (vi)əːʊ
838
+ (vi)əːʊɜ
839
+ (vi)ɛ
840
+ (vi)ɛ2
841
+ (vi)ɛ4
842
+ (vi)ɛ5
843
+ (vi)ɛ6
844
+ (vi)ɛɜ
845
+ (vi)ɡ
846
+ (vi)ɣ
847
+ (vi)ɲ
848
+ (vi)ʐ
849
+ (vi)ʒ
850
+ ,
851
+ .
852
+ 1
853
+ ?
854
+ ^
855
+ _
856
+ a
857
+
858
+ b
859
+ c
860
+ d
861
+
862
+ e
863
+ f
864
+ i
865
+ j
866
+ k
867
+ l
868
+ m
869
+ n
870
+ o
871
+
872
+ p
873
+ r
874
+ s
875
+ t
876
+ ts
877
+
878
+ u
879
+ v
880
+ w
881
+ x
882
+ z
883
+ ¡
884
+ ç
885
+ ð
886
+ ŋ
887
+ ɔ
888
+ ɛ
889
+ ɡ
890
+ ɣ
891
+ ɲ
892
+ ʃ
893
+ ʎ
894
+ ̃
895
+ ̩
896
+ ̪
897
+ θ
898
+
pretrained_models/demo/test.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6913e2ff86f5d12b70ed46e78363dd7b33ce4642dd32e0641a614dfc0c81a92
3
+ size 464084
pretrained_models/denoiser_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe5eb64fa2e4154c83f8e4935e82871c850c154387ee892e0ab65fe179e7d8c9
3
+ size 16104687
pretrained_models/uvr5/Kim_Vocal_1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f313140ef8fecc3041881b60ecb993d985a0281a138b2fb634aa8901aebc38cb
3
+ size 66759214
pretrained_models/uvr5/MDX-Net-Kim-Vocal1.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "Kim Vocal 1",
3
+ "aggression_setting": "5",
4
+ "window_size": "320",
5
+ "batch_size": "Default",
6
+ "crop_size": "256",
7
+ "is_tta": false,
8
+ "is_output_image": false,
9
+ "is_post_process": false,
10
+ "is_high_end_process": false,
11
+ "post_process_threshold": "0.2",
12
+ "vr_voc_inst_secondary_model": "No Model Selected",
13
+ "vr_other_secondary_model": "No Model Selected",
14
+ "vr_bass_secondary_model": "No Model Selected",
15
+ "vr_drums_secondary_model": "No Model Selected",
16
+ "vr_is_secondary_model_activate": false,
17
+ "vr_voc_inst_secondary_model_scale": "0.9",
18
+ "vr_other_secondary_model_scale": "0.7",
19
+ "vr_bass_secondary_model_scale": "0.5",
20
+ "vr_drums_secondary_model_scale": "0.5",
21
+ "demucs_model": "Choose Model",
22
+ "segment": "Default",
23
+ "overlap": "0.25",
24
+ "shifts": "2",
25
+ "chunks_demucs": "Auto",
26
+ "margin_demucs": "44100",
27
+ "is_chunk_demucs": false,
28
+ "is_chunk_mdxnet": false,
29
+ "is_primary_stem_only_Demucs": false,
30
+ "is_secondary_stem_only_Demucs": false,
31
+ "is_split_mode": true,
32
+ "is_demucs_combine_stems": true,
33
+ "demucs_voc_inst_secondary_model": "No Model Selected",
34
+ "demucs_other_secondary_model": "No Model Selected",
35
+ "demucs_bass_secondary_model": "No Model Selected",
36
+ "demucs_drums_secondary_model": "No Model Selected",
37
+ "demucs_is_secondary_model_activate": false,
38
+ "demucs_voc_inst_secondary_model_scale": "0.9",
39
+ "demucs_other_secondary_model_scale": "0.7",
40
+ "demucs_bass_secondary_model_scale": "0.5",
41
+ "demucs_drums_secondary_model_scale": "0.5",
42
+ "demucs_pre_proc_model": "No Model Selected",
43
+ "is_demucs_pre_proc_model_activate": false,
44
+ "is_demucs_pre_proc_model_inst_mix": false,
45
+ "mdx_net_model": "Kim Vocal 1",
46
+ "chunks": "Auto",
47
+ "margin": "44100",
48
+ "compensate": "Auto",
49
+ "is_denoise": false,
50
+ "is_invert_spec": false,
51
+ "is_mixer_mode": false,
52
+ "mdx_batch_size": "Default",
53
+ "mdx_voc_inst_secondary_model": "No Model Selected",
54
+ "mdx_other_secondary_model": "No Model Selected",
55
+ "mdx_bass_secondary_model": "No Model Selected",
56
+ "mdx_drums_secondary_model": "No Model Selected",
57
+ "mdx_is_secondary_model_activate": false,
58
+ "mdx_voc_inst_secondary_model_scale": "0.9",
59
+ "mdx_other_secondary_model_scale": "0.7",
60
+ "mdx_bass_secondary_model_scale": "0.5",
61
+ "mdx_drums_secondary_model_scale": "0.5",
62
+ "is_save_all_outputs_ensemble": true,
63
+ "is_append_ensemble_name": false,
64
+ "chosen_audio_tool": "Manual Ensemble",
65
+ "choose_algorithm": "Min Spec",
66
+ "time_stretch_rate": "2.0",
67
+ "pitch_rate": "2.0",
68
+ "is_gpu_conversion": true,
69
+ "is_primary_stem_only": false,
70
+ "is_secondary_stem_only": false,
71
+ "is_testing_audio": false,
72
+ "is_add_model_name": false,
73
+ "is_accept_any_input": false,
74
+ "is_task_complete": false,
75
+ "is_normalization": false,
76
+ "is_create_model_folder": false,
77
+ "mp3_bit_set": "320k",
78
+ "save_format": "WAV",
79
+ "wav_type_set": "PCM_16",
80
+ "user_code": "",
81
+ "help_hints_var": false,
82
+ "model_sample_mode": false,
83
+ "model_sample_mode_duration": "30",
84
+ "demucs_stems": "All Stems"
85
+ }
pretrained_models/uvr5/model_data.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0ddfc0eb5792638ad5dc27850236c246": {
3
+ "compensate": 1.035,
4
+ "mdx_dim_f_set": 2048,
5
+ "mdx_dim_t_set": 8,
6
+ "mdx_n_fft_scale_set": 6144,
7
+ "primary_stem": "Vocals"
8
+ },
9
+ "26d308f91f3423a67dc69a6d12a8793d": {
10
+ "compensate": 1.035,
11
+ "mdx_dim_f_set": 2048,
12
+ "mdx_dim_t_set": 9,
13
+ "mdx_n_fft_scale_set": 8192,
14
+ "primary_stem": "Other"
15
+ },
16
+ "2cdd429caac38f0194b133884160f2c6": {
17
+ "compensate": 1.045,
18
+ "mdx_dim_f_set": 3072,
19
+ "mdx_dim_t_set": 8,
20
+ "mdx_n_fft_scale_set": 7680,
21
+ "primary_stem": "Instrumental"
22
+ },
23
+ "2f5501189a2f6db6349916fabe8c90de": {
24
+ "compensate": 1.035,
25
+ "mdx_dim_f_set": 2048,
26
+ "mdx_dim_t_set": 8,
27
+ "mdx_n_fft_scale_set": 6144,
28
+ "primary_stem": "Vocals"
29
+ },
30
+ "398580b6d5d973af3120df54cee6759d": {
31
+ "compensate": 1.75,
32
+ "mdx_dim_f_set": 3072,
33
+ "mdx_dim_t_set": 8,
34
+ "mdx_n_fft_scale_set": 7680,
35
+ "primary_stem": "Vocals"
36
+ },
37
+ "488b3e6f8bd3717d9d7c428476be2d75": {
38
+ "compensate": 1.035,
39
+ "mdx_dim_f_set": 3072,
40
+ "mdx_dim_t_set": 8,
41
+ "mdx_n_fft_scale_set": 7680,
42
+ "primary_stem": "Instrumental"
43
+ },
44
+ "4910e7827f335048bdac11fa967772f9": {
45
+ "compensate": 1.035,
46
+ "mdx_dim_f_set": 2048,
47
+ "mdx_dim_t_set": 7,
48
+ "mdx_n_fft_scale_set": 4096,
49
+ "primary_stem": "Drums"
50
+ },
51
+ "53c4baf4d12c3e6c3831bb8f5b532b93": {
52
+ "compensate": 1.043,
53
+ "mdx_dim_f_set": 3072,
54
+ "mdx_dim_t_set": 8,
55
+ "mdx_n_fft_scale_set": 7680,
56
+ "primary_stem": "Vocals"
57
+ },
58
+ "5d343409ef0df48c7d78cce9f0106781": {
59
+ "compensate": 1.075,
60
+ "mdx_dim_f_set": 3072,
61
+ "mdx_dim_t_set": 8,
62
+ "mdx_n_fft_scale_set": 7680,
63
+ "primary_stem": "Vocals"
64
+ },
65
+ "5f6483271e1efb9bfb59e4a3e6d4d098": {
66
+ "compensate": 1.035,
67
+ "mdx_dim_f_set": 2048,
68
+ "mdx_dim_t_set": 9,
69
+ "mdx_n_fft_scale_set": 6144,
70
+ "primary_stem": "Vocals"
71
+ },
72
+ "65ab5919372a128e4167f5e01a8fda85": {
73
+ "compensate": 1.035,
74
+ "mdx_dim_f_set": 2048,
75
+ "mdx_dim_t_set": 8,
76
+ "mdx_n_fft_scale_set": 8192,
77
+ "primary_stem": "Other"
78
+ },
79
+ "6703e39f36f18aa7855ee1047765621d": {
80
+ "compensate": 1.035,
81
+ "mdx_dim_f_set": 2048,
82
+ "mdx_dim_t_set": 9,
83
+ "mdx_n_fft_scale_set": 16384,
84
+ "primary_stem": "Bass"
85
+ },
86
+ "6b31de20e84392859a3d09d43f089515": {
87
+ "compensate": 1.035,
88
+ "mdx_dim_f_set": 2048,
89
+ "mdx_dim_t_set": 8,
90
+ "mdx_n_fft_scale_set": 6144,
91
+ "primary_stem": "Vocals"
92
+ },
93
+ "867595e9de46f6ab699008295df62798": {
94
+ "compensate": 1.03,
95
+ "mdx_dim_f_set": 3072,
96
+ "mdx_dim_t_set": 8,
97
+ "mdx_n_fft_scale_set": 7680,
98
+ "primary_stem": "Vocals"
99
+ },
100
+ "a3cd63058945e777505c01d2507daf37": {
101
+ "compensate": 1.03,
102
+ "mdx_dim_f_set": 2048,
103
+ "mdx_dim_t_set": 8,
104
+ "mdx_n_fft_scale_set": 6144,
105
+ "primary_stem": "Vocals"
106
+ },
107
+ "b33d9b3950b6cbf5fe90a32608924700": {
108
+ "compensate": 1.03,
109
+ "mdx_dim_f_set": 3072,
110
+ "mdx_dim_t_set": 8,
111
+ "mdx_n_fft_scale_set": 7680,
112
+ "primary_stem": "Vocals"
113
+ },
114
+ "c3b29bdce8c4fa17ec609e16220330ab": {
115
+ "compensate": 1.035,
116
+ "mdx_dim_f_set": 2048,
117
+ "mdx_dim_t_set": 8,
118
+ "mdx_n_fft_scale_set": 16384,
119
+ "primary_stem": "Bass"
120
+ },
121
+ "ceed671467c1f64ebdfac8a2490d0d52": {
122
+ "compensate": 1.035,
123
+ "mdx_dim_f_set": 3072,
124
+ "mdx_dim_t_set": 8,
125
+ "mdx_n_fft_scale_set": 7680,
126
+ "primary_stem": "Instrumental"
127
+ },
128
+ "d2a1376f310e4f7fa37fb9b5774eb701": {
129
+ "compensate": 1.035,
130
+ "mdx_dim_f_set": 3072,
131
+ "mdx_dim_t_set": 8,
132
+ "mdx_n_fft_scale_set": 7680,
133
+ "primary_stem": "Instrumental"
134
+ },
135
+ "d7bff498db9324db933d913388cba6be": {
136
+ "compensate": 1.035,
137
+ "mdx_dim_f_set": 2048,
138
+ "mdx_dim_t_set": 8,
139
+ "mdx_n_fft_scale_set": 6144,
140
+ "primary_stem": "Vocals"
141
+ },
142
+ "d94058f8c7f1fae4164868ae8ae66b20": {
143
+ "compensate": 1.035,
144
+ "mdx_dim_f_set": 2048,
145
+ "mdx_dim_t_set": 8,
146
+ "mdx_n_fft_scale_set": 6144,
147
+ "primary_stem": "Vocals"
148
+ },
149
+ "dc41ede5961d50f277eb846db17f5319": {
150
+ "compensate": 1.035,
151
+ "mdx_dim_f_set": 2048,
152
+ "mdx_dim_t_set": 9,
153
+ "mdx_n_fft_scale_set": 4096,
154
+ "primary_stem": "Drums"
155
+ },
156
+ "e5572e58abf111f80d8241d2e44e7fa4": {
157
+ "compensate": 1.028,
158
+ "mdx_dim_f_set": 3072,
159
+ "mdx_dim_t_set": 8,
160
+ "mdx_n_fft_scale_set": 7680,
161
+ "primary_stem": "Instrumental"
162
+ },
163
+ "e7324c873b1f615c35c1967f912db92a": {
164
+ "compensate": 1.03,
165
+ "mdx_dim_f_set": 3072,
166
+ "mdx_dim_t_set": 8,
167
+ "mdx_n_fft_scale_set": 7680,
168
+ "primary_stem": "Vocals"
169
+ },
170
+ "1c56ec0224f1d559c42fd6fd2a67b154": {
171
+ "compensate": 1.025,
172
+ "mdx_dim_f_set": 2048,
173
+ "mdx_dim_t_set": 8,
174
+ "mdx_n_fft_scale_set": 5120,
175
+ "primary_stem": "Instrumental"
176
+ },
177
+ "f2df6d6863d8f435436d8b561594ff49": {
178
+ "compensate": 1.035,
179
+ "mdx_dim_f_set": 3072,
180
+ "mdx_dim_t_set": 8,
181
+ "mdx_n_fft_scale_set": 7680,
182
+ "primary_stem": "Instrumental"
183
+ },
184
+ "b06327a00d5e5fbc7d96e1781bbdb596": {
185
+ "compensate": 1.035,
186
+ "mdx_dim_f_set": 3072,
187
+ "mdx_dim_t_set": 8,
188
+ "mdx_n_fft_scale_set": 6144,
189
+ "primary_stem": "Instrumental"
190
+ },
191
+ "94ff780b977d3ca07c7a343dab2e25dd": {
192
+ "compensate": 1.039,
193
+ "mdx_dim_f_set": 3072,
194
+ "mdx_dim_t_set": 8,
195
+ "mdx_n_fft_scale_set": 6144,
196
+ "primary_stem": "Instrumental"
197
+ },
198
+ "73492b58195c3b52d34590d5474452f6": {
199
+ "compensate": 1.043,
200
+ "mdx_dim_f_set": 3072,
201
+ "mdx_dim_t_set": 8,
202
+ "mdx_n_fft_scale_set": 7680,
203
+ "primary_stem": "Vocals"
204
+ },
205
+ "970b3f9492014d18fefeedfe4773cb42": {
206
+ "compensate": 1.009,
207
+ "mdx_dim_f_set": 3072,
208
+ "mdx_dim_t_set": 8,
209
+ "mdx_n_fft_scale_set": 7680,
210
+ "primary_stem": "Vocals"
211
+ },
212
+ "1d64a6d2c30f709b8c9b4ce1366d96ee": {
213
+ "compensate": 1.065,
214
+ "mdx_dim_f_set": 2048,
215
+ "mdx_dim_t_set": 8,
216
+ "mdx_n_fft_scale_set": 5120,
217
+ "primary_stem": "Instrumental"
218
+ },
219
+ "203f2a3955221b64df85a41af87cf8f0": {
220
+ "compensate": 1.035,
221
+ "mdx_dim_f_set": 3072,
222
+ "mdx_dim_t_set": 8,
223
+ "mdx_n_fft_scale_set": 6144,
224
+ "primary_stem": "Instrumental"
225
+ },
226
+ "291c2049608edb52648b96e27eb80e95": {
227
+ "compensate": 1.035,
228
+ "mdx_dim_f_set": 3072,
229
+ "mdx_dim_t_set": 8,
230
+ "mdx_n_fft_scale_set": 6144,
231
+ "primary_stem": "Instrumental"
232
+ },
233
+ "ead8d05dab12ec571d67549b3aab03fc": {
234
+ "compensate": 1.035,
235
+ "mdx_dim_f_set": 3072,
236
+ "mdx_dim_t_set": 8,
237
+ "mdx_n_fft_scale_set": 6144,
238
+ "primary_stem": "Instrumental"
239
+ },
240
+ "cc63408db3d80b4d85b0287d1d7c9632": {
241
+ "compensate": 1.033,
242
+ "mdx_dim_f_set": 3072,
243
+ "mdx_dim_t_set": 8,
244
+ "mdx_n_fft_scale_set": 6144,
245
+ "primary_stem": "Instrumental"
246
+ },
247
+ "cd5b2989ad863f116c855db1dfe24e39": {
248
+ "compensate": 1.035,
249
+ "mdx_dim_f_set": 3072,
250
+ "mdx_dim_t_set": 9,
251
+ "mdx_n_fft_scale_set": 6144,
252
+ "primary_stem": "Other"
253
+ },
254
+ "55657dd70583b0fedfba5f67df11d711": {
255
+ "compensate": 1.022,
256
+ "mdx_dim_f_set": 3072,
257
+ "mdx_dim_t_set": 8,
258
+ "mdx_n_fft_scale_set": 6144,
259
+ "primary_stem": "Instrumental"
260
+ },
261
+ "b6bccda408a436db8500083ef3491e8b": {
262
+ "compensate": 1.02,
263
+ "mdx_dim_f_set": 3072,
264
+ "mdx_dim_t_set": 8,
265
+ "mdx_n_fft_scale_set": 7680,
266
+ "primary_stem": "Instrumental"
267
+ },
268
+ "8a88db95c7fb5dbe6a095ff2ffb428b1": {
269
+ "compensate": 1.026,
270
+ "mdx_dim_f_set": 2048,
271
+ "mdx_dim_t_set": 8,
272
+ "mdx_n_fft_scale_set": 5120,
273
+ "primary_stem": "Instrumental"
274
+ },
275
+ "b78da4afc6512f98e4756f5977f5c6b9": {
276
+ "compensate": 1.021,
277
+ "mdx_dim_f_set": 3072,
278
+ "mdx_dim_t_set": 8,
279
+ "mdx_n_fft_scale_set": 7680,
280
+ "primary_stem": "Instrumental"
281
+ },
282
+ "77d07b2667ddf05b9e3175941b4454a0": {
283
+ "compensate": 1.021,
284
+ "mdx_dim_f_set": 3072,
285
+ "mdx_dim_t_set": 8,
286
+ "mdx_n_fft_scale_set": 7680,
287
+ "primary_stem": "Vocals"
288
+ },
289
+ "2154254ee89b2945b97a7efed6e88820": {
290
+ "config_yaml": "model_2_stem_061321.yaml"
291
+ },
292
+ "063aadd735d58150722926dcbf5852a9": {
293
+ "config_yaml": "model_2_stem_061321.yaml"
294
+ },
295
+ "fe96801369f6a148df2720f5ced88c19": {
296
+ "config_yaml": "model3.yaml"
297
+ },
298
+ "02e8b226f85fb566e5db894b9931c640": {
299
+ "config_yaml": "model2.yaml"
300
+ },
301
+ "e3de6d861635ab9c1d766149edd680d6": {
302
+ "config_yaml": "model1.yaml"
303
+ },
304
+ "3f2936c554ab73ce2e396d54636bd373": {
305
+ "config_yaml": "modelB.yaml"
306
+ },
307
+ "890d0f6f82d7574bca741a9e8bcb8168": {
308
+ "config_yaml": "modelB.yaml"
309
+ },
310
+ "63a3cb8c37c474681049be4ad1ba8815": {
311
+ "config_yaml": "modelB.yaml"
312
+ },
313
+ "a7fc5d719743c7fd6b61bd2b4d48b9f0": {
314
+ "config_yaml": "modelA.yaml"
315
+ },
316
+ "3567f3dee6e77bf366fcb1c7b8bc3745": {
317
+ "config_yaml": "modelA.yaml"
318
+ },
319
+ "a28f4d717bd0d34cd2ff7a3b0a3d065e": {
320
+ "config_yaml": "modelA.yaml"
321
+ },
322
+ "c9971a18da20911822593dc81caa8be9": {
323
+ "config_yaml": "sndfx.yaml"
324
+ },
325
+ "57d94d5ed705460d21c75a5ac829a605": {
326
+ "config_yaml": "sndfx.yaml"
327
+ },
328
+ "e7a25f8764f25a52c1b96c4946e66ba2": {
329
+ "config_yaml": "sndfx.yaml"
330
+ },
331
+ "104081d24e37217086ce5fde09147ee1": {
332
+ "config_yaml": "model_2_stem_061321.yaml"
333
+ },
334
+ "1e6165b601539f38d0a9330f3facffeb": {
335
+ "config_yaml": "model_2_stem_061321.yaml"
336
+ },
337
+ "fe0108464ce0d8271be5ab810891bd7c": {
338
+ "config_yaml": "model_2_stem_full_band.yaml"
339
+ }
340
+ }
pretrained_models/uvr5/model_name_mapper.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "UVR_MDXNET_1_9703": "UVR-MDX-NET 1",
3
+ "UVR_MDXNET_2_9682": "UVR-MDX-NET 2",
4
+ "UVR_MDXNET_3_9662": "UVR-MDX-NET 3",
5
+ "UVR_MDXNET_KARA": "UVR-MDX-NET Karaoke",
6
+ "UVR_MDXNET_Main": "UVR-MDX-NET Main",
7
+ "UVR-MDX-NET-Inst_1": "UVR-MDX-NET Inst 1",
8
+ "UVR-MDX-NET-Inst_2": "UVR-MDX-NET Inst 2",
9
+ "UVR-MDX-NET-Inst_3": "UVR-MDX-NET Inst 3",
10
+ "UVR-MDX-NET-Inst_4": "UVR-MDX-NET Inst 4",
11
+ "UVR-MDX-NET-Inst_Main": "UVR-MDX-NET Inst Main",
12
+ "UVR-MDX-NET-Inst_Main_2": "UVR-MDX-NET Inst Main 2",
13
+ "UVR-MDX-NET-Inst_HQ_1": "UVR-MDX-NET Inst HQ 1",
14
+ "UVR-MDX-NET-Inst_HQ_2": "UVR-MDX-NET Inst HQ 2",
15
+ "UVR-MDX-NET-Inst_HQ_3": "UVR-MDX-NET Inst HQ 3",
16
+ "UVR_MDXNET_KARA_2": "UVR-MDX-NET Karaoke 2",
17
+ "UVR-MDX-NET-Voc_FT": "UVR-MDX-NET Voc FT",
18
+ "Kim_Vocal_1": "Kim Vocal 1",
19
+ "Kim_Vocal_2": "Kim Vocal 2",
20
+ "Kim_Inst": "Kim Inst",
21
+ "Reverb_HQ_By_FoxJoy": "Reverb HQ"
22
+ }
pretrained_models/whisperx/whisperx-vad-segmentation.bak ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
3
+ size 17719103
pretrained_models/whisperx/whisperx-vad-segmentation.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:665a2c2576a9d8f09edde3b0ac2aa7aeb242fe2c0134584c5b6fa829b960266c
3
+ size 17724661