a43992899 commited on
Commit
0163095
·
verified ·
1 Parent(s): de23e09

Upload folder using huggingface_hub

Browse files
ckpt_00360000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8c379ea2d3cbde1c8ba1b9717975220e79ba3f556bb161766fd5e4585dcd59c
3
+ size 1360444883
config.yaml ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ generator:
2
+ name: SoundStream
3
+ config:
4
+ n_filters: 32
5
+ D: 256
6
+ target_bandwidths:
7
+ - 0.5
8
+ - 1
9
+ - 1.5
10
+ - 2
11
+ - 4
12
+ - 6
13
+ ratios:
14
+ - 8
15
+ - 5
16
+ - 4
17
+ - 2
18
+ sample_rate: 16000
19
+ bins: 1024
20
+ d_list:
21
+ - mfd
22
+ mfd:
23
+ name: MultiFrequencyDiscriminator
24
+ config:
25
+ hop_lengths:
26
+ - 32
27
+ - 64
28
+ - 128
29
+ - 256
30
+ - 512
31
+ - 1024
32
+ hidden_channels:
33
+ - 64
34
+ - 128
35
+ - 256
36
+ - 512
37
+ - 512
38
+ - 512
39
+ domain: double
40
+ mel_scale: true
41
+ sample_rate: 16000
42
+ mpd:
43
+ name: MultiPeriodDiscriminator
44
+ config:
45
+ period_sizes:
46
+ - 2
47
+ - 3
48
+ - 5
49
+ - 7
50
+ - 11
51
+ period_kernel_size: 5
52
+ msd:
53
+ name: MultiScaleDiscriminator
54
+ config:
55
+ num_scales: 3
56
+ pool_kernel_size: 4
57
+ pool_stride: 2
58
+ optimizer:
59
+ g:
60
+ name: AdamW
61
+ config:
62
+ lr: 0.0002
63
+ betas:
64
+ - 0.8
65
+ - 0.99
66
+ eps: 1.0e-06
67
+ d:
68
+ name: AdamW
69
+ config:
70
+ lr: 0.0002
71
+ betas:
72
+ - 0.8
73
+ - 0.99
74
+ eps: 1.0e-06
75
+ lr_scheduler:
76
+ g:
77
+ name: ExponentialLR
78
+ config:
79
+ gamma: 0.999
80
+ d:
81
+ name: ExponentialLR
82
+ config:
83
+ gamma: 0.999
84
+ criterion:
85
+ g_criterion:
86
+ name: losses.generator_loss.GeneratorSTFTLoss
87
+ config:
88
+ use_mel_loss: false
89
+ adv_criterion: MSEGLoss
90
+ mel_loss_weight: 45
91
+ use_feature_match: true
92
+ feat_match_loss_weight: 20
93
+ use_full_stft_loss: true
94
+ use_sub_stft_loss: true
95
+ full_stft_loss_weight: 1
96
+ sub_stft_loss_weight: 1
97
+ mel_scale_loss:
98
+ sampling_rate: 16000
99
+ n_fft: 1024
100
+ num_mels: 80
101
+ hop_size: 160
102
+ win_size: 800
103
+ fmin: 0
104
+ full_multi_scale_stft_loss:
105
+ fft_sizes:
106
+ - 512
107
+ - 1024
108
+ - 2048
109
+ win_sizes:
110
+ - 480
111
+ - 960
112
+ - 1200
113
+ hop_sizes:
114
+ - 120
115
+ - 240
116
+ - 300
117
+ sub_multi_scale_stft_loss:
118
+ num_bands: 6
119
+ fft_sizes:
120
+ - 128
121
+ - 256
122
+ - 256
123
+ win_sizes:
124
+ - 80
125
+ - 120
126
+ - 200
127
+ hop_sizes:
128
+ - 20
129
+ - 40
130
+ - 50
131
+ d_criterion:
132
+ name: losses.discriminator_loss.MSEDiscriminatorLoss
133
+ config: null
134
+ commit_loss_weight: 1.0
135
+ codebook_loss_weight: 75
136
+ training_file: /aifs4su/data/zheny/fairseq/vae_v2/codec_final/list/train.txt
137
+ validation_file: /aifs4su/data/zheny/fairseq/vae_v2/codec_final/list/valid.txt
138
+ seed: 2333
139
+ cudnn_deterministic: false
140
+ tensorboard: true
141
+ checkpoint_interval: 5000
142
+ summary_interval: 100
143
+ validation_interval: 500
144
+ num_epoches: 20
145
+ print_freq: 10
146
+ discriminator_iter_start: 0
147
+ num_ckpt_keep: 10
148
+ segment_size: 16000
149
+ audio_norm_scale: 0.95
150
+ batch_size: 48
151
+ num_workers: 8
152
+ num_plots: 8
153
+ local_rank: 1000000
154
+ basic_model_config: config/codec_16k_6kbps_v3_vqdp.yaml
155
+ exp_model_config: null
156
+ log_dir: 0518_20w_ckpts
157
+ ngpus_per_node: 8
158
+ sample_rate: 16000
159
+ model_ckpt_dir: 0518_20w_ckpts/model_ckpts
semantic_ckpts/hf_1_325000/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModel"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.27.3",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
semantic_ckpts/hf_1_325000/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
10
+
semantic_ckpts/hf_1_325000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5ddbd7fa2468483cb9b2aa53117813471543dd278e65870333a56c54305f527
3
+ size 377555286