Jmica commited on
Commit
2b8cfb9
·
verified ·
1 Parent(s): e7db70b

Upload config.yaml

Browse files
Files changed (1) hide show
  1. config.yaml +120 -0
config.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ bpe_model: bpe.model
3
+ sample_rate: 24000
4
+ squeeze: false
5
+ mel:
6
+ sample_rate: 24000
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ win_length: 1024
10
+ n_mels: 100
11
+ mel_fmin: 0
12
+ normalize: false
13
+
14
+ gpt:
15
+ model_dim: 1280
16
+ max_mel_tokens: 1815
17
+ max_text_tokens: 600
18
+ heads: 20
19
+ use_mel_codes_as_input: true
20
+ mel_length_compression: 1024
21
+ layers: 24
22
+ number_text_tokens: 12000
23
+ number_mel_codes: 8194
24
+ start_mel_token: 8192
25
+ stop_mel_token: 8193
26
+ start_text_token: 0
27
+ stop_text_token: 1
28
+ train_solo_embeddings: false
29
+ condition_type: "conformer_perceiver"
30
+ condition_module:
31
+ output_size: 512
32
+ linear_units: 2048
33
+ attention_heads: 8
34
+ num_blocks: 6
35
+ input_layer: "conv2d2"
36
+ perceiver_mult: 2
37
+ emo_condition_module:
38
+ output_size: 512
39
+ linear_units: 1024
40
+ attention_heads: 4
41
+ num_blocks: 4
42
+ input_layer: "conv2d2"
43
+ perceiver_mult: 2
44
+
45
+ semantic_codec:
46
+ codebook_size: 8192
47
+ hidden_size: 1024
48
+ codebook_dim: 8
49
+ vocos_dim: 384
50
+ vocos_intermediate_dim: 2048
51
+ vocos_num_layers: 12
52
+
53
+ s2mel:
54
+ preprocess_params:
55
+ sr: 22050
56
+ spect_params:
57
+ n_fft: 1024
58
+ win_length: 1024
59
+ hop_length: 256
60
+ n_mels: 80
61
+ fmin: 0
62
+ fmax: "None"
63
+
64
+ dit_type: "DiT"
65
+ reg_loss_type: "l1"
66
+ style_encoder:
67
+ dim: 192
68
+ length_regulator:
69
+ channels: 512
70
+ is_discrete: false
71
+ in_channels: 1024
72
+ content_codebook_size: 2048
73
+ sampling_ratios: [1, 1, 1, 1]
74
+ vector_quantize: false
75
+ n_codebooks: 1
76
+ quantizer_dropout: 0.0
77
+ f0_condition: false
78
+ n_f0_bins: 512
79
+ DiT:
80
+ hidden_dim: 512
81
+ num_heads: 8
82
+ depth: 13
83
+ class_dropout_prob: 0.1
84
+ block_size: 8192
85
+ in_channels: 80
86
+ style_condition: true
87
+ final_layer_type: 'wavenet'
88
+ target: 'mel'
89
+ content_dim: 512
90
+ content_codebook_size: 1024
91
+ content_type: 'discrete'
92
+ f0_condition: false
93
+ n_f0_bins: 512
94
+ content_codebooks: 1
95
+ is_causal: false
96
+ long_skip_connection: true
97
+ zero_prompt_speech_token: false
98
+ time_as_token: false
99
+ style_as_token: false
100
+ uvit_skip_connection: true
101
+ add_resblock_in_transformer: false
102
+ wavenet:
103
+ hidden_dim: 512
104
+ num_layers: 8
105
+ kernel_size: 5
106
+ dilation_rate: 1
107
+ p_dropout: 0.2
108
+ style_condition: true
109
+
110
+ gpt_checkpoint: gpt.pth
111
+ w2v_stat: wav2vec2bert_stats.pt
112
+ s2mel_checkpoint: s2mel.pth
113
+ emo_matrix: feat2.pt
114
+ spk_matrix: feat1.pt
115
+ emo_num: [3, 17, 2, 8, 4, 5, 10, 24]
116
+ qwen_emo_path: qwen0.6bemo4-merge/
117
+ vocoder:
118
+ type: "bigvgan"
119
+ name: "nvidia/bigvgan_v2_22khz_80band_256x"
120
+ version: 2.0