mingyue66 commited on
Commit
6131fb0
·
verified ·
1 Parent(s): d66643d

Upload config.json

Browse files
Files changed (1) hide show
  1. config.json +219 -0
config.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_encoder_config": {
3
+ "model_type": "zipformer",
4
+ "feature_dim": 80,
5
+ "output_downsampling_factor": 2,
6
+ "num_encoder_layers": [
7
+ 2,
8
+ 2,
9
+ 4,
10
+ 5,
11
+ 4,
12
+ 2
13
+ ],
14
+ "downsampling_factor": [
15
+ 1,
16
+ 2,
17
+ 4,
18
+ 8,
19
+ 4,
20
+ 2
21
+ ],
22
+ "encoder_dim": [
23
+ 192,
24
+ 256,
25
+ 512,
26
+ 768,
27
+ 512,
28
+ 256
29
+ ],
30
+ "feedforward_dim": [
31
+ 576,
32
+ 768,
33
+ 1536,
34
+ 2304,
35
+ 1536,
36
+ 768
37
+ ],
38
+ "warmup_batches": 4000.0,
39
+ "dropout": null,
40
+ "num_heads": [
41
+ 4,
42
+ 4,
43
+ 4,
44
+ 8,
45
+ 4,
46
+ 4
47
+ ],
48
+ "query_head_dim": [
49
+ 32
50
+ ],
51
+ "value_head_dim": [
52
+ 12
53
+ ],
54
+ "pos_head_dim": [
55
+ 4
56
+ ],
57
+ "pos_dim": 48,
58
+ "encoder_unmasked_dim": [
59
+ 192,
60
+ 192,
61
+ 256,
62
+ 256,
63
+ 256,
64
+ 192
65
+ ],
66
+ "cnn_module_kernel": [
67
+ 31,
68
+ 31,
69
+ 15,
70
+ 15,
71
+ 15,
72
+ 31
73
+ ],
74
+ "causal": false,
75
+ "chunk_size": [
76
+ 16,
77
+ 32,
78
+ 64,
79
+ -1
80
+ ],
81
+ "left_context_frames": [
82
+ 64,
83
+ 128,
84
+ 256,
85
+ -1
86
+ ]
87
+ },
88
+ "llm_config": {
89
+ "vocab_size": 152064,
90
+ "max_position_embeddings": 32768,
91
+ "hidden_size": 3584,
92
+ "intermediate_size": 18944,
93
+ "num_hidden_layers": 28,
94
+ "num_attention_heads": 28,
95
+ "use_sliding_window": false,
96
+ "sliding_window": 131072,
97
+ "max_window_layers": 28,
98
+ "num_key_value_heads": 4,
99
+ "hidden_act": "silu",
100
+ "initializer_range": 0.02,
101
+ "rms_norm_eps": 1e-06,
102
+ "use_cache": true,
103
+ "rope_theta": 1000000.0,
104
+ "attention_dropout": 0.0,
105
+ "torch_dtype": "float16",
106
+ "tie_word_embeddings": false,
107
+ "architectures": [
108
+ "Qwen2ForCausalLM"
109
+ ],
110
+ "bos_token_id": 151643,
111
+ "eos_token_id": 151645,
112
+ "_name_or_path": "/projects/bejv/models/Qwen2.5-7B-Instruct",
113
+ "transformers_version": "4.38.2",
114
+ "model_type": "qwen2"
115
+ },
116
+ "use_flash_attn": false,
117
+ "audio_encoder_projector_ds_rate": 8,
118
+ "exclude_from_checkpoint": [
119
+ "audio_encoder",
120
+ "voice_encoder",
121
+ "llm"
122
+ ],
123
+ "tag_audio_boundary": false,
124
+ "audio_token": "<|AUDIO|>",
125
+ "model_type": "audio-llm-dual-audio-tokens-anchor-num",
126
+ "max_length": 800,
127
+ "voice_encoder_config": {
128
+ "model_type": "zipformer",
129
+ "feature_dim": 80,
130
+ "output_downsampling_factor": 2,
131
+ "num_encoder_layers": [
132
+ 2,
133
+ 2,
134
+ 4,
135
+ 5,
136
+ 4,
137
+ 2
138
+ ],
139
+ "downsampling_factor": [
140
+ 1,
141
+ 2,
142
+ 4,
143
+ 8,
144
+ 4,
145
+ 2
146
+ ],
147
+ "encoder_dim": [
148
+ 192,
149
+ 256,
150
+ 512,
151
+ 768,
152
+ 512,
153
+ 256
154
+ ],
155
+ "feedforward_dim": [
156
+ 576,
157
+ 768,
158
+ 1536,
159
+ 2304,
160
+ 1536,
161
+ 768
162
+ ],
163
+ "warmup_batches": 4000.0,
164
+ "dropout": null,
165
+ "num_heads": [
166
+ 4,
167
+ 4,
168
+ 4,
169
+ 8,
170
+ 4,
171
+ 4
172
+ ],
173
+ "query_head_dim": [
174
+ 32
175
+ ],
176
+ "value_head_dim": [
177
+ 12
178
+ ],
179
+ "pos_head_dim": [
180
+ 4
181
+ ],
182
+ "pos_dim": 48,
183
+ "encoder_unmasked_dim": [
184
+ 192,
185
+ 192,
186
+ 256,
187
+ 256,
188
+ 256,
189
+ 192
190
+ ],
191
+ "cnn_module_kernel": [
192
+ 31,
193
+ 31,
194
+ 15,
195
+ 15,
196
+ 15,
197
+ 31
198
+ ],
199
+ "causal": false,
200
+ "chunk_size": [
201
+ 16,
202
+ 32,
203
+ 64,
204
+ -1
205
+ ],
206
+ "left_context_frames": [
207
+ 64,
208
+ 128,
209
+ 256,
210
+ -1
211
+ ]
212
+ },
213
+ "semantic_projector_ds_rate": 4,
214
+ "voice_projector_ds_rate": 4,
215
+ "semantic_anchor_interval": 8,
216
+ "voice_anchor_interval": 8,
217
+ "insert_anchors_at_ends": true,
218
+ "digit_embedding_path": "/projects/bejv/code/Auden/examples/multi_asr_llm/models/audio_llm_dual_audio_tokens_anchor_num/digit_token_embeddings.pt"
219
+ }