mazesmazes commited on
Commit
0a2d19e
·
verified ·
1 Parent(s): c6a2b6b

Training in progress, step 500

Browse files
Files changed (4) hide show
  1. config.json +255 -0
  2. generation_config.json +11 -0
  3. model.safetensors +3 -0
  4. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ASRModel"
4
+ ],
5
+ "attn_implementation": "flash_attention_2",
6
+ "audio_config": {
7
+ "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
8
+ "architectures": [
9
+ "GlmAsrForConditionalGeneration"
10
+ ],
11
+ "audio_config": {
12
+ "_name_or_path": "",
13
+ "add_cross_attention": false,
14
+ "architectures": null,
15
+ "attention_dropout": 0.0,
16
+ "bos_token_id": null,
17
+ "chunk_size_feed_forward": 0,
18
+ "cross_attention_hidden_size": null,
19
+ "decoder_start_token_id": null,
20
+ "dtype": null,
21
+ "eos_token_id": null,
22
+ "finetuning_task": null,
23
+ "head_dim": 64,
24
+ "hidden_act": "gelu",
25
+ "hidden_size": 1280,
26
+ "id2label": {
27
+ "0": "LABEL_0",
28
+ "1": "LABEL_1"
29
+ },
30
+ "initializer_range": 0.02,
31
+ "intermediate_size": 5120,
32
+ "is_decoder": false,
33
+ "is_encoder_decoder": false,
34
+ "label2id": {
35
+ "LABEL_0": 0,
36
+ "LABEL_1": 1
37
+ },
38
+ "max_position_embeddings": 1500,
39
+ "model_type": "glmasr_encoder",
40
+ "num_attention_heads": 20,
41
+ "num_hidden_layers": 32,
42
+ "num_key_value_heads": 20,
43
+ "num_mel_bins": 128,
44
+ "output_attentions": false,
45
+ "output_hidden_states": false,
46
+ "pad_token_id": null,
47
+ "partial_rotary_factor": 0.5,
48
+ "prefix": null,
49
+ "problem_type": null,
50
+ "return_dict": true,
51
+ "rope_parameters": {
52
+ "partial_rotary_factor": 0.5,
53
+ "rope_theta": 10000.0,
54
+ "rope_type": "default"
55
+ },
56
+ "sep_token_id": null,
57
+ "task_specific_params": null,
58
+ "tie_word_embeddings": true,
59
+ "tokenizer_class": null
60
+ },
61
+ "audio_token_id": 59260,
62
+ "dtype": "bfloat16",
63
+ "hidden_size": 2048,
64
+ "model_type": "glmasr",
65
+ "num_mel_bins": 128,
66
+ "projector_hidden_act": "gelu",
67
+ "text_config": {
68
+ "_name_or_path": "",
69
+ "add_cross_attention": false,
70
+ "architectures": null,
71
+ "attention_bias": false,
72
+ "attention_dropout": 0.0,
73
+ "bos_token_id": 1,
74
+ "chunk_size_feed_forward": 0,
75
+ "cross_attention_hidden_size": null,
76
+ "decoder_start_token_id": null,
77
+ "dtype": null,
78
+ "eos_token_id": [
79
+ 59246,
80
+ 59253,
81
+ 59255
82
+ ],
83
+ "finetuning_task": null,
84
+ "head_dim": 128,
85
+ "hidden_act": "silu",
86
+ "hidden_size": 2048,
87
+ "id2label": {
88
+ "0": "LABEL_0",
89
+ "1": "LABEL_1"
90
+ },
91
+ "initializer_range": 0.02,
92
+ "intermediate_size": 6144,
93
+ "is_decoder": false,
94
+ "is_encoder_decoder": false,
95
+ "label2id": {
96
+ "LABEL_0": 0,
97
+ "LABEL_1": 1
98
+ },
99
+ "max_position_embeddings": 8192,
100
+ "mlp_bias": false,
101
+ "model_type": "llama",
102
+ "num_attention_heads": 16,
103
+ "num_hidden_layers": 28,
104
+ "num_key_value_heads": 4,
105
+ "output_attentions": false,
106
+ "output_hidden_states": false,
107
+ "pad_token_id": null,
108
+ "prefix": null,
109
+ "pretraining_tp": 1,
110
+ "problem_type": null,
111
+ "return_dict": true,
112
+ "rms_norm_eps": 1e-05,
113
+ "rope_parameters": {
114
+ "rope_theta": 10000.0,
115
+ "rope_type": "default"
116
+ },
117
+ "sep_token_id": null,
118
+ "task_specific_params": null,
119
+ "tie_word_embeddings": false,
120
+ "tokenizer_class": null,
121
+ "use_cache": true,
122
+ "vocab_size": 59264
123
+ },
124
+ "vocab_size": 59264
125
+ },
126
+ "audio_model_id": "zai-org/GLM-ASR-Nano-2512",
127
+ "audio_sample_rate": 16000,
128
+ "auto_map": {
129
+ "AutoConfig": "asr_config.ASRConfig",
130
+ "AutoModel": "asr_modeling.ASRModel",
131
+ "AutoModelForSpeechSeq2Seq": "asr_modeling.ASRModel",
132
+ "AutoProcessor": "asr_processing.ASRProcessor"
133
+ },
134
+ "custom_pipelines": {
135
+ "automatic-speech-recognition": {
136
+ "impl": "asr_pipeline.ASRPipeline",
137
+ "pt": [
138
+ "AutoModelForSpeechSeq2Seq"
139
+ ],
140
+ "tf": [],
141
+ "type": "audio"
142
+ }
143
+ },
144
+ "downsample_rate": 5,
145
+ "dtype": "bfloat16",
146
+ "encoder_conv_layers": [
147
+ [
148
+ 1,
149
+ 3,
150
+ 1
151
+ ],
152
+ [
153
+ 1,
154
+ 3,
155
+ 2
156
+ ]
157
+ ],
158
+ "encoder_dim": 1280,
159
+ "encoder_stride": 2,
160
+ "inference_warmup_tokens": 10,
161
+ "label_smoothing": 0.0,
162
+ "length_penalty": 1.0,
163
+ "llm_dim": 1024,
164
+ "max_new_tokens": 96,
165
+ "model_dtype": "bfloat16",
166
+ "model_type": "asr_model",
167
+ "no_repeat_ngram_size": 0,
168
+ "num_beams": 1,
169
+ "num_experts": 4,
170
+ "num_experts_per_tok": 2,
171
+ "pipeline_tag": "automatic-speech-recognition",
172
+ "projector_dropout": 0.0,
173
+ "projector_hidden_dim": null,
174
+ "projector_init_std": 0.02,
175
+ "projector_input_noise": 0.0,
176
+ "projector_num_layers": 2,
177
+ "projector_pool_stride": 4,
178
+ "projector_type": "mlp",
179
+ "qformer_hidden_size": null,
180
+ "qformer_intermediate_size": null,
181
+ "qformer_num_heads": 16,
182
+ "qformer_num_layers": 2,
183
+ "qformer_window_size": 15,
184
+ "repetition_penalty": 1.0,
185
+ "router_aux_loss_coef": 0.01,
186
+ "system_prompt": "/no_think /system_override",
187
+ "text_config": {
188
+ "_name_or_path": "Qwen/Qwen3-0.6B",
189
+ "architectures": [
190
+ "Qwen3ForCausalLM"
191
+ ],
192
+ "attention_bias": false,
193
+ "attention_dropout": 0.0,
194
+ "dtype": "bfloat16",
195
+ "eos_token_id": 151645,
196
+ "head_dim": 128,
197
+ "hidden_act": "silu",
198
+ "hidden_size": 1024,
199
+ "initializer_range": 0.02,
200
+ "intermediate_size": 3072,
201
+ "layer_types": [
202
+ "full_attention",
203
+ "full_attention",
204
+ "full_attention",
205
+ "full_attention",
206
+ "full_attention",
207
+ "full_attention",
208
+ "full_attention",
209
+ "full_attention",
210
+ "full_attention",
211
+ "full_attention",
212
+ "full_attention",
213
+ "full_attention",
214
+ "full_attention",
215
+ "full_attention",
216
+ "full_attention",
217
+ "full_attention",
218
+ "full_attention",
219
+ "full_attention",
220
+ "full_attention",
221
+ "full_attention",
222
+ "full_attention",
223
+ "full_attention",
224
+ "full_attention",
225
+ "full_attention",
226
+ "full_attention",
227
+ "full_attention",
228
+ "full_attention",
229
+ "full_attention"
230
+ ],
231
+ "max_position_embeddings": 40960,
232
+ "max_window_layers": 28,
233
+ "model_type": "qwen3",
234
+ "num_attention_heads": 16,
235
+ "num_hidden_layers": 28,
236
+ "num_key_value_heads": 8,
237
+ "pad_token_id": 151643,
238
+ "rms_norm_eps": 1e-06,
239
+ "rope_parameters": {
240
+ "rope_theta": 1000000,
241
+ "rope_type": "default"
242
+ },
243
+ "sliding_window": null,
244
+ "tie_word_embeddings": true,
245
+ "use_cache": true,
246
+ "use_sliding_window": false,
247
+ "vocab_size": 151670
248
+ },
249
+ "text_model_id": "Qwen/Qwen3-0.6B",
250
+ "transformers_version": "5.0.0.dev0",
251
+ "use_cache": false,
252
+ "use_specaugment": true,
253
+ "user_prompt": "Please transcribe this English audio into text: <audio>",
254
+ "vocab_size": 151670
255
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151645,
4
+ "length_penalty": 1.0,
5
+ "max_new_tokens": 96,
6
+ "no_repeat_ngram_size": 0,
7
+ "num_beams": 1,
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.0,
10
+ "transformers_version": "5.0.0.dev0"
11
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ef36beee7b81b43b3a975fb4d9737a2e5d96e04e2c93cbd2aa9fc5c11a8e8a
3
+ size 12583152
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8ee6c25214c15acbc5f2381a05f268e4d13b92ac59a9e9dee0a436f2dc53165
3
+ size 5265