mazesmazes commited on
Commit
3cc13cb
·
verified ·
1 Parent(s): d5548e7

Training in progress, step 2000

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if message.content is string %}
27
+ {%- set content = message.content %}
28
+ {%- else %}
29
+ {%- set content = '' %}
30
+ {%- endif %}
31
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
32
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
33
+ {%- elif message.role == "assistant" %}
34
+ {%- set reasoning_content = '' %}
35
+ {%- if message.reasoning_content is string %}
36
+ {%- set reasoning_content = message.reasoning_content %}
37
+ {%- else %}
38
+ {%- if '</think>' in content %}
39
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
40
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
41
+ {%- endif %}
42
+ {%- endif %}
43
+ {%- if loop.index0 > ns.last_query_index %}
44
+ {%- if loop.last or (not loop.last and reasoning_content) %}
45
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
46
+ {%- else %}
47
+ {{- '<|im_start|>' + message.role + '\n' + content }}
48
+ {%- endif %}
49
+ {%- else %}
50
+ {{- '<|im_start|>' + message.role + '\n' + content }}
51
+ {%- endif %}
52
+ {%- if message.tool_calls %}
53
+ {%- for tool_call in message.tool_calls %}
54
+ {%- if (loop.first and content) or (not loop.first) %}
55
+ {{- '\n' }}
56
+ {%- endif %}
57
+ {%- if tool_call.function %}
58
+ {%- set tool_call = tool_call.function %}
59
+ {%- endif %}
60
+ {{- '<tool_call>\n{"name": "' }}
61
+ {{- tool_call.name }}
62
+ {{- '", "arguments": ' }}
63
+ {%- if tool_call.arguments is string %}
64
+ {{- tool_call.arguments }}
65
+ {%- else %}
66
+ {{- tool_call.arguments | tojson }}
67
+ {%- endif %}
68
+ {{- '}\n</tool_call>' }}
69
+ {%- endfor %}
70
+ {%- endif %}
71
+ {{- '<|im_end|>\n' }}
72
+ {%- elif message.role == "tool" %}
73
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
74
+ {{- '<|im_start|>user' }}
75
+ {%- endif %}
76
+ {{- '\n<tool_response>\n' }}
77
+ {{- content }}
78
+ {{- '\n</tool_response>' }}
79
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
80
+ {{- '<|im_end|>\n' }}
81
+ {%- endif %}
82
+ {%- endif %}
83
+ {%- endfor %}
84
+ {%- if add_generation_prompt %}
85
+ {{- '<|im_start|>assistant\n' }}
86
+ {%- if true %}
87
+ {{- '<think>\n\n</think>\n\n' }}
88
+ {%- endif %}
89
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_spec_augment": true,
3
+ "architectures": [
4
+ "ASRModel"
5
+ ],
6
+ "attn_implementation": "flash_attention_2",
7
+ "audio_config": {
8
+ "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
9
+ "architectures": [
10
+ "GlmAsrForConditionalGeneration"
11
+ ],
12
+ "audio_config": {
13
+ "_name_or_path": "",
14
+ "architectures": null,
15
+ "attention_dropout": 0.0,
16
+ "chunk_size_feed_forward": 0,
17
+ "dtype": null,
18
+ "head_dim": 64,
19
+ "hidden_act": "gelu",
20
+ "hidden_size": 1280,
21
+ "id2label": {
22
+ "0": "LABEL_0",
23
+ "1": "LABEL_1"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 5120,
27
+ "is_encoder_decoder": false,
28
+ "label2id": {
29
+ "LABEL_0": 0,
30
+ "LABEL_1": 1
31
+ },
32
+ "max_position_embeddings": 1500,
33
+ "model_type": "glmasr_encoder",
34
+ "num_attention_heads": 20,
35
+ "num_hidden_layers": 32,
36
+ "num_key_value_heads": 20,
37
+ "num_mel_bins": 128,
38
+ "output_attentions": false,
39
+ "output_hidden_states": false,
40
+ "partial_rotary_factor": 0.5,
41
+ "problem_type": null,
42
+ "return_dict": true,
43
+ "rope_parameters": {
44
+ "partial_rotary_factor": 0.5,
45
+ "rope_theta": 10000.0,
46
+ "rope_type": "default"
47
+ }
48
+ },
49
+ "audio_token_id": 59260,
50
+ "dtype": "float32",
51
+ "hidden_size": 2048,
52
+ "model_type": "glmasr",
53
+ "num_mel_bins": 128,
54
+ "projector_hidden_act": "gelu",
55
+ "text_config": {
56
+ "_name_or_path": "",
57
+ "architectures": null,
58
+ "attention_bias": false,
59
+ "attention_dropout": 0.0,
60
+ "bos_token_id": 1,
61
+ "chunk_size_feed_forward": 0,
62
+ "dtype": null,
63
+ "eos_token_id": [
64
+ 59246,
65
+ 59253,
66
+ 59255
67
+ ],
68
+ "head_dim": 128,
69
+ "hidden_act": "silu",
70
+ "hidden_size": 2048,
71
+ "id2label": {
72
+ "0": "LABEL_0",
73
+ "1": "LABEL_1"
74
+ },
75
+ "initializer_range": 0.02,
76
+ "intermediate_size": 6144,
77
+ "is_encoder_decoder": false,
78
+ "label2id": {
79
+ "LABEL_0": 0,
80
+ "LABEL_1": 1
81
+ },
82
+ "max_position_embeddings": 8192,
83
+ "mlp_bias": false,
84
+ "model_type": "llama",
85
+ "num_attention_heads": 16,
86
+ "num_hidden_layers": 28,
87
+ "num_key_value_heads": 4,
88
+ "output_attentions": false,
89
+ "output_hidden_states": false,
90
+ "pad_token_id": null,
91
+ "pretraining_tp": 1,
92
+ "problem_type": null,
93
+ "return_dict": true,
94
+ "rms_norm_eps": 1e-05,
95
+ "rope_parameters": {
96
+ "rope_theta": 10000.0,
97
+ "rope_type": "default"
98
+ },
99
+ "tie_word_embeddings": false,
100
+ "use_cache": true,
101
+ "vocab_size": 59264
102
+ },
103
+ "vocab_size": 59264
104
+ },
105
+ "audio_model_id": "zai-org/GLM-ASR-Nano-2512",
106
+ "audio_sample_rate": 16000,
107
+ "auto_map": {
108
+ "AutoConfig": "asr_config.ASRConfig",
109
+ "AutoModel": "asr_modeling.ASRModel",
110
+ "AutoModelForSpeechSeq2Seq": "asr_modeling.ASRModel",
111
+ "AutoProcessor": "asr_processing.ASRProcessor"
112
+ },
113
+ "bos_token_id": null,
114
+ "custom_pipelines": {
115
+ "automatic-speech-recognition": {
116
+ "impl": "asr_pipeline.ASRPipeline",
117
+ "pt": [
118
+ "AutoModelForSpeechSeq2Seq"
119
+ ],
120
+ "tf": [],
121
+ "type": "audio"
122
+ }
123
+ },
124
+ "do_sample": false,
125
+ "downsample_rate": 5,
126
+ "dtype": "float32",
127
+ "encoder": {
128
+ "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
129
+ "architectures": [
130
+ "GlmAsrForConditionalGeneration"
131
+ ],
132
+ "audio_config": {
133
+ "_name_or_path": "",
134
+ "architectures": null,
135
+ "attention_dropout": 0.0,
136
+ "chunk_size_feed_forward": 0,
137
+ "dtype": null,
138
+ "head_dim": 64,
139
+ "hidden_act": "gelu",
140
+ "hidden_size": 1280,
141
+ "id2label": {
142
+ "0": "LABEL_0",
143
+ "1": "LABEL_1"
144
+ },
145
+ "initializer_range": 0.02,
146
+ "intermediate_size": 5120,
147
+ "is_encoder_decoder": false,
148
+ "label2id": {
149
+ "LABEL_0": 0,
150
+ "LABEL_1": 1
151
+ },
152
+ "max_position_embeddings": 1500,
153
+ "model_type": "glmasr_encoder",
154
+ "num_attention_heads": 20,
155
+ "num_hidden_layers": 32,
156
+ "num_key_value_heads": 20,
157
+ "num_mel_bins": 128,
158
+ "output_attentions": false,
159
+ "output_hidden_states": false,
160
+ "partial_rotary_factor": 0.5,
161
+ "problem_type": null,
162
+ "return_dict": true,
163
+ "rope_parameters": {
164
+ "partial_rotary_factor": 0.5,
165
+ "rope_theta": 10000.0,
166
+ "rope_type": "default"
167
+ }
168
+ },
169
+ "audio_token_id": 59260,
170
+ "dtype": "float32",
171
+ "hidden_size": 2048,
172
+ "model_type": "glmasr",
173
+ "num_mel_bins": 128,
174
+ "projector_hidden_act": "gelu",
175
+ "text_config": {
176
+ "_name_or_path": "",
177
+ "architectures": null,
178
+ "attention_bias": false,
179
+ "attention_dropout": 0.0,
180
+ "bos_token_id": 1,
181
+ "chunk_size_feed_forward": 0,
182
+ "dtype": null,
183
+ "eos_token_id": [
184
+ 59246,
185
+ 59253,
186
+ 59255
187
+ ],
188
+ "head_dim": 128,
189
+ "hidden_act": "silu",
190
+ "hidden_size": 2048,
191
+ "id2label": {
192
+ "0": "LABEL_0",
193
+ "1": "LABEL_1"
194
+ },
195
+ "initializer_range": 0.02,
196
+ "intermediate_size": 6144,
197
+ "is_encoder_decoder": false,
198
+ "label2id": {
199
+ "LABEL_0": 0,
200
+ "LABEL_1": 1
201
+ },
202
+ "max_position_embeddings": 8192,
203
+ "mlp_bias": false,
204
+ "model_type": "llama",
205
+ "num_attention_heads": 16,
206
+ "num_hidden_layers": 28,
207
+ "num_key_value_heads": 4,
208
+ "output_attentions": false,
209
+ "output_hidden_states": false,
210
+ "pad_token_id": null,
211
+ "pretraining_tp": 1,
212
+ "problem_type": null,
213
+ "return_dict": true,
214
+ "rms_norm_eps": 1e-05,
215
+ "rope_parameters": {
216
+ "rope_theta": 10000.0,
217
+ "rope_type": "default"
218
+ },
219
+ "tie_word_embeddings": false,
220
+ "use_cache": true,
221
+ "vocab_size": 59264
222
+ },
223
+ "vocab_size": 59264
224
+ },
225
+ "encoder_conv_layers": [
226
+ [
227
+ 1,
228
+ 3,
229
+ 1
230
+ ],
231
+ [
232
+ 1,
233
+ 3,
234
+ 2
235
+ ]
236
+ ],
237
+ "encoder_dim": 1280,
238
+ "eos_token_id": 151645,
239
+ "freeze_audio_encoder": false,
240
+ "freeze_language_model": true,
241
+ "freeze_projector": false,
242
+ "freeze_text_embed_tokens": true,
243
+ "label_smoothing": 0.0,
244
+ "length_penalty": 1.0,
245
+ "llm_dim": 1024,
246
+ "lora_alpha": 32,
247
+ "lora_dropout": 0.0,
248
+ "lora_rank": 8,
249
+ "lora_target_modules": [
250
+ "q_proj",
251
+ "k_proj",
252
+ "v_proj",
253
+ "o_proj",
254
+ "gate_proj",
255
+ "up_proj",
256
+ "down_proj"
257
+ ],
258
+ "mask_feature_length": 10,
259
+ "mask_feature_min_masks": 0,
260
+ "mask_feature_prob": 0.0,
261
+ "mask_time_length": 10,
262
+ "mask_time_min_masks": 2,
263
+ "mask_time_prob": 0.05,
264
+ "max_new_tokens": 256,
265
+ "min_new_tokens": 0,
266
+ "model_dtype": "float32",
267
+ "model_type": "asr_model",
268
+ "no_repeat_ngram_size": 0,
269
+ "num_beams": 1,
270
+ "num_experts": 4,
271
+ "num_experts_per_tok": 2,
272
+ "pad_token_id": 151643,
273
+ "pipeline_tag": "automatic-speech-recognition",
274
+ "pretrained_model_path": "mazesmazes/tiny-audio-next-encoder",
275
+ "projector_dropout": 0.0,
276
+ "projector_hidden_dim": 1024,
277
+ "projector_pool_stride": 4,
278
+ "projector_type": "mlp",
279
+ "qformer_hidden_size": null,
280
+ "qformer_intermediate_size": null,
281
+ "qformer_num_heads": 16,
282
+ "qformer_num_layers": 2,
283
+ "qformer_window_size": 15,
284
+ "repetition_penalty": 1.0,
285
+ "router_aux_loss_coef": 0.01,
286
+ "system_prompt": "",
287
+ "temperature": null,
288
+ "text_config": {
289
+ "_name_or_path": "Qwen/Qwen3-0.6B",
290
+ "architectures": [
291
+ "Qwen3ForCausalLM"
292
+ ],
293
+ "attention_bias": false,
294
+ "attention_dropout": 0.0,
295
+ "bos_token_id": null,
296
+ "dtype": "float32",
297
+ "eos_token_id": 151645,
298
+ "head_dim": 128,
299
+ "hidden_act": "silu",
300
+ "hidden_size": 1024,
301
+ "initializer_range": 0.02,
302
+ "intermediate_size": 3072,
303
+ "layer_types": [
304
+ "full_attention",
305
+ "full_attention",
306
+ "full_attention",
307
+ "full_attention",
308
+ "full_attention",
309
+ "full_attention",
310
+ "full_attention",
311
+ "full_attention",
312
+ "full_attention",
313
+ "full_attention",
314
+ "full_attention",
315
+ "full_attention",
316
+ "full_attention",
317
+ "full_attention",
318
+ "full_attention",
319
+ "full_attention",
320
+ "full_attention",
321
+ "full_attention",
322
+ "full_attention",
323
+ "full_attention",
324
+ "full_attention",
325
+ "full_attention",
326
+ "full_attention",
327
+ "full_attention",
328
+ "full_attention",
329
+ "full_attention",
330
+ "full_attention",
331
+ "full_attention"
332
+ ],
333
+ "max_position_embeddings": 40960,
334
+ "max_window_layers": 28,
335
+ "model_type": "qwen3",
336
+ "num_attention_heads": 16,
337
+ "num_hidden_layers": 28,
338
+ "num_key_value_heads": 8,
339
+ "pad_token_id": 151643,
340
+ "rms_norm_eps": 1e-06,
341
+ "rope_parameters": {
342
+ "rope_theta": 1000000,
343
+ "rope_type": "default"
344
+ },
345
+ "sliding_window": null,
346
+ "tie_word_embeddings": true,
347
+ "use_cache": true,
348
+ "use_sliding_window": false,
349
+ "vocab_size": 151670
350
+ },
351
+ "text_model_id": "Qwen/Qwen3-0.6B",
352
+ "top_k": null,
353
+ "top_p": null,
354
+ "transformers_version": "5.7.0",
355
+ "use_cache": false,
356
+ "use_lora": false,
357
+ "vocab_size": 151670
358
+ }
generation_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": false,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "length_penalty": 1.0,
9
+ "max_new_tokens": 256,
10
+ "min_new_tokens": 0,
11
+ "no_repeat_ngram_size": 0,
12
+ "num_beams": 1,
13
+ "pad_token_id": 151643,
14
+ "repetition_penalty": 1.0,
15
+ "transformers_version": "5.7.0",
16
+ "use_cache": true
17
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f603808de744d8bf8908f2064f7f3c40f05b756f33767f73ef48bf13ce82496
3
+ size 25174432
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33b674fb8444e2553eae8f1b261093371920a28ef75b5c18f4deb3f9217ed0ba
3
+ size 11422834
tokenizer_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<audio>"
10
+ ],
11
+ "is_local": false,
12
+ "local_files_only": false,
13
+ "model_max_length": 131072,
14
+ "pad_token": "<|endoftext|>",
15
+ "split_special_tokens": false,
16
+ "tokenizer_class": "Qwen2Tokenizer",
17
+ "unk_token": null
18
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a569c150a324521151dbfb83dc500d6af8450313084538a126f450600782a0c
3
+ size 5393