AEmotionStudio commited on
Commit
17b3762
·
verified ·
1 Parent(s): 46a9e1e

Mirror config.json from ACE-Step/acestep-transcriber

Browse files
checkpoints/acestep-transcriber/config.json ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5OmniForConditionalGeneration"
4
+ ],
5
+ "enable_audio_output": true,
6
+ "enable_talker": true,
7
+ "hidden_size": 3584,
8
+ "keys_to_ignore_at_inference": [
9
+ "past_key_values",
10
+ "hidden_states",
11
+ "attention_mask",
12
+ "hidden_states",
13
+ "attention_mask",
14
+ "hidden_states",
15
+ "attention_mask",
16
+ "hidden_states",
17
+ "attention_mask"
18
+ ],
19
+ "model_type": "qwen2_5_omni",
20
+ "pad_token_id": 151643,
21
+ "talker_config": {
22
+ "_name_or_path": "Qwen2.5-Omni-7B/talker",
23
+ "architectures": [
24
+ "Qwen2OmniTalkerForConditionalGeneration"
25
+ ],
26
+ "attention_dropout": 0.0,
27
+ "audio_end_token_id": 151648,
28
+ "audio_start_token_id": 151647,
29
+ "audio_token_index": 151646,
30
+ "embedding_size": 3584,
31
+ "head_dim": 128,
32
+ "hidden_act": "silu",
33
+ "hidden_size": 896,
34
+ "image_token_index": 151655,
35
+ "init_std": 0.02,
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": 18944,
38
+ "layer_types": [
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention"
63
+ ],
64
+ "max_position_embeddings": 32768,
65
+ "max_window_layers": 28,
66
+ "model_type": "qwen2_5_omni_talker",
67
+ "num_attention_heads": 12,
68
+ "num_hidden_layers": 24,
69
+ "num_key_value_heads": 4,
70
+ "position_id_per_seconds": 25,
71
+ "rms_norm_eps": 1e-06,
72
+ "rope_scaling": {
73
+ "mrope_section": [
74
+ 16,
75
+ 24,
76
+ 24
77
+ ],
78
+ "rope_type": "default",
79
+ "type": "default"
80
+ },
81
+ "rope_theta": 1000000.0,
82
+ "seconds_per_chunk": 2,
83
+ "sliding_window": null,
84
+ "spatial_merge_size": 2,
85
+ "torch_dtype": "bfloat16",
86
+ "tts_codec_end_token_id": 8294,
87
+ "tts_codec_mask_token_id": 8296,
88
+ "tts_codec_pad_token_id": 8292,
89
+ "tts_codec_start_token_id": 8293,
90
+ "tts_text_end_token_id": 151861,
91
+ "tts_text_pad_token_id": 151859,
92
+ "tts_text_start_token_id": 151860,
93
+ "use_cache": false,
94
+ "use_sliding_window": false,
95
+ "video_token_index": 151656,
96
+ "vision_end_token_id": 151653,
97
+ "vision_start_token_id": 151652,
98
+ "vocab_size": 8448
99
+ },
100
+ "thinker_config": {
101
+ "_name_or_path": "Qwen2.5-Omni-7B/thinker",
102
+ "architectures": [
103
+ "Qwen2OmniNaViTThinkerForConditionalGeneration"
104
+ ],
105
+ "audio_config": {
106
+ "_name_or_path": "",
107
+ "activation_dropout": 0.0,
108
+ "activation_function": "gelu",
109
+ "add_cross_attention": false,
110
+ "architectures": null,
111
+ "attention_dropout": 0.0,
112
+ "bad_words_ids": null,
113
+ "begin_suppress_tokens": null,
114
+ "bos_token_id": null,
115
+ "chunk_size_feed_forward": 0,
116
+ "cross_attention_hidden_size": null,
117
+ "d_model": 1280,
118
+ "decoder_start_token_id": null,
119
+ "diversity_penalty": 0.0,
120
+ "do_sample": false,
121
+ "dropout": 0.0,
122
+ "early_stopping": false,
123
+ "encoder_attention_heads": 20,
124
+ "encoder_ffn_dim": 5120,
125
+ "encoder_layerdrop": 0.0,
126
+ "encoder_layers": 32,
127
+ "encoder_no_repeat_ngram_size": 0,
128
+ "eos_token_id": null,
129
+ "exponential_decay_length_penalty": null,
130
+ "finetuning_task": null,
131
+ "forced_bos_token_id": null,
132
+ "forced_eos_token_id": null,
133
+ "id2label": {
134
+ "0": "LABEL_0",
135
+ "1": "LABEL_1"
136
+ },
137
+ "init_std": 0.02,
138
+ "initializer_range": 0.02,
139
+ "is_decoder": false,
140
+ "is_encoder_decoder": false,
141
+ "label2id": {
142
+ "LABEL_0": 0,
143
+ "LABEL_1": 1
144
+ },
145
+ "length_penalty": 1.0,
146
+ "max_length": 20,
147
+ "max_source_positions": 1500,
148
+ "min_length": 0,
149
+ "model_type": "qwen2_5_omni_audio_encoder",
150
+ "n_window": 100,
151
+ "no_repeat_ngram_size": 0,
152
+ "num_beam_groups": 1,
153
+ "num_beams": 1,
154
+ "num_hidden_layers": 32,
155
+ "num_mel_bins": 128,
156
+ "num_return_sequences": 1,
157
+ "output_attentions": false,
158
+ "output_dim": 3584,
159
+ "output_hidden_states": false,
160
+ "output_scores": false,
161
+ "pad_token_id": null,
162
+ "prefix": null,
163
+ "problem_type": null,
164
+ "pruned_heads": {},
165
+ "remove_invalid_values": false,
166
+ "repetition_penalty": 1.0,
167
+ "return_dict": true,
168
+ "return_dict_in_generate": false,
169
+ "scale_embedding": false,
170
+ "sep_token_id": null,
171
+ "suppress_tokens": null,
172
+ "task_specific_params": null,
173
+ "temperature": 1.0,
174
+ "tf_legacy_loss": false,
175
+ "tie_encoder_decoder": false,
176
+ "tie_word_embeddings": true,
177
+ "tokenizer_class": null,
178
+ "top_k": 50,
179
+ "top_p": 1.0,
180
+ "torch_dtype": null,
181
+ "torchscript": false,
182
+ "typical_p": 1.0,
183
+ "use_bfloat16": false
184
+ },
185
+ "audio_end_token_id": 151648,
186
+ "audio_start_token_id": 151647,
187
+ "audio_token_index": 151646,
188
+ "bos_token_id": 151644,
189
+ "eos_token_id": 151645,
190
+ "ignore_index": -100,
191
+ "image_token_index": 151655,
192
+ "init_std": 0.02,
193
+ "initializer_range": 0.02,
194
+ "model_type": "qwen2_5_omni_thinker",
195
+ "pad_token_id": 151643,
196
+ "position_id_per_seconds": 25,
197
+ "seconds_per_chunk": 2,
198
+ "text_config": {
199
+ "_name_or_path": "",
200
+ "add_cross_attention": false,
201
+ "architectures": null,
202
+ "attention_dropout": 0.0,
203
+ "bad_words_ids": null,
204
+ "begin_suppress_tokens": null,
205
+ "bos_token_id": null,
206
+ "chunk_size_feed_forward": 0,
207
+ "cross_attention_hidden_size": null,
208
+ "decoder_start_token_id": null,
209
+ "diversity_penalty": 0.0,
210
+ "do_sample": false,
211
+ "early_stopping": false,
212
+ "encoder_no_repeat_ngram_size": 0,
213
+ "eos_token_id": null,
214
+ "exponential_decay_length_penalty": null,
215
+ "finetuning_task": null,
216
+ "forced_bos_token_id": null,
217
+ "forced_eos_token_id": null,
218
+ "hidden_act": "silu",
219
+ "hidden_size": 3584,
220
+ "id2label": {
221
+ "0": "LABEL_0",
222
+ "1": "LABEL_1"
223
+ },
224
+ "init_std": 0.02,
225
+ "initializer_range": 0.02,
226
+ "intermediate_size": 18944,
227
+ "is_decoder": false,
228
+ "is_encoder_decoder": false,
229
+ "label2id": {
230
+ "LABEL_0": 0,
231
+ "LABEL_1": 1
232
+ },
233
+ "layer_types": [
234
+ "full_attention",
235
+ "full_attention",
236
+ "full_attention",
237
+ "full_attention",
238
+ "full_attention",
239
+ "full_attention",
240
+ "full_attention",
241
+ "full_attention",
242
+ "full_attention",
243
+ "full_attention",
244
+ "full_attention",
245
+ "full_attention",
246
+ "full_attention",
247
+ "full_attention",
248
+ "full_attention",
249
+ "full_attention",
250
+ "full_attention",
251
+ "full_attention",
252
+ "full_attention",
253
+ "full_attention",
254
+ "full_attention",
255
+ "full_attention",
256
+ "full_attention",
257
+ "full_attention",
258
+ "full_attention",
259
+ "full_attention",
260
+ "full_attention",
261
+ "full_attention"
262
+ ],
263
+ "length_penalty": 1.0,
264
+ "max_length": 20,
265
+ "max_position_embeddings": 32768,
266
+ "max_window_layers": 28,
267
+ "min_length": 0,
268
+ "model_type": "qwen2_5_omni_text",
269
+ "no_repeat_ngram_size": 0,
270
+ "num_attention_heads": 28,
271
+ "num_beam_groups": 1,
272
+ "num_beams": 1,
273
+ "num_hidden_layers": 28,
274
+ "num_key_value_heads": 4,
275
+ "num_return_sequences": 1,
276
+ "output_attentions": false,
277
+ "output_hidden_states": false,
278
+ "output_scores": false,
279
+ "pad_token_id": null,
280
+ "prefix": null,
281
+ "problem_type": null,
282
+ "pruned_heads": {},
283
+ "remove_invalid_values": false,
284
+ "repetition_penalty": 1.0,
285
+ "return_dict": true,
286
+ "return_dict_in_generate": false,
287
+ "rms_norm_eps": 1e-06,
288
+ "rope_scaling": {
289
+ "mrope_section": [
290
+ 16,
291
+ 24,
292
+ 24
293
+ ],
294
+ "rope_type": "default",
295
+ "type": "default"
296
+ },
297
+ "rope_theta": 1000000.0,
298
+ "sep_token_id": null,
299
+ "sliding_window": null,
300
+ "suppress_tokens": null,
301
+ "task_specific_params": null,
302
+ "temperature": 1.0,
303
+ "tf_legacy_loss": false,
304
+ "tie_encoder_decoder": false,
305
+ "tie_word_embeddings": false,
306
+ "tokenizer_class": null,
307
+ "top_k": 50,
308
+ "top_p": 1.0,
309
+ "torch_dtype": null,
310
+ "torchscript": false,
311
+ "typical_p": 1.0,
312
+ "use_bfloat16": false,
313
+ "use_cache": true,
314
+ "use_sliding_window": false,
315
+ "vocab_size": 152064
316
+ },
317
+ "torch_dtype": "bfloat16",
318
+ "user_token_id": 872,
319
+ "video_token_index": 151656,
320
+ "vision_config": {
321
+ "_name_or_path": "",
322
+ "add_cross_attention": false,
323
+ "architectures": null,
324
+ "bad_words_ids": null,
325
+ "begin_suppress_tokens": null,
326
+ "bos_token_id": null,
327
+ "chunk_size_feed_forward": 0,
328
+ "cross_attention_hidden_size": null,
329
+ "decoder_start_token_id": null,
330
+ "depth": 32,
331
+ "diversity_penalty": 0.0,
332
+ "do_sample": false,
333
+ "early_stopping": false,
334
+ "embed_dim": 1280,
335
+ "encoder_no_repeat_ngram_size": 0,
336
+ "eos_token_id": null,
337
+ "exponential_decay_length_penalty": null,
338
+ "finetuning_task": null,
339
+ "forced_bos_token_id": null,
340
+ "forced_eos_token_id": null,
341
+ "fullatt_block_indexes": [
342
+ 7,
343
+ 15,
344
+ 23,
345
+ 31
346
+ ],
347
+ "hidden_act": "silu",
348
+ "hidden_size": 1280,
349
+ "id2label": {
350
+ "0": "LABEL_0",
351
+ "1": "LABEL_1"
352
+ },
353
+ "in_channels": 3,
354
+ "in_chans": 3,
355
+ "init_std": 0.02,
356
+ "initializer_range": 0.02,
357
+ "intermediate_size": 3420,
358
+ "is_decoder": false,
359
+ "is_encoder_decoder": false,
360
+ "label2id": {
361
+ "LABEL_0": 0,
362
+ "LABEL_1": 1
363
+ },
364
+ "length_penalty": 1.0,
365
+ "max_length": 20,
366
+ "min_length": 0,
367
+ "model_type": "qwen2_5_omni_vision_encoder",
368
+ "no_repeat_ngram_size": 0,
369
+ "num_beam_groups": 1,
370
+ "num_beams": 1,
371
+ "num_heads": 16,
372
+ "num_return_sequences": 1,
373
+ "out_hidden_size": 3584,
374
+ "output_attentions": false,
375
+ "output_hidden_states": false,
376
+ "output_scores": false,
377
+ "pad_token_id": null,
378
+ "patch_size": 14,
379
+ "prefix": null,
380
+ "problem_type": null,
381
+ "pruned_heads": {},
382
+ "remove_invalid_values": false,
383
+ "repetition_penalty": 1.0,
384
+ "return_dict": true,
385
+ "return_dict_in_generate": false,
386
+ "sep_token_id": null,
387
+ "spatial_merge_size": 2,
388
+ "spatial_patch_size": 14,
389
+ "suppress_tokens": null,
390
+ "task_specific_params": null,
391
+ "temperature": 1.0,
392
+ "temporal_patch_size": 2,
393
+ "tf_legacy_loss": false,
394
+ "tie_encoder_decoder": false,
395
+ "tie_word_embeddings": true,
396
+ "tokenizer_class": null,
397
+ "tokens_per_second": 25,
398
+ "top_k": 50,
399
+ "top_p": 1.0,
400
+ "torch_dtype": null,
401
+ "torchscript": false,
402
+ "typical_p": 1.0,
403
+ "use_bfloat16": false,
404
+ "window_size": 112
405
+ },
406
+ "vision_end_token_id": 151653,
407
+ "vision_start_token_id": 151652,
408
+ "vision_token_id": 151654
409
+ },
410
+ "token2wav_config": {
411
+ "bigvgan_config": {
412
+ "_name_or_path": "",
413
+ "add_cross_attention": false,
414
+ "architectures": null,
415
+ "bad_words_ids": null,
416
+ "begin_suppress_tokens": null,
417
+ "bos_token_id": null,
418
+ "chunk_size_feed_forward": 0,
419
+ "cross_attention_hidden_size": null,
420
+ "decoder_start_token_id": null,
421
+ "diversity_penalty": 0.0,
422
+ "do_sample": false,
423
+ "early_stopping": false,
424
+ "encoder_no_repeat_ngram_size": 0,
425
+ "eos_token_id": null,
426
+ "exponential_decay_length_penalty": null,
427
+ "finetuning_task": null,
428
+ "forced_bos_token_id": null,
429
+ "forced_eos_token_id": null,
430
+ "id2label": {
431
+ "0": "LABEL_0",
432
+ "1": "LABEL_1"
433
+ },
434
+ "is_decoder": false,
435
+ "is_encoder_decoder": false,
436
+ "label2id": {
437
+ "LABEL_0": 0,
438
+ "LABEL_1": 1
439
+ },
440
+ "length_penalty": 1.0,
441
+ "max_length": 20,
442
+ "mel_dim": 80,
443
+ "min_length": 0,
444
+ "model_type": "qwen2_5_omni_bigvgan",
445
+ "no_repeat_ngram_size": 0,
446
+ "num_beam_groups": 1,
447
+ "num_beams": 1,
448
+ "num_return_sequences": 1,
449
+ "output_attentions": false,
450
+ "output_hidden_states": false,
451
+ "output_scores": false,
452
+ "pad_token_id": null,
453
+ "prefix": null,
454
+ "problem_type": null,
455
+ "pruned_heads": {},
456
+ "remove_invalid_values": false,
457
+ "repetition_penalty": 1.0,
458
+ "resblock_dilation_sizes": [
459
+ [
460
+ 1,
461
+ 3,
462
+ 5
463
+ ],
464
+ [
465
+ 1,
466
+ 3,
467
+ 5
468
+ ],
469
+ [
470
+ 1,
471
+ 3,
472
+ 5
473
+ ]
474
+ ],
475
+ "resblock_kernel_sizes": [
476
+ 3,
477
+ 7,
478
+ 11
479
+ ],
480
+ "return_dict": true,
481
+ "return_dict_in_generate": false,
482
+ "sep_token_id": null,
483
+ "suppress_tokens": null,
484
+ "task_specific_params": null,
485
+ "temperature": 1.0,
486
+ "tf_legacy_loss": false,
487
+ "tie_encoder_decoder": false,
488
+ "tie_word_embeddings": true,
489
+ "tokenizer_class": null,
490
+ "top_k": 50,
491
+ "top_p": 1.0,
492
+ "torch_dtype": null,
493
+ "torchscript": false,
494
+ "typical_p": 1.0,
495
+ "upsample_initial_channel": 1536,
496
+ "upsample_kernel_sizes": [
497
+ 11,
498
+ 7,
499
+ 4,
500
+ 4,
501
+ 4,
502
+ 4
503
+ ],
504
+ "upsample_rates": [
505
+ 5,
506
+ 3,
507
+ 2,
508
+ 2,
509
+ 2,
510
+ 2
511
+ ],
512
+ "use_bfloat16": false,
513
+ "use_bias_at_final": false
514
+ },
515
+ "dit_config": {
516
+ "_name_or_path": "",
517
+ "add_cross_attention": false,
518
+ "architectures": null,
519
+ "bad_words_ids": null,
520
+ "begin_suppress_tokens": null,
521
+ "block_size": 24,
522
+ "bos_token_id": null,
523
+ "chunk_size_feed_forward": 0,
524
+ "cross_attention_hidden_size": null,
525
+ "decoder_start_token_id": null,
526
+ "depth": 22,
527
+ "dim": 1024,
528
+ "diversity_penalty": 0.0,
529
+ "do_sample": false,
530
+ "dropout": 0.1,
531
+ "early_stopping": false,
532
+ "emb_dim": 512,
533
+ "enc_attention_channels": 64,
534
+ "enc_channels": [
535
+ 256,
536
+ 256,
537
+ 256,
538
+ 256,
539
+ 768
540
+ ],
541
+ "enc_dilations": [
542
+ 1,
543
+ 2,
544
+ 3,
545
+ 4,
546
+ 1
547
+ ],
548
+ "enc_dim": 128,
549
+ "enc_emb_dim": 192,
550
+ "enc_global_context": true,
551
+ "enc_kernel_sizes": [
552
+ 5,
553
+ 3,
554
+ 3,
555
+ 3,
556
+ 1
557
+ ],
558
+ "enc_lin_neurons": 192,
559
+ "enc_res2net_scale": 2,
560
+ "enc_se_channels": 64,
561
+ "encoder_no_repeat_ngram_size": 0,
562
+ "eos_token_id": null,
563
+ "exponential_decay_length_penalty": null,
564
+ "ff_mult": 2,
565
+ "finetuning_task": null,
566
+ "forced_bos_token_id": null,
567
+ "forced_eos_token_id": null,
568
+ "head_dim": 64,
569
+ "heads": 16,
570
+ "hidden_size": 1024,
571
+ "id2label": {
572
+ "0": "LABEL_0",
573
+ "1": "LABEL_1"
574
+ },
575
+ "is_decoder": false,
576
+ "is_encoder_decoder": false,
577
+ "label2id": {
578
+ "LABEL_0": 0,
579
+ "LABEL_1": 1
580
+ },
581
+ "length_penalty": 1.0,
582
+ "look_ahead_layers": [
583
+ 10
584
+ ],
585
+ "look_backward_layers": [
586
+ 0,
587
+ 20
588
+ ],
589
+ "max_length": 20,
590
+ "max_position_embeddings": 32768,
591
+ "mel_dim": 80,
592
+ "min_length": 0,
593
+ "model_type": "qwen2_5_omni_dit",
594
+ "no_repeat_ngram_size": 0,
595
+ "num_attention_heads": 16,
596
+ "num_beam_groups": 1,
597
+ "num_beams": 1,
598
+ "num_embeds": 8193,
599
+ "num_hidden_layers": 22,
600
+ "num_return_sequences": 1,
601
+ "output_attentions": false,
602
+ "output_hidden_states": false,
603
+ "output_scores": false,
604
+ "pad_token_id": null,
605
+ "prefix": null,
606
+ "problem_type": null,
607
+ "pruned_heads": {},
608
+ "remove_invalid_values": false,
609
+ "repeats": 2,
610
+ "repetition_penalty": 1.0,
611
+ "return_dict": true,
612
+ "return_dict_in_generate": false,
613
+ "rope_theta": 10000.0,
614
+ "sep_token_id": null,
615
+ "suppress_tokens": null,
616
+ "task_specific_params": null,
617
+ "temperature": 1.0,
618
+ "tf_legacy_loss": false,
619
+ "tie_encoder_decoder": false,
620
+ "tie_word_embeddings": true,
621
+ "tokenizer_class": null,
622
+ "top_k": 50,
623
+ "top_p": 1.0,
624
+ "torch_dtype": "float32",
625
+ "torchscript": false,
626
+ "typical_p": 1.0,
627
+ "use_bfloat16": false
628
+ },
629
+ "model_type": "qwen2_5_omni_token2wav",
630
+ "pad_token_id": 151643,
631
+ "torch_dtype": "bfloat16"
632
+ },
633
+ "torch_dtype": "bfloat16",
634
+ "transformers_version": "4.53.3"
635
+ }