lematt1991 commited on
Commit
dc6f393
·
verified ·
1 Parent(s): 741c530

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,209 +1,8 @@
1
  {
2
- "audio_video_model": {
3
- "video_model": {
4
- "clip_vision_model": {
5
- "architecture": "vit_pe_core_large_patch14_336",
6
- "do_pooling": true,
7
- "global_pool": "map",
8
- "initializer_range": 0.02,
9
- "model_args": {},
10
- "num_labels": 1024,
11
- "model_type": "timm_wrapper"
12
- },
13
- "transformer": {
14
- "vocab_size": 151936,
15
- "max_position_embeddings": 10000,
16
- "hidden_size": 768,
17
- "intermediate_size": 2048,
18
- "num_hidden_layers": 4,
19
- "num_attention_heads": 6,
20
- "use_sliding_window": false,
21
- "sliding_window": null,
22
- "max_window_layers": 28,
23
- "num_key_value_heads": 6,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "initializer_range": 0.02,
27
- "rms_norm_eps": 1e-05,
28
- "use_cache": true,
29
- "rope_theta": 20000,
30
- "rope_scaling": null,
31
- "attention_bias": false,
32
- "attention_dropout": 0.0,
33
- "layer_types": [
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention"
38
- ],
39
- "return_dict": true,
40
- "output_hidden_states": false,
41
- "torchscript": false,
42
- "dtype": null,
43
- "pruned_heads": {},
44
- "tie_word_embeddings": false,
45
- "chunk_size_feed_forward": 0,
46
- "is_encoder_decoder": false,
47
- "is_decoder": false,
48
- "cross_attention_hidden_size": null,
49
- "add_cross_attention": false,
50
- "tie_encoder_decoder": false,
51
- "architectures": null,
52
- "finetuning_task": null,
53
- "id2label": {
54
- "0": "LABEL_0",
55
- "1": "LABEL_1"
56
- },
57
- "label2id": {
58
- "LABEL_0": 0,
59
- "LABEL_1": 1
60
- },
61
- "task_specific_params": null,
62
- "problem_type": null,
63
- "tokenizer_class": null,
64
- "prefix": null,
65
- "bos_token_id": null,
66
- "pad_token_id": null,
67
- "eos_token_id": null,
68
- "sep_token_id": null,
69
- "decoder_start_token_id": null,
70
- "max_length": 20,
71
- "min_length": 0,
72
- "do_sample": false,
73
- "early_stopping": false,
74
- "num_beams": 1,
75
- "temperature": 1.0,
76
- "top_k": 50,
77
- "top_p": 1.0,
78
- "typical_p": 1.0,
79
- "repetition_penalty": 1.0,
80
- "length_penalty": 1.0,
81
- "no_repeat_ngram_size": 0,
82
- "encoder_no_repeat_ngram_size": 0,
83
- "bad_words_ids": null,
84
- "num_return_sequences": 1,
85
- "output_scores": false,
86
- "return_dict_in_generate": false,
87
- "forced_bos_token_id": null,
88
- "forced_eos_token_id": null,
89
- "remove_invalid_values": false,
90
- "exponential_decay_length_penalty": null,
91
- "suppress_tokens": null,
92
- "begin_suppress_tokens": null,
93
- "num_beam_groups": 1,
94
- "diversity_penalty": 0.0,
95
- "_name_or_path": "",
96
- "transformers_version": "4.57.0.dev0",
97
- "tf_legacy_loss": false,
98
- "use_bfloat16": false,
99
- "model_type": "qwen3",
100
- "output_attentions": false
101
- },
102
- "text_model": {
103
- "return_dict": true,
104
- "output_hidden_states": false,
105
- "torchscript": false,
106
- "dtype": "float32",
107
- "pruned_heads": {},
108
- "tie_word_embeddings": true,
109
- "chunk_size_feed_forward": 0,
110
- "is_encoder_decoder": false,
111
- "is_decoder": false,
112
- "cross_attention_hidden_size": null,
113
- "add_cross_attention": false,
114
- "tie_encoder_decoder": false,
115
- "architectures": [
116
- "ModernBertForMaskedLM"
117
- ],
118
- "finetuning_task": null,
119
- "id2label": {
120
- "0": "LABEL_0",
121
- "1": "LABEL_1"
122
- },
123
- "label2id": {
124
- "LABEL_0": 0,
125
- "LABEL_1": 1
126
- },
127
- "task_specific_params": null,
128
- "problem_type": null,
129
- "tokenizer_class": null,
130
- "prefix": null,
131
- "bos_token_id": 50281,
132
- "pad_token_id": 50283,
133
- "eos_token_id": 50282,
134
- "sep_token_id": 50282,
135
- "decoder_start_token_id": null,
136
- "max_length": 20,
137
- "min_length": 0,
138
- "do_sample": false,
139
- "early_stopping": false,
140
- "num_beams": 1,
141
- "temperature": 1.0,
142
- "top_k": 50,
143
- "top_p": 1.0,
144
- "typical_p": 1.0,
145
- "repetition_penalty": 1.0,
146
- "length_penalty": 1.0,
147
- "no_repeat_ngram_size": 0,
148
- "encoder_no_repeat_ngram_size": 0,
149
- "bad_words_ids": null,
150
- "num_return_sequences": 1,
151
- "output_scores": false,
152
- "return_dict_in_generate": false,
153
- "forced_bos_token_id": null,
154
- "forced_eos_token_id": null,
155
- "remove_invalid_values": false,
156
- "exponential_decay_length_penalty": null,
157
- "suppress_tokens": null,
158
- "begin_suppress_tokens": null,
159
- "num_beam_groups": 1,
160
- "diversity_penalty": 0.0,
161
- "_name_or_path": "answerdotai/ModernBERT-large",
162
- "transformers_version": "4.57.0.dev0",
163
- "cls_token_id": 50281,
164
- "gradient_checkpointing": false,
165
- "layer_norm_eps": 1e-05,
166
- "model_type": "modernbert",
167
- "position_embedding_type": "absolute",
168
- "tf_legacy_loss": false,
169
- "use_bfloat16": false,
170
- "vocab_size": 50368,
171
- "max_position_embeddings": 8192,
172
- "hidden_size": 1024,
173
- "intermediate_size": 2624,
174
- "num_hidden_layers": 28,
175
- "num_attention_heads": 16,
176
- "initializer_range": 0.02,
177
- "initializer_cutoff_factor": 2.0,
178
- "norm_eps": 1e-05,
179
- "norm_bias": false,
180
- "global_rope_theta": 160000.0,
181
- "attention_bias": false,
182
- "attention_dropout": 0.0,
183
- "hidden_activation": "gelu",
184
- "global_attn_every_n_layers": 3,
185
- "local_attention": 128,
186
- "local_rope_theta": 10000.0,
187
- "embedding_dropout": 0.0,
188
- "mlp_bias": false,
189
- "mlp_dropout": 0.0,
190
- "decoder_bias": true,
191
- "classifier_pooling": "mean",
192
- "classifier_dropout": 0.0,
193
- "classifier_bias": false,
194
- "classifier_activation": "gelu",
195
- "deterministic_flash_attn": false,
196
- "sparse_prediction": false,
197
- "sparse_pred_ignore_index": -100,
198
- "repad_logits_with_grad": false,
199
- "output_attentions": false
200
- },
201
- "output_dim": 1024,
202
- "fixed_len_video": false,
203
- "nth_text_layer": 22
204
- },
205
- "audio_model": {
206
- "dac_vae_encoder": {
207
  "encoder_hidden_size": 64,
208
  "downsampling_ratios": [
209
  2,
@@ -218,396 +17,62 @@
218
  "quantizer_dropout": 0,
219
  "sampling_rate": 48000
220
  },
221
- "transformer": {
222
- "vocab_size": 151936,
223
- "max_position_embeddings": 10000,
224
- "hidden_size": 768,
225
- "intermediate_size": 2048,
226
- "num_hidden_layers": 12,
227
- "num_attention_heads": 6,
228
- "use_sliding_window": false,
229
- "sliding_window": null,
230
- "max_window_layers": 28,
231
- "num_key_value_heads": 6,
232
- "head_dim": 128,
233
- "hidden_act": "silu",
234
- "initializer_range": 0.02,
235
- "rms_norm_eps": 1e-05,
236
- "use_cache": true,
237
- "rope_theta": 20000,
238
- "rope_scaling": null,
239
- "attention_bias": false,
240
- "attention_dropout": 0.0,
241
- "layer_types": [
242
- "full_attention",
243
- "full_attention",
244
- "full_attention",
245
- "full_attention",
246
- "full_attention",
247
- "full_attention",
248
- "full_attention",
249
- "full_attention",
250
- "full_attention",
251
- "full_attention",
252
- "full_attention",
253
- "full_attention"
254
- ],
255
- "return_dict": true,
256
- "output_hidden_states": false,
257
- "torchscript": false,
258
- "dtype": null,
259
- "pruned_heads": {},
260
- "tie_word_embeddings": false,
261
- "chunk_size_feed_forward": 0,
262
- "is_encoder_decoder": false,
263
- "is_decoder": false,
264
- "cross_attention_hidden_size": null,
265
- "add_cross_attention": false,
266
- "tie_encoder_decoder": false,
267
- "architectures": null,
268
- "finetuning_task": null,
269
- "id2label": {
270
- "0": "LABEL_0",
271
- "1": "LABEL_1"
272
- },
273
- "label2id": {
274
- "LABEL_0": 0,
275
- "LABEL_1": 1
276
- },
277
- "task_specific_params": null,
278
- "problem_type": null,
279
- "tokenizer_class": null,
280
- "prefix": null,
281
- "bos_token_id": null,
282
- "pad_token_id": null,
283
- "eos_token_id": null,
284
- "sep_token_id": null,
285
- "decoder_start_token_id": null,
286
- "max_length": 20,
287
- "min_length": 0,
288
- "do_sample": false,
289
- "early_stopping": false,
290
- "num_beams": 1,
291
- "temperature": 1.0,
292
- "top_k": 50,
293
- "top_p": 1.0,
294
- "typical_p": 1.0,
295
- "repetition_penalty": 1.0,
296
- "length_penalty": 1.0,
297
- "no_repeat_ngram_size": 0,
298
- "encoder_no_repeat_ngram_size": 0,
299
- "bad_words_ids": null,
300
- "num_return_sequences": 1,
301
- "output_scores": false,
302
- "return_dict_in_generate": false,
303
- "forced_bos_token_id": null,
304
- "forced_eos_token_id": null,
305
- "remove_invalid_values": false,
306
- "exponential_decay_length_penalty": null,
307
- "suppress_tokens": null,
308
- "begin_suppress_tokens": null,
309
- "num_beam_groups": 1,
310
- "diversity_penalty": 0.0,
311
- "_name_or_path": "",
312
- "transformers_version": "4.57.0.dev0",
313
- "tf_legacy_loss": false,
314
- "use_bfloat16": false,
315
- "model_type": "qwen3",
316
- "output_attentions": false
317
  },
318
- "text_model": {
319
- "return_dict": true,
320
- "output_hidden_states": false,
321
- "torchscript": false,
322
- "dtype": "float32",
323
- "pruned_heads": {},
324
- "tie_word_embeddings": true,
325
- "chunk_size_feed_forward": 0,
326
- "is_encoder_decoder": false,
327
- "is_decoder": false,
328
- "cross_attention_hidden_size": null,
329
- "add_cross_attention": false,
330
- "tie_encoder_decoder": false,
331
- "architectures": [
332
- "ModernBertForMaskedLM"
333
- ],
334
- "finetuning_task": null,
335
- "id2label": {
336
- "0": "LABEL_0",
337
- "1": "LABEL_1"
338
- },
339
- "label2id": {
340
- "LABEL_0": 0,
341
- "LABEL_1": 1
342
- },
343
- "task_specific_params": null,
344
- "problem_type": null,
345
- "tokenizer_class": null,
346
- "prefix": null,
347
- "bos_token_id": 50281,
348
- "pad_token_id": 50283,
349
- "eos_token_id": 50282,
350
- "sep_token_id": 50282,
351
- "decoder_start_token_id": null,
352
- "max_length": 20,
353
- "min_length": 0,
354
- "do_sample": false,
355
- "early_stopping": false,
356
- "num_beams": 1,
357
- "temperature": 1.0,
358
- "top_k": 50,
359
- "top_p": 1.0,
360
- "typical_p": 1.0,
361
- "repetition_penalty": 1.0,
362
- "length_penalty": 1.0,
363
- "no_repeat_ngram_size": 0,
364
- "encoder_no_repeat_ngram_size": 0,
365
- "bad_words_ids": null,
366
- "num_return_sequences": 1,
367
- "output_scores": false,
368
- "return_dict_in_generate": false,
369
- "forced_bos_token_id": null,
370
- "forced_eos_token_id": null,
371
- "remove_invalid_values": false,
372
- "exponential_decay_length_penalty": null,
373
- "suppress_tokens": null,
374
- "begin_suppress_tokens": null,
375
- "num_beam_groups": 1,
376
- "diversity_penalty": 0.0,
377
- "_name_or_path": "answerdotai/ModernBERT-large",
378
- "transformers_version": "4.57.0.dev0",
379
- "cls_token_id": 50281,
380
- "gradient_checkpointing": false,
381
- "layer_norm_eps": 1e-05,
382
- "model_type": "modernbert",
383
- "position_embedding_type": "absolute",
384
- "tf_legacy_loss": false,
385
- "use_bfloat16": false,
386
- "vocab_size": 50368,
387
- "max_position_embeddings": 8192,
388
- "hidden_size": 1024,
389
- "intermediate_size": 2624,
390
- "num_hidden_layers": 28,
391
- "num_attention_heads": 16,
392
  "initializer_range": 0.02,
393
- "initializer_cutoff_factor": 2.0,
394
- "norm_eps": 1e-05,
395
- "norm_bias": false,
396
- "global_rope_theta": 160000.0,
397
- "attention_bias": false,
398
- "attention_dropout": 0.0,
399
- "hidden_activation": "gelu",
400
- "global_attn_every_n_layers": 3,
401
- "local_attention": 128,
402
- "local_rope_theta": 10000.0,
403
- "embedding_dropout": 0.0,
404
- "mlp_bias": false,
405
- "mlp_dropout": 0.0,
406
- "decoder_bias": true,
407
- "classifier_pooling": "mean",
408
- "classifier_dropout": 0.0,
409
- "classifier_bias": false,
410
- "classifier_activation": "gelu",
411
- "deterministic_flash_attn": false,
412
- "sparse_prediction": false,
413
- "sparse_pred_ignore_index": -100,
414
- "repad_logits_with_grad": false,
415
- "output_attentions": false
416
  },
417
- "output_dim": 1024,
418
- "nth_text_layer": 22
419
- },
420
- "transformer": {
421
- "vocab_size": 151936,
422
- "max_position_embeddings": 10000,
423
  "hidden_size": 768,
424
  "intermediate_size": 2048,
425
- "num_hidden_layers": 6,
426
  "num_attention_heads": 6,
427
- "use_sliding_window": false,
428
- "sliding_window": null,
429
- "max_window_layers": 28,
430
- "num_key_value_heads": 6,
431
  "head_dim": 128,
 
432
  "hidden_act": "silu",
 
433
  "initializer_range": 0.02,
434
  "rms_norm_eps": 1e-05,
435
- "use_cache": true,
436
- "rope_theta": 20000,
437
- "rope_scaling": null,
438
- "attention_bias": false,
439
- "attention_dropout": 0.0,
440
- "layer_types": [
441
- "full_attention",
442
- "full_attention",
443
- "full_attention",
444
- "full_attention",
445
- "full_attention",
446
- "full_attention"
447
- ],
448
- "return_dict": true,
449
- "output_hidden_states": false,
450
- "torchscript": false,
451
- "dtype": null,
452
- "pruned_heads": {},
453
- "tie_word_embeddings": false,
454
- "chunk_size_feed_forward": 0,
455
- "is_encoder_decoder": false,
456
- "is_decoder": false,
457
- "cross_attention_hidden_size": null,
458
- "add_cross_attention": false,
459
- "tie_encoder_decoder": false,
460
- "architectures": null,
461
- "finetuning_task": null,
462
- "id2label": {
463
- "0": "LABEL_0",
464
- "1": "LABEL_1"
465
- },
466
- "label2id": {
467
- "LABEL_0": 0,
468
- "LABEL_1": 1
469
  },
470
- "task_specific_params": null,
471
- "problem_type": null,
472
- "tokenizer_class": null,
473
- "prefix": null,
474
- "bos_token_id": null,
475
- "pad_token_id": null,
476
- "eos_token_id": null,
477
- "sep_token_id": null,
478
- "decoder_start_token_id": null,
479
- "max_length": 20,
480
- "min_length": 0,
481
- "do_sample": false,
482
- "early_stopping": false,
483
- "num_beams": 1,
484
- "temperature": 1.0,
485
- "top_k": 50,
486
- "top_p": 1.0,
487
- "typical_p": 1.0,
488
- "repetition_penalty": 1.0,
489
- "length_penalty": 1.0,
490
- "no_repeat_ngram_size": 0,
491
- "encoder_no_repeat_ngram_size": 0,
492
- "bad_words_ids": null,
493
- "num_return_sequences": 1,
494
- "output_scores": false,
495
- "return_dict_in_generate": false,
496
- "forced_bos_token_id": null,
497
- "forced_eos_token_id": null,
498
- "remove_invalid_values": false,
499
- "exponential_decay_length_penalty": null,
500
- "suppress_tokens": null,
501
- "begin_suppress_tokens": null,
502
- "num_beam_groups": 1,
503
- "diversity_penalty": 0.0,
504
- "_name_or_path": "",
505
- "transformers_version": "4.57.0.dev0",
506
- "tf_legacy_loss": false,
507
- "use_bfloat16": false,
508
- "model_type": "qwen3",
509
- "output_attentions": false
510
- }
511
- },
512
- "text_model": {
513
- "return_dict": true,
514
- "output_hidden_states": false,
515
- "torchscript": false,
516
- "dtype": "float32",
517
- "pruned_heads": {},
518
- "tie_word_embeddings": true,
519
- "chunk_size_feed_forward": 0,
520
- "is_encoder_decoder": false,
521
- "is_decoder": false,
522
- "cross_attention_hidden_size": null,
523
- "add_cross_attention": false,
524
- "tie_encoder_decoder": false,
525
- "architectures": [
526
- "ModernBertForMaskedLM"
527
- ],
528
- "finetuning_task": null,
529
- "id2label": {
530
- "0": "LABEL_0",
531
- "1": "LABEL_1"
532
- },
533
- "label2id": {
534
- "LABEL_0": 0,
535
- "LABEL_1": 1
536
  },
537
- "task_specific_params": null,
538
- "problem_type": null,
539
- "tokenizer_class": null,
540
- "prefix": null,
541
- "bos_token_id": 50281,
542
- "pad_token_id": 50283,
543
- "eos_token_id": 50282,
544
- "sep_token_id": 50282,
545
- "decoder_start_token_id": null,
546
- "max_length": 20,
547
- "min_length": 0,
548
- "do_sample": false,
549
- "early_stopping": false,
550
- "num_beams": 1,
551
- "temperature": 1.0,
552
- "top_k": 50,
553
- "top_p": 1.0,
554
- "typical_p": 1.0,
555
- "repetition_penalty": 1.0,
556
- "length_penalty": 1.0,
557
- "no_repeat_ngram_size": 0,
558
- "encoder_no_repeat_ngram_size": 0,
559
- "bad_words_ids": null,
560
- "num_return_sequences": 1,
561
- "output_scores": false,
562
- "return_dict_in_generate": false,
563
- "forced_bos_token_id": null,
564
- "forced_eos_token_id": null,
565
- "remove_invalid_values": false,
566
- "exponential_decay_length_penalty": null,
567
- "suppress_tokens": null,
568
- "begin_suppress_tokens": null,
569
- "num_beam_groups": 1,
570
- "diversity_penalty": 0.0,
571
- "_name_or_path": "answerdotai/ModernBERT-large",
572
- "transformers_version": "4.57.0.dev0",
573
- "cls_token_id": 50281,
574
- "gradient_checkpointing": false,
575
- "layer_norm_eps": 1e-05,
576
- "model_type": "modernbert",
577
- "position_embedding_type": "absolute",
578
- "tf_legacy_loss": false,
579
- "use_bfloat16": false,
580
- "vocab_size": 50368,
581
- "max_position_embeddings": 8192,
582
- "hidden_size": 1024,
583
- "intermediate_size": 2624,
584
- "num_hidden_layers": 28,
585
- "num_attention_heads": 16,
586
  "initializer_range": 0.02,
587
- "initializer_cutoff_factor": 2.0,
588
- "norm_eps": 1e-05,
589
- "norm_bias": false,
590
- "global_rope_theta": 160000.0,
591
  "attention_bias": false,
592
- "attention_dropout": 0.0,
593
- "hidden_activation": "gelu",
594
- "global_attn_every_n_layers": 3,
595
- "local_attention": 128,
596
- "local_rope_theta": 10000.0,
597
- "embedding_dropout": 0.0,
598
- "mlp_bias": false,
599
- "mlp_dropout": 0.0,
600
- "decoder_bias": true,
601
- "classifier_pooling": "mean",
602
- "classifier_dropout": 0.0,
603
- "classifier_bias": false,
604
- "classifier_activation": "gelu",
605
- "deterministic_flash_attn": false,
606
- "sparse_prediction": false,
607
- "sparse_pred_ignore_index": -100,
608
- "repad_logits_with_grad": false,
609
- "output_attentions": false
610
- },
611
- "output_dim": 1024,
612
- "nth_text_layer": 22
613
  }
 
1
  {
2
+ "model_type": "pe_audio_video",
3
+ "audio_video_config": {
4
+ "audio_config": {
5
+ "dac_config": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "encoder_hidden_size": 64,
7
  "downsampling_ratios": [
8
  2,
 
17
  "quantizer_dropout": 0,
18
  "sampling_rate": 48000
19
  },
20
+ "hidden_size": 768,
21
+ "intermediate_size": 2048,
22
+ "num_hidden_layers": 12,
23
+ "num_attention_heads": 6,
24
+ "head_dim": 128,
25
+ "num_key_value_heads": null,
26
+ "hidden_act": "silu",
27
+ "max_position_embeddings": 10000,
28
+ "initializer_range": 0.02,
29
+ "rms_norm_eps": 1e-05,
30
+ "rope_parameters": {
31
+ "rope_theta": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
+ "attention_bias": false,
34
+ "attention_dropout": 0.0
35
+ },
36
+ "video_config": {
37
+ "vision_config": {
38
+ "architecture": "vit_pe_core_large_patch14_336",
39
+ "do_pooling": true,
40
+ "global_pool": "map",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "initializer_range": 0.02,
42
+ "model_args": {},
43
+ "num_labels": 1024,
44
+ "model_type": "timm_wrapper"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
 
 
 
 
 
 
46
  "hidden_size": 768,
47
  "intermediate_size": 2048,
48
+ "num_hidden_layers": 4,
49
  "num_attention_heads": 6,
 
 
 
 
50
  "head_dim": 128,
51
+ "num_key_value_heads": null,
52
  "hidden_act": "silu",
53
+ "max_position_embeddings": 10000,
54
  "initializer_range": 0.02,
55
  "rms_norm_eps": 1e-05,
56
+ "rope_parameters": {
57
+ "rope_theta": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
+ "attention_bias": false,
60
+ "attention_dropout": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
+ "hidden_size": 768,
63
+ "intermediate_size": 2048,
64
+ "num_hidden_layers": 6,
65
+ "num_attention_heads": 6,
66
+ "head_dim": 128,
67
+ "num_key_value_heads": null,
68
+ "hidden_act": "silu",
69
+ "max_position_embeddings": 10000,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  "initializer_range": 0.02,
71
+ "rms_norm_eps": 1e-05,
72
+ "rope_parameters": {
73
+ "rope_theta": 20000
74
+ },
75
  "attention_bias": false,
76
+ "attention_dropout": 0.0
77
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1020444e1853e50d26280211715d2f48dcc110731411622e5176ec87acd0bed5
3
- size 3683318128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd492748b94339c873b0d5795cf85a2262f21118f7d57bfb9abc8d61adefa682
3
+ size 3388617568
preprocessor_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "feature_extractor_type": "PEAudioVideoFeatureExtractor",
3
  "feature_size": 1,
4
  "hop_length": 1920,
5
  "padding_side": "right",
 
1
  {
2
+ "feature_extractor_type": "PeAudioFeatureExtractor",
3
  "feature_size": 1,
4
  "hop_length": 1920,
5
  "padding_side": "right",
video_preprocessor_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "do_sample_frames": false,
3
  "num_frames": null,
4
  "size": {
 
1
  {
2
+ "video_processor_type": "PeVideoVideoProcessor",
3
  "do_sample_frames": false,
4
  "num_frames": null,
5
  "size": {