lematt1991 commited on
Commit
bbb02cf
·
verified ·
1 Parent(s): e9b996f

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,209 +1,8 @@
1
  {
2
- "audio_video_model": {
3
- "video_model": {
4
- "clip_vision_model": {
5
- "architecture": "vit_pe_core_large_patch14_336",
6
- "do_pooling": true,
7
- "global_pool": "map",
8
- "initializer_range": 0.02,
9
- "model_args": {},
10
- "num_labels": 1024,
11
- "model_type": "timm_wrapper"
12
- },
13
- "transformer": {
14
- "vocab_size": 151936,
15
- "max_position_embeddings": 10000,
16
- "hidden_size": 1024,
17
- "intermediate_size": 2752,
18
- "num_hidden_layers": 4,
19
- "num_attention_heads": 8,
20
- "use_sliding_window": false,
21
- "sliding_window": null,
22
- "max_window_layers": 28,
23
- "num_key_value_heads": 8,
24
- "head_dim": 128,
25
- "hidden_act": "silu",
26
- "initializer_range": 0.02,
27
- "rms_norm_eps": 1e-05,
28
- "use_cache": true,
29
- "rope_theta": 20000,
30
- "rope_scaling": null,
31
- "attention_bias": false,
32
- "attention_dropout": 0.0,
33
- "layer_types": [
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention"
38
- ],
39
- "return_dict": true,
40
- "output_hidden_states": false,
41
- "torchscript": false,
42
- "dtype": null,
43
- "pruned_heads": {},
44
- "tie_word_embeddings": false,
45
- "chunk_size_feed_forward": 0,
46
- "is_encoder_decoder": false,
47
- "is_decoder": false,
48
- "cross_attention_hidden_size": null,
49
- "add_cross_attention": false,
50
- "tie_encoder_decoder": false,
51
- "architectures": null,
52
- "finetuning_task": null,
53
- "id2label": {
54
- "0": "LABEL_0",
55
- "1": "LABEL_1"
56
- },
57
- "label2id": {
58
- "LABEL_0": 0,
59
- "LABEL_1": 1
60
- },
61
- "task_specific_params": null,
62
- "problem_type": null,
63
- "tokenizer_class": null,
64
- "prefix": null,
65
- "bos_token_id": null,
66
- "pad_token_id": null,
67
- "eos_token_id": null,
68
- "sep_token_id": null,
69
- "decoder_start_token_id": null,
70
- "max_length": 20,
71
- "min_length": 0,
72
- "do_sample": false,
73
- "early_stopping": false,
74
- "num_beams": 1,
75
- "temperature": 1.0,
76
- "top_k": 50,
77
- "top_p": 1.0,
78
- "typical_p": 1.0,
79
- "repetition_penalty": 1.0,
80
- "length_penalty": 1.0,
81
- "no_repeat_ngram_size": 0,
82
- "encoder_no_repeat_ngram_size": 0,
83
- "bad_words_ids": null,
84
- "num_return_sequences": 1,
85
- "output_scores": false,
86
- "return_dict_in_generate": false,
87
- "forced_bos_token_id": null,
88
- "forced_eos_token_id": null,
89
- "remove_invalid_values": false,
90
- "exponential_decay_length_penalty": null,
91
- "suppress_tokens": null,
92
- "begin_suppress_tokens": null,
93
- "num_beam_groups": 1,
94
- "diversity_penalty": 0.0,
95
- "_name_or_path": "",
96
- "transformers_version": "4.57.0.dev0",
97
- "tf_legacy_loss": false,
98
- "use_bfloat16": false,
99
- "model_type": "qwen3",
100
- "output_attentions": false
101
- },
102
- "text_model": {
103
- "return_dict": true,
104
- "output_hidden_states": false,
105
- "torchscript": false,
106
- "dtype": "float32",
107
- "pruned_heads": {},
108
- "tie_word_embeddings": true,
109
- "chunk_size_feed_forward": 0,
110
- "is_encoder_decoder": false,
111
- "is_decoder": false,
112
- "cross_attention_hidden_size": null,
113
- "add_cross_attention": false,
114
- "tie_encoder_decoder": false,
115
- "architectures": [
116
- "ModernBertForMaskedLM"
117
- ],
118
- "finetuning_task": null,
119
- "id2label": {
120
- "0": "LABEL_0",
121
- "1": "LABEL_1"
122
- },
123
- "label2id": {
124
- "LABEL_0": 0,
125
- "LABEL_1": 1
126
- },
127
- "task_specific_params": null,
128
- "problem_type": null,
129
- "tokenizer_class": null,
130
- "prefix": null,
131
- "bos_token_id": 50281,
132
- "pad_token_id": 50283,
133
- "eos_token_id": 50282,
134
- "sep_token_id": 50282,
135
- "decoder_start_token_id": null,
136
- "max_length": 20,
137
- "min_length": 0,
138
- "do_sample": false,
139
- "early_stopping": false,
140
- "num_beams": 1,
141
- "temperature": 1.0,
142
- "top_k": 50,
143
- "top_p": 1.0,
144
- "typical_p": 1.0,
145
- "repetition_penalty": 1.0,
146
- "length_penalty": 1.0,
147
- "no_repeat_ngram_size": 0,
148
- "encoder_no_repeat_ngram_size": 0,
149
- "bad_words_ids": null,
150
- "num_return_sequences": 1,
151
- "output_scores": false,
152
- "return_dict_in_generate": false,
153
- "forced_bos_token_id": null,
154
- "forced_eos_token_id": null,
155
- "remove_invalid_values": false,
156
- "exponential_decay_length_penalty": null,
157
- "suppress_tokens": null,
158
- "begin_suppress_tokens": null,
159
- "num_beam_groups": 1,
160
- "diversity_penalty": 0.0,
161
- "_name_or_path": "answerdotai/ModernBERT-large",
162
- "transformers_version": "4.57.0.dev0",
163
- "cls_token_id": 50281,
164
- "gradient_checkpointing": false,
165
- "layer_norm_eps": 1e-05,
166
- "model_type": "modernbert",
167
- "position_embedding_type": "absolute",
168
- "tf_legacy_loss": false,
169
- "use_bfloat16": false,
170
- "vocab_size": 50368,
171
- "max_position_embeddings": 8192,
172
- "hidden_size": 1024,
173
- "intermediate_size": 2624,
174
- "num_hidden_layers": 28,
175
- "num_attention_heads": 16,
176
- "initializer_range": 0.02,
177
- "initializer_cutoff_factor": 2.0,
178
- "norm_eps": 1e-05,
179
- "norm_bias": false,
180
- "global_rope_theta": 160000.0,
181
- "attention_bias": false,
182
- "attention_dropout": 0.0,
183
- "hidden_activation": "gelu",
184
- "global_attn_every_n_layers": 3,
185
- "local_attention": 128,
186
- "local_rope_theta": 10000.0,
187
- "embedding_dropout": 0.0,
188
- "mlp_bias": false,
189
- "mlp_dropout": 0.0,
190
- "decoder_bias": true,
191
- "classifier_pooling": "mean",
192
- "classifier_dropout": 0.0,
193
- "classifier_bias": false,
194
- "classifier_activation": "gelu",
195
- "deterministic_flash_attn": false,
196
- "sparse_prediction": false,
197
- "sparse_pred_ignore_index": -100,
198
- "repad_logits_with_grad": false,
199
- "output_attentions": false
200
- },
201
- "output_dim": 1024,
202
- "fixed_len_video": false,
203
- "nth_text_layer": 22
204
- },
205
- "audio_model": {
206
- "dac_vae_encoder": {
207
  "encoder_hidden_size": 64,
208
  "downsampling_ratios": [
209
  2,
@@ -218,400 +17,62 @@
218
  "quantizer_dropout": 0,
219
  "sampling_rate": 48000
220
  },
221
- "transformer": {
222
- "vocab_size": 151936,
223
- "max_position_embeddings": 10000,
224
- "hidden_size": 1024,
225
- "intermediate_size": 2752,
226
- "num_hidden_layers": 16,
227
- "num_attention_heads": 8,
228
- "use_sliding_window": false,
229
- "sliding_window": null,
230
- "max_window_layers": 28,
231
- "num_key_value_heads": 8,
232
- "head_dim": 128,
233
- "hidden_act": "silu",
234
- "initializer_range": 0.02,
235
- "rms_norm_eps": 1e-05,
236
- "use_cache": true,
237
- "rope_theta": 20000,
238
- "rope_scaling": null,
239
- "attention_bias": false,
240
- "attention_dropout": 0.0,
241
- "layer_types": [
242
- "full_attention",
243
- "full_attention",
244
- "full_attention",
245
- "full_attention",
246
- "full_attention",
247
- "full_attention",
248
- "full_attention",
249
- "full_attention",
250
- "full_attention",
251
- "full_attention",
252
- "full_attention",
253
- "full_attention",
254
- "full_attention",
255
- "full_attention",
256
- "full_attention",
257
- "full_attention"
258
- ],
259
- "return_dict": true,
260
- "output_hidden_states": false,
261
- "torchscript": false,
262
- "dtype": null,
263
- "pruned_heads": {},
264
- "tie_word_embeddings": false,
265
- "chunk_size_feed_forward": 0,
266
- "is_encoder_decoder": false,
267
- "is_decoder": false,
268
- "cross_attention_hidden_size": null,
269
- "add_cross_attention": false,
270
- "tie_encoder_decoder": false,
271
- "architectures": null,
272
- "finetuning_task": null,
273
- "id2label": {
274
- "0": "LABEL_0",
275
- "1": "LABEL_1"
276
- },
277
- "label2id": {
278
- "LABEL_0": 0,
279
- "LABEL_1": 1
280
- },
281
- "task_specific_params": null,
282
- "problem_type": null,
283
- "tokenizer_class": null,
284
- "prefix": null,
285
- "bos_token_id": null,
286
- "pad_token_id": null,
287
- "eos_token_id": null,
288
- "sep_token_id": null,
289
- "decoder_start_token_id": null,
290
- "max_length": 20,
291
- "min_length": 0,
292
- "do_sample": false,
293
- "early_stopping": false,
294
- "num_beams": 1,
295
- "temperature": 1.0,
296
- "top_k": 50,
297
- "top_p": 1.0,
298
- "typical_p": 1.0,
299
- "repetition_penalty": 1.0,
300
- "length_penalty": 1.0,
301
- "no_repeat_ngram_size": 0,
302
- "encoder_no_repeat_ngram_size": 0,
303
- "bad_words_ids": null,
304
- "num_return_sequences": 1,
305
- "output_scores": false,
306
- "return_dict_in_generate": false,
307
- "forced_bos_token_id": null,
308
- "forced_eos_token_id": null,
309
- "remove_invalid_values": false,
310
- "exponential_decay_length_penalty": null,
311
- "suppress_tokens": null,
312
- "begin_suppress_tokens": null,
313
- "num_beam_groups": 1,
314
- "diversity_penalty": 0.0,
315
- "_name_or_path": "",
316
- "transformers_version": "4.57.0.dev0",
317
- "tf_legacy_loss": false,
318
- "use_bfloat16": false,
319
- "model_type": "qwen3",
320
- "output_attentions": false
321
  },
322
- "text_model": {
323
- "return_dict": true,
324
- "output_hidden_states": false,
325
- "torchscript": false,
326
- "dtype": "float32",
327
- "pruned_heads": {},
328
- "tie_word_embeddings": true,
329
- "chunk_size_feed_forward": 0,
330
- "is_encoder_decoder": false,
331
- "is_decoder": false,
332
- "cross_attention_hidden_size": null,
333
- "add_cross_attention": false,
334
- "tie_encoder_decoder": false,
335
- "architectures": [
336
- "ModernBertForMaskedLM"
337
- ],
338
- "finetuning_task": null,
339
- "id2label": {
340
- "0": "LABEL_0",
341
- "1": "LABEL_1"
342
- },
343
- "label2id": {
344
- "LABEL_0": 0,
345
- "LABEL_1": 1
346
- },
347
- "task_specific_params": null,
348
- "problem_type": null,
349
- "tokenizer_class": null,
350
- "prefix": null,
351
- "bos_token_id": 50281,
352
- "pad_token_id": 50283,
353
- "eos_token_id": 50282,
354
- "sep_token_id": 50282,
355
- "decoder_start_token_id": null,
356
- "max_length": 20,
357
- "min_length": 0,
358
- "do_sample": false,
359
- "early_stopping": false,
360
- "num_beams": 1,
361
- "temperature": 1.0,
362
- "top_k": 50,
363
- "top_p": 1.0,
364
- "typical_p": 1.0,
365
- "repetition_penalty": 1.0,
366
- "length_penalty": 1.0,
367
- "no_repeat_ngram_size": 0,
368
- "encoder_no_repeat_ngram_size": 0,
369
- "bad_words_ids": null,
370
- "num_return_sequences": 1,
371
- "output_scores": false,
372
- "return_dict_in_generate": false,
373
- "forced_bos_token_id": null,
374
- "forced_eos_token_id": null,
375
- "remove_invalid_values": false,
376
- "exponential_decay_length_penalty": null,
377
- "suppress_tokens": null,
378
- "begin_suppress_tokens": null,
379
- "num_beam_groups": 1,
380
- "diversity_penalty": 0.0,
381
- "_name_or_path": "answerdotai/ModernBERT-large",
382
- "transformers_version": "4.57.0.dev0",
383
- "cls_token_id": 50281,
384
- "gradient_checkpointing": false,
385
- "layer_norm_eps": 1e-05,
386
- "model_type": "modernbert",
387
- "position_embedding_type": "absolute",
388
- "tf_legacy_loss": false,
389
- "use_bfloat16": false,
390
- "vocab_size": 50368,
391
- "max_position_embeddings": 8192,
392
- "hidden_size": 1024,
393
- "intermediate_size": 2624,
394
- "num_hidden_layers": 28,
395
- "num_attention_heads": 16,
396
  "initializer_range": 0.02,
397
- "initializer_cutoff_factor": 2.0,
398
- "norm_eps": 1e-05,
399
- "norm_bias": false,
400
- "global_rope_theta": 160000.0,
401
- "attention_bias": false,
402
- "attention_dropout": 0.0,
403
- "hidden_activation": "gelu",
404
- "global_attn_every_n_layers": 3,
405
- "local_attention": 128,
406
- "local_rope_theta": 10000.0,
407
- "embedding_dropout": 0.0,
408
- "mlp_bias": false,
409
- "mlp_dropout": 0.0,
410
- "decoder_bias": true,
411
- "classifier_pooling": "mean",
412
- "classifier_dropout": 0.0,
413
- "classifier_bias": false,
414
- "classifier_activation": "gelu",
415
- "deterministic_flash_attn": false,
416
- "sparse_prediction": false,
417
- "sparse_pred_ignore_index": -100,
418
- "repad_logits_with_grad": false,
419
- "output_attentions": false
420
  },
421
- "output_dim": 1024,
422
- "nth_text_layer": 22
423
- },
424
- "transformer": {
425
- "vocab_size": 151936,
426
- "max_position_embeddings": 10000,
427
  "hidden_size": 1024,
428
  "intermediate_size": 2752,
429
- "num_hidden_layers": 6,
430
  "num_attention_heads": 8,
431
- "use_sliding_window": false,
432
- "sliding_window": null,
433
- "max_window_layers": 28,
434
- "num_key_value_heads": 8,
435
  "head_dim": 128,
 
436
  "hidden_act": "silu",
 
437
  "initializer_range": 0.02,
438
  "rms_norm_eps": 1e-05,
439
- "use_cache": true,
440
- "rope_theta": 20000,
441
- "rope_scaling": null,
442
- "attention_bias": false,
443
- "attention_dropout": 0.0,
444
- "layer_types": [
445
- "full_attention",
446
- "full_attention",
447
- "full_attention",
448
- "full_attention",
449
- "full_attention",
450
- "full_attention"
451
- ],
452
- "return_dict": true,
453
- "output_hidden_states": false,
454
- "torchscript": false,
455
- "dtype": null,
456
- "pruned_heads": {},
457
- "tie_word_embeddings": false,
458
- "chunk_size_feed_forward": 0,
459
- "is_encoder_decoder": false,
460
- "is_decoder": false,
461
- "cross_attention_hidden_size": null,
462
- "add_cross_attention": false,
463
- "tie_encoder_decoder": false,
464
- "architectures": null,
465
- "finetuning_task": null,
466
- "id2label": {
467
- "0": "LABEL_0",
468
- "1": "LABEL_1"
469
- },
470
- "label2id": {
471
- "LABEL_0": 0,
472
- "LABEL_1": 1
473
  },
474
- "task_specific_params": null,
475
- "problem_type": null,
476
- "tokenizer_class": null,
477
- "prefix": null,
478
- "bos_token_id": null,
479
- "pad_token_id": null,
480
- "eos_token_id": null,
481
- "sep_token_id": null,
482
- "decoder_start_token_id": null,
483
- "max_length": 20,
484
- "min_length": 0,
485
- "do_sample": false,
486
- "early_stopping": false,
487
- "num_beams": 1,
488
- "temperature": 1.0,
489
- "top_k": 50,
490
- "top_p": 1.0,
491
- "typical_p": 1.0,
492
- "repetition_penalty": 1.0,
493
- "length_penalty": 1.0,
494
- "no_repeat_ngram_size": 0,
495
- "encoder_no_repeat_ngram_size": 0,
496
- "bad_words_ids": null,
497
- "num_return_sequences": 1,
498
- "output_scores": false,
499
- "return_dict_in_generate": false,
500
- "forced_bos_token_id": null,
501
- "forced_eos_token_id": null,
502
- "remove_invalid_values": false,
503
- "exponential_decay_length_penalty": null,
504
- "suppress_tokens": null,
505
- "begin_suppress_tokens": null,
506
- "num_beam_groups": 1,
507
- "diversity_penalty": 0.0,
508
- "_name_or_path": "",
509
- "transformers_version": "4.57.0.dev0",
510
- "tf_legacy_loss": false,
511
- "use_bfloat16": false,
512
- "model_type": "qwen3",
513
- "output_attentions": false
514
- }
515
- },
516
- "text_model": {
517
- "return_dict": true,
518
- "output_hidden_states": false,
519
- "torchscript": false,
520
- "dtype": "float32",
521
- "pruned_heads": {},
522
- "tie_word_embeddings": true,
523
- "chunk_size_feed_forward": 0,
524
- "is_encoder_decoder": false,
525
- "is_decoder": false,
526
- "cross_attention_hidden_size": null,
527
- "add_cross_attention": false,
528
- "tie_encoder_decoder": false,
529
- "architectures": [
530
- "ModernBertForMaskedLM"
531
- ],
532
- "finetuning_task": null,
533
- "id2label": {
534
- "0": "LABEL_0",
535
- "1": "LABEL_1"
536
- },
537
- "label2id": {
538
- "LABEL_0": 0,
539
- "LABEL_1": 1
540
  },
541
- "task_specific_params": null,
542
- "problem_type": null,
543
- "tokenizer_class": null,
544
- "prefix": null,
545
- "bos_token_id": 50281,
546
- "pad_token_id": 50283,
547
- "eos_token_id": 50282,
548
- "sep_token_id": 50282,
549
- "decoder_start_token_id": null,
550
- "max_length": 20,
551
- "min_length": 0,
552
- "do_sample": false,
553
- "early_stopping": false,
554
- "num_beams": 1,
555
- "temperature": 1.0,
556
- "top_k": 50,
557
- "top_p": 1.0,
558
- "typical_p": 1.0,
559
- "repetition_penalty": 1.0,
560
- "length_penalty": 1.0,
561
- "no_repeat_ngram_size": 0,
562
- "encoder_no_repeat_ngram_size": 0,
563
- "bad_words_ids": null,
564
- "num_return_sequences": 1,
565
- "output_scores": false,
566
- "return_dict_in_generate": false,
567
- "forced_bos_token_id": null,
568
- "forced_eos_token_id": null,
569
- "remove_invalid_values": false,
570
- "exponential_decay_length_penalty": null,
571
- "suppress_tokens": null,
572
- "begin_suppress_tokens": null,
573
- "num_beam_groups": 1,
574
- "diversity_penalty": 0.0,
575
- "_name_or_path": "answerdotai/ModernBERT-large",
576
- "transformers_version": "4.57.0.dev0",
577
- "cls_token_id": 50281,
578
- "gradient_checkpointing": false,
579
- "layer_norm_eps": 1e-05,
580
- "model_type": "modernbert",
581
- "position_embedding_type": "absolute",
582
- "tf_legacy_loss": false,
583
- "use_bfloat16": false,
584
- "vocab_size": 50368,
585
- "max_position_embeddings": 8192,
586
  "hidden_size": 1024,
587
- "intermediate_size": 2624,
588
- "num_hidden_layers": 28,
589
- "num_attention_heads": 16,
 
 
 
 
590
  "initializer_range": 0.02,
591
- "initializer_cutoff_factor": 2.0,
592
- "norm_eps": 1e-05,
593
- "norm_bias": false,
594
- "global_rope_theta": 160000.0,
595
  "attention_bias": false,
596
- "attention_dropout": 0.0,
597
- "hidden_activation": "gelu",
598
- "global_attn_every_n_layers": 3,
599
- "local_attention": 128,
600
- "local_rope_theta": 10000.0,
601
- "embedding_dropout": 0.0,
602
- "mlp_bias": false,
603
- "mlp_dropout": 0.0,
604
- "decoder_bias": true,
605
- "classifier_pooling": "mean",
606
- "classifier_dropout": 0.0,
607
- "classifier_bias": false,
608
- "classifier_activation": "gelu",
609
- "deterministic_flash_attn": false,
610
- "sparse_prediction": false,
611
- "sparse_pred_ignore_index": -100,
612
- "repad_logits_with_grad": false,
613
- "output_attentions": false
614
- },
615
- "output_dim": 1024,
616
- "nth_text_layer": 22
617
  }
 
1
  {
2
+ "model_type": "pe_audio_video",
3
+ "audio_video_config": {
4
+ "audio_config": {
5
+ "dac_config": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "encoder_hidden_size": 64,
7
  "downsampling_ratios": [
8
  2,
 
17
  "quantizer_dropout": 0,
18
  "sampling_rate": 48000
19
  },
20
+ "hidden_size": 1024,
21
+ "intermediate_size": 2752,
22
+ "num_hidden_layers": 16,
23
+ "num_attention_heads": 8,
24
+ "head_dim": 128,
25
+ "num_key_value_heads": null,
26
+ "hidden_act": "silu",
27
+ "max_position_embeddings": 10000,
28
+ "initializer_range": 0.02,
29
+ "rms_norm_eps": 1e-05,
30
+ "rope_parameters": {
31
+ "rope_theta": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  },
33
+ "attention_bias": false,
34
+ "attention_dropout": 0.0
35
+ },
36
+ "video_config": {
37
+ "vision_config": {
38
+ "architecture": "vit_pe_core_large_patch14_336",
39
+ "do_pooling": true,
40
+ "global_pool": "map",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "initializer_range": 0.02,
42
+ "model_args": {},
43
+ "num_labels": 1024,
44
+ "model_type": "timm_wrapper"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  },
 
 
 
 
 
 
46
  "hidden_size": 1024,
47
  "intermediate_size": 2752,
48
+ "num_hidden_layers": 4,
49
  "num_attention_heads": 8,
 
 
 
 
50
  "head_dim": 128,
51
+ "num_key_value_heads": null,
52
  "hidden_act": "silu",
53
+ "max_position_embeddings": 10000,
54
  "initializer_range": 0.02,
55
  "rms_norm_eps": 1e-05,
56
+ "rope_parameters": {
57
+ "rope_theta": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
+ "attention_bias": false,
60
+ "attention_dropout": 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  "hidden_size": 1024,
63
+ "intermediate_size": 2752,
64
+ "num_hidden_layers": 6,
65
+ "num_attention_heads": 8,
66
+ "head_dim": 128,
67
+ "num_key_value_heads": null,
68
+ "hidden_act": "silu",
69
+ "max_position_embeddings": 10000,
70
  "initializer_range": 0.02,
71
+ "rms_norm_eps": 1e-05,
72
+ "rope_parameters": {
73
+ "rope_theta": 20000
74
+ },
75
  "attention_bias": false,
76
+ "attention_dropout": 0.0
77
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14497c4ceb814166361dfea06ec9cc9bade4cc41b6162f4a8710f6d75ae814d8
3
- size 4430166416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db30c3c4f7d5cc0a096ab3eb10acd472b68486a7ea87ed074dd7332d8335a884
3
+ size 4135465848
preprocessor_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "feature_extractor_type": "PEAudioVideoFeatureExtractor",
3
  "feature_size": 1,
4
  "hop_length": 1920,
5
  "padding_side": "right",
 
1
  {
2
+ "feature_extractor_type": "PeAudioFeatureExtractor",
3
  "feature_size": 1,
4
  "hop_length": 1920,
5
  "padding_side": "right",
video_preprocessor_config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "do_sample_frames": false,
3
  "num_frames": null,
4
  "size": {
 
1
  {
2
+ "video_processor_type": "PeVideoVideoProcessor",
3
  "do_sample_frames": false,
4
  "num_frames": null,
5
  "size": {