Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

config.json +390 -180
model.safetensors +2 -2
preprocessor_config.json +7 -7

config.json CHANGED Viewed

@@ -1,7 +1,14 @@
 {
-    "video_encoder": {
-        "backbone": "PE-Core-L14-336",
-        "backbone_checkpoint": null,
         "transformer": {
             "vocab_size": 151936,
             "max_position_embeddings": 10000,
@@ -90,86 +97,357 @@
             "use_bfloat16": false,
             "model_type": "qwen3",
             "output_attentions": false
-        }
     },
-    "audio_codec": {
-        "encoder_hidden_size": 64,
-        "downsampling_ratios": [
-            2,
-            8,
-            10,
-            12
-        ],
-        "decoder_hidden_size": 1536,
-        "n_codebooks": 16,
-        "codebook_size": 1024,
-        "codebook_dim": 128,
-        "quantizer_dropout": 0,
-        "sampling_rate": 48000
     },
-    "audio_encoder": {
-        "vocab_size": 151936,
-        "max_position_embeddings": 10000,
-        "hidden_size": 1792,
-        "intermediate_size": 4800,
-        "num_hidden_layers": 28,
-        "num_attention_heads": 14,
-        "use_sliding_window": false,
-        "sliding_window": null,
-        "max_window_layers": 28,
-        "num_key_value_heads": 14,
-        "head_dim": 128,
-        "hidden_act": "silu",
-        "initializer_range": 0.02,
-        "rms_norm_eps": 1e-05,
-        "use_cache": true,
-        "rope_theta": 20000,
-        "rope_scaling": null,
-        "attention_bias": false,
-        "attention_dropout": 0.0,
-        "layer_types": [
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention",
-            "full_attention"
-        ],
         "return_dict": true,
         "output_hidden_states": false,
         "torchscript": false,
-        "dtype": null,
         "pruned_heads": {},
-        "tie_word_embeddings": false,
         "chunk_size_feed_forward": 0,
         "is_encoder_decoder": false,
         "is_decoder": false,
         "cross_attention_hidden_size": null,
         "add_cross_attention": false,
         "tie_encoder_decoder": false,
-        "architectures": null,
         "finetuning_task": null,
         "id2label": {
             "0": "LABEL_0",
@@ -183,10 +461,10 @@
         "problem_type": null,
         "tokenizer_class": null,
         "prefix": null,
-        "bos_token_id": null,
-        "pad_token_id": null,
-        "eos_token_id": null,
-        "sep_token_id": null,
         "decoder_start_token_id": null,
         "max_length": 20,
         "min_length": 0,
@@ -213,14 +491,47 @@
         "begin_suppress_tokens": null,
         "num_beam_groups": 1,
         "diversity_penalty": 0.0,
-        "_name_or_path": "",
         "transformers_version": "4.57.0.dev0",
         "tf_legacy_loss": false,
         "use_bfloat16": false,
-        "model_type": "qwen3",
         "output_attentions": false
     },
-    "audio_video_encoder": {
         "vocab_size": 151936,
         "max_position_embeddings": 10000,
         "hidden_size": 1792,
@@ -311,107 +622,6 @@
         "model_type": "qwen3",
         "output_attentions": false
     },
-    "text_encoder": {
-        "nth_layer": 22,
-        "return_dict": true,
-        "output_hidden_states": false,
-        "torchscript": false,
-        "dtype": "float32",
-        "pruned_heads": {},
-        "tie_word_embeddings": true,
-        "chunk_size_feed_forward": 0,
-        "is_encoder_decoder": false,
-        "is_decoder": false,
-        "cross_attention_hidden_size": null,
-        "add_cross_attention": false,
-        "tie_encoder_decoder": false,
-        "architectures": [
-            "ModernBertForMaskedLM"
-        ],
-        "finetuning_task": null,
-        "id2label": {
-            "0": "LABEL_0",
-            "1": "LABEL_1"
-        },
-        "label2id": {
-            "LABEL_0": 0,
-            "LABEL_1": 1
-        },
-        "task_specific_params": null,
-        "problem_type": null,
-        "tokenizer_class": null,
-        "prefix": null,
-        "bos_token_id": 50281,
-        "pad_token_id": 50283,
-        "eos_token_id": 50282,
-        "sep_token_id": 50282,
-        "decoder_start_token_id": null,
-        "max_length": 20,
-        "min_length": 0,
-        "do_sample": false,
-        "early_stopping": false,
-        "num_beams": 1,
-        "temperature": 1.0,
-        "top_k": 50,
-        "top_p": 1.0,
-        "typical_p": 1.0,
-        "repetition_penalty": 1.0,
-        "length_penalty": 1.0,
-        "no_repeat_ngram_size": 0,
-        "encoder_no_repeat_ngram_size": 0,
-        "bad_words_ids": null,
-        "num_return_sequences": 1,
-        "output_scores": false,
-        "return_dict_in_generate": false,
-        "forced_bos_token_id": null,
-        "forced_eos_token_id": null,
-        "remove_invalid_values": false,
-        "exponential_decay_length_penalty": null,
-        "suppress_tokens": null,
-        "begin_suppress_tokens": null,
-        "num_beam_groups": 1,
-        "diversity_penalty": 0.0,
-        "_name_or_path": "answerdotai/ModernBERT-large",
-        "cls_token_id": 50281,
-        "gradient_checkpointing": false,
-        "layer_norm_eps": 1e-05,
-        "model_type": "modernbert",
-        "position_embedding_type": "absolute",
-        "tf_legacy_loss": false,
-        "use_bfloat16": false,
-        "vocab_size": 50368,
-        "max_position_embeddings": 8192,
-        "hidden_size": 1024,
-        "intermediate_size": 2624,
-        "num_hidden_layers": 28,
-        "num_attention_heads": 16,
-        "initializer_range": 0.02,
-        "initializer_cutoff_factor": 2.0,
-        "norm_eps": 1e-05,
-        "norm_bias": false,
-        "global_rope_theta": 160000.0,
-        "attention_bias": false,
-        "attention_dropout": 0.0,
-        "hidden_activation": "gelu",
-        "global_attn_every_n_layers": 3,
-        "local_attention": 128,
-        "local_rope_theta": 10000.0,
-        "embedding_dropout": 0.0,
-        "mlp_bias": false,
-        "mlp_dropout": 0.0,
-        "decoder_bias": true,
-        "classifier_pooling": "mean",
-        "classifier_dropout": 0.0,
-        "classifier_bias": false,
-        "classifier_activation": "gelu",
-        "deterministic_flash_attn": false,
-        "sparse_prediction": false,
-        "sparse_pred_ignore_index": -100,
-        "repad_logits_with_grad": false,
-        "output_attentions": false
-    },
-    "separate_text_heads": true,
     "output_dim": 1024,
-    "contrastive_head_norm_type": "none",
-    "fixed_len_video": false
 }

 {
+    "video_model": {
+        "clip_vision_model": {
+            "architecture": "vit_pe_core_large_patch14_336",
+            "do_pooling": true,
+            "global_pool": "map",
+            "initializer_range": 0.02,
+            "model_args": {},
+            "num_labels": 1024,
+            "model_type": "timm_wrapper"
+        },
         "transformer": {
             "vocab_size": 151936,
             "max_position_embeddings": 10000,
             "use_bfloat16": false,
             "model_type": "qwen3",
             "output_attentions": false
+        },
+        "text_model": {
+            "return_dict": true,
+            "output_hidden_states": false,
+            "torchscript": false,
+            "dtype": "float32",
+            "pruned_heads": {},
+            "tie_word_embeddings": true,
+            "chunk_size_feed_forward": 0,
+            "is_encoder_decoder": false,
+            "is_decoder": false,
+            "cross_attention_hidden_size": null,
+            "add_cross_attention": false,
+            "tie_encoder_decoder": false,
+            "architectures": [
+                "ModernBertForMaskedLM"
+            ],
+            "finetuning_task": null,
+            "id2label": {
+                "0": "LABEL_0",
+                "1": "LABEL_1"
+            },
+            "label2id": {
+                "LABEL_0": 0,
+                "LABEL_1": 1
+            },
+            "task_specific_params": null,
+            "problem_type": null,
+            "tokenizer_class": null,
+            "prefix": null,
+            "bos_token_id": 50281,
+            "pad_token_id": 50283,
+            "eos_token_id": 50282,
+            "sep_token_id": 50282,
+            "decoder_start_token_id": null,
+            "max_length": 20,
+            "min_length": 0,
+            "do_sample": false,
+            "early_stopping": false,
+            "num_beams": 1,
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "typical_p": 1.0,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "encoder_no_repeat_ngram_size": 0,
+            "bad_words_ids": null,
+            "num_return_sequences": 1,
+            "output_scores": false,
+            "return_dict_in_generate": false,
+            "forced_bos_token_id": null,
+            "forced_eos_token_id": null,
+            "remove_invalid_values": false,
+            "exponential_decay_length_penalty": null,
+            "suppress_tokens": null,
+            "begin_suppress_tokens": null,
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
+            "_name_or_path": "answerdotai/ModernBERT-large",
+            "transformers_version": "4.57.0.dev0",
+            "cls_token_id": 50281,
+            "gradient_checkpointing": false,
+            "layer_norm_eps": 1e-05,
+            "model_type": "modernbert",
+            "position_embedding_type": "absolute",
+            "tf_legacy_loss": false,
+            "use_bfloat16": false,
+            "vocab_size": 50368,
+            "max_position_embeddings": 8192,
+            "hidden_size": 1024,
+            "intermediate_size": 2624,
+            "num_hidden_layers": 28,
+            "num_attention_heads": 16,
+            "initializer_range": 0.02,
+            "initializer_cutoff_factor": 2.0,
+            "norm_eps": 1e-05,
+            "norm_bias": false,
+            "global_rope_theta": 160000.0,
+            "attention_bias": false,
+            "attention_dropout": 0.0,
+            "hidden_activation": "gelu",
+            "global_attn_every_n_layers": 3,
+            "local_attention": 128,
+            "local_rope_theta": 10000.0,
+            "embedding_dropout": 0.0,
+            "mlp_bias": false,
+            "mlp_dropout": 0.0,
+            "decoder_bias": true,
+            "classifier_pooling": "mean",
+            "classifier_dropout": 0.0,
+            "classifier_bias": false,
+            "classifier_activation": "gelu",
+            "deterministic_flash_attn": false,
+            "sparse_prediction": false,
+            "sparse_pred_ignore_index": -100,
+            "repad_logits_with_grad": false,
+            "output_attentions": false
+        },
+        "output_dim": 1024,
+        "fixed_len_video": false,
+        "nth_text_layer": 22
     },
+    "audio_model": {
+        "dac_vae_encoder": {
+            "encoder_hidden_size": 64,
+            "downsampling_ratios": [
+                2,
+                8,
+                10,
+                12
+            ],
+            "decoder_hidden_size": 1536,
+            "n_codebooks": 16,
+            "codebook_size": 1024,
+            "codebook_dim": 128,
+            "quantizer_dropout": 0,
+            "sampling_rate": 48000
+        },
+        "transformer": {
+            "vocab_size": 151936,
+            "max_position_embeddings": 10000,
+            "hidden_size": 1792,
+            "intermediate_size": 4800,
+            "num_hidden_layers": 28,
+            "num_attention_heads": 14,
+            "use_sliding_window": false,
+            "sliding_window": null,
+            "max_window_layers": 28,
+            "num_key_value_heads": 14,
+            "head_dim": 128,
+            "hidden_act": "silu",
+            "initializer_range": 0.02,
+            "rms_norm_eps": 1e-05,
+            "use_cache": true,
+            "rope_theta": 20000,
+            "rope_scaling": null,
+            "attention_bias": false,
+            "attention_dropout": 0.0,
+            "layer_types": [
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention",
+                "full_attention"
+            ],
+            "return_dict": true,
+            "output_hidden_states": false,
+            "torchscript": false,
+            "dtype": null,
+            "pruned_heads": {},
+            "tie_word_embeddings": false,
+            "chunk_size_feed_forward": 0,
+            "is_encoder_decoder": false,
+            "is_decoder": false,
+            "cross_attention_hidden_size": null,
+            "add_cross_attention": false,
+            "tie_encoder_decoder": false,
+            "architectures": null,
+            "finetuning_task": null,
+            "id2label": {
+                "0": "LABEL_0",
+                "1": "LABEL_1"
+            },
+            "label2id": {
+                "LABEL_0": 0,
+                "LABEL_1": 1
+            },
+            "task_specific_params": null,
+            "problem_type": null,
+            "tokenizer_class": null,
+            "prefix": null,
+            "bos_token_id": null,
+            "pad_token_id": null,
+            "eos_token_id": null,
+            "sep_token_id": null,
+            "decoder_start_token_id": null,
+            "max_length": 20,
+            "min_length": 0,
+            "do_sample": false,
+            "early_stopping": false,
+            "num_beams": 1,
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "typical_p": 1.0,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "encoder_no_repeat_ngram_size": 0,
+            "bad_words_ids": null,
+            "num_return_sequences": 1,
+            "output_scores": false,
+            "return_dict_in_generate": false,
+            "forced_bos_token_id": null,
+            "forced_eos_token_id": null,
+            "remove_invalid_values": false,
+            "exponential_decay_length_penalty": null,
+            "suppress_tokens": null,
+            "begin_suppress_tokens": null,
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
+            "_name_or_path": "",
+            "transformers_version": "4.57.0.dev0",
+            "tf_legacy_loss": false,
+            "use_bfloat16": false,
+            "model_type": "qwen3",
+            "output_attentions": false
+        },
+        "text_model": {
+            "return_dict": true,
+            "output_hidden_states": false,
+            "torchscript": false,
+            "dtype": "float32",
+            "pruned_heads": {},
+            "tie_word_embeddings": true,
+            "chunk_size_feed_forward": 0,
+            "is_encoder_decoder": false,
+            "is_decoder": false,
+            "cross_attention_hidden_size": null,
+            "add_cross_attention": false,
+            "tie_encoder_decoder": false,
+            "architectures": [
+                "ModernBertForMaskedLM"
+            ],
+            "finetuning_task": null,
+            "id2label": {
+                "0": "LABEL_0",
+                "1": "LABEL_1"
+            },
+            "label2id": {
+                "LABEL_0": 0,
+                "LABEL_1": 1
+            },
+            "task_specific_params": null,
+            "problem_type": null,
+            "tokenizer_class": null,
+            "prefix": null,
+            "bos_token_id": 50281,
+            "pad_token_id": 50283,
+            "eos_token_id": 50282,
+            "sep_token_id": 50282,
+            "decoder_start_token_id": null,
+            "max_length": 20,
+            "min_length": 0,
+            "do_sample": false,
+            "early_stopping": false,
+            "num_beams": 1,
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "typical_p": 1.0,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "encoder_no_repeat_ngram_size": 0,
+            "bad_words_ids": null,
+            "num_return_sequences": 1,
+            "output_scores": false,
+            "return_dict_in_generate": false,
+            "forced_bos_token_id": null,
+            "forced_eos_token_id": null,
+            "remove_invalid_values": false,
+            "exponential_decay_length_penalty": null,
+            "suppress_tokens": null,
+            "begin_suppress_tokens": null,
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
+            "_name_or_path": "answerdotai/ModernBERT-large",
+            "transformers_version": "4.57.0.dev0",
+            "cls_token_id": 50281,
+            "gradient_checkpointing": false,
+            "layer_norm_eps": 1e-05,
+            "model_type": "modernbert",
+            "position_embedding_type": "absolute",
+            "tf_legacy_loss": false,
+            "use_bfloat16": false,
+            "vocab_size": 50368,
+            "max_position_embeddings": 8192,
+            "hidden_size": 1024,
+            "intermediate_size": 2624,
+            "num_hidden_layers": 28,
+            "num_attention_heads": 16,
+            "initializer_range": 0.02,
+            "initializer_cutoff_factor": 2.0,
+            "norm_eps": 1e-05,
+            "norm_bias": false,
+            "global_rope_theta": 160000.0,
+            "attention_bias": false,
+            "attention_dropout": 0.0,
+            "hidden_activation": "gelu",
+            "global_attn_every_n_layers": 3,
+            "local_attention": 128,
+            "local_rope_theta": 10000.0,
+            "embedding_dropout": 0.0,
+            "mlp_bias": false,
+            "mlp_dropout": 0.0,
+            "decoder_bias": true,
+            "classifier_pooling": "mean",
+            "classifier_dropout": 0.0,
+            "classifier_bias": false,
+            "classifier_activation": "gelu",
+            "deterministic_flash_attn": false,
+            "sparse_prediction": false,
+            "sparse_pred_ignore_index": -100,
+            "repad_logits_with_grad": false,
+            "output_attentions": false
+        },
+        "output_dim": 1024,
+        "nth_text_layer": 22
     },
+    "text_model": {
         "return_dict": true,
         "output_hidden_states": false,
         "torchscript": false,
+        "dtype": "float32",
         "pruned_heads": {},
+        "tie_word_embeddings": true,
         "chunk_size_feed_forward": 0,
         "is_encoder_decoder": false,
         "is_decoder": false,
         "cross_attention_hidden_size": null,
         "add_cross_attention": false,
         "tie_encoder_decoder": false,
+        "architectures": [
+            "ModernBertForMaskedLM"
+        ],
         "finetuning_task": null,
         "id2label": {
             "0": "LABEL_0",
         "problem_type": null,
         "tokenizer_class": null,
         "prefix": null,
+        "bos_token_id": 50281,
+        "pad_token_id": 50283,
+        "eos_token_id": 50282,
+        "sep_token_id": 50282,
         "decoder_start_token_id": null,
         "max_length": 20,
         "min_length": 0,
         "begin_suppress_tokens": null,
         "num_beam_groups": 1,
         "diversity_penalty": 0.0,
+        "_name_or_path": "answerdotai/ModernBERT-large",
         "transformers_version": "4.57.0.dev0",
+        "cls_token_id": 50281,
+        "gradient_checkpointing": false,
+        "layer_norm_eps": 1e-05,
+        "model_type": "modernbert",
+        "position_embedding_type": "absolute",
         "tf_legacy_loss": false,
         "use_bfloat16": false,
+        "vocab_size": 50368,
+        "max_position_embeddings": 8192,
+        "hidden_size": 1024,
+        "intermediate_size": 2624,
+        "num_hidden_layers": 28,
+        "num_attention_heads": 16,
+        "initializer_range": 0.02,
+        "initializer_cutoff_factor": 2.0,
+        "norm_eps": 1e-05,
+        "norm_bias": false,
+        "global_rope_theta": 160000.0,
+        "attention_bias": false,
+        "attention_dropout": 0.0,
+        "hidden_activation": "gelu",
+        "global_attn_every_n_layers": 3,
+        "local_attention": 128,
+        "local_rope_theta": 10000.0,
+        "embedding_dropout": 0.0,
+        "mlp_bias": false,
+        "mlp_dropout": 0.0,
+        "decoder_bias": true,
+        "classifier_pooling": "mean",
+        "classifier_dropout": 0.0,
+        "classifier_bias": false,
+        "classifier_activation": "gelu",
+        "deterministic_flash_attn": false,
+        "sparse_prediction": false,
+        "sparse_pred_ignore_index": -100,
+        "repad_logits_with_grad": false,
         "output_attentions": false
     },
+    "transformer": {
         "vocab_size": 151936,
         "max_position_embeddings": 10000,
         "hidden_size": 1792,
         "model_type": "qwen3",
         "output_attentions": false
     },
     "output_dim": 1024,
+    "nth_text_layer": 22
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:108e38c62cbeabcd56961ea85c217da733db84907f81b59f61598b2b90de3704
-size 10627032564

 version https://git-lfs.github.com/spec/v1
+oid sha256:da71d199c9222e045334d1bdd2ceba7a1bd57fa0072949f035f1d49971538d92
+size 9234167264

preprocessor_config.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "feature_extractor_type": "PerceptionEncoderAVFeatureExtractor",
-  "feature_size": 1,
-  "hop_length": 1920,
-  "padding_side": "right",
-  "padding_value": 0.0,
-  "return_attention_mask": true,
-  "sampling_rate": 48000
 }

 {
+    "feature_extractor_type": "PerceptionEncoderAudioVideoFeatureExtractor",
+    "feature_size": 1,
+    "hop_length": 1920,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "return_attention_mask": true,
+    "sampling_rate": 48000
 }