Upload folder using huggingface_hub
Browse files- .gitattributes +8 -0
- config.json +38 -10
- generation_config.json +2 -2
- onnx/audio_encoder.onnx +3 -0
- onnx/audio_encoder.onnx_data +3 -0
- onnx/audio_encoder_fp16.onnx +3 -0
- onnx/audio_encoder_fp16.onnx_data +3 -0
- onnx/decoder_model_merged.onnx +3 -0
- onnx/decoder_model_merged.onnx_data +3 -0
- onnx/decoder_model_merged_fp16.onnx +3 -0
- onnx/decoder_model_merged_fp16.onnx_data +3 -0
- onnx/embed_tokens.onnx +3 -0
- onnx/embed_tokens.onnx_data +3 -0
- onnx/embed_tokens_fp16.onnx +3 -0
- onnx/embed_tokens_fp16.onnx_data +3 -0
- onnx/vision_encoder.onnx +3 -0
- onnx/vision_encoder.onnx_data +3 -0
- onnx/vision_encoder_fp16.onnx +3 -0
- onnx/vision_encoder_fp16.onnx_data +3 -0
- processor_config.json +92 -0
- tokenizer.json +2 -2
- tokenizer_config.json +0 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
onnx/audio_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
onnx/audio_encoder_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
onnx/decoder_model_merged.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
onnx/decoder_model_merged_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
onnx/embed_tokens.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
onnx/embed_tokens_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
onnx/vision_encoder.onnx_data filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
onnx/vision_encoder_fp16.onnx_data filter=lfs diff=lfs merge=lfs -text
|
config.json
CHANGED
|
@@ -12,6 +12,7 @@
|
|
| 12 |
"conf_num_hidden_layers": 2,
|
| 13 |
"conf_reduction_factor": 4,
|
| 14 |
"conf_residual_weight": 0.5,
|
|
|
|
| 15 |
"gradient_clipping": 10000000000.0,
|
| 16 |
"hidden_size": 64,
|
| 17 |
"input_feat_size": 128,
|
|
@@ -42,7 +43,6 @@
|
|
| 42 |
2
|
| 43 |
]
|
| 44 |
],
|
| 45 |
-
"torch_dtype": "bfloat16",
|
| 46 |
"vocab_offset": 262272,
|
| 47 |
"vocab_size": 128
|
| 48 |
},
|
|
@@ -50,6 +50,7 @@
|
|
| 50 |
"audio_token_id": 262273,
|
| 51 |
"boa_token_id": 256000,
|
| 52 |
"boi_token_id": 255999,
|
|
|
|
| 53 |
"eoa_token_id": 262272,
|
| 54 |
"eoi_token_id": 262144,
|
| 55 |
"eos_token_id": [
|
|
@@ -72,6 +73,9 @@
|
|
| 72 |
"altup_num_inputs": 4,
|
| 73 |
"attention_bias": false,
|
| 74 |
"attention_dropout": 0.0,
|
|
|
|
|
|
|
|
|
|
| 75 |
"final_logit_softcapping": 30.0,
|
| 76 |
"head_dim": 32,
|
| 77 |
"hidden_activation": "gelu_pytorch_tanh",
|
|
@@ -97,21 +101,30 @@
|
|
| 97 |
"num_hidden_layers": 4,
|
| 98 |
"num_key_value_heads": 1,
|
| 99 |
"num_kv_shared_layers": 2,
|
|
|
|
| 100 |
"rms_norm_eps": 1e-06,
|
| 101 |
-
"
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
"sliding_window": 512,
|
| 105 |
-
"
|
| 106 |
"use_cache": true,
|
| 107 |
"vocab_size": 262400,
|
| 108 |
"vocab_size_per_layer_input": 262144
|
| 109 |
},
|
| 110 |
-
"
|
| 111 |
-
"transformers_version": "
|
| 112 |
"vision_config": {
|
| 113 |
"architecture": "mobilenetv5_300m_enc",
|
| 114 |
"do_pooling": false,
|
|
|
|
| 115 |
"hidden_size": 2048,
|
| 116 |
"initializer_range": 0.02,
|
| 117 |
"label_names": [
|
|
@@ -277,9 +290,24 @@
|
|
| 277 |
"model_type": "gemma3n_vision",
|
| 278 |
"num_classes": 2,
|
| 279 |
"rms_norm_eps": 1e-06,
|
| 280 |
-
"torch_dtype": "bfloat16",
|
| 281 |
"vocab_offset": 262144,
|
| 282 |
"vocab_size": 128
|
| 283 |
},
|
| 284 |
-
"vision_soft_tokens_per_image": 256
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"conf_num_hidden_layers": 2,
|
| 13 |
"conf_reduction_factor": 4,
|
| 14 |
"conf_residual_weight": 0.5,
|
| 15 |
+
"dtype": "bfloat16",
|
| 16 |
"gradient_clipping": 10000000000.0,
|
| 17 |
"hidden_size": 64,
|
| 18 |
"input_feat_size": 128,
|
|
|
|
| 43 |
2
|
| 44 |
]
|
| 45 |
],
|
|
|
|
| 46 |
"vocab_offset": 262272,
|
| 47 |
"vocab_size": 128
|
| 48 |
},
|
|
|
|
| 50 |
"audio_token_id": 262273,
|
| 51 |
"boa_token_id": 256000,
|
| 52 |
"boi_token_id": 255999,
|
| 53 |
+
"dtype": "bfloat16",
|
| 54 |
"eoa_token_id": 262272,
|
| 55 |
"eoi_token_id": 262144,
|
| 56 |
"eos_token_id": [
|
|
|
|
| 73 |
"altup_num_inputs": 4,
|
| 74 |
"attention_bias": false,
|
| 75 |
"attention_dropout": 0.0,
|
| 76 |
+
"bos_token_id": 2,
|
| 77 |
+
"dtype": "bfloat16",
|
| 78 |
+
"eos_token_id": 1,
|
| 79 |
"final_logit_softcapping": 30.0,
|
| 80 |
"head_dim": 32,
|
| 81 |
"hidden_activation": "gelu_pytorch_tanh",
|
|
|
|
| 101 |
"num_hidden_layers": 4,
|
| 102 |
"num_key_value_heads": 1,
|
| 103 |
"num_kv_shared_layers": 2,
|
| 104 |
+
"pad_token_id": 0,
|
| 105 |
"rms_norm_eps": 1e-06,
|
| 106 |
+
"rope_parameters": {
|
| 107 |
+
"full_attention": {
|
| 108 |
+
"rope_theta": 1000000.0,
|
| 109 |
+
"rope_type": "default"
|
| 110 |
+
},
|
| 111 |
+
"sliding_attention": {
|
| 112 |
+
"rope_theta": 10000.0,
|
| 113 |
+
"rope_type": "default"
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
"sliding_window": 512,
|
| 117 |
+
"tie_word_embeddings": true,
|
| 118 |
"use_cache": true,
|
| 119 |
"vocab_size": 262400,
|
| 120 |
"vocab_size_per_layer_input": 262144
|
| 121 |
},
|
| 122 |
+
"tie_word_embeddings": true,
|
| 123 |
+
"transformers_version": "5.3.0.dev0",
|
| 124 |
"vision_config": {
|
| 125 |
"architecture": "mobilenetv5_300m_enc",
|
| 126 |
"do_pooling": false,
|
| 127 |
+
"dtype": "bfloat16",
|
| 128 |
"hidden_size": 2048,
|
| 129 |
"initializer_range": 0.02,
|
| 130 |
"label_names": [
|
|
|
|
| 290 |
"model_type": "gemma3n_vision",
|
| 291 |
"num_classes": 2,
|
| 292 |
"rms_norm_eps": 1e-06,
|
|
|
|
| 293 |
"vocab_offset": 262144,
|
| 294 |
"vocab_size": 128
|
| 295 |
},
|
| 296 |
+
"vision_soft_tokens_per_image": 256,
|
| 297 |
+
"transformers.js_config": {
|
| 298 |
+
"use_external_data_format": {
|
| 299 |
+
"decoder_model_merged.onnx": 1,
|
| 300 |
+
"embed_tokens.onnx": 1,
|
| 301 |
+
"audio_encoder.onnx": 1,
|
| 302 |
+
"vision_encoder.onnx": 1,
|
| 303 |
+
"decoder_model_merged_fp16.onnx": 1,
|
| 304 |
+
"embed_tokens_fp16.onnx": 1,
|
| 305 |
+
"audio_encoder_fp16.onnx": 1,
|
| 306 |
+
"vision_encoder_fp16.onnx": 1
|
| 307 |
+
},
|
| 308 |
+
"kv_cache_dtype": {
|
| 309 |
+
"q4f16": "float16",
|
| 310 |
+
"fp16": "float16"
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
}
|
generation_config.json
CHANGED
|
@@ -9,6 +9,6 @@
|
|
| 9 |
"pad_token_id": 0,
|
| 10 |
"top_k": 64,
|
| 11 |
"top_p": 0.95,
|
| 12 |
-
"transformers_version": "
|
| 13 |
-
"trust_remote_code":
|
| 14 |
}
|
|
|
|
| 9 |
"pad_token_id": 0,
|
| 10 |
"top_k": 64,
|
| 11 |
"top_p": 0.95,
|
| 12 |
+
"transformers_version": "5.3.0.dev0",
|
| 13 |
+
"trust_remote_code": false
|
| 14 |
}
|
onnx/audio_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99d0c9ae5f59d629428e1b7b95679a99608fca45e3d4db5738f71f2b38e7d8c8
|
| 3 |
+
size 85829
|
onnx/audio_encoder.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ad01ced648acb6d8be96d0b496090af96a7cf5e6ff6738ab7eb5f5b7993cc36e
|
| 3 |
+
size 1185280
|
onnx/audio_encoder_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c3eef215db75dcca84489c384939b45371765ef42c221f9389d03837f907c96
|
| 3 |
+
size 88445
|
onnx/audio_encoder_fp16.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46bb776b1ddc24ffdda01b65c604ffa6818a36970b394d4cac2ceb46535384c2
|
| 3 |
+
size 591360
|
onnx/decoder_model_merged.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:23dad85daa5513a2fec1de07a5f29735beeeac6a987a191813147c4eb98f8e4f
|
| 3 |
+
size 221367
|
onnx/decoder_model_merged.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:582a0f3703579ab00ba5c5d53be8433ec1e0a9e961500768f774d261288e7072
|
| 3 |
+
size 42157056
|
onnx/decoder_model_merged_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9753fd50a89b0da45648b5546e3a38fa7ff28befda15b01062500eb34aad665f
|
| 3 |
+
size 244912
|
onnx/decoder_model_merged_fp16.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1efc35ba3fe0f28d52485f4ea064e6f1acd0833bd3dbe044614a3b06f5df24c9
|
| 3 |
+
size 21073920
|
onnx/embed_tokens.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:841e9b1cd993484b3eaf7fa6ecd70710cbfb3bc7826f15daab502fd42d22c268
|
| 3 |
+
size 2518
|
onnx/embed_tokens.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3914f667ab7d56d508a3a7185c7dc58cb24eaf62989c4013e35b120813ca562b
|
| 3 |
+
size 41975808
|
onnx/embed_tokens_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe288814cb8a3f8f28a3b05825921087ffbd7e603b7bccff47aa4d117fbe8fc2
|
| 3 |
+
size 2987
|
onnx/embed_tokens_fp16.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a40d3d603e327ded01e83138127ed2426666da0a3e4dd83d55922562838a982f
|
| 3 |
+
size 20987904
|
onnx/vision_encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77de514a12de8b120542a24ca52e303a4fe881cc18c6d49586421d79c6b64d13
|
| 3 |
+
size 98161
|
onnx/vision_encoder.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:138633e802dbe842c925eafacacdcf459d14ed241169fbcb1ccbeeaeb6ce0e83
|
| 3 |
+
size 2586496
|
onnx/vision_encoder_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bbff8e53a81de7ef0cc8c477f760061ec78cedfbe0868a8f0292c221c6db81c1
|
| 3 |
+
size 96373
|
onnx/vision_encoder_fp16.onnx_data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20d1d561e54868f967dd6c626cffb1fdbe9f2b1fda177a760f62bf2074510890
|
| 3 |
+
size 1294528
|
processor_config.json
CHANGED
|
@@ -1,5 +1,97 @@
|
|
| 1 |
{
|
| 2 |
"audio_seq_length": 188,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"image_seq_length": 256,
|
| 4 |
"processor_class": "Gemma3nProcessor"
|
| 5 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"audio_seq_length": 188,
|
| 3 |
+
"feature_extractor": {
|
| 4 |
+
"crop_size": null,
|
| 5 |
+
"data_format": "channels_first",
|
| 6 |
+
"default_to_square": false,
|
| 7 |
+
"device": null,
|
| 8 |
+
"disable_grouping": null,
|
| 9 |
+
"dither": 0.0,
|
| 10 |
+
"do_center_crop": null,
|
| 11 |
+
"do_convert_rgb": null,
|
| 12 |
+
"do_normalize": false,
|
| 13 |
+
"do_rescale": true,
|
| 14 |
+
"do_resize": true,
|
| 15 |
+
"feature_extractor_type": "Gemma3nAudioFeatureExtractor",
|
| 16 |
+
"feature_size": 128,
|
| 17 |
+
"fft_length": 1024,
|
| 18 |
+
"fft_overdrive": true,
|
| 19 |
+
"frame_length": 512,
|
| 20 |
+
"hop_length": 160,
|
| 21 |
+
"image_mean": [
|
| 22 |
+
0.5,
|
| 23 |
+
0.5,
|
| 24 |
+
0.5
|
| 25 |
+
],
|
| 26 |
+
"image_processor_type": "SiglipImageProcessorFast",
|
| 27 |
+
"image_seq_length": 256,
|
| 28 |
+
"image_std": [
|
| 29 |
+
0.5,
|
| 30 |
+
0.5,
|
| 31 |
+
0.5
|
| 32 |
+
],
|
| 33 |
+
"input_data_format": null,
|
| 34 |
+
"input_scale_factor": 1.0,
|
| 35 |
+
"max_frequency": 7600.0,
|
| 36 |
+
"mel_floor": 1e-05,
|
| 37 |
+
"min_frequency": 125.0,
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"padding_value": 0.0,
|
| 40 |
+
"per_bin_mean": null,
|
| 41 |
+
"per_bin_stddev": null,
|
| 42 |
+
"preemphasis": 0.97,
|
| 43 |
+
"preemphasis_htk_flavor": true,
|
| 44 |
+
"resample": 2,
|
| 45 |
+
"rescale_factor": 0.00392156862745098,
|
| 46 |
+
"return_attention_mask": true,
|
| 47 |
+
"return_tensors": null,
|
| 48 |
+
"sampling_rate": 16000,
|
| 49 |
+
"size": {
|
| 50 |
+
"height": 768,
|
| 51 |
+
"width": 768
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"image_processor": {
|
| 55 |
+
"data_format": "channels_first",
|
| 56 |
+
"default_to_square": false,
|
| 57 |
+
"dither": 0.0,
|
| 58 |
+
"do_normalize": false,
|
| 59 |
+
"do_rescale": true,
|
| 60 |
+
"do_resize": true,
|
| 61 |
+
"feature_size": 128,
|
| 62 |
+
"fft_length": 1024,
|
| 63 |
+
"fft_overdrive": true,
|
| 64 |
+
"frame_length": 512,
|
| 65 |
+
"hop_length": 160,
|
| 66 |
+
"image_mean": [
|
| 67 |
+
0.5,
|
| 68 |
+
0.5,
|
| 69 |
+
0.5
|
| 70 |
+
],
|
| 71 |
+
"image_processor_type": "SiglipImageProcessorFast",
|
| 72 |
+
"image_seq_length": 256,
|
| 73 |
+
"image_std": [
|
| 74 |
+
0.5,
|
| 75 |
+
0.5,
|
| 76 |
+
0.5
|
| 77 |
+
],
|
| 78 |
+
"input_scale_factor": 1.0,
|
| 79 |
+
"max_frequency": 7600.0,
|
| 80 |
+
"mel_floor": 1e-05,
|
| 81 |
+
"min_frequency": 125.0,
|
| 82 |
+
"padding_side": "right",
|
| 83 |
+
"padding_value": 0.0,
|
| 84 |
+
"preemphasis": 0.97,
|
| 85 |
+
"preemphasis_htk_flavor": true,
|
| 86 |
+
"resample": 2,
|
| 87 |
+
"rescale_factor": 0.00392156862745098,
|
| 88 |
+
"return_attention_mask": true,
|
| 89 |
+
"sampling_rate": 16000,
|
| 90 |
+
"size": {
|
| 91 |
+
"height": 768,
|
| 92 |
+
"width": 768
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
"image_seq_length": 256,
|
| 96 |
"processor_class": "Gemma3nProcessor"
|
| 97 |
}
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44cb3d7d545cf895311e004d9a2b2ce823be5eb84c9aa31f73858b607c44c924
|
| 3 |
+
size 20366294
|
tokenizer_config.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|