erfanzar commited on
Commit
ff3f5d5
·
verified ·
1 Parent(s): de5130f

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. README.md +159 -0
  2. checkpoint_metadata.json +6 -0
  3. config.json +489 -0
  4. generation_config.json +72 -0
  5. model/model/embed_vision/embedding_projection/kernel/.zarray +1 -0
  6. model/model/language_model/embed_tokens/embedding/.zarray +1 -0
  7. model/model/language_model/layers/0/input_layernorm/kernel/.zarray +1 -0
  8. model/model/language_model/layers/0/input_layernorm/kernel/0 +0 -0
  9. model/model/language_model/layers/0/layer_scalar/.zarray +1 -0
  10. model/model/language_model/layers/0/layer_scalar/0 +0 -0
  11. model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray +1 -0
  12. model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray +1 -0
  13. model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray +1 -0
  14. model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray +1 -0
  15. model/model/language_model/layers/0/post_attention_layernorm/kernel/0 +0 -0
  16. model/model/language_model/layers/0/post_feedforward_layernorm/kernel/.zarray +1 -0
  17. model/model/language_model/layers/0/post_feedforward_layernorm/kernel/0 +0 -0
  18. model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/.zarray +1 -0
  19. model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/0 +0 -0
  20. model/model/language_model/layers/0/self_attn/k_norm/kernel/.zarray +1 -0
  21. model/model/language_model/layers/0/self_attn/k_norm/kernel/0 +0 -0
  22. model/model/language_model/layers/0/self_attn/k_proj/kernel/.zarray +1 -0
  23. model/model/language_model/layers/0/self_attn/o_proj/kernel/.zarray +1 -0
  24. model/model/language_model/layers/0/self_attn/q_norm/kernel/.zarray +1 -0
  25. model/model/language_model/layers/0/self_attn/q_norm/kernel/0 +0 -0
  26. model/model/language_model/layers/0/self_attn/q_proj/kernel/.zarray +1 -0
  27. model/model/language_model/layers/0/self_attn/v_proj/kernel/.zarray +1 -0
  28. model/model/language_model/layers/1/input_layernorm/kernel/.zarray +1 -0
  29. model/model/language_model/layers/1/input_layernorm/kernel/0 +0 -0
  30. model/model/language_model/layers/1/layer_scalar/.zarray +1 -0
  31. model/model/language_model/layers/1/layer_scalar/0 +0 -0
  32. model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray +1 -0
  33. model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray +1 -0
  34. model/model/language_model/layers/1/mlp/up_proj/kernel/.zarray +1 -0
  35. model/model/language_model/layers/1/post_attention_layernorm/kernel/.zarray +1 -0
  36. model/model/language_model/layers/1/post_feedforward_layernorm/kernel/.zarray +1 -0
  37. model/model/language_model/layers/1/post_feedforward_layernorm/kernel/0 +0 -0
  38. model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/.zarray +1 -0
  39. model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/0 +0 -0
  40. model/model/language_model/layers/1/self_attn/k_norm/kernel/.zarray +1 -0
  41. model/model/language_model/layers/1/self_attn/k_norm/kernel/0 +0 -0
  42. model/model/language_model/layers/1/self_attn/k_proj/kernel/.zarray +1 -0
  43. model/model/language_model/layers/1/self_attn/q_norm/kernel/.zarray +1 -0
  44. model/model/language_model/layers/1/self_attn/v_proj/kernel/.zarray +1 -0
  45. model/model/language_model/layers/10/mlp/gate_proj/kernel/.zarray +1 -0
  46. model/model/language_model/layers/10/mlp/up_proj/kernel/.zarray +1 -0
  47. model/model/language_model/layers/13/mlp/down_proj/kernel/.zarray +1 -0
  48. preprocessor_config.json +21 -0
  49. tensorstore_index.json +0 -0
  50. tokenizer_config.json +54 -0
README.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: easydel
3
+ pipeline_tag: image-to-text
4
+ tags:
5
+ - easydel
6
+ - jax
7
+ - "gemma4"
8
+ - "ImageTextToText"
9
+ - "vanilla"
10
+ ---
11
+
12
+ <p align="center">
13
+ <img alt="EasyDeL" src="https://raw.githubusercontent.com/erfanzar/easydel/main/images/easydel-logo-with-text.png" height="80">
14
+ </p>
15
+
16
+ <h1 align="center">google/gemma-4-31B</h1>
17
+
18
+ <div align="center">
19
+ EasyDeL checkpoint converted from google/gemma-4-31B.
20
+ </div>
21
+
22
+ ## Overview
23
+
24
+ This checkpoint is intended to be loaded with EasyDeL on JAX (CPU/GPU/TPU). It supports sharded loading with `auto_shard_model=True` and configurable precision via `dtype`, `param_dtype`, and `precision`.
25
+
26
+ ## Quickstart
27
+
28
+ ```python
29
+ import easydel as ed
30
+ from jax import numpy as jnp, lax
31
+
32
+ repo_id = "/dev/shm/conv/gemma-4-31B"
33
+
34
+ dtype = jnp.bfloat16 # try jnp.float16 on many GPUs
35
+
36
+ model = ed.AutoEasyDeLModelForImageTextToText.from_pretrained(
37
+ repo_id,
38
+ dtype=dtype,
39
+ param_dtype=dtype,
40
+ precision=lax.Precision("fastest"),
41
+ sharding_axis_names=("dp", "fsdp", "ep", "tp", "sp"),
42
+ sharding_axis_dims=(1, -1, 1, 1, 1),
43
+ config_kwargs=ed.EasyDeLBaseConfigDict(
44
+ attn_dtype=dtype,
45
+ attn_mechanism=ed.AttentionMechanisms.VANILLA,
46
+ fsdp_is_ep_bound=True,
47
+ sp_is_ep_bound=True,
48
+ moe_method=ed.MoEMethods.FUSED_MOE,
49
+ ),
50
+ auto_shard_model=True,
51
+ partition_axis=ed.PartitionAxis(),
52
+ )
53
+ ```
54
+
55
+ If the repository only provides PyTorch weights, pass `from_torch=True` to `from_pretrained(...)`.
56
+
57
+ ## Sharding & Parallelism (Multi-Device)
58
+
59
+ EasyDeL can scale to multiple devices by creating a logical device mesh. Most EasyDeL loaders use a 5D mesh:
60
+
61
+ - `dp`: data parallel (replicated parameters, different batch shards)
62
+ - `fsdp`: parameter sharding (memory saver; often the biggest axis)
63
+ - `ep`: expert parallel (MoE; keep `1` for non-MoE models)
64
+ - `tp`: tensor parallel (splits large matmuls)
65
+ - `sp`: sequence parallel (splits sequence dimension)
66
+
67
+ Use `sharding_axis_names=("dp","fsdp","ep","tp","sp")` and choose `sharding_axis_dims` so that their product equals your device count.
68
+ You can use `-1` in `sharding_axis_dims` to let EasyDeL infer the remaining dimension.
69
+
70
+ <details>
71
+ <summary>Example sharding configs</summary>
72
+
73
+ ```python
74
+ # 8 devices, pure FSDP
75
+ sharding_axis_dims = (1, 8, 1, 1, 1)
76
+
77
+ # 8 devices, 2-way DP x 4-way FSDP
78
+ sharding_axis_dims = (2, 4, 1, 1, 1)
79
+
80
+ # 8 devices, 4-way FSDP x 2-way TP
81
+ sharding_axis_dims = (1, 4, 1, 2, 1)
82
+ ```
83
+ </details>
84
+
85
+ ## Using via `eLargeModel` (ELM)
86
+
87
+ `eLargeModel` is a higher-level interface that wires together loading, sharding, training, and eSurge inference from a single config.
88
+
89
+ ```python
90
+ from easydel import eLargeModel
91
+
92
+ repo_id = "/dev/shm/conv/gemma-4-31B"
93
+
94
+ elm = eLargeModel.from_pretrained(repo_id) # task is auto-detected
95
+ elm.set_dtype("bf16")
96
+ elm.set_sharding(axis_names=("dp", "fsdp", "ep", "tp", "sp"), axis_dims=(1, -1, 1, 1, 1))
97
+
98
+ model = elm.build_model()
99
+ # Optional: build an inference engine
100
+ # engine = elm.build_esurge()
101
+ ```
102
+
103
+ <details>
104
+ <summary>ELM YAML config example</summary>
105
+
106
+ ```yaml
107
+ model:
108
+ name_or_path: "/dev/shm/conv/gemma-4-31B"
109
+
110
+ loader:
111
+ dtype: bf16
112
+ param_dtype: bf16
113
+
114
+ sharding:
115
+ axis_dims: [1, -1, 1, 1, 1]
116
+ auto_shard_model: true
117
+ ```
118
+ </details>
119
+
120
+ ## Features
121
+
122
+ **EasyDeL:**
123
+ - JAX native implementation and sharded execution
124
+ - Configurable attention backends via `AttentionMechanisms.*`
125
+ - Precision control via `dtype`, `param_dtype`, and `precision`
126
+
127
+ ## Installation
128
+
129
+ ```bash
130
+ pip install easydel
131
+ ```
132
+
133
+ ## Links
134
+
135
+ - EasyDeL GitHub: https://github.com/erfanzar/EasyDeL
136
+ - Docs: https://easydel.readthedocs.io/en/latest/
137
+
138
+ ## Supported Tasks
139
+
140
+ - ImageTextToText
141
+
142
+ ## Limitations
143
+
144
+ - Refer to the original model card for training data, evaluation, and intended use.
145
+
146
+ ## License
147
+
148
+ EasyDeL is released under the Apache-2.0 license. The license for this model's weights may differ; please consult the original repository.
149
+
150
+ ## Citation
151
+
152
+ ```bibtex
153
+ @misc{Zare Chavoshi_2023,
154
+ title={EasyDeL: An open-source library for enhancing and streamlining the training process of machine learning models},
155
+ url={https://github.com/erfanzar/EasyDeL},
156
+ author={Zare Chavoshi, Erfan},
157
+ year={2023}
158
+ }
159
+ ```
checkpoint_metadata.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2026-04-09T04:25:04.408345",
3
+ "custom_metadata": {
4
+ "step": 0
5
+ }
6
+ }
config.json ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_external_rope_config_kwargs": {},
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "Gemma4ForConditionalGeneration"
6
+ ],
7
+ "attn_mechanism": "vanilla",
8
+ "audio_config": null,
9
+ "audio_token_id": 258881,
10
+ "backend": null,
11
+ "bits": null,
12
+ "blocksize_b": 1,
13
+ "blocksize_k": 512,
14
+ "blocksize_q": 512,
15
+ "boa_token_id": 256000,
16
+ "boi_token_id": 255999,
17
+ "bos_token_id": null,
18
+ "cross_attention_hidden_size": null,
19
+ "decode_attn_mechanism": null,
20
+ "decoder_start_token_id": null,
21
+ "dtype": "bfloat16",
22
+ "easy_method": "train",
23
+ "eoa_token_id": 258883,
24
+ "eoa_token_index": 258883,
25
+ "eoi_token_id": 258882,
26
+ "eos_token_id": null,
27
+ "fcm_max_ratio": 0.0,
28
+ "fcm_min_ratio": 0.0,
29
+ "flash_attention_backward_pass_impl": "triton",
30
+ "fsdp_is_ep_bound": true,
31
+ "gradient_checkpointing": "",
32
+ "gradient_checkpointing_targets": null,
33
+ "hardware_abstraction": false,
34
+ "image_token_id": 258880,
35
+ "initializer_range": 0.02,
36
+ "is_decoder": false,
37
+ "kv_cache_quantization_config": null,
38
+ "kv_cache_sharding_sequence_axis_name": "sp",
39
+ "kvdtype": "bfloat16",
40
+ "lmhead_chunksize": null,
41
+ "max_position_embeddings": null,
42
+ "mla_attn_dtype": "bfloat16",
43
+ "mla_attn_mechanism": "auto",
44
+ "mla_attn_softmax_dtype": "float32",
45
+ "model_type": "gemma4",
46
+ "moe_force_xla_gmm": false,
47
+ "moe_method": "fused_moe",
48
+ "moe_tiling_size_batch": 4,
49
+ "moe_tiling_size_dim": 128,
50
+ "moe_tiling_size_seqlen": 128,
51
+ "operation_configs": null,
52
+ "pad_token_id": null,
53
+ "pallas_k_block_size": 128,
54
+ "pallas_m_block_size": 128,
55
+ "pallas_n_block_size": 128,
56
+ "partition_axis": {
57
+ "attention_dim_axis": null,
58
+ "attention_kv_dim_axis": null,
59
+ "batch_axis": [
60
+ "fsdp",
61
+ "dp"
62
+ ],
63
+ "bias_head_sequence_axis": null,
64
+ "bias_key_sequence_axis": null,
65
+ "data_parallel_axis": "dp",
66
+ "decode_attention_dim_axis": null,
67
+ "decode_attention_kv_dim_axis": null,
68
+ "decode_batch_axis": [
69
+ "fsdp",
70
+ "dp"
71
+ ],
72
+ "decode_head_axis": "tp",
73
+ "decode_key_sequence_axis": "sp",
74
+ "decode_kv_head_axis": "tp",
75
+ "decode_query_sequence_axis": null,
76
+ "expert_axis": "ep",
77
+ "expert_gate_axis": null,
78
+ "expert_parallel_axis": "ep",
79
+ "fully_sharded_data_parallel_axis": "fsdp",
80
+ "head_axis": "tp",
81
+ "hidden_state_axis": "tp",
82
+ "key_sequence_axis": "sp",
83
+ "kv_head_axis": "tp",
84
+ "mlp_intermediate_axis": "tp",
85
+ "query_sequence_axis": "sp",
86
+ "sequence_axis": "sp",
87
+ "sequence_parallel_axis": "sp",
88
+ "tensor_parallel_axis": "tp",
89
+ "vocab_axis": "tp"
90
+ },
91
+ "platform": null,
92
+ "precompute_masks": true,
93
+ "pretraining_tp": 1,
94
+ "qmm_platform_override": null,
95
+ "qmm_tpu_path_override": null,
96
+ "quantization_config": null,
97
+ "scan_attention_layers": false,
98
+ "scan_mlp_chunk_size": 1024,
99
+ "scan_ring_attention": true,
100
+ "sep_token_id": null,
101
+ "sequence_axis_name": "sp",
102
+ "sharding_axis_dims": [
103
+ 1,
104
+ -1,
105
+ 1,
106
+ 1,
107
+ 1
108
+ ],
109
+ "sharding_axis_names": [
110
+ "dp",
111
+ "fsdp",
112
+ "ep",
113
+ "tp",
114
+ "sp"
115
+ ],
116
+ "sharding_dcn_axis_dims": null,
117
+ "sp_is_ep_bound": true,
118
+ "text_config": {
119
+ "_external_rope_config_kwargs": {},
120
+ "add_cross_attention": false,
121
+ "attention_bias": false,
122
+ "attention_dropout": 0.0,
123
+ "attention_k_eq_v": true,
124
+ "attn_dtype": "bfloat16",
125
+ "attn_mechanism": "vanilla",
126
+ "attn_softmax_dtype": "float32",
127
+ "backend": null,
128
+ "bits": null,
129
+ "blocksize_b": 1,
130
+ "blocksize_k": 512,
131
+ "blocksize_q": 512,
132
+ "bos_token_id": 2,
133
+ "cross_attention_hidden_size": null,
134
+ "decode_attn_mechanism": null,
135
+ "decoder_start_token_id": null,
136
+ "dtype": "bfloat16",
137
+ "easy_method": "train",
138
+ "enable_moe_block": false,
139
+ "eos_token_id": 1,
140
+ "expert_intermediate_size": null,
141
+ "fcm_max_ratio": 0.0,
142
+ "fcm_min_ratio": 0.0,
143
+ "final_logit_softcapping": 30.0,
144
+ "flash_attention_backward_pass_impl": "triton",
145
+ "fsdp_is_ep_bound": true,
146
+ "global_head_dim": 512,
147
+ "gradient_checkpointing": "",
148
+ "gradient_checkpointing_targets": null,
149
+ "hardware_abstraction": false,
150
+ "head_dim": 256,
151
+ "hidden_activation": "gelu_pytorch_tanh",
152
+ "hidden_size": 5376,
153
+ "hidden_size_per_layer_input": 0,
154
+ "initializer_range": 0.02,
155
+ "intermediate_size": 21504,
156
+ "is_decoder": false,
157
+ "kv_cache_quantization_config": null,
158
+ "kv_cache_sharding_sequence_axis_name": "sp",
159
+ "kvdtype": "bfloat16",
160
+ "layer_types": [
161
+ "sliding_attention",
162
+ "sliding_attention",
163
+ "sliding_attention",
164
+ "sliding_attention",
165
+ "sliding_attention",
166
+ "full_attention",
167
+ "sliding_attention",
168
+ "sliding_attention",
169
+ "sliding_attention",
170
+ "sliding_attention",
171
+ "sliding_attention",
172
+ "full_attention",
173
+ "sliding_attention",
174
+ "sliding_attention",
175
+ "sliding_attention",
176
+ "sliding_attention",
177
+ "sliding_attention",
178
+ "full_attention",
179
+ "sliding_attention",
180
+ "sliding_attention",
181
+ "sliding_attention",
182
+ "sliding_attention",
183
+ "sliding_attention",
184
+ "full_attention",
185
+ "sliding_attention",
186
+ "sliding_attention",
187
+ "sliding_attention",
188
+ "sliding_attention",
189
+ "sliding_attention",
190
+ "full_attention",
191
+ "sliding_attention",
192
+ "sliding_attention",
193
+ "sliding_attention",
194
+ "sliding_attention",
195
+ "sliding_attention",
196
+ "full_attention",
197
+ "sliding_attention",
198
+ "sliding_attention",
199
+ "sliding_attention",
200
+ "sliding_attention",
201
+ "sliding_attention",
202
+ "full_attention",
203
+ "sliding_attention",
204
+ "sliding_attention",
205
+ "sliding_attention",
206
+ "sliding_attention",
207
+ "sliding_attention",
208
+ "full_attention",
209
+ "sliding_attention",
210
+ "sliding_attention",
211
+ "sliding_attention",
212
+ "sliding_attention",
213
+ "sliding_attention",
214
+ "full_attention",
215
+ "sliding_attention",
216
+ "sliding_attention",
217
+ "sliding_attention",
218
+ "sliding_attention",
219
+ "sliding_attention",
220
+ "full_attention"
221
+ ],
222
+ "lmhead_chunksize": null,
223
+ "max_position_embeddings": 262144,
224
+ "mla_attn_dtype": "bfloat16",
225
+ "mla_attn_mechanism": "auto",
226
+ "mla_attn_softmax_dtype": "float32",
227
+ "model_type": "gemma4_text",
228
+ "moe_force_xla_gmm": false,
229
+ "moe_intermediate_size": null,
230
+ "moe_method": "fused_moe",
231
+ "moe_tiling_size_batch": 4,
232
+ "moe_tiling_size_dim": 128,
233
+ "moe_tiling_size_seqlen": 128,
234
+ "num_attention_heads": 32,
235
+ "num_experts": null,
236
+ "num_global_key_value_heads": 4,
237
+ "num_hidden_layers": 60,
238
+ "num_key_value_heads": 16,
239
+ "num_kv_shared_layers": 0,
240
+ "num_local_experts": null,
241
+ "operation_configs": null,
242
+ "pad_token_id": 0,
243
+ "pallas_k_block_size": 128,
244
+ "pallas_m_block_size": 128,
245
+ "pallas_n_block_size": 128,
246
+ "partition_axis": {
247
+ "attention_dim_axis": null,
248
+ "attention_kv_dim_axis": null,
249
+ "batch_axis": [
250
+ "fsdp",
251
+ "dp"
252
+ ],
253
+ "bias_head_sequence_axis": null,
254
+ "bias_key_sequence_axis": null,
255
+ "data_parallel_axis": "dp",
256
+ "decode_attention_dim_axis": null,
257
+ "decode_attention_kv_dim_axis": null,
258
+ "decode_batch_axis": [
259
+ "fsdp",
260
+ "dp"
261
+ ],
262
+ "decode_head_axis": "tp",
263
+ "decode_key_sequence_axis": "sp",
264
+ "decode_kv_head_axis": "tp",
265
+ "decode_query_sequence_axis": null,
266
+ "expert_axis": "ep",
267
+ "expert_gate_axis": null,
268
+ "expert_parallel_axis": "ep",
269
+ "fully_sharded_data_parallel_axis": "fsdp",
270
+ "head_axis": "tp",
271
+ "hidden_state_axis": "tp",
272
+ "key_sequence_axis": "sp",
273
+ "kv_head_axis": "tp",
274
+ "mlp_intermediate_axis": "tp",
275
+ "query_sequence_axis": "sp",
276
+ "sequence_axis": "sp",
277
+ "sequence_parallel_axis": "sp",
278
+ "tensor_parallel_axis": "tp",
279
+ "vocab_axis": "tp"
280
+ },
281
+ "platform": null,
282
+ "precompute_masks": true,
283
+ "pretraining_tp": 1,
284
+ "qmm_platform_override": null,
285
+ "qmm_tpu_path_override": null,
286
+ "quantization_config": null,
287
+ "rms_norm_eps": 1e-06,
288
+ "rope_parameters": {
289
+ "full_attention": {
290
+ "partial_rotary_factor": 0.25,
291
+ "rope_theta": 1000000.0,
292
+ "rope_type": "proportional",
293
+ "type": "proportional"
294
+ },
295
+ "sliding_attention": {
296
+ "rope_theta": 10000.0,
297
+ "rope_type": "default",
298
+ "type": "default"
299
+ }
300
+ },
301
+ "scan_attention_layers": false,
302
+ "scan_layers": false,
303
+ "scan_mlp_chunk_size": 1024,
304
+ "scan_ring_attention": true,
305
+ "sep_token_id": null,
306
+ "sequence_axis_name": "sp",
307
+ "sharding_axis_dims": [
308
+ 1,
309
+ -1,
310
+ 1,
311
+ 1,
312
+ 1
313
+ ],
314
+ "sharding_axis_names": [
315
+ "dp",
316
+ "fsdp",
317
+ "ep",
318
+ "tp",
319
+ "sp"
320
+ ],
321
+ "sharding_dcn_axis_dims": null,
322
+ "sliding_window": 1024,
323
+ "sp_is_ep_bound": true,
324
+ "tie_encoder_decoder": false,
325
+ "tie_word_embeddings": true,
326
+ "top_k_experts": null,
327
+ "use_bidirectional_attention": "vision",
328
+ "use_cache": true,
329
+ "use_double_wide_mlp": false,
330
+ "use_expert_tensor_mode": false,
331
+ "use_qmm_best_config": false,
332
+ "use_ring_of_experts": false,
333
+ "use_scan_mlp": false,
334
+ "use_sharded_kv_caching": false,
335
+ "use_sharding_constraint": false,
336
+ "vocab_size": 262144,
337
+ "vocab_size_per_layer_input": 262144
338
+ },
339
+ "tie_encoder_decoder": false,
340
+ "tie_word_embeddings": true,
341
+ "transformers_version": "5.5.0",
342
+ "use_expert_tensor_mode": false,
343
+ "use_qmm_best_config": false,
344
+ "use_ring_of_experts": false,
345
+ "use_scan_mlp": false,
346
+ "use_sharded_kv_caching": false,
347
+ "use_sharding_constraint": false,
348
+ "video_token_id": 258884,
349
+ "vision_config": {
350
+ "_external_rope_config_kwargs": {},
351
+ "add_cross_attention": false,
352
+ "attention_bias": false,
353
+ "attention_dropout": 0.0,
354
+ "attn_dtype": "bfloat16",
355
+ "attn_mechanism": "vanilla",
356
+ "attn_softmax_dtype": "float32",
357
+ "backend": null,
358
+ "bits": null,
359
+ "blocksize_b": 1,
360
+ "blocksize_k": 512,
361
+ "blocksize_q": 512,
362
+ "bos_token_id": null,
363
+ "cross_attention_hidden_size": null,
364
+ "decode_attn_mechanism": null,
365
+ "decoder_start_token_id": null,
366
+ "default_output_length": 280,
367
+ "dtype": "bfloat16",
368
+ "easy_method": "train",
369
+ "eos_token_id": null,
370
+ "fcm_max_ratio": 0.0,
371
+ "fcm_min_ratio": 0.0,
372
+ "flash_attention_backward_pass_impl": "triton",
373
+ "fsdp_is_ep_bound": true,
374
+ "global_head_dim": 72,
375
+ "gradient_checkpointing": "",
376
+ "gradient_checkpointing_targets": null,
377
+ "hardware_abstraction": false,
378
+ "head_dim": 72,
379
+ "hidden_activation": "gelu_pytorch_tanh",
380
+ "hidden_size": 1152,
381
+ "initializer_range": 0.02,
382
+ "intermediate_size": 4304,
383
+ "is_decoder": false,
384
+ "kv_cache_quantization_config": null,
385
+ "kv_cache_sharding_sequence_axis_name": "sp",
386
+ "kvdtype": "bfloat16",
387
+ "lmhead_chunksize": null,
388
+ "max_position_embeddings": 131072,
389
+ "mla_attn_dtype": "bfloat16",
390
+ "mla_attn_mechanism": "auto",
391
+ "mla_attn_softmax_dtype": "float32",
392
+ "model_type": "gemma4_vision",
393
+ "moe_force_xla_gmm": false,
394
+ "moe_method": "fused_moe",
395
+ "moe_tiling_size_batch": 4,
396
+ "moe_tiling_size_dim": 128,
397
+ "moe_tiling_size_seqlen": 128,
398
+ "num_attention_heads": 16,
399
+ "num_hidden_layers": 27,
400
+ "num_key_value_heads": 16,
401
+ "operation_configs": null,
402
+ "pad_token_id": null,
403
+ "pallas_k_block_size": 128,
404
+ "pallas_m_block_size": 128,
405
+ "pallas_n_block_size": 128,
406
+ "partition_axis": {
407
+ "attention_dim_axis": null,
408
+ "attention_kv_dim_axis": null,
409
+ "batch_axis": [
410
+ "fsdp",
411
+ "dp"
412
+ ],
413
+ "bias_head_sequence_axis": null,
414
+ "bias_key_sequence_axis": null,
415
+ "data_parallel_axis": "dp",
416
+ "decode_attention_dim_axis": null,
417
+ "decode_attention_kv_dim_axis": null,
418
+ "decode_batch_axis": [
419
+ "fsdp",
420
+ "dp"
421
+ ],
422
+ "decode_head_axis": "tp",
423
+ "decode_key_sequence_axis": "sp",
424
+ "decode_kv_head_axis": "tp",
425
+ "decode_query_sequence_axis": null,
426
+ "expert_axis": "ep",
427
+ "expert_gate_axis": null,
428
+ "expert_parallel_axis": "ep",
429
+ "fully_sharded_data_parallel_axis": "fsdp",
430
+ "head_axis": "tp",
431
+ "hidden_state_axis": "tp",
432
+ "key_sequence_axis": "sp",
433
+ "kv_head_axis": "tp",
434
+ "mlp_intermediate_axis": "tp",
435
+ "query_sequence_axis": "sp",
436
+ "sequence_axis": "sp",
437
+ "sequence_parallel_axis": "sp",
438
+ "tensor_parallel_axis": "tp",
439
+ "vocab_axis": "tp"
440
+ },
441
+ "patch_size": 16,
442
+ "platform": null,
443
+ "pooling_kernel_size": 3,
444
+ "position_embedding_size": 10240,
445
+ "precompute_masks": true,
446
+ "pretraining_tp": 1,
447
+ "qmm_platform_override": null,
448
+ "qmm_tpu_path_override": null,
449
+ "quantization_config": null,
450
+ "rms_norm_eps": 1e-06,
451
+ "rope_parameters": {
452
+ "rope_theta": 100.0,
453
+ "rope_type": "default",
454
+ "type": "default"
455
+ },
456
+ "scan_attention_layers": false,
457
+ "scan_mlp_chunk_size": 1024,
458
+ "scan_ring_attention": true,
459
+ "sep_token_id": null,
460
+ "sequence_axis_name": "sp",
461
+ "sharding_axis_dims": [
462
+ 1,
463
+ -1,
464
+ 1,
465
+ 1,
466
+ 1
467
+ ],
468
+ "sharding_axis_names": [
469
+ "dp",
470
+ "fsdp",
471
+ "ep",
472
+ "tp",
473
+ "sp"
474
+ ],
475
+ "sharding_dcn_axis_dims": null,
476
+ "sp_is_ep_bound": true,
477
+ "standardize": true,
478
+ "tie_encoder_decoder": false,
479
+ "tie_word_embeddings": true,
480
+ "use_clipped_linears": false,
481
+ "use_expert_tensor_mode": false,
482
+ "use_qmm_best_config": false,
483
+ "use_ring_of_experts": false,
484
+ "use_scan_mlp": false,
485
+ "use_sharded_kv_caching": false,
486
+ "use_sharding_constraint": false
487
+ },
488
+ "vision_soft_tokens_per_image": 280
489
+ }
generation_config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": null,
3
+ "assistant_confidence_threshold": null,
4
+ "assistant_early_exit": null,
5
+ "assistant_lookbehind": null,
6
+ "bad_words_ids": null,
7
+ "begin_suppress_tokens": null,
8
+ "bos_token_id": 2,
9
+ "cache_config": null,
10
+ "cache_implementation": null,
11
+ "compile_config": null,
12
+ "constraints": null,
13
+ "continuous_batching_config": null,
14
+ "decoder_start_token_id": null,
15
+ "disable_compile": null,
16
+ "diversity_penalty": null,
17
+ "do_sample": true,
18
+ "dola_layers": null,
19
+ "early_stopping": null,
20
+ "encoder_no_repeat_ngram_size": null,
21
+ "encoder_repetition_penalty": null,
22
+ "eos_token_id": 1,
23
+ "epsilon_cutoff": null,
24
+ "eta_cutoff": null,
25
+ "exponential_decay_length_penalty": null,
26
+ "force_words_ids": null,
27
+ "forced_bos_token_id": null,
28
+ "forced_eos_token_id": null,
29
+ "guidance_scale": null,
30
+ "is_assistant": null,
31
+ "length_penalty": null,
32
+ "low_memory": null,
33
+ "max_length": null,
34
+ "max_matching_ngram_size": null,
35
+ "max_new_tokens": null,
36
+ "max_time": null,
37
+ "min_length": null,
38
+ "min_new_tokens": null,
39
+ "min_p": null,
40
+ "no_repeat_ngram_size": null,
41
+ "num_assistant_tokens": null,
42
+ "num_assistant_tokens_schedule": null,
43
+ "num_beam_groups": null,
44
+ "num_beams": null,
45
+ "num_return_sequences": null,
46
+ "output_attentions": null,
47
+ "output_hidden_states": null,
48
+ "output_logits": null,
49
+ "output_scores": null,
50
+ "pad_token_id": 0,
51
+ "penalty_alpha": null,
52
+ "prefill_chunk_size": null,
53
+ "prompt_lookup_num_tokens": null,
54
+ "remove_invalid_values": null,
55
+ "renormalize_logits": null,
56
+ "repetition_penalty": null,
57
+ "return_dict_in_generate": null,
58
+ "sequence_bias": null,
59
+ "stop_strings": null,
60
+ "suppress_tokens": null,
61
+ "target_lookbehind": null,
62
+ "temperature": 1.0,
63
+ "token_healing": null,
64
+ "top_h": null,
65
+ "top_k": 64,
66
+ "top_p": 0.95,
67
+ "transformers_version": "5.5.0",
68
+ "trust_remote_code": false,
69
+ "typical_p": null,
70
+ "use_cache": null,
71
+ "watermarking_config": null
72
+ }
model/model/embed_vision/embedding_projection/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1152,5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1152,5376],"zarr_format":2}
model/model/language_model/embed_tokens/embedding/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[65536,5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[262144,5376],"zarr_format":2}
model/model/language_model/layers/0/input_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/0/input_layernorm/kernel/0 ADDED
Binary file (6.09 kB). View file
 
model/model/language_model/layers/0/layer_scalar/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}
model/model/language_model/layers/0/layer_scalar/0 ADDED
Binary file (11 Bytes). View file
 
model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[21504,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[21504,5376],"zarr_format":2}
model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}
model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}
model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/0/post_attention_layernorm/kernel/0 ADDED
Binary file (6.39 kB). View file
 
model/model/language_model/layers/0/post_feedforward_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/0/post_feedforward_layernorm/kernel/0 ADDED
Binary file (7.01 kB). View file
 
model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/0 ADDED
Binary file (6.04 kB). View file
 
model/model/language_model/layers/0/self_attn/k_norm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}
model/model/language_model/layers/0/self_attn/k_norm/kernel/0 ADDED
Binary file (18 Bytes). View file
 
model/model/language_model/layers/0/self_attn/k_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}
model/model/language_model/layers/0/self_attn/o_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[8192,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[8192,5376],"zarr_format":2}
model/model/language_model/layers/0/self_attn/q_norm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}
model/model/language_model/layers/0/self_attn/q_norm/kernel/0 ADDED
Binary file (18 Bytes). View file
 
model/model/language_model/layers/0/self_attn/q_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,8192],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,8192],"zarr_format":2}
model/model/language_model/layers/0/self_attn/v_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}
model/model/language_model/layers/1/input_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/1/input_layernorm/kernel/0 ADDED
Binary file (6.12 kB). View file
 
model/model/language_model/layers/1/layer_scalar/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}
model/model/language_model/layers/1/layer_scalar/0 ADDED
Binary file (11 Bytes). View file
 
model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[21504,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[21504,5376],"zarr_format":2}
model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}
model/model/language_model/layers/1/mlp/up_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}
model/model/language_model/layers/1/post_attention_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/1/post_feedforward_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/1/post_feedforward_layernorm/kernel/0 ADDED
Binary file (6.78 kB). View file
 
model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}
model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/0 ADDED
Binary file (6.72 kB). View file
 
model/model/language_model/layers/1/self_attn/k_norm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}
model/model/language_model/layers/1/self_attn/k_norm/kernel/0 ADDED
Binary file (18 Bytes). View file
 
model/model/language_model/layers/1/self_attn/k_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}
model/model/language_model/layers/1/self_attn/q_norm/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}
model/model/language_model/layers/1/self_attn/v_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}
model/model/language_model/layers/10/mlp/gate_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}
model/model/language_model/layers/10/mlp/up_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}
model/model/language_model/layers/13/mlp/down_proj/kernel/.zarray ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chunks":[21504,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[21504,5376],"zarr_format":2}
preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dither": 0.0,
3
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
4
+ "feature_size": 128,
5
+ "fft_length": 512,
6
+ "fft_overdrive": false,
7
+ "frame_length": 320,
8
+ "hop_length": 160,
9
+ "input_scale_factor": 1.0,
10
+ "max_frequency": 8000.0,
11
+ "mel_floor": 0.001,
12
+ "min_frequency": 0.0,
13
+ "padding_side": "right",
14
+ "padding_value": 0.0,
15
+ "per_bin_mean": null,
16
+ "per_bin_stddev": null,
17
+ "preemphasis": 0.0,
18
+ "preemphasis_htk_flavor": true,
19
+ "return_attention_mask": true,
20
+ "sampling_rate": 16000
21
+ }
tensorstore_index.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }