erfanzar commited on Apr 9

Commit

ff3f5d5

verified ·

1 Parent(s): de5130f

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

README.md +159 -0
checkpoint_metadata.json +6 -0
config.json +489 -0
generation_config.json +72 -0
model/model/embed_vision/embedding_projection/kernel/.zarray +1 -0
model/model/language_model/embed_tokens/embedding/.zarray +1 -0
model/model/language_model/layers/0/input_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/0/input_layernorm/kernel/0 +0 -0
model/model/language_model/layers/0/layer_scalar/.zarray +1 -0
model/model/language_model/layers/0/layer_scalar/0 +0 -0
model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray +1 -0
model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray +1 -0
model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray +1 -0
model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/0/post_attention_layernorm/kernel/0 +0 -0
model/model/language_model/layers/0/post_feedforward_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/0/post_feedforward_layernorm/kernel/0 +0 -0
model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/0 +0 -0
model/model/language_model/layers/0/self_attn/k_norm/kernel/.zarray +1 -0
model/model/language_model/layers/0/self_attn/k_norm/kernel/0 +0 -0
model/model/language_model/layers/0/self_attn/k_proj/kernel/.zarray +1 -0
model/model/language_model/layers/0/self_attn/o_proj/kernel/.zarray +1 -0
model/model/language_model/layers/0/self_attn/q_norm/kernel/.zarray +1 -0
model/model/language_model/layers/0/self_attn/q_norm/kernel/0 +0 -0
model/model/language_model/layers/0/self_attn/q_proj/kernel/.zarray +1 -0
model/model/language_model/layers/0/self_attn/v_proj/kernel/.zarray +1 -0
model/model/language_model/layers/1/input_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/1/input_layernorm/kernel/0 +0 -0
model/model/language_model/layers/1/layer_scalar/.zarray +1 -0
model/model/language_model/layers/1/layer_scalar/0 +0 -0
model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray +1 -0
model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray +1 -0
model/model/language_model/layers/1/mlp/up_proj/kernel/.zarray +1 -0
model/model/language_model/layers/1/post_attention_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/1/post_feedforward_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/1/post_feedforward_layernorm/kernel/0 +0 -0
model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/.zarray +1 -0
model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/0 +0 -0
model/model/language_model/layers/1/self_attn/k_norm/kernel/.zarray +1 -0
model/model/language_model/layers/1/self_attn/k_norm/kernel/0 +0 -0
model/model/language_model/layers/1/self_attn/k_proj/kernel/.zarray +1 -0
model/model/language_model/layers/1/self_attn/q_norm/kernel/.zarray +1 -0
model/model/language_model/layers/1/self_attn/v_proj/kernel/.zarray +1 -0
model/model/language_model/layers/10/mlp/gate_proj/kernel/.zarray +1 -0
model/model/language_model/layers/10/mlp/up_proj/kernel/.zarray +1 -0
model/model/language_model/layers/13/mlp/down_proj/kernel/.zarray +1 -0
preprocessor_config.json +21 -0
tensorstore_index.json +0 -0
tokenizer_config.json +54 -0

README.md ADDED Viewed

	@@ -0,0 +1,159 @@

+---
+library_name: easydel
+pipeline_tag: image-to-text
+tags:
+  - easydel
+  - jax
+  - "gemma4"
+  - "ImageTextToText"
+  - "vanilla"
+---
+<p align="center">
+  <img alt="EasyDeL" src="https://raw.githubusercontent.com/erfanzar/easydel/main/images/easydel-logo-with-text.png" height="80">
+</p>
+<h1 align="center">google/gemma-4-31B</h1>
+<div align="center">
+  EasyDeL checkpoint converted from google/gemma-4-31B.
+</div>
+## Overview
+This checkpoint is intended to be loaded with EasyDeL on JAX (CPU/GPU/TPU). It supports sharded loading with `auto_shard_model=True` and configurable precision via `dtype`, `param_dtype`, and `precision`.
+## Quickstart
+```python
+import easydel as ed
+from jax import numpy as jnp, lax
+repo_id = "/dev/shm/conv/gemma-4-31B"
+dtype = jnp.bfloat16  # try jnp.float16 on many GPUs
+model = ed.AutoEasyDeLModelForImageTextToText.from_pretrained(
+    repo_id,
+    dtype=dtype,
+    param_dtype=dtype,
+    precision=lax.Precision("fastest"),
+    sharding_axis_names=("dp", "fsdp", "ep", "tp", "sp"),
+    sharding_axis_dims=(1, -1, 1, 1, 1),
+    config_kwargs=ed.EasyDeLBaseConfigDict(
+        attn_dtype=dtype,
+        attn_mechanism=ed.AttentionMechanisms.VANILLA,
+        fsdp_is_ep_bound=True,
+        sp_is_ep_bound=True,
+        moe_method=ed.MoEMethods.FUSED_MOE,
+    ),
+    auto_shard_model=True,
+    partition_axis=ed.PartitionAxis(),
+)
+```
+If the repository only provides PyTorch weights, pass `from_torch=True` to `from_pretrained(...)`.
+## Sharding & Parallelism (Multi-Device)
+EasyDeL can scale to multiple devices by creating a logical device mesh. Most EasyDeL loaders use a 5D mesh:
+- `dp`: data parallel (replicated parameters, different batch shards)
+- `fsdp`: parameter sharding (memory saver; often the biggest axis)
+- `ep`: expert parallel (MoE; keep `1` for non-MoE models)
+- `tp`: tensor parallel (splits large matmuls)
+- `sp`: sequence parallel (splits sequence dimension)
+Use `sharding_axis_names=("dp","fsdp","ep","tp","sp")` and choose `sharding_axis_dims` so that their product equals your device count.
+You can use `-1` in `sharding_axis_dims` to let EasyDeL infer the remaining dimension.
+<details>
+<summary>Example sharding configs</summary>
+```python
+# 8 devices, pure FSDP
+sharding_axis_dims = (1, 8, 1, 1, 1)
+# 8 devices, 2-way DP x 4-way FSDP
+sharding_axis_dims = (2, 4, 1, 1, 1)
+# 8 devices, 4-way FSDP x 2-way TP
+sharding_axis_dims = (1, 4, 1, 2, 1)
+```
+</details>
+## Using via `eLargeModel` (ELM)
+`eLargeModel` is a higher-level interface that wires together loading, sharding, training, and eSurge inference from a single config.
+```python
+from easydel import eLargeModel
+repo_id = "/dev/shm/conv/gemma-4-31B"
+elm = eLargeModel.from_pretrained(repo_id)  # task is auto-detected
+elm.set_dtype("bf16")
+elm.set_sharding(axis_names=("dp", "fsdp", "ep", "tp", "sp"), axis_dims=(1, -1, 1, 1, 1))
+model = elm.build_model()
+# Optional: build an inference engine
+# engine = elm.build_esurge()
+```
+<details>
+<summary>ELM YAML config example</summary>
+```yaml
+model:
+  name_or_path: "/dev/shm/conv/gemma-4-31B"
+loader:
+  dtype: bf16
+  param_dtype: bf16
+sharding:
+  axis_dims: [1, -1, 1, 1, 1]
+  auto_shard_model: true
+```
+</details>
+## Features
+**EasyDeL:**
+- JAX native implementation and sharded execution
+- Configurable attention backends via `AttentionMechanisms.*`
+- Precision control via `dtype`, `param_dtype`, and `precision`
+## Installation
+```bash
+pip install easydel
+```
+## Links
+- EasyDeL GitHub: https://github.com/erfanzar/EasyDeL
+- Docs: https://easydel.readthedocs.io/en/latest/
+## Supported Tasks
+- ImageTextToText
+## Limitations
+- Refer to the original model card for training data, evaluation, and intended use.
+## License
+EasyDeL is released under the Apache-2.0 license. The license for this model's weights may differ; please consult the original repository.
+## Citation
+```bibtex
+@misc{Zare Chavoshi_2023,
+    title={EasyDeL: An open-source library for enhancing and streamlining the training process of machine learning models},
+    url={https://github.com/erfanzar/EasyDeL},
+    author={Zare Chavoshi, Erfan},
+    year={2023}
+}
+```

checkpoint_metadata.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "timestamp": "2026-04-09T04:25:04.408345",
+  "custom_metadata": {
+    "step": 0
+  }
+}

config.json ADDED Viewed

	@@ -0,0 +1,489 @@

+{
+  "_external_rope_config_kwargs": {},
+  "add_cross_attention": false,
+  "architectures": [
+    "Gemma4ForConditionalGeneration"
+  ],
+  "attn_mechanism": "vanilla",
+  "audio_config": null,
+  "audio_token_id": 258881,
+  "backend": null,
+  "bits": null,
+  "blocksize_b": 1,
+  "blocksize_k": 512,
+  "blocksize_q": 512,
+  "boa_token_id": 256000,
+  "boi_token_id": 255999,
+  "bos_token_id": null,
+  "cross_attention_hidden_size": null,
+  "decode_attn_mechanism": null,
+  "decoder_start_token_id": null,
+  "dtype": "bfloat16",
+  "easy_method": "train",
+  "eoa_token_id": 258883,
+  "eoa_token_index": 258883,
+  "eoi_token_id": 258882,
+  "eos_token_id": null,
+  "fcm_max_ratio": 0.0,
+  "fcm_min_ratio": 0.0,
+  "flash_attention_backward_pass_impl": "triton",
+  "fsdp_is_ep_bound": true,
+  "gradient_checkpointing": "",
+  "gradient_checkpointing_targets": null,
+  "hardware_abstraction": false,
+  "image_token_id": 258880,
+  "initializer_range": 0.02,
+  "is_decoder": false,
+  "kv_cache_quantization_config": null,
+  "kv_cache_sharding_sequence_axis_name": "sp",
+  "kvdtype": "bfloat16",
+  "lmhead_chunksize": null,
+  "max_position_embeddings": null,
+  "mla_attn_dtype": "bfloat16",
+  "mla_attn_mechanism": "auto",
+  "mla_attn_softmax_dtype": "float32",
+  "model_type": "gemma4",
+  "moe_force_xla_gmm": false,
+  "moe_method": "fused_moe",
+  "moe_tiling_size_batch": 4,
+  "moe_tiling_size_dim": 128,
+  "moe_tiling_size_seqlen": 128,
+  "operation_configs": null,
+  "pad_token_id": null,
+  "pallas_k_block_size": 128,
+  "pallas_m_block_size": 128,
+  "pallas_n_block_size": 128,
+  "partition_axis": {
+    "attention_dim_axis": null,
+    "attention_kv_dim_axis": null,
+    "batch_axis": [
+      "fsdp",
+      "dp"
+    ],
+    "bias_head_sequence_axis": null,
+    "bias_key_sequence_axis": null,
+    "data_parallel_axis": "dp",
+    "decode_attention_dim_axis": null,
+    "decode_attention_kv_dim_axis": null,
+    "decode_batch_axis": [
+      "fsdp",
+      "dp"
+    ],
+    "decode_head_axis": "tp",
+    "decode_key_sequence_axis": "sp",
+    "decode_kv_head_axis": "tp",
+    "decode_query_sequence_axis": null,
+    "expert_axis": "ep",
+    "expert_gate_axis": null,
+    "expert_parallel_axis": "ep",
+    "fully_sharded_data_parallel_axis": "fsdp",
+    "head_axis": "tp",
+    "hidden_state_axis": "tp",
+    "key_sequence_axis": "sp",
+    "kv_head_axis": "tp",
+    "mlp_intermediate_axis": "tp",
+    "query_sequence_axis": "sp",
+    "sequence_axis": "sp",
+    "sequence_parallel_axis": "sp",
+    "tensor_parallel_axis": "tp",
+    "vocab_axis": "tp"
+  },
+  "platform": null,
+  "precompute_masks": true,
+  "pretraining_tp": 1,
+  "qmm_platform_override": null,
+  "qmm_tpu_path_override": null,
+  "quantization_config": null,
+  "scan_attention_layers": false,
+  "scan_mlp_chunk_size": 1024,
+  "scan_ring_attention": true,
+  "sep_token_id": null,
+  "sequence_axis_name": "sp",
+  "sharding_axis_dims": [
+    1,
+    -1,
+    1,
+    1,
+    1
+  ],
+  "sharding_axis_names": [
+    "dp",
+    "fsdp",
+    "ep",
+    "tp",
+    "sp"
+  ],
+  "sharding_dcn_axis_dims": null,
+  "sp_is_ep_bound": true,
+  "text_config": {
+    "_external_rope_config_kwargs": {},
+    "add_cross_attention": false,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_k_eq_v": true,
+    "attn_dtype": "bfloat16",
+    "attn_mechanism": "vanilla",
+    "attn_softmax_dtype": "float32",
+    "backend": null,
+    "bits": null,
+    "blocksize_b": 1,
+    "blocksize_k": 512,
+    "blocksize_q": 512,
+    "bos_token_id": 2,
+    "cross_attention_hidden_size": null,
+    "decode_attn_mechanism": null,
+    "decoder_start_token_id": null,
+    "dtype": "bfloat16",
+    "easy_method": "train",
+    "enable_moe_block": false,
+    "eos_token_id": 1,
+    "expert_intermediate_size": null,
+    "fcm_max_ratio": 0.0,
+    "fcm_min_ratio": 0.0,
+    "final_logit_softcapping": 30.0,
+    "flash_attention_backward_pass_impl": "triton",
+    "fsdp_is_ep_bound": true,
+    "global_head_dim": 512,
+    "gradient_checkpointing": "",
+    "gradient_checkpointing_targets": null,
+    "hardware_abstraction": false,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 5376,
+    "hidden_size_per_layer_input": 0,
+    "initializer_range": 0.02,
+    "intermediate_size": 21504,
+    "is_decoder": false,
+    "kv_cache_quantization_config": null,
+    "kv_cache_sharding_sequence_axis_name": "sp",
+    "kvdtype": "bfloat16",
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "lmhead_chunksize": null,
+    "max_position_embeddings": 262144,
+    "mla_attn_dtype": "bfloat16",
+    "mla_attn_mechanism": "auto",
+    "mla_attn_softmax_dtype": "float32",
+    "model_type": "gemma4_text",
+    "moe_force_xla_gmm": false,
+    "moe_intermediate_size": null,
+    "moe_method": "fused_moe",
+    "moe_tiling_size_batch": 4,
+    "moe_tiling_size_dim": 128,
+    "moe_tiling_size_seqlen": 128,
+    "num_attention_heads": 32,
+    "num_experts": null,
+    "num_global_key_value_heads": 4,
+    "num_hidden_layers": 60,
+    "num_key_value_heads": 16,
+    "num_kv_shared_layers": 0,
+    "num_local_experts": null,
+    "operation_configs": null,
+    "pad_token_id": 0,
+    "pallas_k_block_size": 128,
+    "pallas_m_block_size": 128,
+    "pallas_n_block_size": 128,
+    "partition_axis": {
+      "attention_dim_axis": null,
+      "attention_kv_dim_axis": null,
+      "batch_axis": [
+        "fsdp",
+        "dp"
+      ],
+      "bias_head_sequence_axis": null,
+      "bias_key_sequence_axis": null,
+      "data_parallel_axis": "dp",
+      "decode_attention_dim_axis": null,
+      "decode_attention_kv_dim_axis": null,
+      "decode_batch_axis": [
+        "fsdp",
+        "dp"
+      ],
+      "decode_head_axis": "tp",
+      "decode_key_sequence_axis": "sp",
+      "decode_kv_head_axis": "tp",
+      "decode_query_sequence_axis": null,
+      "expert_axis": "ep",
+      "expert_gate_axis": null,
+      "expert_parallel_axis": "ep",
+      "fully_sharded_data_parallel_axis": "fsdp",
+      "head_axis": "tp",
+      "hidden_state_axis": "tp",
+      "key_sequence_axis": "sp",
+      "kv_head_axis": "tp",
+      "mlp_intermediate_axis": "tp",
+      "query_sequence_axis": "sp",
+      "sequence_axis": "sp",
+      "sequence_parallel_axis": "sp",
+      "tensor_parallel_axis": "tp",
+      "vocab_axis": "tp"
+    },
+    "platform": null,
+    "precompute_masks": true,
+    "pretraining_tp": 1,
+    "qmm_platform_override": null,
+    "qmm_tpu_path_override": null,
+    "quantization_config": null,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "full_attention": {
+        "partial_rotary_factor": 0.25,
+        "rope_theta": 1000000.0,
+        "rope_type": "proportional",
+        "type": "proportional"
+      },
+      "sliding_attention": {
+        "rope_theta": 10000.0,
+        "rope_type": "default",
+        "type": "default"
+      }
+    },
+    "scan_attention_layers": false,
+    "scan_layers": false,
+    "scan_mlp_chunk_size": 1024,
+    "scan_ring_attention": true,
+    "sep_token_id": null,
+    "sequence_axis_name": "sp",
+    "sharding_axis_dims": [
+      1,
+      -1,
+      1,
+      1,
+      1
+    ],
+    "sharding_axis_names": [
+      "dp",
+      "fsdp",
+      "ep",
+      "tp",
+      "sp"
+    ],
+    "sharding_dcn_axis_dims": null,
+    "sliding_window": 1024,
+    "sp_is_ep_bound": true,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "top_k_experts": null,
+    "use_bidirectional_attention": "vision",
+    "use_cache": true,
+    "use_double_wide_mlp": false,
+    "use_expert_tensor_mode": false,
+    "use_qmm_best_config": false,
+    "use_ring_of_experts": false,
+    "use_scan_mlp": false,
+    "use_sharded_kv_caching": false,
+    "use_sharding_constraint": false,
+    "vocab_size": 262144,
+    "vocab_size_per_layer_input": 262144
+  },
+  "tie_encoder_decoder": false,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0",
+  "use_expert_tensor_mode": false,
+  "use_qmm_best_config": false,
+  "use_ring_of_experts": false,
+  "use_scan_mlp": false,
+  "use_sharded_kv_caching": false,
+  "use_sharding_constraint": false,
+  "video_token_id": 258884,
+  "vision_config": {
+    "_external_rope_config_kwargs": {},
+    "add_cross_attention": false,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attn_dtype": "bfloat16",
+    "attn_mechanism": "vanilla",
+    "attn_softmax_dtype": "float32",
+    "backend": null,
+    "bits": null,
+    "blocksize_b": 1,
+    "blocksize_k": 512,
+    "blocksize_q": 512,
+    "bos_token_id": null,
+    "cross_attention_hidden_size": null,
+    "decode_attn_mechanism": null,
+    "decoder_start_token_id": null,
+    "default_output_length": 280,
+    "dtype": "bfloat16",
+    "easy_method": "train",
+    "eos_token_id": null,
+    "fcm_max_ratio": 0.0,
+    "fcm_min_ratio": 0.0,
+    "flash_attention_backward_pass_impl": "triton",
+    "fsdp_is_ep_bound": true,
+    "global_head_dim": 72,
+    "gradient_checkpointing": "",
+    "gradient_checkpointing_targets": null,
+    "hardware_abstraction": false,
+    "head_dim": 72,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "kv_cache_quantization_config": null,
+    "kv_cache_sharding_sequence_axis_name": "sp",
+    "kvdtype": "bfloat16",
+    "lmhead_chunksize": null,
+    "max_position_embeddings": 131072,
+    "mla_attn_dtype": "bfloat16",
+    "mla_attn_mechanism": "auto",
+    "mla_attn_softmax_dtype": "float32",
+    "model_type": "gemma4_vision",
+    "moe_force_xla_gmm": false,
+    "moe_method": "fused_moe",
+    "moe_tiling_size_batch": 4,
+    "moe_tiling_size_dim": 128,
+    "moe_tiling_size_seqlen": 128,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 27,
+    "num_key_value_heads": 16,
+    "operation_configs": null,
+    "pad_token_id": null,
+    "pallas_k_block_size": 128,
+    "pallas_m_block_size": 128,
+    "pallas_n_block_size": 128,
+    "partition_axis": {
+      "attention_dim_axis": null,
+      "attention_kv_dim_axis": null,
+      "batch_axis": [
+        "fsdp",
+        "dp"
+      ],
+      "bias_head_sequence_axis": null,
+      "bias_key_sequence_axis": null,
+      "data_parallel_axis": "dp",
+      "decode_attention_dim_axis": null,
+      "decode_attention_kv_dim_axis": null,
+      "decode_batch_axis": [
+        "fsdp",
+        "dp"
+      ],
+      "decode_head_axis": "tp",
+      "decode_key_sequence_axis": "sp",
+      "decode_kv_head_axis": "tp",
+      "decode_query_sequence_axis": null,
+      "expert_axis": "ep",
+      "expert_gate_axis": null,
+      "expert_parallel_axis": "ep",
+      "fully_sharded_data_parallel_axis": "fsdp",
+      "head_axis": "tp",
+      "hidden_state_axis": "tp",
+      "key_sequence_axis": "sp",
+      "kv_head_axis": "tp",
+      "mlp_intermediate_axis": "tp",
+      "query_sequence_axis": "sp",
+      "sequence_axis": "sp",
+      "sequence_parallel_axis": "sp",
+      "tensor_parallel_axis": "tp",
+      "vocab_axis": "tp"
+    },
+    "patch_size": 16,
+    "platform": null,
+    "pooling_kernel_size": 3,
+    "position_embedding_size": 10240,
+    "precompute_masks": true,
+    "pretraining_tp": 1,
+    "qmm_platform_override": null,
+    "qmm_tpu_path_override": null,
+    "quantization_config": null,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 100.0,
+      "rope_type": "default",
+      "type": "default"
+    },
+    "scan_attention_layers": false,
+    "scan_mlp_chunk_size": 1024,
+    "scan_ring_attention": true,
+    "sep_token_id": null,
+    "sequence_axis_name": "sp",
+    "sharding_axis_dims": [
+      1,
+      -1,
+      1,
+      1,
+      1
+    ],
+    "sharding_axis_names": [
+      "dp",
+      "fsdp",
+      "ep",
+      "tp",
+      "sp"
+    ],
+    "sharding_dcn_axis_dims": null,
+    "sp_is_ep_bound": true,
+    "standardize": true,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "use_clipped_linears": false,
+    "use_expert_tensor_mode": false,
+    "use_qmm_best_config": false,
+    "use_ring_of_experts": false,
+    "use_scan_mlp": false,
+    "use_sharded_kv_caching": false,
+    "use_sharding_constraint": false
+  },
+  "vision_soft_tokens_per_image": 280
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "_from_model_config": null,
+  "assistant_confidence_threshold": null,
+  "assistant_early_exit": null,
+  "assistant_lookbehind": null,
+  "bad_words_ids": null,
+  "begin_suppress_tokens": null,
+  "bos_token_id": 2,
+  "cache_config": null,
+  "cache_implementation": null,
+  "compile_config": null,
+  "constraints": null,
+  "continuous_batching_config": null,
+  "decoder_start_token_id": null,
+  "disable_compile": null,
+  "diversity_penalty": null,
+  "do_sample": true,
+  "dola_layers": null,
+  "early_stopping": null,
+  "encoder_no_repeat_ngram_size": null,
+  "encoder_repetition_penalty": null,
+  "eos_token_id": 1,
+  "epsilon_cutoff": null,
+  "eta_cutoff": null,
+  "exponential_decay_length_penalty": null,
+  "force_words_ids": null,
+  "forced_bos_token_id": null,
+  "forced_eos_token_id": null,
+  "guidance_scale": null,
+  "is_assistant": null,
+  "length_penalty": null,
+  "low_memory": null,
+  "max_length": null,
+  "max_matching_ngram_size": null,
+  "max_new_tokens": null,
+  "max_time": null,
+  "min_length": null,
+  "min_new_tokens": null,
+  "min_p": null,
+  "no_repeat_ngram_size": null,
+  "num_assistant_tokens": null,
+  "num_assistant_tokens_schedule": null,
+  "num_beam_groups": null,
+  "num_beams": null,
+  "num_return_sequences": null,
+  "output_attentions": null,
+  "output_hidden_states": null,
+  "output_logits": null,
+  "output_scores": null,
+  "pad_token_id": 0,
+  "penalty_alpha": null,
+  "prefill_chunk_size": null,
+  "prompt_lookup_num_tokens": null,
+  "remove_invalid_values": null,
+  "renormalize_logits": null,
+  "repetition_penalty": null,
+  "return_dict_in_generate": null,
+  "sequence_bias": null,
+  "stop_strings": null,
+  "suppress_tokens": null,
+  "target_lookbehind": null,
+  "temperature": 1.0,
+  "token_healing": null,
+  "top_h": null,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "5.5.0",
+  "trust_remote_code": false,
+  "typical_p": null,
+  "use_cache": null,
+  "watermarking_config": null
+}

model/model/embed_vision/embedding_projection/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1152,5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1152,5376],"zarr_format":2}

model/model/language_model/embed_tokens/embedding/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[65536,5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[262144,5376],"zarr_format":2}

model/model/language_model/layers/0/input_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/0/input_layernorm/kernel/0 ADDED Viewed

Binary file (6.09 kB). View file

model/model/language_model/layers/0/layer_scalar/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}

model/model/language_model/layers/0/layer_scalar/0 ADDED Viewed

Binary file (11 Bytes). View file

model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[21504,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[21504,5376],"zarr_format":2}

model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}

model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}

model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/0/post_attention_layernorm/kernel/0 ADDED Viewed

Binary file (6.39 kB). View file

model/model/language_model/layers/0/post_feedforward_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/0/post_feedforward_layernorm/kernel/0 ADDED Viewed

Binary file (7.01 kB). View file

model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/0/pre_feedforward_layernorm/kernel/0 ADDED Viewed

Binary file (6.04 kB). View file

model/model/language_model/layers/0/self_attn/k_norm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}

model/model/language_model/layers/0/self_attn/k_norm/kernel/0 ADDED Viewed

Binary file (18 Bytes). View file

model/model/language_model/layers/0/self_attn/k_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}

model/model/language_model/layers/0/self_attn/o_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[8192,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[8192,5376],"zarr_format":2}

model/model/language_model/layers/0/self_attn/q_norm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}

model/model/language_model/layers/0/self_attn/q_norm/kernel/0 ADDED Viewed

Binary file (18 Bytes). View file

model/model/language_model/layers/0/self_attn/q_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,8192],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,8192],"zarr_format":2}

model/model/language_model/layers/0/self_attn/v_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}

model/model/language_model/layers/1/input_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/1/input_layernorm/kernel/0 ADDED Viewed

Binary file (6.12 kB). View file

model/model/language_model/layers/1/layer_scalar/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1],"zarr_format":2}

model/model/language_model/layers/1/layer_scalar/0 ADDED Viewed

Binary file (11 Bytes). View file

model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[21504,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[21504,5376],"zarr_format":2}

model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}

model/model/language_model/layers/1/mlp/up_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}

model/model/language_model/layers/1/post_attention_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/1/post_feedforward_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/1/post_feedforward_layernorm/kernel/0 ADDED Viewed

Binary file (6.78 kB). View file

model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[5376],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376],"zarr_format":2}

model/model/language_model/layers/1/pre_feedforward_layernorm/kernel/0 ADDED Viewed

Binary file (6.72 kB). View file

model/model/language_model/layers/1/self_attn/k_norm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}

model/model/language_model/layers/1/self_attn/k_norm/kernel/0 ADDED Viewed

Binary file (18 Bytes). View file

model/model/language_model/layers/1/self_attn/k_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}

model/model/language_model/layers/1/self_attn/q_norm/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}

model/model/language_model/layers/1/self_attn/v_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,4096],"zarr_format":2}

model/model/language_model/layers/10/mlp/gate_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}

model/model/language_model/layers/10/mlp/up_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[1344,21504],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[5376,21504],"zarr_format":2}

model/model/language_model/layers/13/mlp/down_proj/kernel/.zarray ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"chunks":[21504,1344],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[21504,5376],"zarr_format":2}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "dither": 0.0,
+  "feature_extractor_type": "Gemma4AudioFeatureExtractor",
+  "feature_size": 128,
+  "fft_length": 512,
+  "fft_overdrive": false,
+  "frame_length": 320,
+  "hop_length": 160,
+  "input_scale_factor": 1.0,
+  "max_frequency": 8000.0,
+  "mel_floor": 0.001,
+  "min_frequency": 0.0,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "per_bin_mean": null,
+  "per_bin_stddev": null,
+  "preemphasis": 0.0,
+  "preemphasis_htk_flavor": true,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

tensorstore_index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<eos>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "audio_token": "<|audio|>",
+    "boa_token": "<|audio>",
+    "boi_token": "<|image>",
+    "eoa_token": "<audio|>",
+    "eoc_token": "<channel|>",
+    "eoi_token": "<image|>",
+    "eot_token": "<turn|>",
+    "escape_token": "<|\"|>",
+    "etc_token": "<tool_call|>",
+    "etd_token": "<tool|>",
+    "etr_token": "<tool_response|>",
+    "image_token": "<|image|>",
+    "soc_token": "<|channel>",
+    "sot_token": "<|turn>",
+    "stc_token": "<|tool_call>",
+    "std_token": "<|tool>",
+    "str_token": "<|tool_response>",
+    "think_token": "<|think|>"
+  },
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "processor_class": "Gemma4Processor",
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>"
+}