diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b2992a9536cf20e9f164510346c9aa3e6ac3126b
--- /dev/null
+++ b/README.md
@@ -0,0 +1,159 @@
+---
+library_name: easydel
+pipeline_tag: image-to-text
+tags:
+ - easydel
+ - jax
+ - "qwen3_5"
+ - "ImageTextToText"
+ - "vanilla"
+---
+
+
+
+
+
+Qwen/Qwen3.5-0.8B-Base
+
+
+ EasyDeL checkpoint converted from Qwen/Qwen3.5-0.8B-Base.
+
+
+## Overview
+
+This checkpoint is intended to be loaded with EasyDeL on JAX (CPU/GPU/TPU). It supports sharded loading with `auto_shard_model=True` and configurable precision via `dtype`, `param_dtype`, and `precision`.
+
+## Quickstart
+
+```python
+import easydel as ed
+from jax import numpy as jnp, lax
+
+repo_id = "/dev/shm/conv/Qwen3.5-0.8B-Base"
+
+dtype = jnp.bfloat16 # try jnp.float16 on many GPUs
+
+model = ed.AutoEasyDeLModelForImageTextToText.from_pretrained(
+ repo_id,
+ dtype=dtype,
+ param_dtype=dtype,
+ precision=lax.Precision("fastest"),
+ sharding_axis_names=("dp", "fsdp", "ep", "tp", "sp"),
+ sharding_axis_dims=(1, -1, 1, 1, 1),
+ config_kwargs=ed.EasyDeLBaseConfigDict(
+ attn_dtype=dtype,
+ attn_mechanism=ed.AttentionMechanisms.VANILLA,
+ fsdp_is_ep_bound=True,
+ sp_is_ep_bound=True,
+ moe_method=ed.MoEMethods.FUSED_MOE,
+ ),
+ auto_shard_model=True,
+ partition_axis=ed.PartitionAxis(),
+)
+```
+
+If the repository only provides PyTorch weights, pass `from_torch=True` to `from_pretrained(...)`.
+
+## Sharding & Parallelism (Multi-Device)
+
+EasyDeL can scale to multiple devices by creating a logical device mesh. Most EasyDeL loaders use a 5D mesh:
+
+- `dp`: data parallel (replicated parameters, different batch shards)
+- `fsdp`: parameter sharding (memory saver; often the biggest axis)
+- `ep`: expert parallel (MoE; keep `1` for non-MoE models)
+- `tp`: tensor parallel (splits large matmuls)
+- `sp`: sequence parallel (splits sequence dimension)
+
+Use `sharding_axis_names=("dp","fsdp","ep","tp","sp")` and choose `sharding_axis_dims` so that their product equals your device count.
+You can use `-1` in `sharding_axis_dims` to let EasyDeL infer the remaining dimension.
+
+
+Example sharding configs
+
+```python
+# 8 devices, pure FSDP
+sharding_axis_dims = (1, 8, 1, 1, 1)
+
+# 8 devices, 2-way DP x 4-way FSDP
+sharding_axis_dims = (2, 4, 1, 1, 1)
+
+# 8 devices, 4-way FSDP x 2-way TP
+sharding_axis_dims = (1, 4, 1, 2, 1)
+```
+
+
+## Using via `eLargeModel` (ELM)
+
+`eLargeModel` is a higher-level interface that wires together loading, sharding, training, and eSurge inference from a single config.
+
+```python
+from easydel import eLargeModel
+
+repo_id = "/dev/shm/conv/Qwen3.5-0.8B-Base"
+
+elm = eLargeModel.from_pretrained(repo_id) # task is auto-detected
+elm.set_dtype("bf16")
+elm.set_sharding(axis_names=("dp", "fsdp", "ep", "tp", "sp"), axis_dims=(1, -1, 1, 1, 1))
+
+model = elm.build_model()
+# Optional: build an inference engine
+# engine = elm.build_esurge()
+```
+
+
+ELM YAML config example
+
+```yaml
+model:
+ name_or_path: "/dev/shm/conv/Qwen3.5-0.8B-Base"
+
+loader:
+ dtype: bf16
+ param_dtype: bf16
+
+sharding:
+ axis_dims: [1, -1, 1, 1, 1]
+ auto_shard_model: true
+```
+
+
+## Features
+
+**EasyDeL:**
+- JAX native implementation and sharded execution
+- Configurable attention backends via `AttentionMechanisms.*`
+- Precision control via `dtype`, `param_dtype`, and `precision`
+
+## Installation
+
+```bash
+pip install easydel
+```
+
+## Links
+
+- EasyDeL GitHub: https://github.com/erfanzar/EasyDeL
+- Docs: https://easydel.readthedocs.io/en/latest/
+
+## Supported Tasks
+
+- ImageTextToText
+
+## Limitations
+
+- Refer to the original model card for training data, evaluation, and intended use.
+
+## License
+
+EasyDeL is released under the Apache-2.0 license. The license for this model's weights may differ; please consult the original repository.
+
+## Citation
+
+```bibtex
+@misc{Zare Chavoshi_2023,
+ title={EasyDeL: An open-source library for enhancing and streamlining the training process of machine learning models},
+ url={https://github.com/erfanzar/EasyDeL},
+ author={Zare Chavoshi, Erfan},
+ year={2023}
+}
+```
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..0ef09f214eaa6d9bca297988afc1454b5827b2c7
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,154 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+ {%- if content is string %}
+ {{- content }}
+ {%- elif content is iterable and content is not mapping %}
+ {%- for item in content %}
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+ {%- if is_system_content %}
+ {{- raise_exception('System message cannot contain images.') }}
+ {%- endif %}
+ {%- if do_vision_count %}
+ {%- set image_count.value = image_count.value + 1 %}
+ {%- endif %}
+ {%- if add_vision_id %}
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
+ {%- endif %}
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+ {%- elif 'video' in item or item.type == 'video' %}
+ {%- if is_system_content %}
+ {{- raise_exception('System message cannot contain videos.') }}
+ {%- endif %}
+ {%- if do_vision_count %}
+ {%- set video_count.value = video_count.value + 1 %}
+ {%- endif %}
+ {%- if add_vision_id %}
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
+ {%- endif %}
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+ {%- elif 'text' in item %}
+ {{- item.text }}
+ {%- else %}
+ {{- raise_exception('Unexpected item type in content.') }}
+ {%- endif %}
+ {%- endfor %}
+ {%- elif content is none or content is undefined %}
+ {{- '' }}
+ {%- else %}
+ {{- raise_exception('Unexpected content type.') }}
+ {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+ {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+ {{- '<|im_start|>system\n' }}
+ {{- "# Tools\n\nYou have access to the following functions:\n\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n" }}
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }}
+ {%- if messages[0].role == 'system' %}
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
+ {%- if content %}
+ {{- '\n\n' + content }}
+ {%- endif %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" %}
+ {%- set content = render_content(message.content, false)|trim %}
+ {%- if not(content.startswith('') and content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+ {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+ {%- set content = render_content(message.content, true)|trim %}
+ {%- if message.role == "system" %}
+ {%- if not loop.first %}
+ {{- raise_exception('System message must be at the beginning.') }}
+ {%- endif %}
+ {%- elif message.role == "user" %}
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is string %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in content %}
+ {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- set content = content.split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- set reasoning_content = reasoning_content|trim %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if tool_call.function is defined %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {%- if loop.first %}
+ {%- if content|trim %}
+ {{- '\n\n\n\n' }}
+ {%- else %}
+ {{- '\n\n' }}
+ {%- endif %}
+ {%- else %}
+ {{- '\n\n\n' }}
+ {%- endif %}
+ {%- if tool_call.arguments is defined %}
+ {%- for args_name, args_value in tool_call.arguments|items %}
+ {{- '\n' }}
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+ {{- args_value }}
+ {{- '\n\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- content }}
+ {{- '\n' }}
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
+ {{- '<|im_end|>\n' }}
+ {%- elif loop.last %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- else %}
+ {{- raise_exception('Unexpected message role.') }}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is true %}
+ {{- '\n' }}
+ {%- else %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e49425bb06b520c2b26ab2e9c3f5985822a14a44
--- /dev/null
+++ b/config.json
@@ -0,0 +1,473 @@
+{
+ "_external_rope_config_kwargs": {},
+ "add_cross_attention": false,
+ "architectures": [
+ "Qwen3_5ForConditionalGeneration"
+ ],
+ "attn_mechanism": "vanilla",
+ "backend": null,
+ "bits": null,
+ "blocksize_b": 1,
+ "blocksize_k": 512,
+ "blocksize_q": 512,
+ "bos_token_id": null,
+ "cross_attention_hidden_size": null,
+ "decode_attn_mechanism": null,
+ "decoder_start_token_id": null,
+ "easy_method": "train",
+ "eos_token_id": null,
+ "fcm_max_ratio": 0.0,
+ "fcm_min_ratio": 0.0,
+ "flash_attention_backward_pass_impl": "triton",
+ "fsdp_is_ep_bound": true,
+ "gradient_checkpointing": "",
+ "gradient_checkpointing_targets": null,
+ "hardware_abstraction": false,
+ "image_token_id": 248056,
+ "is_decoder": false,
+ "kv_cache_quantization_config": null,
+ "kv_cache_sharding_sequence_axis_name": "sp",
+ "kvdtype": "bfloat16",
+ "lmhead_chunksize": null,
+ "max_position_embeddings": null,
+ "mla_attn_dtype": "bfloat16",
+ "mla_attn_mechanism": "auto",
+ "mla_attn_softmax_dtype": "float32",
+ "model_type": "qwen3_5",
+ "moe_force_xla_gmm": false,
+ "moe_method": "fused_moe",
+ "moe_tiling_size_batch": 4,
+ "moe_tiling_size_dim": 128,
+ "moe_tiling_size_seqlen": 128,
+ "operation_configs": null,
+ "pad_token_id": null,
+ "pallas_k_block_size": 128,
+ "pallas_m_block_size": 128,
+ "pallas_n_block_size": 128,
+ "partition_axis": {
+ "attention_dim_axis": null,
+ "attention_kv_dim_axis": null,
+ "batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "bias_head_sequence_axis": null,
+ "bias_key_sequence_axis": null,
+ "data_parallel_axis": "dp",
+ "decode_attention_dim_axis": null,
+ "decode_attention_kv_dim_axis": null,
+ "decode_batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "decode_head_axis": "tp",
+ "decode_key_sequence_axis": "sp",
+ "decode_kv_head_axis": "tp",
+ "decode_query_sequence_axis": null,
+ "expert_axis": "ep",
+ "expert_gate_axis": null,
+ "expert_parallel_axis": "ep",
+ "fully_sharded_data_parallel_axis": "fsdp",
+ "head_axis": "tp",
+ "hidden_state_axis": "tp",
+ "key_sequence_axis": "sp",
+ "kv_head_axis": "tp",
+ "mlp_intermediate_axis": "tp",
+ "query_sequence_axis": "sp",
+ "sequence_axis": "sp",
+ "sequence_parallel_axis": "sp",
+ "tensor_parallel_axis": "tp",
+ "vocab_axis": "tp"
+ },
+ "platform": null,
+ "precompute_masks": true,
+ "pretraining_tp": 1,
+ "qmm_platform_override": null,
+ "qmm_tpu_path_override": null,
+ "quantization_config": null,
+ "scan_attention_layers": false,
+ "scan_mlp_chunk_size": 1024,
+ "scan_ring_attention": true,
+ "sep_token_id": null,
+ "sequence_axis_name": "sp",
+ "sharding_axis_dims": [
+ 1,
+ -1,
+ 1,
+ 1,
+ 1
+ ],
+ "sharding_axis_names": [
+ "dp",
+ "fsdp",
+ "ep",
+ "tp",
+ "sp"
+ ],
+ "sharding_dcn_axis_dims": null,
+ "sp_is_ep_bound": true,
+ "text_config": {
+ "_external_rope_config_kwargs": {},
+ "add_cross_attention": false,
+ "architectures": [
+ "Qwen3_5ForConditionalGeneration"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "attn_dtype": "bfloat16",
+ "attn_mechanism": "vanilla",
+ "attn_output_gate": true,
+ "attn_softmax_dtype": "float32",
+ "backend": null,
+ "bits": null,
+ "blocksize_b": 1,
+ "blocksize_k": 512,
+ "blocksize_q": 512,
+ "bos_token_id": null,
+ "cross_attention_hidden_size": null,
+ "decode_attn_mechanism": null,
+ "decoder_sparse_step": 1,
+ "decoder_start_token_id": null,
+ "dtype": "bfloat16",
+ "easy_method": "train",
+ "eos_token_id": 248044,
+ "fcm_max_ratio": 0.0,
+ "fcm_min_ratio": 0.0,
+ "flash_attention_backward_pass_impl": "triton",
+ "fsdp_is_ep_bound": true,
+ "full_attention_interval": 4,
+ "gradient_checkpointing": "",
+ "gradient_checkpointing_targets": null,
+ "hardware_abstraction": false,
+ "head_dim": 256,
+ "hidden_act": "silu",
+ "hidden_size": 1024,
+ "initializer_range": 0.02,
+ "intermediate_size": 3584,
+ "is_decoder": false,
+ "kv_cache_quantization_config": null,
+ "kv_cache_sharding_sequence_axis_name": "sp",
+ "kvdtype": "bfloat16",
+ "layer_types": [
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention"
+ ],
+ "linear_attention_separate_proj": true,
+ "linear_conv_kernel_dim": 4,
+ "linear_key_head_dim": 128,
+ "linear_num_key_heads": 16,
+ "linear_num_value_heads": 16,
+ "linear_value_head_dim": 128,
+ "lmhead_chunksize": null,
+ "mamba_ssm_dtype": "float32",
+ "max_position_embeddings": 262144,
+ "mla_attn_dtype": "bfloat16",
+ "mla_attn_mechanism": "auto",
+ "mla_attn_softmax_dtype": "float32",
+ "mlp_only_layers": [
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ 11,
+ 12,
+ 13,
+ 14,
+ 15,
+ 16,
+ 17,
+ 18,
+ 19,
+ 20,
+ 21,
+ 22,
+ 23
+ ],
+ "model_type": "qwen3_5_text",
+ "moe_force_xla_gmm": false,
+ "moe_intermediate_size": 512,
+ "moe_method": "fused_moe",
+ "moe_tiling_size_batch": 4,
+ "moe_tiling_size_dim": 128,
+ "moe_tiling_size_seqlen": 128,
+ "mtp_num_hidden_layers": 1,
+ "mtp_use_dedicated_embeddings": false,
+ "norm_topk_prob": true,
+ "num_attention_heads": 8,
+ "num_experts": 256,
+ "num_experts_per_tok": 8,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "num_local_experts": 256,
+ "operation_configs": null,
+ "output_router_logits": false,
+ "pad_token_id": null,
+ "pallas_k_block_size": 128,
+ "pallas_m_block_size": 128,
+ "pallas_n_block_size": 128,
+ "partial_rotary_factor": 0.25,
+ "partition_axis": {
+ "attention_dim_axis": null,
+ "attention_kv_dim_axis": null,
+ "batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "bias_head_sequence_axis": null,
+ "bias_key_sequence_axis": null,
+ "data_parallel_axis": "dp",
+ "decode_attention_dim_axis": null,
+ "decode_attention_kv_dim_axis": null,
+ "decode_batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "decode_head_axis": "tp",
+ "decode_key_sequence_axis": "sp",
+ "decode_kv_head_axis": "tp",
+ "decode_query_sequence_axis": null,
+ "expert_axis": "ep",
+ "expert_gate_axis": null,
+ "expert_parallel_axis": "ep",
+ "fully_sharded_data_parallel_axis": "fsdp",
+ "head_axis": "tp",
+ "hidden_state_axis": "tp",
+ "key_sequence_axis": "sp",
+ "kv_head_axis": "tp",
+ "mlp_intermediate_axis": "tp",
+ "query_sequence_axis": "sp",
+ "sequence_axis": "sp",
+ "sequence_parallel_axis": "sp",
+ "tensor_parallel_axis": "tp",
+ "vocab_axis": "tp"
+ },
+ "platform": null,
+ "precompute_masks": true,
+ "pretraining_tp": 1,
+ "qmm_platform_override": null,
+ "qmm_tpu_path_override": null,
+ "quantization_config": null,
+ "rms_norm_eps": 1e-06,
+ "rope_parameters": {
+ "mrope_interleaved": true,
+ "mrope_section": [
+ 11,
+ 11,
+ 10
+ ],
+ "partial_rotary_factor": 0.25,
+ "rope_theta": 10000000,
+ "rope_type": "default",
+ "type": "default"
+ },
+ "rope_theta": 10000000,
+ "router_aux_loss_coef": 0.001,
+ "scan_attention_layers": false,
+ "scan_mlp_chunk_size": 1024,
+ "scan_ring_attention": true,
+ "sep_token_id": null,
+ "sequence_axis_name": "sp",
+ "sharding_axis_dims": [
+ 1,
+ -1,
+ 1,
+ 1,
+ 1
+ ],
+ "sharding_axis_names": [
+ "dp",
+ "fsdp",
+ "ep",
+ "tp",
+ "sp"
+ ],
+ "sharding_dcn_axis_dims": null,
+ "shared_expert_intermediate_size": 512,
+ "sp_is_ep_bound": true,
+ "tie_encoder_decoder": false,
+ "tie_word_embeddings": true,
+ "use_cache": true,
+ "use_expert_tensor_mode": false,
+ "use_qmm_best_config": false,
+ "use_ring_of_experts": false,
+ "use_scan_mlp": false,
+ "use_sharded_kv_caching": false,
+ "use_sharding_constraint": false,
+ "vocab_size": 248320
+ },
+ "tie_encoder_decoder": false,
+ "tie_word_embeddings": true,
+ "transformers_version": "5.5.0",
+ "use_expert_tensor_mode": false,
+ "use_qmm_best_config": false,
+ "use_ring_of_experts": false,
+ "use_scan_mlp": false,
+ "use_sharded_kv_caching": false,
+ "use_sharding_constraint": false,
+ "video_token_id": 248057,
+ "vision_config": {
+ "_external_rope_config_kwargs": {},
+ "add_cross_attention": false,
+ "architectures": [
+ "Qwen3_5ForConditionalGeneration"
+ ],
+ "attn_dtype": "bfloat16",
+ "attn_mechanism": "vanilla",
+ "attn_softmax_dtype": "float32",
+ "backend": null,
+ "bits": null,
+ "blocksize_b": 1,
+ "blocksize_k": 512,
+ "blocksize_q": 512,
+ "bos_token_id": null,
+ "cross_attention_hidden_size": null,
+ "decode_attn_mechanism": null,
+ "decoder_start_token_id": null,
+ "deepstack_visual_indexes": [],
+ "depth": 12,
+ "easy_method": "train",
+ "embed_dim": 768,
+ "eos_token_id": null,
+ "fcm_max_ratio": 0.0,
+ "fcm_min_ratio": 0.0,
+ "flash_attention_backward_pass_impl": "triton",
+ "fsdp_is_ep_bound": true,
+ "gradient_checkpointing": "",
+ "gradient_checkpointing_targets": null,
+ "hardware_abstraction": false,
+ "hidden_act": "gelu_pytorch_tanh",
+ "hidden_size": 768,
+ "in_channels": 3,
+ "initializer_range": 0.02,
+ "intermediate_size": 3072,
+ "is_decoder": false,
+ "kv_cache_quantization_config": null,
+ "kv_cache_sharding_sequence_axis_name": "sp",
+ "kvdtype": "bfloat16",
+ "lmhead_chunksize": null,
+ "max_position_embeddings": null,
+ "mla_attn_dtype": "bfloat16",
+ "mla_attn_mechanism": "auto",
+ "mla_attn_softmax_dtype": "float32",
+ "model_type": "qwen3_5",
+ "moe_force_xla_gmm": false,
+ "moe_method": "fused_moe",
+ "moe_tiling_size_batch": 4,
+ "moe_tiling_size_dim": 128,
+ "moe_tiling_size_seqlen": 128,
+ "num_attention_heads": 12,
+ "num_heads": 12,
+ "num_position_embeddings": 2304,
+ "operation_configs": null,
+ "out_hidden_size": 1024,
+ "pad_token_id": null,
+ "pallas_k_block_size": 128,
+ "pallas_m_block_size": 128,
+ "pallas_n_block_size": 128,
+ "partition_axis": {
+ "attention_dim_axis": null,
+ "attention_kv_dim_axis": null,
+ "batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "bias_head_sequence_axis": null,
+ "bias_key_sequence_axis": null,
+ "data_parallel_axis": "dp",
+ "decode_attention_dim_axis": null,
+ "decode_attention_kv_dim_axis": null,
+ "decode_batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "decode_head_axis": "tp",
+ "decode_key_sequence_axis": "sp",
+ "decode_kv_head_axis": "tp",
+ "decode_query_sequence_axis": null,
+ "expert_axis": "ep",
+ "expert_gate_axis": null,
+ "expert_parallel_axis": "ep",
+ "fully_sharded_data_parallel_axis": "fsdp",
+ "head_axis": "tp",
+ "hidden_state_axis": "tp",
+ "key_sequence_axis": "sp",
+ "kv_head_axis": "tp",
+ "mlp_intermediate_axis": "tp",
+ "query_sequence_axis": "sp",
+ "sequence_axis": "sp",
+ "sequence_parallel_axis": "sp",
+ "tensor_parallel_axis": "tp",
+ "vocab_axis": "tp"
+ },
+ "patch_size": 16,
+ "platform": null,
+ "precompute_masks": true,
+ "pretraining_tp": 1,
+ "qmm_platform_override": null,
+ "qmm_tpu_path_override": null,
+ "quantization_config": null,
+ "scan_attention_layers": false,
+ "scan_mlp_chunk_size": 1024,
+ "scan_ring_attention": true,
+ "sep_token_id": null,
+ "sequence_axis_name": "sp",
+ "sharding_axis_dims": [
+ 1,
+ -1,
+ 1,
+ 1,
+ 1
+ ],
+ "sharding_axis_names": [
+ "dp",
+ "fsdp",
+ "ep",
+ "tp",
+ "sp"
+ ],
+ "sharding_dcn_axis_dims": null,
+ "sp_is_ep_bound": true,
+ "spatial_merge_size": 2,
+ "temporal_patch_size": 2,
+ "tie_encoder_decoder": false,
+ "tie_word_embeddings": true,
+ "tokens_per_second": 2.0,
+ "use_expert_tensor_mode": false,
+ "use_qmm_best_config": false,
+ "use_ring_of_experts": false,
+ "use_scan_mlp": false,
+ "use_sharded_kv_caching": false,
+ "use_sharding_constraint": false
+ },
+ "vision_end_token_id": 248054,
+ "vision_start_token_id": 248053
+}
diff --git a/model/model/language_model/embed_tokens/embedding/.zarray b/model/model/language_model/embed_tokens/embedding/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..711f2529b21c2f4435446397dce15c51ebbdd3d3
--- /dev/null
+++ b/model/model/language_model/embed_tokens/embedding/.zarray
@@ -0,0 +1 @@
+{"chunks":[62080,1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[248320,1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/.zarray b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/0 b/model/model/language_model/layers/0/input_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..433d6181d9f790c85a53eedcefc8516a0d2225b6
Binary files /dev/null and b/model/model/language_model/layers/0/input_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/A_log/.zarray b/model/model/language_model/layers/0/linear_attn/A_log/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/A_log/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/A_log/0 b/model/model/language_model/layers/0/linear_attn/A_log/0
new file mode 100644
index 0000000000000000000000000000000000000000..a320d85b6af3b571a6fcc30a66efbf74120750c6
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/A_log/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..552ce665d7c76ffc220dbd50227061c0d1edebd5
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/0 b/model/model/language_model/layers/0/linear_attn/dt_bias/0
new file mode 100644
index 0000000000000000000000000000000000000000..e97bd1f79084ae9651a61e5a488962e8557c2b56
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/dt_bias/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..d77c3881da82245021b9d28ad9d51ffe1481172b
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..d8bc18181b2dc180b1bc9aa1b36333eef98ff156
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..409b2d2b99fa367cf891fcec6e4068b21a52ceb7
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..fef45a62ae9383aeac8d59b216505e621e95cfe1
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..d66de4b1412fc4aedef4e5d611ed5ce24a14e6b9
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 differ
diff --git a/model/model/language_model/layers/19/input_layernorm/kernel/0 b/model/model/language_model/layers/19/input_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..f106053ccb5f8f402181121be5a3b3c20f086caf
Binary files /dev/null and b/model/model/language_model/layers/19/input_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/19/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/19/post_attention_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/19/post_attention_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/19/self_attn/k_proj/kernel/.zarray b/model/model/language_model/layers/19/self_attn/k_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..36234540fe1b82a927003cd0f5ed4bcb3039a8d4
--- /dev/null
+++ b/model/model/language_model/layers/19/self_attn/k_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,512],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/19/self_attn/q_norm/kernel/.zarray b/model/model/language_model/layers/19/self_attn/q_norm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..653c877efa83da0ab518995885715f7e8b4275d7
--- /dev/null
+++ b/model/model/language_model/layers/19/self_attn/q_norm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/19/self_attn/v_proj/kernel/.zarray b/model/model/language_model/layers/19/self_attn/v_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..36234540fe1b82a927003cd0f5ed4bcb3039a8d4
--- /dev/null
+++ b/model/model/language_model/layers/19/self_attn/v_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,512],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/input_layernorm/kernel/.zarray b/model/model/language_model/layers/2/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/2/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/input_layernorm/kernel/0 b/model/model/language_model/layers/2/input_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..706e73f7cd04a53f863cad02a1d3c4b11dcf2396
Binary files /dev/null and b/model/model/language_model/layers/2/input_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/2/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/2/linear_attn/conv1d/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285
--- /dev/null
+++ b/model/model/language_model/layers/2/linear_attn/conv1d/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/linear_attn/dt_bias/0 b/model/model/language_model/layers/2/linear_attn/dt_bias/0
new file mode 100644
index 0000000000000000000000000000000000000000..89aa4dd1f989686d55cfcea012bdd56f6572275b
Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/dt_bias/0 differ
diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/2/linear_attn/in_proj_a/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..b2f9c840422ff637b6020ea8f2567216a835349b
Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_a/kernel/3.0 differ
diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921
--- /dev/null
+++ b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..9828c26e66281c0b1830768247e7bc9d7070f143
Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/0.0 differ
diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..ce2d208830a419b1f71c4b49d9e09d3dc95f1ed2
Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/1.0 differ
diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..f2e05f805579915dd3ae5116a8d02b84aefb706e
Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/3.0 differ
diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/2/linear_attn/in_proj_z/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..c1d4bca0cadb8de14bbcf4269f75130fae4b5e12
--- /dev/null
+++ b/model/model/language_model/layers/2/linear_attn/in_proj_z/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/linear_attn/norm/kernel/0 b/model/model/language_model/layers/2/linear_attn/norm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..9ba5a3510daf31c0fa8ea3cc04e22a03ac74a99c
Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/norm/kernel/0 differ
diff --git a/model/model/language_model/layers/2/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/2/mlp/down_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..d430451028a2753e4ce16636cfe899c607fae561
--- /dev/null
+++ b/model/model/language_model/layers/2/mlp/down_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[3584,256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[3584,1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/mlp/up_proj/kernel/.zarray b/model/model/language_model/layers/2/mlp/up_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..9069ead8bf11c7bc932be9123c95ddc3a4360c93
--- /dev/null
+++ b/model/model/language_model/layers/2/mlp/up_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,3584],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,3584],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/2/post_attention_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/2/post_attention_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/2/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/2/post_attention_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..7459699fd571d825095ee14203347344d4786612
Binary files /dev/null and b/model/model/language_model/layers/2/post_attention_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/20/input_layernorm/kernel/.zarray b/model/model/language_model/layers/20/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/20/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/input_layernorm/kernel/0 b/model/model/language_model/layers/20/input_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..54f289c5de162088278c7916ba79db0df7e65495
Binary files /dev/null and b/model/model/language_model/layers/20/input_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/A_log/.zarray b/model/model/language_model/layers/20/linear_attn/A_log/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/A_log/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/A_log/0 b/model/model/language_model/layers/20/linear_attn/A_log/0
new file mode 100644
index 0000000000000000000000000000000000000000..99faa4e9472cda2e968bf6ba1ec188f581dc08f7
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/A_log/0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..4bed7ee37bf1295110266286321ccdddcee4e8d3
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/0.0.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/20/linear_attn/dt_bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/dt_bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/dt_bias/0 b/model/model/language_model/layers/20/linear_attn/dt_bias/0
new file mode 100644
index 0000000000000000000000000000000000000000..f3f9e9f0f3a46a90509e5abc5ee804483637a474
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/dt_bias/0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..3347a88cf0dde541152c6582c88acc68efa32e78
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/1.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..3a0dbaf17b87c12d92e1ba7dd136bfd2b4d1634c
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/2.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..32cacdb1dfe49859bf5d473401415ed804d38978
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/3.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..23bf6bf9d04f958801acbd12e2a81b458f0f7f11
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/0.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..2663b90fda7f413a2f30b95a6a0bb2dee1911070
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/1.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..c55314003eee271ab08f6cb3ff1d973e9f75f2c6
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/2.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..8b0999cb98b47282b2e81c6e36bbdfffd77e8e38
Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/3.0 differ
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..c7a1625024939060efef29beeead903e4170d0ac
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_z/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..c1d4bca0cadb8de14bbcf4269f75130fae4b5e12
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/in_proj_z/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/linear_attn/norm/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/norm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01
--- /dev/null
+++ b/model/model/language_model/layers/20/linear_attn/norm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/mlp/gate_proj/kernel/.zarray b/model/model/language_model/layers/20/mlp/gate_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..9069ead8bf11c7bc932be9123c95ddc3a4360c93
--- /dev/null
+++ b/model/model/language_model/layers/20/mlp/gate_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,3584],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,3584],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/mlp/up_proj/kernel/.zarray b/model/model/language_model/layers/20/mlp/up_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..9069ead8bf11c7bc932be9123c95ddc3a4360c93
--- /dev/null
+++ b/model/model/language_model/layers/20/mlp/up_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,3584],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,3584],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/20/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/20/post_attention_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..794cf8e9ff80ad5f6f7ef59dd4f05007639d4ca1
Binary files /dev/null and b/model/model/language_model/layers/20/post_attention_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/21/input_layernorm/kernel/.zarray b/model/model/language_model/layers/21/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/21/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/21/linear_attn/A_log/0 b/model/model/language_model/layers/21/linear_attn/A_log/0
new file mode 100644
index 0000000000000000000000000000000000000000..1d2dd8b7e783676a9eb45793a85b8bda06467e3d
Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/A_log/0 differ
diff --git a/model/model/language_model/layers/21/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/21/linear_attn/conv1d/kernel/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..4a74fcb15801900be3813fc67dbed01709b6f2d1
Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/conv1d/kernel/0.0.0 differ
diff --git a/model/model/language_model/layers/21/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/21/linear_attn/dt_bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/21/linear_attn/dt_bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/21/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/21/linear_attn/in_proj_a/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..5b36624557647bc367a14b53030799421517ab59
Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/in_proj_a/kernel/2.0 differ
diff --git a/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921
--- /dev/null
+++ b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..5ada231fe525597887531333f531a01f20e6a64b
Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/3.0 differ
diff --git a/model/model/language_model/layers/21/linear_attn/norm/kernel/0 b/model/model/language_model/layers/21/linear_attn/norm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..f862a97770995da7e84c9a6744106d1f91fb72fc
Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/norm/kernel/0 differ
diff --git a/model/model/language_model/layers/21/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/21/mlp/down_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..d430451028a2753e4ce16636cfe899c607fae561
--- /dev/null
+++ b/model/model/language_model/layers/21/mlp/down_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[3584,256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[3584,1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/22/input_layernorm/kernel/.zarray b/model/model/language_model/layers/22/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582
--- /dev/null
+++ b/model/model/language_model/layers/22/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/visual/blocks/9/norm2/bias/0 b/model/model/visual/blocks/9/norm2/bias/0
new file mode 100644
index 0000000000000000000000000000000000000000..df23544af2c54751cd9c5dc44910c9b463296052
Binary files /dev/null and b/model/model/visual/blocks/9/norm2/bias/0 differ
diff --git a/model/model/visual/merger/norm/bias/.zarray b/model/model/visual/merger/norm/bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..108bf2997962926e388231bffd83dd858a8cfa34
--- /dev/null
+++ b/model/model/visual/merger/norm/bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[768],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2}
\ No newline at end of file
diff --git a/tensorstore_index.json b/tensorstore_index.json
new file mode 100644
index 0000000000000000000000000000000000000000..7947e94b4d19eb03fe34d93f91596c733d274a2a
--- /dev/null
+++ b/tensorstore_index.json
@@ -0,0 +1,3597 @@
+{
+ "format": "tensorstore",
+ "version": "easydel",
+ "prefixes": {
+ "model": [
+ {
+ "path": "model/model/language_model/embed_tokens/embedding",
+ "shape": [
+ 248320,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/k_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/q_proj/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/v_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/k_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/q_proj/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/v_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/k_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/q_proj/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/v_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/k_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/q_proj/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/v_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/k_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/q_proj/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/v_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/k_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/q_proj/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/v_proj/kernel",
+ "shape": [
+ 1024,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/input_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 1024,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 1024,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 1024,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/mlp/down_proj/kernel",
+ "shape": [
+ 3584,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/mlp/gate_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/mlp/up_proj/kernel",
+ "shape": [
+ 1024,
+ 3584
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/post_attention_layernorm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/norm/kernel",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/proj/kernel",
+ "shape": [
+ 768,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/qkv/bias",
+ "shape": [
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/qkv/kernel",
+ "shape": [
+ 768,
+ 2304
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc1/kernel",
+ "shape": [
+ 768,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm1/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm1/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm2/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm2/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc1/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc1/kernel",
+ "shape": [
+ 3072,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc2/kernel",
+ "shape": [
+ 3072,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/norm/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/norm/scale",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/patch_embed/proj/bias",
+ "shape": [
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/patch_embed/proj/kernel",
+ "shape": [
+ 2,
+ 16,
+ 16,
+ 3,
+ 768
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/pos_embed/embedding",
+ "shape": [
+ 2304,
+ 768
+ ],
+ "dtype": "bfloat16"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4a37b2a6fd3ab3317cd7bac72855be1a843b2bb
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,31 @@
+{
+ "add_prefix_space": false,
+ "audio_bos_token": "<|audio_start|>",
+ "audio_eos_token": "<|audio_end|>",
+ "audio_token": "<|audio_pad|>",
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "image_token": "<|image_pad|>",
+ "is_local": false,
+ "model_max_length": 262144,
+ "model_specific_special_tokens": {
+ "audio_bos_token": "<|audio_start|>",
+ "audio_eos_token": "<|audio_end|>",
+ "audio_token": "<|audio_pad|>",
+ "image_token": "<|image_pad|>",
+ "video_token": "<|video_pad|>",
+ "vision_bos_token": "<|vision_start|>",
+ "vision_eos_token": "<|vision_end|>"
+ },
+ "pad_token": "<|endoftext|>",
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ "split_special_tokens": false,
+ "tokenizer_class": "TokenizersBackend",
+ "unk_token": null,
+ "video_token": "<|video_pad|>",
+ "vision_bos_token": "<|vision_start|>",
+ "vision_eos_token": "<|vision_end|>"
+}