diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..0ef09f214eaa6d9bca297988afc1454b5827b2c7
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,154 @@
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- macro render_content(content, do_vision_count, is_system_content=false) %}
+ {%- if content is string %}
+ {{- content }}
+ {%- elif content is iterable and content is not mapping %}
+ {%- for item in content %}
+ {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}
+ {%- if is_system_content %}
+ {{- raise_exception('System message cannot contain images.') }}
+ {%- endif %}
+ {%- if do_vision_count %}
+ {%- set image_count.value = image_count.value + 1 %}
+ {%- endif %}
+ {%- if add_vision_id %}
+ {{- 'Picture ' ~ image_count.value ~ ': ' }}
+ {%- endif %}
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
+ {%- elif 'video' in item or item.type == 'video' %}
+ {%- if is_system_content %}
+ {{- raise_exception('System message cannot contain videos.') }}
+ {%- endif %}
+ {%- if do_vision_count %}
+ {%- set video_count.value = video_count.value + 1 %}
+ {%- endif %}
+ {%- if add_vision_id %}
+ {{- 'Video ' ~ video_count.value ~ ': ' }}
+ {%- endif %}
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
+ {%- elif 'text' in item %}
+ {{- item.text }}
+ {%- else %}
+ {{- raise_exception('Unexpected item type in content.') }}
+ {%- endif %}
+ {%- endfor %}
+ {%- elif content is none or content is undefined %}
+ {{- '' }}
+ {%- else %}
+ {{- raise_exception('Unexpected content type.') }}
+ {%- endif %}
+{%- endmacro %}
+{%- if not messages %}
+ {{- raise_exception('No messages provided.') }}
+{%- endif %}
+{%- if tools and tools is iterable and tools is not mapping %}
+ {{- '<|im_start|>system\n' }}
+ {{- "# Tools\n\nYou have access to the following functions:\n\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n" }}
+ {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }}
+ {%- if messages[0].role == 'system' %}
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
+ {%- if content %}
+ {{- '\n\n' + content }}
+ {%- endif %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {%- set content = render_content(messages[0].content, false, true)|trim %}
+ {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" %}
+ {%- set content = render_content(message.content, false)|trim %}
+ {%- if not(content.startswith('') and content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if ns.multi_step_tool %}
+ {{- raise_exception('No user query found in messages.') }}
+{%- endif %}
+{%- for message in messages %}
+ {%- set content = render_content(message.content, true)|trim %}
+ {%- if message.role == "system" %}
+ {%- if not loop.first %}
+ {{- raise_exception('System message must be at the beginning.') }}
+ {%- endif %}
+ {%- elif message.role == "user" %}
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is string %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in content %}
+ {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- set content = content.split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- set reasoning_content = reasoning_content|trim %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if tool_call.function is defined %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {%- if loop.first %}
+ {%- if content|trim %}
+ {{- '\n\n\n\n' }}
+ {%- else %}
+ {{- '\n\n' }}
+ {%- endif %}
+ {%- else %}
+ {{- '\n\n\n' }}
+ {%- endif %}
+ {%- if tool_call.arguments is defined %}
+ {%- for args_name, args_value in tool_call.arguments|items %}
+ {{- '\n' }}
+ {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+ {{- args_value }}
+ {{- '\n\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.previtem and loop.previtem.role != "tool" %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- content }}
+ {{- '\n' }}
+ {%- if not loop.last and loop.nextitem.role != "tool" %}
+ {{- '<|im_end|>\n' }}
+ {%- elif loop.last %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- else %}
+ {{- raise_exception('Unexpected message role.') }}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is true %}
+ {{- '\n' }}
+ {%- else %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/checkpoint_metadata.json b/checkpoint_metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..8f81209eda0565ce47c0ec8e0a2a5e4e5891dae6
--- /dev/null
+++ b/checkpoint_metadata.json
@@ -0,0 +1,6 @@
+{
+ "timestamp": "2026-04-09T05:12:06.203634",
+ "custom_metadata": {
+ "step": 0
+ }
+}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f42db6e5ed757e0c884a6abfe6eb40decafe0
--- /dev/null
+++ b/config.json
@@ -0,0 +1,473 @@
+{
+ "_external_rope_config_kwargs": {},
+ "add_cross_attention": false,
+ "architectures": [
+ "Qwen3_5ForConditionalGeneration"
+ ],
+ "attn_mechanism": "vanilla",
+ "backend": null,
+ "bits": null,
+ "blocksize_b": 1,
+ "blocksize_k": 512,
+ "blocksize_q": 512,
+ "bos_token_id": null,
+ "cross_attention_hidden_size": null,
+ "decode_attn_mechanism": null,
+ "decoder_start_token_id": null,
+ "easy_method": "train",
+ "eos_token_id": null,
+ "fcm_max_ratio": 0.0,
+ "fcm_min_ratio": 0.0,
+ "flash_attention_backward_pass_impl": "triton",
+ "fsdp_is_ep_bound": true,
+ "gradient_checkpointing": "",
+ "gradient_checkpointing_targets": null,
+ "hardware_abstraction": false,
+ "image_token_id": 248056,
+ "is_decoder": false,
+ "kv_cache_quantization_config": null,
+ "kv_cache_sharding_sequence_axis_name": "sp",
+ "kvdtype": "bfloat16",
+ "lmhead_chunksize": null,
+ "max_position_embeddings": null,
+ "mla_attn_dtype": "bfloat16",
+ "mla_attn_mechanism": "auto",
+ "mla_attn_softmax_dtype": "float32",
+ "model_type": "qwen3_5",
+ "moe_force_xla_gmm": false,
+ "moe_method": "fused_moe",
+ "moe_tiling_size_batch": 4,
+ "moe_tiling_size_dim": 128,
+ "moe_tiling_size_seqlen": 128,
+ "operation_configs": null,
+ "pad_token_id": null,
+ "pallas_k_block_size": 128,
+ "pallas_m_block_size": 128,
+ "pallas_n_block_size": 128,
+ "partition_axis": {
+ "attention_dim_axis": null,
+ "attention_kv_dim_axis": null,
+ "batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "bias_head_sequence_axis": null,
+ "bias_key_sequence_axis": null,
+ "data_parallel_axis": "dp",
+ "decode_attention_dim_axis": null,
+ "decode_attention_kv_dim_axis": null,
+ "decode_batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "decode_head_axis": "tp",
+ "decode_key_sequence_axis": "sp",
+ "decode_kv_head_axis": "tp",
+ "decode_query_sequence_axis": null,
+ "expert_axis": "ep",
+ "expert_gate_axis": null,
+ "expert_parallel_axis": "ep",
+ "fully_sharded_data_parallel_axis": "fsdp",
+ "head_axis": "tp",
+ "hidden_state_axis": "tp",
+ "key_sequence_axis": "sp",
+ "kv_head_axis": "tp",
+ "mlp_intermediate_axis": "tp",
+ "query_sequence_axis": "sp",
+ "sequence_axis": "sp",
+ "sequence_parallel_axis": "sp",
+ "tensor_parallel_axis": "tp",
+ "vocab_axis": "tp"
+ },
+ "platform": null,
+ "precompute_masks": true,
+ "pretraining_tp": 1,
+ "qmm_platform_override": null,
+ "qmm_tpu_path_override": null,
+ "quantization_config": null,
+ "scan_attention_layers": false,
+ "scan_mlp_chunk_size": 1024,
+ "scan_ring_attention": true,
+ "sep_token_id": null,
+ "sequence_axis_name": "sp",
+ "sharding_axis_dims": [
+ 1,
+ -1,
+ 1,
+ 1,
+ 1
+ ],
+ "sharding_axis_names": [
+ "dp",
+ "fsdp",
+ "ep",
+ "tp",
+ "sp"
+ ],
+ "sharding_dcn_axis_dims": null,
+ "sp_is_ep_bound": true,
+ "text_config": {
+ "_external_rope_config_kwargs": {},
+ "add_cross_attention": false,
+ "architectures": [
+ "Qwen3_5ForConditionalGeneration"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "attn_dtype": "bfloat16",
+ "attn_mechanism": "vanilla",
+ "attn_output_gate": true,
+ "attn_softmax_dtype": "float32",
+ "backend": null,
+ "bits": null,
+ "blocksize_b": 1,
+ "blocksize_k": 512,
+ "blocksize_q": 512,
+ "bos_token_id": null,
+ "cross_attention_hidden_size": null,
+ "decode_attn_mechanism": null,
+ "decoder_sparse_step": 1,
+ "decoder_start_token_id": null,
+ "dtype": "bfloat16",
+ "easy_method": "train",
+ "eos_token_id": 248044,
+ "fcm_max_ratio": 0.0,
+ "fcm_min_ratio": 0.0,
+ "flash_attention_backward_pass_impl": "triton",
+ "fsdp_is_ep_bound": true,
+ "full_attention_interval": 4,
+ "gradient_checkpointing": "",
+ "gradient_checkpointing_targets": null,
+ "hardware_abstraction": false,
+ "head_dim": 256,
+ "hidden_act": "silu",
+ "hidden_size": 2048,
+ "initializer_range": 0.02,
+ "intermediate_size": 6144,
+ "is_decoder": false,
+ "kv_cache_quantization_config": null,
+ "kv_cache_sharding_sequence_axis_name": "sp",
+ "kvdtype": "bfloat16",
+ "layer_types": [
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention",
+ "linear_attention",
+ "linear_attention",
+ "linear_attention",
+ "full_attention"
+ ],
+ "linear_attention_separate_proj": true,
+ "linear_conv_kernel_dim": 4,
+ "linear_key_head_dim": 128,
+ "linear_num_key_heads": 16,
+ "linear_num_value_heads": 16,
+ "linear_value_head_dim": 128,
+ "lmhead_chunksize": null,
+ "mamba_ssm_dtype": "float32",
+ "max_position_embeddings": 262144,
+ "mla_attn_dtype": "bfloat16",
+ "mla_attn_mechanism": "auto",
+ "mla_attn_softmax_dtype": "float32",
+ "mlp_only_layers": [
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 5,
+ 6,
+ 7,
+ 8,
+ 9,
+ 10,
+ 11,
+ 12,
+ 13,
+ 14,
+ 15,
+ 16,
+ 17,
+ 18,
+ 19,
+ 20,
+ 21,
+ 22,
+ 23
+ ],
+ "model_type": "qwen3_5_text",
+ "moe_force_xla_gmm": false,
+ "moe_intermediate_size": 512,
+ "moe_method": "fused_moe",
+ "moe_tiling_size_batch": 4,
+ "moe_tiling_size_dim": 128,
+ "moe_tiling_size_seqlen": 128,
+ "mtp_num_hidden_layers": 1,
+ "mtp_use_dedicated_embeddings": false,
+ "norm_topk_prob": true,
+ "num_attention_heads": 8,
+ "num_experts": 256,
+ "num_experts_per_tok": 8,
+ "num_hidden_layers": 24,
+ "num_key_value_heads": 2,
+ "num_local_experts": 256,
+ "operation_configs": null,
+ "output_router_logits": false,
+ "pad_token_id": null,
+ "pallas_k_block_size": 128,
+ "pallas_m_block_size": 128,
+ "pallas_n_block_size": 128,
+ "partial_rotary_factor": 0.25,
+ "partition_axis": {
+ "attention_dim_axis": null,
+ "attention_kv_dim_axis": null,
+ "batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "bias_head_sequence_axis": null,
+ "bias_key_sequence_axis": null,
+ "data_parallel_axis": "dp",
+ "decode_attention_dim_axis": null,
+ "decode_attention_kv_dim_axis": null,
+ "decode_batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "decode_head_axis": "tp",
+ "decode_key_sequence_axis": "sp",
+ "decode_kv_head_axis": "tp",
+ "decode_query_sequence_axis": null,
+ "expert_axis": "ep",
+ "expert_gate_axis": null,
+ "expert_parallel_axis": "ep",
+ "fully_sharded_data_parallel_axis": "fsdp",
+ "head_axis": "tp",
+ "hidden_state_axis": "tp",
+ "key_sequence_axis": "sp",
+ "kv_head_axis": "tp",
+ "mlp_intermediate_axis": "tp",
+ "query_sequence_axis": "sp",
+ "sequence_axis": "sp",
+ "sequence_parallel_axis": "sp",
+ "tensor_parallel_axis": "tp",
+ "vocab_axis": "tp"
+ },
+ "platform": null,
+ "precompute_masks": true,
+ "pretraining_tp": 1,
+ "qmm_platform_override": null,
+ "qmm_tpu_path_override": null,
+ "quantization_config": null,
+ "rms_norm_eps": 1e-06,
+ "rope_parameters": {
+ "mrope_interleaved": true,
+ "mrope_section": [
+ 11,
+ 11,
+ 10
+ ],
+ "partial_rotary_factor": 0.25,
+ "rope_theta": 10000000,
+ "rope_type": "default",
+ "type": "default"
+ },
+ "rope_theta": 10000000,
+ "router_aux_loss_coef": 0.001,
+ "scan_attention_layers": false,
+ "scan_mlp_chunk_size": 1024,
+ "scan_ring_attention": true,
+ "sep_token_id": null,
+ "sequence_axis_name": "sp",
+ "sharding_axis_dims": [
+ 1,
+ -1,
+ 1,
+ 1,
+ 1
+ ],
+ "sharding_axis_names": [
+ "dp",
+ "fsdp",
+ "ep",
+ "tp",
+ "sp"
+ ],
+ "sharding_dcn_axis_dims": null,
+ "shared_expert_intermediate_size": 512,
+ "sp_is_ep_bound": true,
+ "tie_encoder_decoder": false,
+ "tie_word_embeddings": true,
+ "use_cache": true,
+ "use_expert_tensor_mode": false,
+ "use_qmm_best_config": false,
+ "use_ring_of_experts": false,
+ "use_scan_mlp": false,
+ "use_sharded_kv_caching": false,
+ "use_sharding_constraint": false,
+ "vocab_size": 248320
+ },
+ "tie_encoder_decoder": false,
+ "tie_word_embeddings": true,
+ "transformers_version": "5.5.0",
+ "use_expert_tensor_mode": false,
+ "use_qmm_best_config": false,
+ "use_ring_of_experts": false,
+ "use_scan_mlp": false,
+ "use_sharded_kv_caching": false,
+ "use_sharding_constraint": false,
+ "video_token_id": 248057,
+ "vision_config": {
+ "_external_rope_config_kwargs": {},
+ "add_cross_attention": false,
+ "architectures": [
+ "Qwen3_5ForConditionalGeneration"
+ ],
+ "attn_dtype": "bfloat16",
+ "attn_mechanism": "vanilla",
+ "attn_softmax_dtype": "float32",
+ "backend": null,
+ "bits": null,
+ "blocksize_b": 1,
+ "blocksize_k": 512,
+ "blocksize_q": 512,
+ "bos_token_id": null,
+ "cross_attention_hidden_size": null,
+ "decode_attn_mechanism": null,
+ "decoder_start_token_id": null,
+ "deepstack_visual_indexes": [],
+ "depth": 24,
+ "easy_method": "train",
+ "embed_dim": 1024,
+ "eos_token_id": null,
+ "fcm_max_ratio": 0.0,
+ "fcm_min_ratio": 0.0,
+ "flash_attention_backward_pass_impl": "triton",
+ "fsdp_is_ep_bound": true,
+ "gradient_checkpointing": "",
+ "gradient_checkpointing_targets": null,
+ "hardware_abstraction": false,
+ "hidden_act": "gelu_pytorch_tanh",
+ "hidden_size": 1024,
+ "in_channels": 3,
+ "initializer_range": 0.02,
+ "intermediate_size": 4096,
+ "is_decoder": false,
+ "kv_cache_quantization_config": null,
+ "kv_cache_sharding_sequence_axis_name": "sp",
+ "kvdtype": "bfloat16",
+ "lmhead_chunksize": null,
+ "max_position_embeddings": null,
+ "mla_attn_dtype": "bfloat16",
+ "mla_attn_mechanism": "auto",
+ "mla_attn_softmax_dtype": "float32",
+ "model_type": "qwen3_5",
+ "moe_force_xla_gmm": false,
+ "moe_method": "fused_moe",
+ "moe_tiling_size_batch": 4,
+ "moe_tiling_size_dim": 128,
+ "moe_tiling_size_seqlen": 128,
+ "num_attention_heads": 16,
+ "num_heads": 16,
+ "num_position_embeddings": 2304,
+ "operation_configs": null,
+ "out_hidden_size": 2048,
+ "pad_token_id": null,
+ "pallas_k_block_size": 128,
+ "pallas_m_block_size": 128,
+ "pallas_n_block_size": 128,
+ "partition_axis": {
+ "attention_dim_axis": null,
+ "attention_kv_dim_axis": null,
+ "batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "bias_head_sequence_axis": null,
+ "bias_key_sequence_axis": null,
+ "data_parallel_axis": "dp",
+ "decode_attention_dim_axis": null,
+ "decode_attention_kv_dim_axis": null,
+ "decode_batch_axis": [
+ "fsdp",
+ "dp"
+ ],
+ "decode_head_axis": "tp",
+ "decode_key_sequence_axis": "sp",
+ "decode_kv_head_axis": "tp",
+ "decode_query_sequence_axis": null,
+ "expert_axis": "ep",
+ "expert_gate_axis": null,
+ "expert_parallel_axis": "ep",
+ "fully_sharded_data_parallel_axis": "fsdp",
+ "head_axis": "tp",
+ "hidden_state_axis": "tp",
+ "key_sequence_axis": "sp",
+ "kv_head_axis": "tp",
+ "mlp_intermediate_axis": "tp",
+ "query_sequence_axis": "sp",
+ "sequence_axis": "sp",
+ "sequence_parallel_axis": "sp",
+ "tensor_parallel_axis": "tp",
+ "vocab_axis": "tp"
+ },
+ "patch_size": 16,
+ "platform": null,
+ "precompute_masks": true,
+ "pretraining_tp": 1,
+ "qmm_platform_override": null,
+ "qmm_tpu_path_override": null,
+ "quantization_config": null,
+ "scan_attention_layers": false,
+ "scan_mlp_chunk_size": 1024,
+ "scan_ring_attention": true,
+ "sep_token_id": null,
+ "sequence_axis_name": "sp",
+ "sharding_axis_dims": [
+ 1,
+ -1,
+ 1,
+ 1,
+ 1
+ ],
+ "sharding_axis_names": [
+ "dp",
+ "fsdp",
+ "ep",
+ "tp",
+ "sp"
+ ],
+ "sharding_dcn_axis_dims": null,
+ "sp_is_ep_bound": true,
+ "spatial_merge_size": 2,
+ "temporal_patch_size": 2,
+ "tie_encoder_decoder": false,
+ "tie_word_embeddings": true,
+ "tokens_per_second": 2.0,
+ "use_expert_tensor_mode": false,
+ "use_qmm_best_config": false,
+ "use_ring_of_experts": false,
+ "use_scan_mlp": false,
+ "use_sharded_kv_caching": false,
+ "use_sharding_constraint": false
+ },
+ "vision_end_token_id": 248054,
+ "vision_start_token_id": 248053
+}
diff --git a/model/model/language_model/embed_tokens/embedding/.zarray b/model/model/language_model/embed_tokens/embedding/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dfcb4ffa7e0bbca3569d481cc366e7c2e4f1fea5
--- /dev/null
+++ b/model/model/language_model/embed_tokens/embedding/.zarray
@@ -0,0 +1 @@
+{"chunks":[62080,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[248320,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/.zarray b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57
--- /dev/null
+++ b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/0 b/model/model/language_model/layers/0/input_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..bdc350ba7b48847e8d2d9d1c2a02949a1b47807c
Binary files /dev/null and b/model/model/language_model/layers/0/input_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/A_log/.zarray b/model/model/language_model/layers/0/linear_attn/A_log/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/A_log/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/A_log/0 b/model/model/language_model/layers/0/linear_attn/A_log/0
new file mode 100644
index 0000000000000000000000000000000000000000..1352a4b2d875d72753a8c385abbf87b4d4212748
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/A_log/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..424eba11ba0a14e9cda88dbedf43c7e2102ef43c
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/0 b/model/model/language_model/layers/0/linear_attn/dt_bias/0
new file mode 100644
index 0000000000000000000000000000000000000000..c5353d6982bee441ff4c1e24e4f7d711421fd3b2
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/dt_bias/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..1e14011041bcf25dbde4179bc3716104bc58aef0
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..cc63abd5d918ab9fb9c77a0397326d8690d320db
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..5836a2604a7ff7efec37b34a252171ca010b51d5
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..7ee5848d713f8abb1fe05e70ad64779150e732b6
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/3.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..60c228f6c634078c36d1f50708b1f440600cd930
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/0.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..e823c971c974662915c9ff335c7e8003f82d96eb
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..f36836bcd324a6d5c088cce73ff72c5da668e0a2
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..07192713ad2d1c6275f68df4c40ae77c57d4e3aa
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/3.0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_z/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..059f1aabbdc9e18d982e2db50174668264fc96e4
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/in_proj_z/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/norm/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/norm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/norm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/linear_attn/norm/kernel/0 b/model/model/language_model/layers/0/linear_attn/norm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..c261eea19ec869e9d0e8c33266c2f64ddf7ada8a
Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/norm/kernel/0 differ
diff --git a/model/model/language_model/layers/0/linear_attn/out_proj/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/out_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..1f43215c04ea8738b5721bed11256b14599d4e36
--- /dev/null
+++ b/model/model/language_model/layers/0/linear_attn/out_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..ea5e3899eef8cfa2cf94f843bf6ce55302a6c9b6
--- /dev/null
+++ b/model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[6144,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[6144,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray b/model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c
--- /dev/null
+++ b/model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray b/model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c
--- /dev/null
+++ b/model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57
--- /dev/null
+++ b/model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/0/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/0/post_attention_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..b56ef507f57d632aba75ad2c0215430a9bdc2a24
Binary files /dev/null and b/model/model/language_model/layers/0/post_attention_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/1/input_layernorm/kernel/.zarray b/model/model/language_model/layers/1/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57
--- /dev/null
+++ b/model/model/language_model/layers/1/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/input_layernorm/kernel/0 b/model/model/language_model/layers/1/input_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..cde235dbae6ba57d361c9f82ad3679ab8eeb2ded
Binary files /dev/null and b/model/model/language_model/layers/1/input_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/A_log/.zarray b/model/model/language_model/layers/1/linear_attn/A_log/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/A_log/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/A_log/0 b/model/model/language_model/layers/1/linear_attn/A_log/0
new file mode 100644
index 0000000000000000000000000000000000000000..ce54b390189ebac616a4797a472b62a770d43187
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/A_log/0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..7270ad5188adc63dbfa9497ede39b8049b5f87a8
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/0.0.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/1/linear_attn/dt_bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/dt_bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/dt_bias/0 b/model/model/language_model/layers/1/linear_attn/dt_bias/0
new file mode 100644
index 0000000000000000000000000000000000000000..47c68d1a02863b57b741876952620ce9681113f7
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/dt_bias/0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..051da798301ef0e9de219d6d6b4eb42eb00ccb8c
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/0.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..ee31a1f9fa0fa35067d49393a10dfdf6bbd3b335
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/1.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..604062e7628893f2ea876250b614f22864d5ee2a
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/2.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..fbd7cbfa5c15bc585e02f64a59a8cac3825a658c
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/3.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..eaa2bedb9c687152a0c112c424aa76a3874baddf
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/0.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..451ac165e5cf2ed1cbbf893d61754a97fe78864c
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/1.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/2.0
new file mode 100644
index 0000000000000000000000000000000000000000..b69480a1a297561305809eaa91c9e6150c4f9cf7
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/2.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..a9f7f3add47f09fe7124f2bf2d7bd5ba0caa7b16
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/3.0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_z/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..059f1aabbdc9e18d982e2db50174668264fc96e4
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/in_proj_z/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/norm/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/norm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/norm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/linear_attn/norm/kernel/0 b/model/model/language_model/layers/1/linear_attn/norm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..2a2e8dfaac352c0b9aa94ded6cbf4afc1a0dda81
Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/norm/kernel/0 differ
diff --git a/model/model/language_model/layers/1/linear_attn/out_proj/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/out_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..1f43215c04ea8738b5721bed11256b14599d4e36
--- /dev/null
+++ b/model/model/language_model/layers/1/linear_attn/out_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..ea5e3899eef8cfa2cf94f843bf6ce55302a6c9b6
--- /dev/null
+++ b/model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[6144,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[6144,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray b/model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c
--- /dev/null
+++ b/model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/1/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/1/post_attention_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..49ff9124a63982799c3fa7b8fc7e8839c28e2860
Binary files /dev/null and b/model/model/language_model/layers/1/post_attention_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/10/linear_attn/A_log/0 b/model/model/language_model/layers/10/linear_attn/A_log/0
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d55c3cc74659ede4575e900825ae10b6f6e7d
Binary files /dev/null and b/model/model/language_model/layers/10/linear_attn/A_log/0 differ
diff --git a/model/model/language_model/layers/10/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/10/linear_attn/dt_bias/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1
--- /dev/null
+++ b/model/model/language_model/layers/10/linear_attn/dt_bias/.zarray
@@ -0,0 +1 @@
+{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/0.0
new file mode 100644
index 0000000000000000000000000000000000000000..f0334323d743df81fb473dcd09ac038fc9845f2f
Binary files /dev/null and b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/0.0 differ
diff --git a/model/model/language_model/layers/10/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/10/linear_attn/in_proj_b/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..5fb8e020347f7d0c20f7bdf7c8bb01f23ea9191a
Binary files /dev/null and b/model/model/language_model/layers/10/linear_attn/in_proj_b/kernel/3.0 differ
diff --git a/model/model/language_model/layers/13/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/13/post_attention_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57
--- /dev/null
+++ b/model/model/language_model/layers/13/post_attention_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/13/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/13/post_attention_layernorm/kernel/0
new file mode 100644
index 0000000000000000000000000000000000000000..60f740ddc78e66bf49ee925d45c80b9445b5e86a
Binary files /dev/null and b/model/model/language_model/layers/13/post_attention_layernorm/kernel/0 differ
diff --git a/model/model/language_model/layers/14/input_layernorm/kernel/.zarray b/model/model/language_model/layers/14/input_layernorm/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57
--- /dev/null
+++ b/model/model/language_model/layers/14/input_layernorm/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/14/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/14/linear_attn/conv1d/kernel/0.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..a81a2444be66d06e25c69462d2659424959fe60c
Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/conv1d/kernel/0.0.0 differ
diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..93f6e0a102b71adc960ff8c5dbea8445369bd385
Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/1.0 differ
diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/3.0
new file mode 100644
index 0000000000000000000000000000000000000000..c99b2619c2c7cd6227fe4a2113f021e21c550888
Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/3.0 differ
diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c
--- /dev/null
+++ b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2}
\ No newline at end of file
diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/1.0
new file mode 100644
index 0000000000000000000000000000000000000000..61e4cd6f836a2ffa00d99a36030dbf68a7b5186c
Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/1.0 differ
diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/14/linear_attn/in_proj_z/kernel/.zarray
new file mode 100644
index 0000000000000000000000000000000000000000..059f1aabbdc9e18d982e2db50174668264fc96e4
--- /dev/null
+++ b/model/model/language_model/layers/14/linear_attn/in_proj_z/kernel/.zarray
@@ -0,0 +1 @@
+{"chunks":[512,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2}
\ No newline at end of file
diff --git a/tensorstore_index.json b/tensorstore_index.json
new file mode 100644
index 0000000000000000000000000000000000000000..1156d6d8c90b54e01d416192f9567802d3d5f2f1
--- /dev/null
+++ b/tensorstore_index.json
@@ -0,0 +1,4653 @@
+{
+ "format": "tensorstore",
+ "version": "easydel",
+ "prefixes": {
+ "model": [
+ {
+ "path": "model/model/language_model/embed_tokens/embedding",
+ "shape": [
+ 248320,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/0/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/1/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/10/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/k_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/q_proj/kernel",
+ "shape": [
+ 2048,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/11/self_attn/v_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/12/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/13/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/14/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/k_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/q_proj/kernel",
+ "shape": [
+ 2048,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/15/self_attn/v_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/16/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/17/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/18/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/k_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/q_proj/kernel",
+ "shape": [
+ 2048,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/19/self_attn/v_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/2/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/20/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/21/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/22/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/k_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/q_proj/kernel",
+ "shape": [
+ 2048,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/23/self_attn/v_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/k_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/q_proj/kernel",
+ "shape": [
+ 2048,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/3/self_attn/v_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/4/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/5/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/6/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/k_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/k_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/o_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/q_norm/kernel",
+ "shape": [
+ 256
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/q_proj/kernel",
+ "shape": [
+ 2048,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/7/self_attn/v_proj/kernel",
+ "shape": [
+ 2048,
+ 512
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/8/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/input_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/A_log",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/conv1d/kernel",
+ "shape": [
+ 4,
+ 1,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/dt_bias",
+ "shape": [
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_a/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_b/kernel",
+ "shape": [
+ 2048,
+ 16
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_qkv/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/in_proj_z/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/norm/kernel",
+ "shape": [
+ 128
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/linear_attn/out_proj/kernel",
+ "shape": [
+ 2048,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/mlp/down_proj/kernel",
+ "shape": [
+ 6144,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/mlp/gate_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/mlp/up_proj/kernel",
+ "shape": [
+ 2048,
+ 6144
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/layers/9/post_attention_layernorm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/language_model/norm/kernel",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/0/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/1/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/10/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/11/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/12/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/13/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/14/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/15/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/16/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/17/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/18/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/19/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/2/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/20/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/21/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/22/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/23/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/3/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/4/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/5/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/6/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/7/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/8/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/proj/kernel",
+ "shape": [
+ 1024,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/qkv/bias",
+ "shape": [
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/attn/qkv/kernel",
+ "shape": [
+ 1024,
+ 3072
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc1/kernel",
+ "shape": [
+ 1024,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/mlp/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm1/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm1/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm2/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/blocks/9/norm2/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc1/bias",
+ "shape": [
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc1/kernel",
+ "shape": [
+ 4096,
+ 4096
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc2/bias",
+ "shape": [
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/linear_fc2/kernel",
+ "shape": [
+ 4096,
+ 2048
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/norm/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/merger/norm/scale",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/patch_embed/proj/bias",
+ "shape": [
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/patch_embed/proj/kernel",
+ "shape": [
+ 2,
+ 16,
+ 16,
+ 3,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ },
+ {
+ "path": "model/model/visual/pos_embed/embedding",
+ "shape": [
+ 2304,
+ 1024
+ ],
+ "dtype": "bfloat16"
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..b4a37b2a6fd3ab3317cd7bac72855be1a843b2bb
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,31 @@
+{
+ "add_prefix_space": false,
+ "audio_bos_token": "<|audio_start|>",
+ "audio_eos_token": "<|audio_end|>",
+ "audio_token": "<|audio_pad|>",
+ "backend": "tokenizers",
+ "bos_token": null,
+ "clean_up_tokenization_spaces": false,
+ "eos_token": "<|endoftext|>",
+ "errors": "replace",
+ "image_token": "<|image_pad|>",
+ "is_local": false,
+ "model_max_length": 262144,
+ "model_specific_special_tokens": {
+ "audio_bos_token": "<|audio_start|>",
+ "audio_eos_token": "<|audio_end|>",
+ "audio_token": "<|audio_pad|>",
+ "image_token": "<|image_pad|>",
+ "video_token": "<|video_pad|>",
+ "vision_bos_token": "<|vision_start|>",
+ "vision_eos_token": "<|vision_end|>"
+ },
+ "pad_token": "<|endoftext|>",
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+ "split_special_tokens": false,
+ "tokenizer_class": "TokenizersBackend",
+ "unk_token": null,
+ "video_token": "<|video_pad|>",
+ "vision_bos_token": "<|vision_start|>",
+ "vision_eos_token": "<|vision_end|>"
+}