diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b2992a9536cf20e9f164510346c9aa3e6ac3126b --- /dev/null +++ b/README.md @@ -0,0 +1,159 @@ +--- +library_name: easydel +pipeline_tag: image-to-text +tags: + - easydel + - jax + - "qwen3_5" + - "ImageTextToText" + - "vanilla" +--- + +

+ EasyDeL +

+ +

Qwen/Qwen3.5-0.8B-Base

+ +
+ EasyDeL checkpoint converted from Qwen/Qwen3.5-0.8B-Base. +
+ +## Overview + +This checkpoint is intended to be loaded with EasyDeL on JAX (CPU/GPU/TPU). It supports sharded loading with `auto_shard_model=True` and configurable precision via `dtype`, `param_dtype`, and `precision`. + +## Quickstart + +```python +import easydel as ed +from jax import numpy as jnp, lax + +repo_id = "/dev/shm/conv/Qwen3.5-0.8B-Base" + +dtype = jnp.bfloat16 # try jnp.float16 on many GPUs + +model = ed.AutoEasyDeLModelForImageTextToText.from_pretrained( + repo_id, + dtype=dtype, + param_dtype=dtype, + precision=lax.Precision("fastest"), + sharding_axis_names=("dp", "fsdp", "ep", "tp", "sp"), + sharding_axis_dims=(1, -1, 1, 1, 1), + config_kwargs=ed.EasyDeLBaseConfigDict( + attn_dtype=dtype, + attn_mechanism=ed.AttentionMechanisms.VANILLA, + fsdp_is_ep_bound=True, + sp_is_ep_bound=True, + moe_method=ed.MoEMethods.FUSED_MOE, + ), + auto_shard_model=True, + partition_axis=ed.PartitionAxis(), +) +``` + +If the repository only provides PyTorch weights, pass `from_torch=True` to `from_pretrained(...)`. + +## Sharding & Parallelism (Multi-Device) + +EasyDeL can scale to multiple devices by creating a logical device mesh. Most EasyDeL loaders use a 5D mesh: + +- `dp`: data parallel (replicated parameters, different batch shards) +- `fsdp`: parameter sharding (memory saver; often the biggest axis) +- `ep`: expert parallel (MoE; keep `1` for non-MoE models) +- `tp`: tensor parallel (splits large matmuls) +- `sp`: sequence parallel (splits sequence dimension) + +Use `sharding_axis_names=("dp","fsdp","ep","tp","sp")` and choose `sharding_axis_dims` so that their product equals your device count. +You can use `-1` in `sharding_axis_dims` to let EasyDeL infer the remaining dimension. + +
+Example sharding configs + +```python +# 8 devices, pure FSDP +sharding_axis_dims = (1, 8, 1, 1, 1) + +# 8 devices, 2-way DP x 4-way FSDP +sharding_axis_dims = (2, 4, 1, 1, 1) + +# 8 devices, 4-way FSDP x 2-way TP +sharding_axis_dims = (1, 4, 1, 2, 1) +``` +
+ +## Using via `eLargeModel` (ELM) + +`eLargeModel` is a higher-level interface that wires together loading, sharding, training, and eSurge inference from a single config. + +```python +from easydel import eLargeModel + +repo_id = "/dev/shm/conv/Qwen3.5-0.8B-Base" + +elm = eLargeModel.from_pretrained(repo_id) # task is auto-detected +elm.set_dtype("bf16") +elm.set_sharding(axis_names=("dp", "fsdp", "ep", "tp", "sp"), axis_dims=(1, -1, 1, 1, 1)) + +model = elm.build_model() +# Optional: build an inference engine +# engine = elm.build_esurge() +``` + +
+ELM YAML config example + +```yaml +model: + name_or_path: "/dev/shm/conv/Qwen3.5-0.8B-Base" + +loader: + dtype: bf16 + param_dtype: bf16 + +sharding: + axis_dims: [1, -1, 1, 1, 1] + auto_shard_model: true +``` +
+ +## Features + +**EasyDeL:** +- JAX native implementation and sharded execution +- Configurable attention backends via `AttentionMechanisms.*` +- Precision control via `dtype`, `param_dtype`, and `precision` + +## Installation + +```bash +pip install easydel +``` + +## Links + +- EasyDeL GitHub: https://github.com/erfanzar/EasyDeL +- Docs: https://easydel.readthedocs.io/en/latest/ + +## Supported Tasks + +- ImageTextToText + +## Limitations + +- Refer to the original model card for training data, evaluation, and intended use. + +## License + +EasyDeL is released under the Apache-2.0 license. The license for this model's weights may differ; please consult the original repository. + +## Citation + +```bibtex +@misc{Zare Chavoshi_2023, + title={EasyDeL: An open-source library for enhancing and streamlining the training process of machine learning models}, + url={https://github.com/erfanzar/EasyDeL}, + author={Zare Chavoshi, Erfan}, + year={2023} +} +``` diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..0ef09f214eaa6d9bca297988afc1454b5827b2c7 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,154 @@ +{%- set image_count = namespace(value=0) %} +{%- set video_count = namespace(value=0) %} +{%- macro render_content(content, do_vision_count, is_system_content=false) %} + {%- if content is string %} + {{- content }} + {%- elif content is iterable and content is not mapping %} + {%- for item in content %} + {%- if 'image' in item or 'image_url' in item or item.type == 'image' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain images.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set image_count.value = image_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Picture ' ~ image_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|image_pad|><|vision_end|>' }} + {%- elif 'video' in item or item.type == 'video' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain videos.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set video_count.value = video_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Video ' ~ video_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|video_pad|><|vision_end|>' }} + {%- elif 'text' in item %} + {{- item.text }} + {%- else %} + {{- raise_exception('Unexpected item type in content.') }} + {%- endif %} + {%- endfor %} + {%- elif content is none or content is undefined %} + {{- '' }} + {%- else %} + {{- raise_exception('Unexpected content type.') }} + {%- endif %} +{%- endmacro %} +{%- if not messages %} + {{- raise_exception('No messages provided.') }} +{%- endif %} +{%- if tools and tools is iterable and tools is not mapping %} + {{- '<|im_start|>system\n' }} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if content %} + {{- '\n\n' + content }} + {%- endif %} + {%- endif %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" %} + {%- set content = render_content(message.content, false)|trim %} + {%- if not(content.startswith('') and content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if ns.multi_step_tool %} + {{- raise_exception('No user query found in messages.') }} +{%- endif %} +{%- for message in messages %} + {%- set content = render_content(message.content, true)|trim %} + {%- if message.role == "system" %} + {%- if not loop.first %} + {{- raise_exception('System message must be at the beginning.') }} + {%- endif %} + {%- elif message.role == "user" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- set reasoning_content = reasoning_content|trim %} + {%- if loop.index0 > ns.last_query_index %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if loop.first %} + {%- if content|trim %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} + {%- else %} + {{- '\n\n\n' }} + {%- endif %} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- raise_exception('Unexpected message role.') }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is true %} + {{- '\n' }} + {%- else %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e49425bb06b520c2b26ab2e9c3f5985822a14a44 --- /dev/null +++ b/config.json @@ -0,0 +1,473 @@ +{ + "_external_rope_config_kwargs": {}, + "add_cross_attention": false, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "attn_mechanism": "vanilla", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 512, + "blocksize_q": 512, + "bos_token_id": null, + "cross_attention_hidden_size": null, + "decode_attn_mechanism": null, + "decoder_start_token_id": null, + "easy_method": "train", + "eos_token_id": null, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "image_token_id": 248056, + "is_decoder": false, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "kvdtype": "bfloat16", + "lmhead_chunksize": null, + "max_position_embeddings": null, + "mla_attn_dtype": "bfloat16", + "mla_attn_mechanism": "auto", + "mla_attn_softmax_dtype": "float32", + "model_type": "qwen3_5", + "moe_force_xla_gmm": false, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "operation_configs": null, + "pad_token_id": null, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "qmm_platform_override": null, + "qmm_tpu_path_override": null, + "quantization_config": null, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sep_token_id": null, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "sp_is_ep_bound": true, + "text_config": { + "_external_rope_config_kwargs": {}, + "add_cross_attention": false, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attn_dtype": "bfloat16", + "attn_mechanism": "vanilla", + "attn_output_gate": true, + "attn_softmax_dtype": "float32", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 512, + "blocksize_q": 512, + "bos_token_id": null, + "cross_attention_hidden_size": null, + "decode_attn_mechanism": null, + "decoder_sparse_step": 1, + "decoder_start_token_id": null, + "dtype": "bfloat16", + "easy_method": "train", + "eos_token_id": 248044, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "full_attention_interval": 4, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 3584, + "is_decoder": false, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "kvdtype": "bfloat16", + "layer_types": [ + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention" + ], + "linear_attention_separate_proj": true, + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 16, + "linear_value_head_dim": 128, + "lmhead_chunksize": null, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 262144, + "mla_attn_dtype": "bfloat16", + "mla_attn_mechanism": "auto", + "mla_attn_softmax_dtype": "float32", + "mlp_only_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23 + ], + "model_type": "qwen3_5_text", + "moe_force_xla_gmm": false, + "moe_intermediate_size": 512, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "mtp_num_hidden_layers": 1, + "mtp_use_dedicated_embeddings": false, + "norm_topk_prob": true, + "num_attention_heads": 8, + "num_experts": 256, + "num_experts_per_tok": 8, + "num_hidden_layers": 24, + "num_key_value_heads": 2, + "num_local_experts": 256, + "operation_configs": null, + "output_router_logits": false, + "pad_token_id": null, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partial_rotary_factor": 0.25, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "qmm_platform_override": null, + "qmm_tpu_path_override": null, + "quantization_config": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "mrope_interleaved": true, + "mrope_section": [ + 11, + 11, + 10 + ], + "partial_rotary_factor": 0.25, + "rope_theta": 10000000, + "rope_type": "default", + "type": "default" + }, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.001, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sep_token_id": null, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "shared_expert_intermediate_size": 512, + "sp_is_ep_bound": true, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "use_cache": true, + "use_expert_tensor_mode": false, + "use_qmm_best_config": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false, + "vocab_size": 248320 + }, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "transformers_version": "5.5.0", + "use_expert_tensor_mode": false, + "use_qmm_best_config": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false, + "video_token_id": 248057, + "vision_config": { + "_external_rope_config_kwargs": {}, + "add_cross_attention": false, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "attn_dtype": "bfloat16", + "attn_mechanism": "vanilla", + "attn_softmax_dtype": "float32", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 512, + "blocksize_q": 512, + "bos_token_id": null, + "cross_attention_hidden_size": null, + "decode_attn_mechanism": null, + "decoder_start_token_id": null, + "deepstack_visual_indexes": [], + "depth": 12, + "easy_method": "train", + "embed_dim": 768, + "eos_token_id": null, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 768, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 3072, + "is_decoder": false, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "kvdtype": "bfloat16", + "lmhead_chunksize": null, + "max_position_embeddings": null, + "mla_attn_dtype": "bfloat16", + "mla_attn_mechanism": "auto", + "mla_attn_softmax_dtype": "float32", + "model_type": "qwen3_5", + "moe_force_xla_gmm": false, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "num_attention_heads": 12, + "num_heads": 12, + "num_position_embeddings": 2304, + "operation_configs": null, + "out_hidden_size": 1024, + "pad_token_id": null, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "patch_size": 16, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "qmm_platform_override": null, + "qmm_tpu_path_override": null, + "quantization_config": null, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sep_token_id": null, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "sp_is_ep_bound": true, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokens_per_second": 2.0, + "use_expert_tensor_mode": false, + "use_qmm_best_config": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false + }, + "vision_end_token_id": 248054, + "vision_start_token_id": 248053 +} diff --git a/model/model/language_model/embed_tokens/embedding/.zarray b/model/model/language_model/embed_tokens/embedding/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..711f2529b21c2f4435446397dce15c51ebbdd3d3 --- /dev/null +++ b/model/model/language_model/embed_tokens/embedding/.zarray @@ -0,0 +1 @@ +{"chunks":[62080,1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[248320,1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/.zarray b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/0 b/model/model/language_model/layers/0/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..433d6181d9f790c85a53eedcefc8516a0d2225b6 Binary files /dev/null and b/model/model/language_model/layers/0/input_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/A_log/.zarray b/model/model/language_model/layers/0/linear_attn/A_log/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/A_log/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/A_log/0 b/model/model/language_model/layers/0/linear_attn/A_log/0 new file mode 100644 index 0000000000000000000000000000000000000000..a320d85b6af3b571a6fcc30a66efbf74120750c6 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/A_log/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..552ce665d7c76ffc220dbd50227061c0d1edebd5 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/0 b/model/model/language_model/layers/0/linear_attn/dt_bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..e97bd1f79084ae9651a61e5a488962e8557c2b56 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/dt_bias/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..d77c3881da82245021b9d28ad9d51ffe1481172b Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..d8bc18181b2dc180b1bc9aa1b36333eef98ff156 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..409b2d2b99fa367cf891fcec6e4068b21a52ceb7 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..fef45a62ae9383aeac8d59b216505e621e95cfe1 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..d66de4b1412fc4aedef4e5d611ed5ce24a14e6b9 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 differ diff --git a/model/model/language_model/layers/19/input_layernorm/kernel/0 b/model/model/language_model/layers/19/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..f106053ccb5f8f402181121be5a3b3c20f086caf Binary files /dev/null and b/model/model/language_model/layers/19/input_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/19/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/19/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/19/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/19/self_attn/k_proj/kernel/.zarray b/model/model/language_model/layers/19/self_attn/k_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..36234540fe1b82a927003cd0f5ed4bcb3039a8d4 --- /dev/null +++ b/model/model/language_model/layers/19/self_attn/k_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,512],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/19/self_attn/q_norm/kernel/.zarray b/model/model/language_model/layers/19/self_attn/q_norm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..653c877efa83da0ab518995885715f7e8b4275d7 --- /dev/null +++ b/model/model/language_model/layers/19/self_attn/q_norm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[256],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/19/self_attn/v_proj/kernel/.zarray b/model/model/language_model/layers/19/self_attn/v_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..36234540fe1b82a927003cd0f5ed4bcb3039a8d4 --- /dev/null +++ b/model/model/language_model/layers/19/self_attn/v_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,512],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/input_layernorm/kernel/.zarray b/model/model/language_model/layers/2/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/2/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/input_layernorm/kernel/0 b/model/model/language_model/layers/2/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..706e73f7cd04a53f863cad02a1d3c4b11dcf2396 Binary files /dev/null and b/model/model/language_model/layers/2/input_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/2/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/2/linear_attn/conv1d/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285 --- /dev/null +++ b/model/model/language_model/layers/2/linear_attn/conv1d/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/linear_attn/dt_bias/0 b/model/model/language_model/layers/2/linear_attn/dt_bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..89aa4dd1f989686d55cfcea012bdd56f6572275b Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/dt_bias/0 differ diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/2/linear_attn/in_proj_a/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..b2f9c840422ff637b6020ea8f2567216a835349b Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_a/kernel/3.0 differ diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921 --- /dev/null +++ b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..9828c26e66281c0b1830768247e7bc9d7070f143 Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/0.0 differ diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..ce2d208830a419b1f71c4b49d9e09d3dc95f1ed2 Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/1.0 differ diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..f2e05f805579915dd3ae5116a8d02b84aefb706e Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/in_proj_b/kernel/3.0 differ diff --git a/model/model/language_model/layers/2/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/2/linear_attn/in_proj_z/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..c1d4bca0cadb8de14bbcf4269f75130fae4b5e12 --- /dev/null +++ b/model/model/language_model/layers/2/linear_attn/in_proj_z/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/linear_attn/norm/kernel/0 b/model/model/language_model/layers/2/linear_attn/norm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..9ba5a3510daf31c0fa8ea3cc04e22a03ac74a99c Binary files /dev/null and b/model/model/language_model/layers/2/linear_attn/norm/kernel/0 differ diff --git a/model/model/language_model/layers/2/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/2/mlp/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..d430451028a2753e4ce16636cfe899c607fae561 --- /dev/null +++ b/model/model/language_model/layers/2/mlp/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[3584,256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[3584,1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/mlp/up_proj/kernel/.zarray b/model/model/language_model/layers/2/mlp/up_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..9069ead8bf11c7bc932be9123c95ddc3a4360c93 --- /dev/null +++ b/model/model/language_model/layers/2/mlp/up_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,3584],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,3584],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/2/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/2/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/2/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/2/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..7459699fd571d825095ee14203347344d4786612 Binary files /dev/null and b/model/model/language_model/layers/2/post_attention_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/20/input_layernorm/kernel/.zarray b/model/model/language_model/layers/20/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/20/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/input_layernorm/kernel/0 b/model/model/language_model/layers/20/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..54f289c5de162088278c7916ba79db0df7e65495 Binary files /dev/null and b/model/model/language_model/layers/20/input_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/20/linear_attn/A_log/.zarray b/model/model/language_model/layers/20/linear_attn/A_log/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/A_log/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/A_log/0 b/model/model/language_model/layers/20/linear_attn/A_log/0 new file mode 100644 index 0000000000000000000000000000000000000000..99faa4e9472cda2e968bf6ba1ec188f581dc08f7 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/A_log/0 differ diff --git a/model/model/language_model/layers/20/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..4bed7ee37bf1295110266286321ccdddcee4e8d3 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/conv1d/kernel/0.0.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/20/linear_attn/dt_bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/dt_bias/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/dt_bias/0 b/model/model/language_model/layers/20/linear_attn/dt_bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..f3f9e9f0f3a46a90509e5abc5ee804483637a474 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/dt_bias/0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..3347a88cf0dde541152c6582c88acc68efa32e78 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/1.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..3a0dbaf17b87c12d92e1ba7dd136bfd2b4d1634c Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/2.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..32cacdb1dfe49859bf5d473401415ed804d38978 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_a/kernel/3.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..23bf6bf9d04f958801acbd12e2a81b458f0f7f11 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/0.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..2663b90fda7f413a2f30b95a6a0bb2dee1911070 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/1.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..c55314003eee271ab08f6cb3ff1d973e9f75f2c6 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/2.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..8b0999cb98b47282b2e81c6e36bbdfffd77e8e38 Binary files /dev/null and b/model/model/language_model/layers/20/linear_attn/in_proj_b/kernel/3.0 differ diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..c7a1625024939060efef29beeead903e4170d0ac --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/in_proj_z/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..c1d4bca0cadb8de14bbcf4269f75130fae4b5e12 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/in_proj_z/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/linear_attn/norm/kernel/.zarray b/model/model/language_model/layers/20/linear_attn/norm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01 --- /dev/null +++ b/model/model/language_model/layers/20/linear_attn/norm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/mlp/gate_proj/kernel/.zarray b/model/model/language_model/layers/20/mlp/gate_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..9069ead8bf11c7bc932be9123c95ddc3a4360c93 --- /dev/null +++ b/model/model/language_model/layers/20/mlp/gate_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,3584],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,3584],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/mlp/up_proj/kernel/.zarray b/model/model/language_model/layers/20/mlp/up_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..9069ead8bf11c7bc932be9123c95ddc3a4360c93 --- /dev/null +++ b/model/model/language_model/layers/20/mlp/up_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,3584],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,3584],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/20/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/20/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..794cf8e9ff80ad5f6f7ef59dd4f05007639d4ca1 Binary files /dev/null and b/model/model/language_model/layers/20/post_attention_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/21/input_layernorm/kernel/.zarray b/model/model/language_model/layers/21/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/21/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/21/linear_attn/A_log/0 b/model/model/language_model/layers/21/linear_attn/A_log/0 new file mode 100644 index 0000000000000000000000000000000000000000..1d2dd8b7e783676a9eb45793a85b8bda06467e3d Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/A_log/0 differ diff --git a/model/model/language_model/layers/21/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/21/linear_attn/conv1d/kernel/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..4a74fcb15801900be3813fc67dbed01709b6f2d1 Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/conv1d/kernel/0.0.0 differ diff --git a/model/model/language_model/layers/21/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/21/linear_attn/dt_bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/21/linear_attn/dt_bias/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/21/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/21/linear_attn/in_proj_a/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..5b36624557647bc367a14b53030799421517ab59 Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/in_proj_a/kernel/2.0 differ diff --git a/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..de96511727cb0375605dff9d883760adf580d921 --- /dev/null +++ b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[256,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..5ada231fe525597887531333f531a01f20e6a64b Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/in_proj_b/kernel/3.0 differ diff --git a/model/model/language_model/layers/21/linear_attn/norm/kernel/0 b/model/model/language_model/layers/21/linear_attn/norm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..f862a97770995da7e84c9a6744106d1f91fb72fc Binary files /dev/null and b/model/model/language_model/layers/21/linear_attn/norm/kernel/0 differ diff --git a/model/model/language_model/layers/21/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/21/mlp/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..d430451028a2753e4ce16636cfe899c607fae561 --- /dev/null +++ b/model/model/language_model/layers/21/mlp/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[3584,256],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[3584,1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/22/input_layernorm/kernel/.zarray b/model/model/language_model/layers/22/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0566d8c6344b2bc36bc482d2262468c7a92e0582 --- /dev/null +++ b/model/model/language_model/layers/22/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[1024],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[1024],"zarr_format":2} \ No newline at end of file diff --git a/model/model/visual/blocks/9/norm2/bias/0 b/model/model/visual/blocks/9/norm2/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..df23544af2c54751cd9c5dc44910c9b463296052 Binary files /dev/null and b/model/model/visual/blocks/9/norm2/bias/0 differ diff --git a/model/model/visual/merger/norm/bias/.zarray b/model/model/visual/merger/norm/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..108bf2997962926e388231bffd83dd858a8cfa34 --- /dev/null +++ b/model/model/visual/merger/norm/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[768],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[768],"zarr_format":2} \ No newline at end of file diff --git a/tensorstore_index.json b/tensorstore_index.json new file mode 100644 index 0000000000000000000000000000000000000000..7947e94b4d19eb03fe34d93f91596c733d274a2a --- /dev/null +++ b/tensorstore_index.json @@ -0,0 +1,3597 @@ +{ + "format": "tensorstore", + "version": "easydel", + "prefixes": { + "model": [ + { + "path": "model/model/language_model/embed_tokens/embedding", + "shape": [ + 248320, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/k_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/o_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/q_proj/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/v_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/k_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/o_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/q_proj/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/v_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/k_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/o_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/q_proj/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/v_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/k_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/o_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/q_proj/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/v_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/k_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/o_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/q_proj/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/v_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/k_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/o_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/q_proj/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/v_proj/kernel", + "shape": [ + 1024, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/input_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_a/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_b/kernel", + "shape": [ + 1024, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_qkv/kernel", + "shape": [ + 1024, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_z/kernel", + "shape": [ + 1024, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/mlp/down_proj/kernel", + "shape": [ + 3584, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/mlp/gate_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/mlp/up_proj/kernel", + "shape": [ + 1024, + 3584 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/post_attention_layernorm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/norm/kernel", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/proj/kernel", + "shape": [ + 768, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/qkv/bias", + "shape": [ + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/qkv/kernel", + "shape": [ + 768, + 2304 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc1/kernel", + "shape": [ + 768, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc2/kernel", + "shape": [ + 3072, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm1/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm1/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm2/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm2/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc1/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc1/kernel", + "shape": [ + 3072, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc2/kernel", + "shape": [ + 3072, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/norm/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/norm/scale", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/patch_embed/proj/bias", + "shape": [ + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/patch_embed/proj/kernel", + "shape": [ + 2, + 16, + 16, + 3, + 768 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/pos_embed/embedding", + "shape": [ + 2304, + 768 + ], + "dtype": "bfloat16" + } + ] + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b4a37b2a6fd3ab3317cd7bac72855be1a843b2bb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,31 @@ +{ + "add_prefix_space": false, + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "image_token": "<|image_pad|>", + "is_local": false, + "model_max_length": 262144, + "model_specific_special_tokens": { + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "image_token": "<|image_pad|>", + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" + }, + "pad_token": "<|endoftext|>", + "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + "split_special_tokens": false, + "tokenizer_class": "TokenizersBackend", + "unk_token": null, + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" +}