diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..0ef09f214eaa6d9bca297988afc1454b5827b2c7 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,154 @@ +{%- set image_count = namespace(value=0) %} +{%- set video_count = namespace(value=0) %} +{%- macro render_content(content, do_vision_count, is_system_content=false) %} + {%- if content is string %} + {{- content }} + {%- elif content is iterable and content is not mapping %} + {%- for item in content %} + {%- if 'image' in item or 'image_url' in item or item.type == 'image' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain images.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set image_count.value = image_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Picture ' ~ image_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|image_pad|><|vision_end|>' }} + {%- elif 'video' in item or item.type == 'video' %} + {%- if is_system_content %} + {{- raise_exception('System message cannot contain videos.') }} + {%- endif %} + {%- if do_vision_count %} + {%- set video_count.value = video_count.value + 1 %} + {%- endif %} + {%- if add_vision_id %} + {{- 'Video ' ~ video_count.value ~ ': ' }} + {%- endif %} + {{- '<|vision_start|><|video_pad|><|vision_end|>' }} + {%- elif 'text' in item %} + {{- item.text }} + {%- else %} + {{- raise_exception('Unexpected item type in content.') }} + {%- endif %} + {%- endfor %} + {%- elif content is none or content is undefined %} + {{- '' }} + {%- else %} + {{- raise_exception('Unexpected content type.') }} + {%- endif %} +{%- endmacro %} +{%- if not messages %} + {{- raise_exception('No messages provided.') }} +{%- endif %} +{%- if tools and tools is iterable and tools is not mapping %} + {{- '<|im_start|>system\n' }} + {{- "# Tools\n\nYou have access to the following functions:\n\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {%- if content %} + {{- '\n\n' + content }} + {%- endif %} + {%- endif %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if messages[0].role == 'system' %} + {%- set content = render_content(messages[0].content, false, true)|trim %} + {{- '<|im_start|>system\n' + content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" %} + {%- set content = render_content(message.content, false)|trim %} + {%- if not(content.startswith('') and content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if ns.multi_step_tool %} + {{- raise_exception('No user query found in messages.') }} +{%- endif %} +{%- for message in messages %} + {%- set content = render_content(message.content, true)|trim %} + {%- if message.role == "system" %} + {%- if not loop.first %} + {{- raise_exception('System message must be at the beginning.') }} + {%- endif %} + {%- elif message.role == "user" %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- set reasoning_content = reasoning_content|trim %} + {%- if loop.index0 > ns.last_query_index %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content + '\n\n\n' + content }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if loop.first %} + {%- if content|trim %} + {{- '\n\n\n\n' }} + {%- else %} + {{- '\n\n' }} + {%- endif %} + {%- else %} + {{- '\n\n\n' }} + {%- endif %} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- raise_exception('Unexpected message role.') }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is true %} + {{- '\n' }} + {%- else %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/checkpoint_metadata.json b/checkpoint_metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8f81209eda0565ce47c0ec8e0a2a5e4e5891dae6 --- /dev/null +++ b/checkpoint_metadata.json @@ -0,0 +1,6 @@ +{ + "timestamp": "2026-04-09T05:12:06.203634", + "custom_metadata": { + "step": 0 + } +} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..5f3f42db6e5ed757e0c884a6abfe6eb40decafe0 --- /dev/null +++ b/config.json @@ -0,0 +1,473 @@ +{ + "_external_rope_config_kwargs": {}, + "add_cross_attention": false, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "attn_mechanism": "vanilla", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 512, + "blocksize_q": 512, + "bos_token_id": null, + "cross_attention_hidden_size": null, + "decode_attn_mechanism": null, + "decoder_start_token_id": null, + "easy_method": "train", + "eos_token_id": null, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "image_token_id": 248056, + "is_decoder": false, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "kvdtype": "bfloat16", + "lmhead_chunksize": null, + "max_position_embeddings": null, + "mla_attn_dtype": "bfloat16", + "mla_attn_mechanism": "auto", + "mla_attn_softmax_dtype": "float32", + "model_type": "qwen3_5", + "moe_force_xla_gmm": false, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "operation_configs": null, + "pad_token_id": null, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "qmm_platform_override": null, + "qmm_tpu_path_override": null, + "quantization_config": null, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sep_token_id": null, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "sp_is_ep_bound": true, + "text_config": { + "_external_rope_config_kwargs": {}, + "add_cross_attention": false, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attn_dtype": "bfloat16", + "attn_mechanism": "vanilla", + "attn_output_gate": true, + "attn_softmax_dtype": "float32", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 512, + "blocksize_q": 512, + "bos_token_id": null, + "cross_attention_hidden_size": null, + "decode_attn_mechanism": null, + "decoder_sparse_step": 1, + "decoder_start_token_id": null, + "dtype": "bfloat16", + "easy_method": "train", + "eos_token_id": 248044, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "full_attention_interval": 4, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "head_dim": 256, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "kvdtype": "bfloat16", + "layer_types": [ + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention" + ], + "linear_attention_separate_proj": true, + "linear_conv_kernel_dim": 4, + "linear_key_head_dim": 128, + "linear_num_key_heads": 16, + "linear_num_value_heads": 16, + "linear_value_head_dim": 128, + "lmhead_chunksize": null, + "mamba_ssm_dtype": "float32", + "max_position_embeddings": 262144, + "mla_attn_dtype": "bfloat16", + "mla_attn_mechanism": "auto", + "mla_attn_softmax_dtype": "float32", + "mlp_only_layers": [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + 23 + ], + "model_type": "qwen3_5_text", + "moe_force_xla_gmm": false, + "moe_intermediate_size": 512, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "mtp_num_hidden_layers": 1, + "mtp_use_dedicated_embeddings": false, + "norm_topk_prob": true, + "num_attention_heads": 8, + "num_experts": 256, + "num_experts_per_tok": 8, + "num_hidden_layers": 24, + "num_key_value_heads": 2, + "num_local_experts": 256, + "operation_configs": null, + "output_router_logits": false, + "pad_token_id": null, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partial_rotary_factor": 0.25, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "qmm_platform_override": null, + "qmm_tpu_path_override": null, + "quantization_config": null, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "mrope_interleaved": true, + "mrope_section": [ + 11, + 11, + 10 + ], + "partial_rotary_factor": 0.25, + "rope_theta": 10000000, + "rope_type": "default", + "type": "default" + }, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.001, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sep_token_id": null, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "shared_expert_intermediate_size": 512, + "sp_is_ep_bound": true, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "use_cache": true, + "use_expert_tensor_mode": false, + "use_qmm_best_config": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false, + "vocab_size": 248320 + }, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "transformers_version": "5.5.0", + "use_expert_tensor_mode": false, + "use_qmm_best_config": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false, + "video_token_id": 248057, + "vision_config": { + "_external_rope_config_kwargs": {}, + "add_cross_attention": false, + "architectures": [ + "Qwen3_5ForConditionalGeneration" + ], + "attn_dtype": "bfloat16", + "attn_mechanism": "vanilla", + "attn_softmax_dtype": "float32", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 512, + "blocksize_q": 512, + "bos_token_id": null, + "cross_attention_hidden_size": null, + "decode_attn_mechanism": null, + "decoder_start_token_id": null, + "deepstack_visual_indexes": [], + "depth": 24, + "easy_method": "train", + "embed_dim": 1024, + "eos_token_id": null, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1024, + "in_channels": 3, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "kvdtype": "bfloat16", + "lmhead_chunksize": null, + "max_position_embeddings": null, + "mla_attn_dtype": "bfloat16", + "mla_attn_mechanism": "auto", + "mla_attn_softmax_dtype": "float32", + "model_type": "qwen3_5", + "moe_force_xla_gmm": false, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "num_attention_heads": 16, + "num_heads": 16, + "num_position_embeddings": 2304, + "operation_configs": null, + "out_hidden_size": 2048, + "pad_token_id": null, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "patch_size": 16, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "qmm_platform_override": null, + "qmm_tpu_path_override": null, + "quantization_config": null, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sep_token_id": null, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "sp_is_ep_bound": true, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokens_per_second": 2.0, + "use_expert_tensor_mode": false, + "use_qmm_best_config": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false + }, + "vision_end_token_id": 248054, + "vision_start_token_id": 248053 +} diff --git a/model/model/language_model/embed_tokens/embedding/.zarray b/model/model/language_model/embed_tokens/embedding/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dfcb4ffa7e0bbca3569d481cc366e7c2e4f1fea5 --- /dev/null +++ b/model/model/language_model/embed_tokens/embedding/.zarray @@ -0,0 +1 @@ +{"chunks":[62080,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[248320,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/.zarray b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57 --- /dev/null +++ b/model/model/language_model/layers/0/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/input_layernorm/kernel/0 b/model/model/language_model/layers/0/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..bdc350ba7b48847e8d2d9d1c2a02949a1b47807c Binary files /dev/null and b/model/model/language_model/layers/0/input_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/A_log/.zarray b/model/model/language_model/layers/0/linear_attn/A_log/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/A_log/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/A_log/0 b/model/model/language_model/layers/0/linear_attn/A_log/0 new file mode 100644 index 0000000000000000000000000000000000000000..1352a4b2d875d72753a8c385abbf87b4d4212748 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/A_log/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..424eba11ba0a14e9cda88dbedf43c7e2102ef43c Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/conv1d/kernel/0.0.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/dt_bias/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/dt_bias/0 b/model/model/language_model/layers/0/linear_attn/dt_bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..c5353d6982bee441ff4c1e24e4f7d711421fd3b2 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/dt_bias/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..1e14011041bcf25dbde4179bc3716104bc58aef0 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/0.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..cc63abd5d918ab9fb9c77a0397326d8690d320db Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/1.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..5836a2604a7ff7efec37b34a252171ca010b51d5 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/2.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..7ee5848d713f8abb1fe05e70ad64779150e732b6 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_a/kernel/3.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..60c228f6c634078c36d1f50708b1f440600cd930 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/0.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..e823c971c974662915c9ff335c7e8003f82d96eb Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/1.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..f36836bcd324a6d5c088cce73ff72c5da668e0a2 Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/2.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..07192713ad2d1c6275f68df4c40ae77c57d4e3aa Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/in_proj_b/kernel/3.0 differ diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/in_proj_z/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..059f1aabbdc9e18d982e2db50174668264fc96e4 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/in_proj_z/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/norm/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/norm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/norm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/linear_attn/norm/kernel/0 b/model/model/language_model/layers/0/linear_attn/norm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..c261eea19ec869e9d0e8c33266c2f64ddf7ada8a Binary files /dev/null and b/model/model/language_model/layers/0/linear_attn/norm/kernel/0 differ diff --git a/model/model/language_model/layers/0/linear_attn/out_proj/kernel/.zarray b/model/model/language_model/layers/0/linear_attn/out_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..1f43215c04ea8738b5721bed11256b14599d4e36 --- /dev/null +++ b/model/model/language_model/layers/0/linear_attn/out_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..ea5e3899eef8cfa2cf94f843bf6ce55302a6c9b6 --- /dev/null +++ b/model/model/language_model/layers/0/mlp/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[6144,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[6144,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray b/model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c --- /dev/null +++ b/model/model/language_model/layers/0/mlp/gate_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray b/model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c --- /dev/null +++ b/model/model/language_model/layers/0/mlp/up_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57 --- /dev/null +++ b/model/model/language_model/layers/0/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/0/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/0/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..b56ef507f57d632aba75ad2c0215430a9bdc2a24 Binary files /dev/null and b/model/model/language_model/layers/0/post_attention_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/1/input_layernorm/kernel/.zarray b/model/model/language_model/layers/1/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57 --- /dev/null +++ b/model/model/language_model/layers/1/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/input_layernorm/kernel/0 b/model/model/language_model/layers/1/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..cde235dbae6ba57d361c9f82ad3679ab8eeb2ded Binary files /dev/null and b/model/model/language_model/layers/1/input_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/1/linear_attn/A_log/.zarray b/model/model/language_model/layers/1/linear_attn/A_log/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/A_log/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/A_log/0 b/model/model/language_model/layers/1/linear_attn/A_log/0 new file mode 100644 index 0000000000000000000000000000000000000000..ce54b390189ebac616a4797a472b62a770d43187 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/A_log/0 differ diff --git a/model/model/language_model/layers/1/linear_attn/conv1d/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5f4f6d0e28041fb4b10478ee77629459f48d3285 --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4,1,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4,1,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..7270ad5188adc63dbfa9497ede39b8049b5f87a8 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/conv1d/kernel/0.0.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/1/linear_attn/dt_bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/dt_bias/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/dt_bias/0 b/model/model/language_model/layers/1/linear_attn/dt_bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..47c68d1a02863b57b741876952620ce9681113f7 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/dt_bias/0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..051da798301ef0e9de219d6d6b4eb42eb00ccb8c Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/0.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..ee31a1f9fa0fa35067d49393a10dfdf6bbd3b335 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/1.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/2.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..604062e7628893f2ea876250b614f22864d5ee2a Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/2.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..fbd7cbfa5c15bc585e02f64a59a8cac3825a658c Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_a/kernel/3.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/0.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..eaa2bedb9c687152a0c112c424aa76a3874baddf Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/0.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..451ac165e5cf2ed1cbbf893d61754a97fe78864c Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/1.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/2.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/2.0 new file mode 100644 index 0000000000000000000000000000000000000000..b69480a1a297561305809eaa91c9e6150c4f9cf7 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/2.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..a9f7f3add47f09fe7124f2bf2d7bd5ba0caa7b16 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/in_proj_b/kernel/3.0 differ diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/in_proj_z/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..059f1aabbdc9e18d982e2db50174668264fc96e4 --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/in_proj_z/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/norm/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/norm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01 --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/norm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/linear_attn/norm/kernel/0 b/model/model/language_model/layers/1/linear_attn/norm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..2a2e8dfaac352c0b9aa94ded6cbf4afc1a0dda81 Binary files /dev/null and b/model/model/language_model/layers/1/linear_attn/norm/kernel/0 differ diff --git a/model/model/language_model/layers/1/linear_attn/out_proj/kernel/.zarray b/model/model/language_model/layers/1/linear_attn/out_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..1f43215c04ea8738b5721bed11256b14599d4e36 --- /dev/null +++ b/model/model/language_model/layers/1/linear_attn/out_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray b/model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..ea5e3899eef8cfa2cf94f843bf6ce55302a6c9b6 --- /dev/null +++ b/model/model/language_model/layers/1/mlp/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[6144,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[6144,2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray b/model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6efec79918803a8fdb7fab431f109c04dad3ac3c --- /dev/null +++ b/model/model/language_model/layers/1/mlp/gate_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,6144],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,6144],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/1/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/1/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..49ff9124a63982799c3fa7b8fc7e8839c28e2860 Binary files /dev/null and b/model/model/language_model/layers/1/post_attention_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/10/linear_attn/A_log/0 b/model/model/language_model/layers/10/linear_attn/A_log/0 new file mode 100644 index 0000000000000000000000000000000000000000..9b7d55c3cc74659ede4575e900825ae10b6f6e7d Binary files /dev/null and b/model/model/language_model/layers/10/linear_attn/A_log/0 differ diff --git a/model/model/language_model/layers/10/linear_attn/dt_bias/.zarray b/model/model/language_model/layers/10/linear_attn/dt_bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..07ae7835f16ff037d7f7a0a5c4baa6abc54e40b1 --- /dev/null +++ b/model/model/language_model/layers/10/linear_attn/dt_bias/.zarray @@ -0,0 +1 @@ +{"chunks":[16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/0.0 b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/0.0 new file mode 100644 index 0000000000000000000000000000000000000000..f0334323d743df81fb473dcd09ac038fc9845f2f Binary files /dev/null and b/model/model/language_model/layers/10/linear_attn/in_proj_a/kernel/0.0 differ diff --git a/model/model/language_model/layers/10/linear_attn/in_proj_b/kernel/3.0 b/model/model/language_model/layers/10/linear_attn/in_proj_b/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..5fb8e020347f7d0c20f7bdf7c8bb01f23ea9191a Binary files /dev/null and b/model/model/language_model/layers/10/linear_attn/in_proj_b/kernel/3.0 differ diff --git a/model/model/language_model/layers/13/post_attention_layernorm/kernel/.zarray b/model/model/language_model/layers/13/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57 --- /dev/null +++ b/model/model/language_model/layers/13/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/13/post_attention_layernorm/kernel/0 b/model/model/language_model/layers/13/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..60f740ddc78e66bf49ee925d45c80b9445b5e86a Binary files /dev/null and b/model/model/language_model/layers/13/post_attention_layernorm/kernel/0 differ diff --git a/model/model/language_model/layers/14/input_layernorm/kernel/.zarray b/model/model/language_model/layers/14/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3d403f2b8224d7972b69bca7c2c483d6b3c57b57 --- /dev/null +++ b/model/model/language_model/layers/14/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/14/linear_attn/conv1d/kernel/0.0.0 b/model/model/language_model/layers/14/linear_attn/conv1d/kernel/0.0.0 new file mode 100644 index 0000000000000000000000000000000000000000..a81a2444be66d06e25c69462d2659424959fe60c Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/conv1d/kernel/0.0.0 differ diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/.zarray b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/1.0 b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..93f6e0a102b71adc960ff8c5dbea8445369bd385 Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/1.0 differ diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/3.0 b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/3.0 new file mode 100644 index 0000000000000000000000000000000000000000..c99b2619c2c7cd6227fe4a2113f021e21c550888 Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/in_proj_a/kernel/3.0 differ diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/.zarray b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dcdd7c94b2b0674831c22988747c3b77160dc86c --- /dev/null +++ b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,16],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,16],"zarr_format":2} \ No newline at end of file diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/1.0 b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/1.0 new file mode 100644 index 0000000000000000000000000000000000000000..61e4cd6f836a2ffa00d99a36030dbf68a7b5186c Binary files /dev/null and b/model/model/language_model/layers/14/linear_attn/in_proj_b/kernel/1.0 differ diff --git a/model/model/language_model/layers/14/linear_attn/in_proj_z/kernel/.zarray b/model/model/language_model/layers/14/linear_attn/in_proj_z/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..059f1aabbdc9e18d982e2db50174668264fc96e4 --- /dev/null +++ b/model/model/language_model/layers/14/linear_attn/in_proj_z/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[512,2048],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2048,2048],"zarr_format":2} \ No newline at end of file diff --git a/tensorstore_index.json b/tensorstore_index.json new file mode 100644 index 0000000000000000000000000000000000000000..1156d6d8c90b54e01d416192f9567802d3d5f2f1 --- /dev/null +++ b/tensorstore_index.json @@ -0,0 +1,4653 @@ +{ + "format": "tensorstore", + "version": "easydel", + "prefixes": { + "model": [ + { + "path": "model/model/language_model/embed_tokens/embedding", + "shape": [ + 248320, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/0/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/1/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/10/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/k_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/o_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/q_proj/kernel", + "shape": [ + 2048, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/11/self_attn/v_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/12/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/13/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/14/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/k_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/o_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/q_proj/kernel", + "shape": [ + 2048, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/15/self_attn/v_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/16/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/17/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/18/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/k_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/o_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/q_proj/kernel", + "shape": [ + 2048, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/19/self_attn/v_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/2/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/20/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/21/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/22/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/k_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/o_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/q_proj/kernel", + "shape": [ + 2048, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/23/self_attn/v_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/k_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/o_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/q_proj/kernel", + "shape": [ + 2048, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/3/self_attn/v_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/4/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/5/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/6/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/k_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/k_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/o_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/q_norm/kernel", + "shape": [ + 256 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/q_proj/kernel", + "shape": [ + 2048, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/7/self_attn/v_proj/kernel", + "shape": [ + 2048, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/8/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/input_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/A_log", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/conv1d/kernel", + "shape": [ + 4, + 1, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/dt_bias", + "shape": [ + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_a/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_b/kernel", + "shape": [ + 2048, + 16 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_qkv/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/in_proj_z/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/norm/kernel", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/linear_attn/out_proj/kernel", + "shape": [ + 2048, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/mlp/down_proj/kernel", + "shape": [ + 6144, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/mlp/gate_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/mlp/up_proj/kernel", + "shape": [ + 2048, + 6144 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/layers/9/post_attention_layernorm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/language_model/norm/kernel", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/0/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/1/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/10/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/11/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/12/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/13/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/14/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/15/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/16/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/17/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/18/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/19/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/2/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/20/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/21/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/22/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/23/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/3/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/4/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/5/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/6/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/7/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/8/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/proj/kernel", + "shape": [ + 1024, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/qkv/bias", + "shape": [ + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/attn/qkv/kernel", + "shape": [ + 1024, + 3072 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc1/kernel", + "shape": [ + 1024, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/mlp/linear_fc2/kernel", + "shape": [ + 4096, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm1/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm1/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm2/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/blocks/9/norm2/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc1/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc1/kernel", + "shape": [ + 4096, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc2/bias", + "shape": [ + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/linear_fc2/kernel", + "shape": [ + 4096, + 2048 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/norm/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/merger/norm/scale", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/patch_embed/proj/bias", + "shape": [ + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/patch_embed/proj/kernel", + "shape": [ + 2, + 16, + 16, + 3, + 1024 + ], + "dtype": "bfloat16" + }, + { + "path": "model/model/visual/pos_embed/embedding", + "shape": [ + 2304, + 1024 + ], + "dtype": "bfloat16" + } + ] + } +} \ No newline at end of file diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b4a37b2a6fd3ab3317cd7bac72855be1a843b2bb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,31 @@ +{ + "add_prefix_space": false, + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "backend": "tokenizers", + "bos_token": null, + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "errors": "replace", + "image_token": "<|image_pad|>", + "is_local": false, + "model_max_length": 262144, + "model_specific_special_tokens": { + "audio_bos_token": "<|audio_start|>", + "audio_eos_token": "<|audio_end|>", + "audio_token": "<|audio_pad|>", + "image_token": "<|image_pad|>", + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" + }, + "pad_token": "<|endoftext|>", + "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + "split_special_tokens": false, + "tokenizer_class": "TokenizersBackend", + "unk_token": null, + "video_token": "<|video_pad|>", + "vision_bos_token": "<|vision_start|>", + "vision_eos_token": "<|vision_end|>" +}