diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..8762a2669af9875fde0a424b7036dfecea14d0ee --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,397 @@ +{#- + In addition to the normal inputs of `messages` and `tools`, this template also accepts the + following kwargs: + - "builtin_tools": A list, can contain "browser" and/or "python". + - "model_identity": A string that optionally describes the model identity. + - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium". + #} + +{#- Tool Definition Rendering ============================================== #} +{%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%} + {%- if param_spec.type == "array" -%} + {%- if param_spec['items'] -%} + {%- if param_spec['items']['type'] == "string" -%} + {{- "string[]" }} + {%- elif param_spec['items']['type'] == "number" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "integer" -%} + {{- "number[]" }} + {%- elif param_spec['items']['type'] == "boolean" -%} + {{- "boolean[]" }} + {%- else -%} + {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%} + {%- if inner_type == "object | object" or inner_type|length > 50 -%} + {{- "any[]" }} + {%- else -%} + {{- inner_type + "[]" }} + {%- endif -%} + {%- endif -%} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- else -%} + {{- "any[]" }} + {%- if param_spec.nullable -%} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%} + {#- Handle array of types like ["object", "object"] from Union[dict, list] #} + {%- if param_spec.type | length > 1 -%} + {{- param_spec.type | join(" | ") }} + {%- else -%} + {{- param_spec.type[0] }} + {%- endif -%} + {%- elif param_spec.oneOf -%} + {#- Handle oneOf schemas - check for complex unions and fallback to any #} + {%- set has_object_variants = false -%} + {%- for variant in param_spec.oneOf -%} + {%- if variant.type == "object" -%} + {%- set has_object_variants = true -%} + {%- endif -%} + {%- endfor -%} + {%- if has_object_variants and param_spec.oneOf|length > 1 -%} + {{- "any" }} + {%- else -%} + {%- for variant in param_spec.oneOf -%} + {{- render_typescript_type(variant, required_params) -}} + {%- if variant.description %} + {{- "// " + variant.description }} + {%- endif -%} + {%- if variant.default is defined %} + {{ "// default: " + variant.default|tojson }} + {%- endif -%} + {%- if not loop.last %} + {{- " | " }} + {% endif -%} + {%- endfor -%} + {%- endif -%} + {%- elif param_spec.type == "string" -%} + {%- if param_spec.enum -%} + {{- '"' + param_spec.enum|join('" | "') + '"' -}} + {%- else -%} + {{- "string" }} + {%- if param_spec.nullable %} + {{- " | null" }} + {%- endif -%} + {%- endif -%} + {%- elif param_spec.type == "number" -%} + {{- "number" }} + {%- elif param_spec.type == "integer" -%} + {{- "number" }} + {%- elif param_spec.type == "boolean" -%} + {{- "boolean" }} + + {%- elif param_spec.type == "object" -%} + {%- if param_spec.properties -%} + {{- "{ +" }} + {%- for prop_name, prop_spec in param_spec.properties.items() -%} + {{- prop_name -}} + {%- if prop_name not in (param_spec.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{ render_typescript_type(prop_spec, param_spec.required or []) }} + {%- if not loop.last -%} + {{-", " }} + {%- endif -%} + {%- endfor -%} + {{- "}" }} + {%- else -%} + {{- "object" }} + {%- endif -%} + {%- else -%} + {{- "any" }} + {%- endif -%} +{%- endmacro -%} + +{%- macro render_tool_namespace(namespace_name, tools) -%} + {{- "## " + namespace_name + " + +" }} + {{- "namespace " + namespace_name + " { + +" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- "// " + tool.description + " +" }} + {{- "type "+ tool.name + " = " }} + {%- if tool.parameters and tool.parameters.properties %} + {{- "(_: { +" }} + {%- for param_name, param_spec in tool.parameters.properties.items() %} + {%- if param_spec.description %} + {{- "// " + param_spec.description + " +" }} + {%- endif %} + {{- param_name }} + {%- if param_name not in (tool.parameters.required or []) -%} + {{- "?" }} + {%- endif -%} + {{- ": " }} + {{- render_typescript_type(param_spec, tool.parameters.required or []) }} + {%- if param_spec.default is defined -%} + {%- if param_spec.enum %} + {{- ", // default: " + param_spec.default }} + {%- elif param_spec.oneOf %} + {{- "// default: " + param_spec.default }} + {%- else %} + {{- ", // default: " + param_spec.default|tojson }} + {%- endif -%} + {%- endif -%} + {%- if not loop.last %} + {{- ", +" }} + {%- else %} + {{- " +" }} + {%- endif -%} + {%- endfor %} + {{- "}) => any; + +" }} + {%- else -%} + {{- "() => any; + +" }} + {%- endif -%} + {%- endfor %} + {{- "} // namespace " + namespace_name }} +{%- endmacro -%} + +{%- macro render_builtin_tools(browser_tool, python_tool) -%} + {%- if browser_tool %} + {{- "## browser + +" }} + {{- "// Tool for browsing. +" }} + {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`. +" }} + {{- "// Cite information from the tool using the following format: +" }} + {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`. +" }} + {{- "// Do not quote more than 10 words directly from the tool output. +" }} + {{- "// sources=web (default: web) +" }} + {{- "namespace browser { + +" }} + {{- "// Searches for information related to `query` and displays `topn` results. +" }} + {{- "type search = (_: { +" }} + {{- "query: string, +" }} + {{- "topn?: number, // default: 10 +" }} + {{- "source?: string, +" }} + {{- "}) => any; + +" }} + {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines. +" }} + {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`. +" }} + {{- "// If `cursor` is not provided, the most recent page is implied. +" }} + {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`. +" }} + {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available. +" }} + {{- "// Use this function without `id` to scroll to a new location of an opened page. +" }} + {{- "type open = (_: { +" }} + {{- "id?: number | string, // default: -1 +" }} + {{- "cursor?: number, // default: -1 +" }} + {{- "loc?: number, // default: -1 +" }} + {{- "num_lines?: number, // default: -1 +" }} + {{- "view_source?: boolean, // default: false +" }} + {{- "source?: string, +" }} + {{- "}) => any; + +" }} + {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`. +" }} + {{- "type find = (_: { +" }} + {{- "pattern: string, +" }} + {{- "cursor?: number, // default: -1 +" }} + {{- "}) => any; + +" }} + {{- "} // namespace browser + +" }} + {%- endif -%} + + {%- if python_tool %} + {{- "## python + +" }} + {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files). + +" }} + {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster. + +" }} + {%- endif -%} +{%- endmacro -%} + +{#- System Message Construction ============================================ #} +{%- macro build_system_message() -%} + {%- if model_identity is not defined %} + {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %} + {%- endif %} + {{- model_identity + " +" }} + {{- "Knowledge cutoff: 2024-06 +" }} + {{- "Current date: " + strftime_now("%Y-%m-%d") + " + +" }} + {%- if reasoning_effort is not defined %} + {%- set reasoning_effort = "medium" %} + {%- endif %} + {{- "Reasoning: " + reasoning_effort + " + +" }} + {%- if builtin_tools %} + {{- "# Tools + +" }} + {%- set available_builtin_tools = namespace(browser=false, python=false) %} + {%- for tool in builtin_tools %} + {%- if tool == "browser" %} + {%- set available_builtin_tools.browser = true %} + {%- elif tool == "python" %} + {%- set available_builtin_tools.python = true %} + {%- endif %} + {%- endfor %} + {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }} + {%- endif -%} + {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }} + {%- if tools -%} + {{- " +Calls to these tools must go to the commentary channel: 'functions'." }} + {%- endif -%} +{%- endmacro -%} + +{#- Main Template Logic ================================================= #} +{#- Set defaults #} + +{#- Render system message #} +{{- "<|start|>system<|message|>" }} +{{- build_system_message() }} +{{- "<|end|>" }} + +{#- Extract developer message #} +{%- if messages[0].role == "developer" or messages[0].role == "system" %} + {%- set developer_message = messages[0].content %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set developer_message = "" %} + {%- set loop_messages = messages %} +{%- endif %} + +{#- Render developer message #} +{%- if developer_message or tools %} + {{- "<|start|>developer<|message|>" }} + {%- if developer_message %} + {{- "# Instructions + +" }} + {{- developer_message }} + {%- endif %} + {%- if tools -%} + {{- " + +" }} + {{- "# Tools + +" }} + {{- render_tool_namespace("functions", tools) }} + {%- endif -%} + {{- "<|end|>" }} +{%- endif %} + +{#- Render messages #} +{%- set last_tool_call = namespace(name=none) %} +{%- for message in loop_messages -%} + {#- At this point only assistant/user/tool messages should remain #} + {%- if message.role == 'assistant' -%} + {#- Checks to ensure the messages are being passed in the format we expect #} + {%- if "content" in message %} + {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "thinking" in message %} + {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %} + {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }} + {%- endif %} + {%- endif %} + {%- if "tool_calls" in message %} + {#- We assume max 1 tool call per message, and so we infer the tool call name #} + {#- in "tool" messages from the most recent assistant tool call name #} + {%- set tool_call = message.tool_calls[0] %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {%- if message.content and message.thinking %} + {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }} + {%- elif message.content %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }} + {%- elif message.thinking %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {{- "<|start|>assistant to=" }} + {{- "functions." + tool_call.name + "<|channel|>commentary " }} + {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }} + {{- tool_call.arguments|tojson }} + {{- "<|call|>" }} + {%- set last_tool_call.name = tool_call.name %} + {%- elif loop.last and not add_generation_prompt %} + {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #} + {#- This is a situation that should only occur in training, never in inference. #} + {%- if "thinking" in message %} + {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }} + {%- endif %} + {#- <|return|> indicates the end of generation, but <|end|> does not #} + {#- <|return|> should never be an input to the model, but we include it as the final token #} + {#- when training, so the model learns to emit it. #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }} + {%- else %} + {#- CoT is dropped during all previous turns, so we never render it for inference #} + {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }} + {%- set last_tool_call.name = none %} + {%- endif %} + {%- elif message.role == 'tool' -%} + {%- if last_tool_call.name is none %} + {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }} + {%- endif %} + {{- "<|start|>functions." + last_tool_call.name }} + {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }} + {%- elif message.role == 'user' -%} + {{- "<|start|>user<|message|>" + message.content + "<|end|>" }} + {%- endif -%} +{%- endfor -%} + +{#- Generation prompt #} +{%- if add_generation_prompt -%} +<|start|>assistant +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..b8b5678ab42d78f9c1035df1e591ee689c30d2ed --- /dev/null +++ b/config.json @@ -0,0 +1,172 @@ +{ + "_external_rope_config_kwargs": {}, + "architectures": [ + "GptOssForCausalLM" + ], + "attention_bias": true, + "attention_dropout": 0.0, + "attn_mechanism": "vanilla", + "backend": null, + "bits": null, + "blocksize_b": 1, + "blocksize_k": 128, + "blocksize_q": 128, + "decode_attn_mechanism": null, + "dtype": "bfloat16", + "easy_method": "train", + "eos_token_id": 200002, + "experts_per_token": 4, + "fcm_max_ratio": 0.0, + "fcm_min_ratio": 0.0, + "flash_attention_backward_pass_impl": "triton", + "fsdp_is_ep_bound": true, + "gradient_checkpointing": "", + "gradient_checkpointing_targets": null, + "hardware_abstraction": false, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2880, + "initial_context_length": 4096, + "initializer_range": 0.02, + "intermediate_size": 2880, + "kv_cache_quantization_config": null, + "kv_cache_sharding_sequence_axis_name": "sp", + "layer_types": [ + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention", + "sliding_attention", + "full_attention" + ], + "max_position_embeddings": 131072, + "mlp_activations_limit": 7.0, + "model_type": "gpt_oss", + "moe_force_xla_gmm": false, + "moe_method": "fused_moe", + "moe_tiling_size_batch": 4, + "moe_tiling_size_dim": 128, + "moe_tiling_size_seqlen": 128, + "num_attention_heads": 64, + "num_experts_per_tok": 4, + "num_hidden_layers": 36, + "num_key_value_heads": 8, + "num_local_experts": 128, + "operation_configs": null, + "output_router_logits": false, + "pad_token_id": 199999, + "pallas_k_block_size": 128, + "pallas_m_block_size": 128, + "pallas_n_block_size": 128, + "partition_axis": { + "attention_dim_axis": null, + "attention_kv_dim_axis": null, + "batch_axis": [ + "fsdp", + "dp" + ], + "bias_head_sequence_axis": null, + "bias_key_sequence_axis": null, + "data_parallel_axis": "dp", + "decode_attention_dim_axis": null, + "decode_attention_kv_dim_axis": null, + "decode_batch_axis": [ + "fsdp", + "dp" + ], + "decode_head_axis": "tp", + "decode_key_sequence_axis": "sp", + "decode_kv_head_axis": "tp", + "decode_query_sequence_axis": null, + "expert_axis": "ep", + "expert_gate_axis": null, + "expert_parallel_axis": "ep", + "fully_sharded_data_parallel_axis": "fsdp", + "head_axis": "tp", + "hidden_state_axis": "tp", + "key_sequence_axis": "sp", + "kv_head_axis": "tp", + "mlp_intermediate_axis": "tp", + "query_sequence_axis": "sp", + "sequence_axis": "sp", + "sequence_parallel_axis": "sp", + "tensor_parallel_axis": "tp", + "vocab_axis": "tp" + }, + "platform": null, + "precompute_masks": true, + "pretraining_tp": 1, + "quantization_config": null, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "beta_fast": 32.0, + "beta_slow": 1.0, + "factor": 32.0, + "original_max_position_embeddings": 4096, + "rope_type": "yarn", + "truncate": false + }, + "rope_theta": 150000, + "router_aux_loss_coef": 0.9, + "scan_attention_layers": false, + "scan_mlp_chunk_size": 1024, + "scan_ring_attention": true, + "sequence_axis_name": "sp", + "sharding_axis_dims": [ + 1, + -1, + 1, + 1, + 1 + ], + "sharding_axis_names": [ + "dp", + "fsdp", + "ep", + "tp", + "sp" + ], + "sharding_dcn_axis_dims": null, + "sliding_window": 128, + "sp_is_ep_bound": true, + "swiglu_limit": 7.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.3", + "use_cache": true, + "use_expert_tensor_mode": false, + "use_ring_of_experts": false, + "use_scan_mlp": false, + "use_sharded_kv_caching": false, + "use_sharding_constraint": false, + "vocab_size": 201088 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a7ec9dbff8e420d56f60d32677f5a390ff6de28 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,11 @@ +{ + "bos_token_id": 199998, + "do_sample": true, + "eos_token_id": [ + 200002, + 199999 + ], + "pad_token_id": 199999, + "transformers_version": "4.57.3", + "trust_remote_code": false +} diff --git a/model/params/model/layers/6/input_layernorm/kernel/0 b/model/params/model/layers/6/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..9e22de891f28bb79aa21af30e09a8635ddd44445 Binary files /dev/null and b/model/params/model/layers/6/input_layernorm/kernel/0 differ diff --git a/model/params/model/layers/6/mlp/experts/down_proj/bias/.zarray b/model/params/model/layers/6/mlp/experts/down_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/6/mlp/experts/down_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/6/self_attn/o_proj/kernel/.zarray b/model/params/model/layers/6/self_attn/o_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0d72be683eb2b53390d4eb11bf07343ad17f37d7 --- /dev/null +++ b/model/params/model/layers/6/self_attn/o_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4096,720],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/6/self_attn/q_proj/bias/.zarray b/model/params/model/layers/6/self_attn/q_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6cb36eccfd612cd6d39e72807aaa8527aa28075d --- /dev/null +++ b/model/params/model/layers/6/self_attn/q_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/6/self_attn/q_proj/bias/0 b/model/params/model/layers/6/self_attn/q_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..bbcff036de8e91f866c54f156c55eb85fdcdf5b3 Binary files /dev/null and b/model/params/model/layers/6/self_attn/q_proj/bias/0 differ diff --git a/model/params/model/layers/6/self_attn/sinks/.zarray b/model/params/model/layers/6/self_attn/sinks/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..75fb6ffa552a3ab35caf4dd9123b81fce2aa7eaa --- /dev/null +++ b/model/params/model/layers/6/self_attn/sinks/.zarray @@ -0,0 +1 @@ +{"chunks":[64],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[64],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/6/self_attn/v_proj/bias/.zarray b/model/params/model/layers/6/self_attn/v_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..82cda9219c678aba952afd8b130fd0430cff4770 --- /dev/null +++ b/model/params/model/layers/6/self_attn/v_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/input_layernorm/kernel/.zarray b/model/params/model/layers/7/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/7/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/input_layernorm/kernel/0 b/model/params/model/layers/7/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..6f5ff6e2e9f0e7e1fc5c01c6aae60b82a9af4a0b Binary files /dev/null and b/model/params/model/layers/7/input_layernorm/kernel/0 differ diff --git a/model/params/model/layers/7/mlp/experts/down_proj/bias/.zarray b/model/params/model/layers/7/mlp/experts/down_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/7/mlp/experts/down_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/experts/down_proj/kernel/.zarray b/model/params/model/layers/7/mlp/experts/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/7/mlp/experts/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/experts/gate_proj/bias/.zarray b/model/params/model/layers/7/mlp/experts/gate_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/7/mlp/experts/gate_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/experts/gate_proj/kernel/.zarray b/model/params/model/layers/7/mlp/experts/gate_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/7/mlp/experts/gate_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/experts/up_proj/bias/.zarray b/model/params/model/layers/7/mlp/experts/up_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/7/mlp/experts/up_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/experts/up_proj/kernel/.zarray b/model/params/model/layers/7/mlp/experts/up_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/7/mlp/experts/up_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/router/bias/.zarray b/model/params/model/layers/7/mlp/router/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01 --- /dev/null +++ b/model/params/model/layers/7/mlp/router/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/mlp/router/bias/0 b/model/params/model/layers/7/mlp/router/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..6dd49cacf646a1243c336ca528ac627cb30f76a0 Binary files /dev/null and b/model/params/model/layers/7/mlp/router/bias/0 differ diff --git a/model/params/model/layers/7/post_attention_layernorm/kernel/.zarray b/model/params/model/layers/7/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/7/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/post_attention_layernorm/kernel/0 b/model/params/model/layers/7/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..485a84ddc82a4b397c41c3112d19aa716eee883a Binary files /dev/null and b/model/params/model/layers/7/post_attention_layernorm/kernel/0 differ diff --git a/model/params/model/layers/7/self_attn/k_proj/bias/.zarray b/model/params/model/layers/7/self_attn/k_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..82cda9219c678aba952afd8b130fd0430cff4770 --- /dev/null +++ b/model/params/model/layers/7/self_attn/k_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/self_attn/k_proj/bias/0 b/model/params/model/layers/7/self_attn/k_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..a7be141c25d5282d2d2d0c3ed8ea25371293b837 Binary files /dev/null and b/model/params/model/layers/7/self_attn/k_proj/bias/0 differ diff --git a/model/params/model/layers/7/self_attn/k_proj/kernel/.zarray b/model/params/model/layers/7/self_attn/k_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5d1373376dc0d30da62c2050a6251ab55d995f8f --- /dev/null +++ b/model/params/model/layers/7/self_attn/k_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[720,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/self_attn/o_proj/bias/.zarray b/model/params/model/layers/7/self_attn/o_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/7/self_attn/o_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/self_attn/o_proj/bias/0 b/model/params/model/layers/7/self_attn/o_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..37630bd17546c1f651190dfb1c57770384d3550c Binary files /dev/null and b/model/params/model/layers/7/self_attn/o_proj/bias/0 differ diff --git a/model/params/model/layers/7/self_attn/q_proj/bias/0 b/model/params/model/layers/7/self_attn/q_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..292ac8b86838c80f81c5b9876df3bde812f1ca0e Binary files /dev/null and b/model/params/model/layers/7/self_attn/q_proj/bias/0 differ diff --git a/model/params/model/layers/7/self_attn/q_proj/kernel/.zarray b/model/params/model/layers/7/self_attn/q_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..52c1634030f7b3cfbbf99130324b654685a16f94 --- /dev/null +++ b/model/params/model/layers/7/self_attn/q_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[720,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,4096],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/self_attn/sinks/.zarray b/model/params/model/layers/7/self_attn/sinks/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..75fb6ffa552a3ab35caf4dd9123b81fce2aa7eaa --- /dev/null +++ b/model/params/model/layers/7/self_attn/sinks/.zarray @@ -0,0 +1 @@ +{"chunks":[64],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[64],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/self_attn/sinks/0 b/model/params/model/layers/7/self_attn/sinks/0 new file mode 100644 index 0000000000000000000000000000000000000000..8da351779edaa4cd3e22158dccb45ab944a6d346 Binary files /dev/null and b/model/params/model/layers/7/self_attn/sinks/0 differ diff --git a/model/params/model/layers/7/self_attn/v_proj/bias/.zarray b/model/params/model/layers/7/self_attn/v_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..82cda9219c678aba952afd8b130fd0430cff4770 --- /dev/null +++ b/model/params/model/layers/7/self_attn/v_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/7/self_attn/v_proj/bias/0 b/model/params/model/layers/7/self_attn/v_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..e66e6f8be5871b2a689b4489a5e31a1e51777c6b Binary files /dev/null and b/model/params/model/layers/7/self_attn/v_proj/bias/0 differ diff --git a/model/params/model/layers/7/self_attn/v_proj/kernel/.zarray b/model/params/model/layers/7/self_attn/v_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5d1373376dc0d30da62c2050a6251ab55d995f8f --- /dev/null +++ b/model/params/model/layers/7/self_attn/v_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[720,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/input_layernorm/kernel/.zarray b/model/params/model/layers/8/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/8/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/input_layernorm/kernel/0 b/model/params/model/layers/8/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..41c8d1af2f062d9d3993e481f6960933a390c538 Binary files /dev/null and b/model/params/model/layers/8/input_layernorm/kernel/0 differ diff --git a/model/params/model/layers/8/mlp/experts/down_proj/bias/.zarray b/model/params/model/layers/8/mlp/experts/down_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/8/mlp/experts/down_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/experts/down_proj/kernel/.zarray b/model/params/model/layers/8/mlp/experts/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/8/mlp/experts/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/experts/gate_proj/bias/.zarray b/model/params/model/layers/8/mlp/experts/gate_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/8/mlp/experts/gate_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/experts/gate_proj/kernel/.zarray b/model/params/model/layers/8/mlp/experts/gate_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/8/mlp/experts/gate_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/experts/up_proj/bias/.zarray b/model/params/model/layers/8/mlp/experts/up_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/8/mlp/experts/up_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/experts/up_proj/kernel/.zarray b/model/params/model/layers/8/mlp/experts/up_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/8/mlp/experts/up_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/router/bias/.zarray b/model/params/model/layers/8/mlp/router/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01 --- /dev/null +++ b/model/params/model/layers/8/mlp/router/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/mlp/router/kernel/.zarray b/model/params/model/layers/8/mlp/router/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..ea8f6491b31bb855173d1db15e72526f2b085881 --- /dev/null +++ b/model/params/model/layers/8/mlp/router/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880,128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,128],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/post_attention_layernorm/kernel/.zarray b/model/params/model/layers/8/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/8/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/post_attention_layernorm/kernel/0 b/model/params/model/layers/8/post_attention_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..a229b6fc2ae163f3926be81a14e33edf24fcb96e Binary files /dev/null and b/model/params/model/layers/8/post_attention_layernorm/kernel/0 differ diff --git a/model/params/model/layers/8/self_attn/k_proj/bias/.zarray b/model/params/model/layers/8/self_attn/k_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..82cda9219c678aba952afd8b130fd0430cff4770 --- /dev/null +++ b/model/params/model/layers/8/self_attn/k_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/k_proj/bias/0 b/model/params/model/layers/8/self_attn/k_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..a7be141c25d5282d2d2d0c3ed8ea25371293b837 Binary files /dev/null and b/model/params/model/layers/8/self_attn/k_proj/bias/0 differ diff --git a/model/params/model/layers/8/self_attn/k_proj/kernel/.zarray b/model/params/model/layers/8/self_attn/k_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5d1373376dc0d30da62c2050a6251ab55d995f8f --- /dev/null +++ b/model/params/model/layers/8/self_attn/k_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[720,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/o_proj/bias/.zarray b/model/params/model/layers/8/self_attn/o_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/8/self_attn/o_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/o_proj/bias/0 b/model/params/model/layers/8/self_attn/o_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..05d5a05545e47694eca65b709fe214a710305351 Binary files /dev/null and b/model/params/model/layers/8/self_attn/o_proj/bias/0 differ diff --git a/model/params/model/layers/8/self_attn/o_proj/kernel/.zarray b/model/params/model/layers/8/self_attn/o_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..0d72be683eb2b53390d4eb11bf07343ad17f37d7 --- /dev/null +++ b/model/params/model/layers/8/self_attn/o_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[4096,720],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/q_proj/bias/.zarray b/model/params/model/layers/8/self_attn/q_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..6cb36eccfd612cd6d39e72807aaa8527aa28075d --- /dev/null +++ b/model/params/model/layers/8/self_attn/q_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[4096],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/q_proj/bias/0 b/model/params/model/layers/8/self_attn/q_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..516533b7ba28368871cfc7ad6bb0c6de6ee7c7b0 Binary files /dev/null and b/model/params/model/layers/8/self_attn/q_proj/bias/0 differ diff --git a/model/params/model/layers/8/self_attn/q_proj/kernel/.zarray b/model/params/model/layers/8/self_attn/q_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..52c1634030f7b3cfbbf99130324b654685a16f94 --- /dev/null +++ b/model/params/model/layers/8/self_attn/q_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[720,4096],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,4096],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/sinks/.zarray b/model/params/model/layers/8/self_attn/sinks/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..75fb6ffa552a3ab35caf4dd9123b81fce2aa7eaa --- /dev/null +++ b/model/params/model/layers/8/self_attn/sinks/.zarray @@ -0,0 +1 @@ +{"chunks":[64],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[64],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/sinks/0 b/model/params/model/layers/8/self_attn/sinks/0 new file mode 100644 index 0000000000000000000000000000000000000000..754e0072448d18a78239e1b01e0ec0ed117c4d23 Binary files /dev/null and b/model/params/model/layers/8/self_attn/sinks/0 differ diff --git a/model/params/model/layers/8/self_attn/v_proj/bias/.zarray b/model/params/model/layers/8/self_attn/v_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..82cda9219c678aba952afd8b130fd0430cff4770 --- /dev/null +++ b/model/params/model/layers/8/self_attn/v_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/8/self_attn/v_proj/bias/0 b/model/params/model/layers/8/self_attn/v_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..731d59f7bee32aa7ebe733b6877c75cac65e0cca Binary files /dev/null and b/model/params/model/layers/8/self_attn/v_proj/bias/0 differ diff --git a/model/params/model/layers/8/self_attn/v_proj/kernel/.zarray b/model/params/model/layers/8/self_attn/v_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..5d1373376dc0d30da62c2050a6251ab55d995f8f --- /dev/null +++ b/model/params/model/layers/8/self_attn/v_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[720,512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/input_layernorm/kernel/.zarray b/model/params/model/layers/9/input_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/9/input_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/input_layernorm/kernel/0 b/model/params/model/layers/9/input_layernorm/kernel/0 new file mode 100644 index 0000000000000000000000000000000000000000..34f0082fb1696b73ed3759903ff016b6ffae49c9 Binary files /dev/null and b/model/params/model/layers/9/input_layernorm/kernel/0 differ diff --git a/model/params/model/layers/9/mlp/experts/down_proj/bias/.zarray b/model/params/model/layers/9/mlp/experts/down_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/9/mlp/experts/down_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/mlp/experts/down_proj/kernel/.zarray b/model/params/model/layers/9/mlp/experts/down_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/9/mlp/experts/down_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/mlp/experts/gate_proj/bias/.zarray b/model/params/model/layers/9/mlp/experts/gate_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..cc9594b998fdead47c5dfb2bc2873564428a9a2c --- /dev/null +++ b/model/params/model/layers/9/mlp/experts/gate_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/mlp/experts/gate_proj/kernel/.zarray b/model/params/model/layers/9/mlp/experts/gate_proj/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..dea4c25661820bbcbab3394fdafa9a934335bdbf --- /dev/null +++ b/model/params/model/layers/9/mlp/experts/gate_proj/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[32,2880,2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128,2880,2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/mlp/router/bias/.zarray b/model/params/model/layers/9/mlp/router/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..3eeba2a45304285824238b9edd4f261c3d5d6f01 --- /dev/null +++ b/model/params/model/layers/9/mlp/router/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[128],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/mlp/router/kernel/.zarray b/model/params/model/layers/9/mlp/router/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..ea8f6491b31bb855173d1db15e72526f2b085881 --- /dev/null +++ b/model/params/model/layers/9/mlp/router/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880,128],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880,128],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/post_attention_layernorm/kernel/.zarray b/model/params/model/layers/9/post_attention_layernorm/kernel/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..a6b4a137bbd7791e040a35594e3226e4b578df4d --- /dev/null +++ b/model/params/model/layers/9/post_attention_layernorm/kernel/.zarray @@ -0,0 +1 @@ +{"chunks":[2880],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[2880],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/self_attn/k_proj/bias/.zarray b/model/params/model/layers/9/self_attn/k_proj/bias/.zarray new file mode 100644 index 0000000000000000000000000000000000000000..82cda9219c678aba952afd8b130fd0430cff4770 --- /dev/null +++ b/model/params/model/layers/9/self_attn/k_proj/bias/.zarray @@ -0,0 +1 @@ +{"chunks":[512],"compressor":{"id":"zstd","level":1},"dimension_separator":".","dtype":"bfloat16","fill_value":null,"filters":null,"order":"C","shape":[512],"zarr_format":2} \ No newline at end of file diff --git a/model/params/model/layers/9/self_attn/q_proj/bias/0 b/model/params/model/layers/9/self_attn/q_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..27e609b2e307d832a8bb8f9e7bccd580f0c2aef9 Binary files /dev/null and b/model/params/model/layers/9/self_attn/q_proj/bias/0 differ diff --git a/model/params/model/layers/9/self_attn/sinks/0 b/model/params/model/layers/9/self_attn/sinks/0 new file mode 100644 index 0000000000000000000000000000000000000000..25914e8489ccc0d7ffdce42b44ee20bec1d2fdb8 Binary files /dev/null and b/model/params/model/layers/9/self_attn/sinks/0 differ diff --git a/model/params/model/layers/9/self_attn/v_proj/bias/0 b/model/params/model/layers/9/self_attn/v_proj/bias/0 new file mode 100644 index 0000000000000000000000000000000000000000..38b765090a48170b6ca0188612de60b357c908f3 Binary files /dev/null and b/model/params/model/layers/9/self_attn/v_proj/bias/0 differ diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..6274cc1bd159aa75de771315558e5cac7dd8bea0 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|startoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|return|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tensorstore_index.json b/tensorstore_index.json new file mode 100644 index 0000000000000000000000000000000000000000..20aeb30b2e0515017b0efd29ec46d4403684217b --- /dev/null +++ b/tensorstore_index.json @@ -0,0 +1,5323 @@ +{ + "format": "tensorstore", + "version": "easydel", + "prefixes": { + "model": [ + { + "path": "model/params/model/embed_tokens/embedding", + "shape": [ + 201088, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/0/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/1/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/2/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/3/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/4/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/5/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/6/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/7/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/8/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/9/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/10/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/11/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/12/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/13/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/14/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/15/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/16/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/17/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/18/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/19/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/20/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/21/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/22/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/23/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/24/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/25/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/26/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/27/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/28/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/29/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/30/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/31/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/32/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/33/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/34/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/router/bias", + "shape": [ + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/router/kernel", + "shape": [ + 2880, + 128 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/k_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/k_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/o_proj/bias", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/o_proj/kernel", + "shape": [ + 4096, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/q_proj/bias", + "shape": [ + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/q_proj/kernel", + "shape": [ + 2880, + 4096 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/sinks", + "shape": [ + 64 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/v_proj/bias", + "shape": [ + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/self_attn/v_proj/kernel", + "shape": [ + 2880, + 512 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/experts/gate_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/experts/up_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/experts/gate_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/experts/up_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/lm_head/kernel", + "shape": [ + 2880, + 201088 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/input_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/experts/down_proj/kernel", + "shape": [ + 128, + 2880, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/mlp/experts/down_proj/bias", + "shape": [ + 128, + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/layers/35/post_attention_layernorm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + }, + { + "path": "model/params/model/norm/kernel", + "shape": [ + 2880 + ], + "dtype": "bfloat16" + } + ] + } +} \ No newline at end of file