diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a6f79978e9da607de7f395b2edf637afd94f5178 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +--- +language: +- en +- zh +library_name: mlx +license: mit +pipeline_tag: text-generation +tags: +- mlx +base_model: zai-org/GLM-5.1 +--- + +# catalystsec/GLM-5.1-4bit + +This model [catalystsec/GLM-5.1-4bit](https://huggingface.co/catalystsec/GLM-5.1-4bit) was +converted to MLX format from [zai-org/GLM-5.1](https://huggingface.co/zai-org/GLM-5.1) +using mlx-lm version **0.31.2**. + +## Use with mlx + +```bash +pip install mlx-lm +``` + +```python +from mlx_lm import load, generate + +model, tokenizer = load("catalystsec/GLM-5.1-4bit") + +prompt = "hello" + +if tokenizer.chat_template is not None: + messages = [{"role": "user", "content": prompt}] + prompt = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, return_dict=False, + ) + +response = generate(model, tokenizer, prompt=prompt, verbose=True) +``` diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..0093efaa15b9ee3b0d8799ec64933fe0897b6687 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,117 @@ +[gMASK] +{%- if tools -%} +{%- macro tool_to_json(tool) -%} + {%- set ns_tool = namespace(first=true) -%} + {{ '{' -}} + {%- for k, v in tool.items() -%} + {%- if k != 'defer_loading' and k != 'strict' -%} + {%- if not ns_tool.first -%}{{- ', ' -}}{%- endif -%} + {%- set ns_tool.first = false -%} + "{{ k }}": {{ v | tojson(ensure_ascii=False) }} + {%- endif -%} + {%- endfor -%} + {{- '}' -}} +{%- endmacro -%} +<|system|> +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{% for tool in tools %} +{%- if 'function' in tool -%} + {%- set tool = tool['function'] -%} +{%- endif -%} +{% if tool.defer_loading is not defined or not tool.defer_loading %} +{{ tool_to_json(tool) }} +{% endif %} +{% endfor %} + + +For each function call, output the function name and arguments within the following XML format: +{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}...{%- endif -%} +{%- macro visible_text(content) -%} + {%- if content is string -%} + {{- content }} + {%- elif content is iterable and content is not mapping -%} + {%- for item in content -%} + {%- if item is mapping and item.type == 'text' -%} + {{- item.text }} + {%- elif item is string -%} + {{- item }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{- content }} + {%- endif -%} +{%- endmacro -%} +{%- set ns = namespace(last_user_index=-1, thinking_indices='') -%} +{%- for m in messages %} + {%- if m.role == 'user' %} + {%- set ns.last_user_index = loop.index0 -%} + {%- elif m.role == 'assistant' %} + {%- if m.reasoning_content is string %} + {%- set ns.thinking_indices = ns.thinking_indices ~ ',' ~ ns.last_user_index ~ ',' -%} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- set ns.has_thinking = false -%} +{%- for m in messages -%} +{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}{% set ns.has_thinking = (',' ~ loop.index0 ~ ',') in ns.thinking_indices -%} +{%- elif m.role == 'assistant' -%} +<|assistant|> +{%- set content = visible_text(m.content) %} +{%- if m.reasoning_content is string %} + {%- set reasoning_content = m.reasoning_content %} +{%- elif '' in content %} + {%- set reasoning_content = content.split('')[0].split('')[-1] %} + {%- set content = content.split('')[-1] %} +{%- elif loop.index0 > ns.last_user_index and not (enable_thinking is defined and not enable_thinking) %} + {%- set reasoning_content = '' %} +{%- elif loop.index0 < ns.last_user_index and ns.has_thinking %} + {%- set reasoning_content = '' %} +{%- endif %} +{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content is defined -%} +{{ '' + reasoning_content + ''}} +{%- else -%} +{{ '' }} +{%- endif -%} +{%- if content.strip() -%} +{{ content.strip() }} +{%- endif -%} +{% if m.tool_calls %} +{% for tc in m.tool_calls %} +{%- if tc.function %} + {%- set tc = tc.function %} +{%- endif %} +{{- '' + tc.name -}} +{% set _args = tc.arguments %}{% for k, v in _args.items() %}{{ k }}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{% endfor %}{% endfor %} +{% endif %} +{%- elif m.role == 'tool' -%} +{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|observation|>' -}} +{%- endif %} +{%- if m.content is string -%} + {{- '' + m.content + '' -}} +{%- else -%} + {{- '\n' -}} + {% for tr in m.content %} + {%- for tool in tools -%} + {%- if 'function' in tool -%} + {%- set tool = tool['function'] -%} + {%- endif -%} + {%- if tool.name == tr.name -%} + {{- tool_to_json(tool) + '\n' -}} + {%- endif -%} + {%- endfor -%} + {%- endfor -%} + {{- '' -}} +{% endif -%} +{%- elif m.role == 'system' -%} +<|system|>{{ visible_text(m.content) }} +{%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + <|assistant|>{{- '' if (enable_thinking is defined and not enable_thinking) else '' -}} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2d74d335e6ec620c9c2daf3366f2abd7b3827cfd --- /dev/null +++ b/config.json @@ -0,0 +1,69 @@ +{ + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "ep_size": 1, + "first_k_dense_replace": 3, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 6144, + "index_head_dim": 128, + "index_n_heads": 32, + "index_topk": 2048, + "indexer_rope_interleave": true, + "initializer_range": 0.02, + "intermediate_size": 12288, + "kv_lora_rank": 512, + "max_position_embeddings": 202752, + "model_type": "glm_moe_dsa", + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "n_group": 1, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "num_nextn_predict_layers": 1, + "pad_token_id": 154820, + "pretraining_tp": 1, + "q_lora_rank": 2048, + "qk_head_dim": 256, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "quantization": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "quantization_config": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "rms_norm_eps": 1e-05, + "rope_interleave": true, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 1, + "topk_method": "noaux_tc", + "transformers_version": "5.4.0", + "use_cache": true, + "v_head_dim": 256, + "vocab_size": 154880 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..453800a061bdc65b75b9dd99ecc66ede543dac89 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "pad_token_id": 154820, + "temperature": 1.0, + "top_p": 0.95, + "transformers_version": "5.4.0" +} diff --git a/model-00003-of-00091.safetensors b/model-00003-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ee6f6e5145a8de86e25ecd9ef55866c623c862a --- /dev/null +++ b/model-00003-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c79099169ceacbb845091a31ce7c5be43be4d45cbb8db9fcc39e0583320aec15 +size 3947717330 diff --git a/model-00006-of-00091.safetensors b/model-00006-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..16d9e8c6e804f8870ff0dc8ca62bc1ff6ee8462b --- /dev/null +++ b/model-00006-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed74077c6af97a484a61dedf0616f2310738e62ea8530e52f19d34bbfb36e85 +size 5335155728 diff --git a/model-00009-of-00091.safetensors b/model-00009-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..83238a9f0a295657b208445ad6a7601e5fd0843e --- /dev/null +++ b/model-00009-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a0eb85e4927348720709e5f7a4d934cbac7d75012dd16538a613621fb0f6afc +size 3947717362 diff --git a/model-00010-of-00091.safetensors b/model-00010-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2265e1029bda5cb694bebfe722c3423632531f5c --- /dev/null +++ b/model-00010-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad50882d0ace08f77c4086291c095c2a7b0643e45525e5c95b1fba45c6ef3015 +size 5357003398 diff --git a/model-00011-of-00091.safetensors b/model-00011-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..41a0b3aced05539cfd9fe2d588b24f9d7ba39229 --- /dev/null +++ b/model-00011-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4655079c79dee65cc05f8d35bafe8bbb7f7b94fa26a2ae4d11f56a427c6d7f4c +size 3947717358 diff --git a/model-00012-of-00091.safetensors b/model-00012-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6bb4fe4b26d230c11a00f0e3a63faf26333ff17c --- /dev/null +++ b/model-00012-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1748c370ef34c1cf89571bfaf2f3af7d13c0c87bcea170917ddbdf120b7b54b +size 5335155736 diff --git a/model-00014-of-00091.safetensors b/model-00014-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..72f2342a812bc36855b586afc8657d7e268d3c32 --- /dev/null +++ b/model-00014-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9cfa97eb01846afac0d2974b95b0e23dc6beee6d150b890d10c8837c56c7fc8 +size 5357003402 diff --git a/model-00015-of-00091.safetensors b/model-00015-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ba29fcf5d341c16f7e1205552d2517275600e3bc --- /dev/null +++ b/model-00015-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37a1728be9b6b7ebac95bae34850c24f8eff85e774d29c4f9c1978804edac286 +size 3947717380 diff --git a/model-00020-of-00091.safetensors b/model-00020-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4eaa3d84eb8b2a2aba6027199dfb05f2f4c0c8a0 --- /dev/null +++ b/model-00020-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97dc6e5b6dd98a86699648fa43d4d9d6cc7d6eba7c974f42ec792b47bfe02c49 +size 5357003394 diff --git a/model-00022-of-00091.safetensors b/model-00022-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dded5196a06e1cf4dbbdffffeaeb72873e1413b1 --- /dev/null +++ b/model-00022-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:724537dc4971c8e04927caf6f4b3ff9c93e489cac8217574e63e1385acfc9514 +size 5357003398 diff --git a/model-00023-of-00091.safetensors b/model-00023-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0b8f78bfe35178565b120d107df1900289b02332 --- /dev/null +++ b/model-00023-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6752d168ef6dbe5ae8b6aba82bc18c214e4115dfdcb252cacddcbbb5a88ca864 +size 3947717338 diff --git a/model-00025-of-00091.safetensors b/model-00025-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bd0b549cac720be8651058895ad3624e5162711b --- /dev/null +++ b/model-00025-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4eabed2d32d3abab15e1cbc312b96328391a1eb74a66b23d6153619da7a841 +size 3847053950 diff --git a/model-00026-of-00091.safetensors b/model-00026-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..937a86eabbab5776b364cf141ec98b4442f7ddd4 --- /dev/null +++ b/model-00026-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9689b3cc2ca5c2d7854bfd41a088ef143a4576bb84634a2dcf8377ca72a355d4 +size 5357003402 diff --git a/model-00027-of-00091.safetensors b/model-00027-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8b3a60f4d5c1f65feea61e86843428a549232a8 --- /dev/null +++ b/model-00027-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6195d8b43945b7bc6bf311acc41f8b5ead7d313ff66088601e913c4bc43083a4 +size 3947717384 diff --git a/model-00028-of-00091.safetensors b/model-00028-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d017f85d94893725e0aeb4c039f5160ff482dcd8 --- /dev/null +++ b/model-00028-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed9180004097c2508fd8a91a0da68045469fc5f6f96e56aa42d4d435bc3ee376 +size 5357003398 diff --git a/model-00029-of-00091.safetensors b/model-00029-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f2c23e4dbf10dc99368332bd944106de136e6af2 --- /dev/null +++ b/model-00029-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4182cbb09c494d74bc045f000eb094bfc88eb78bb5ac23b644f25eaa5ab75e94 +size 3947717260 diff --git a/model-00031-of-00091.safetensors b/model-00031-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d38864c47dd13ae10ede89a6b3688730beca6fb --- /dev/null +++ b/model-00031-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e316e7e2e360501bed7af89abd2bc1d38522d0bf2aa1a399f8f607e01f280a4a +size 3847053956 diff --git a/model-00034-of-00091.safetensors b/model-00034-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..85432a7dd440c39994f8fef00e15e89b998a34e5 --- /dev/null +++ b/model-00034-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f20a9ee594a13ef549d83057aa53c7296009b4d89c1708b3de11b101ca89e51 +size 5357003398 diff --git a/model-00038-of-00091.safetensors b/model-00038-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e3c12cf6b8bc497b6bbf9429faa01ccdda2a7861 --- /dev/null +++ b/model-00038-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8a980afc55899182ddc33dd40060f532fc39bb2043186e98c7f29515d2cbc8 +size 5357003400 diff --git a/model-00041-of-00091.safetensors b/model-00041-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cb1e1aee0b121b74a144d7978abf7ca2caa2ff11 --- /dev/null +++ b/model-00041-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08691dc51962f2d806be37e19b3549f1885206e7a87a5ae13c44940a78b4fd4e +size 3947717182 diff --git a/model-00042-of-00091.safetensors b/model-00042-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f206c258eebc8f8947c6da20a1ab1e5cd1d15e4e --- /dev/null +++ b/model-00042-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f337bb03a1f510b78d214ff91cf742af6db9bda80c6bcea44adc9e55cfbbc7e +size 5335155738 diff --git a/model-00047-of-00091.safetensors b/model-00047-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..196304c354e84bc11da4531c73fca5d27a496193 --- /dev/null +++ b/model-00047-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677ccc4ba65b17648732af64174525eedef343aa50dd27f1e7ed9516b15612f5 +size 3947717206 diff --git a/model-00048-of-00091.safetensors b/model-00048-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9096997f43d71263cc554f059ec658c71d1cceeb --- /dev/null +++ b/model-00048-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fb320895a964353722432ff36a47f8ba9d5ef24f2183d2215e64c6328454428 +size 5335155736 diff --git a/model-00050-of-00091.safetensors b/model-00050-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..263aabec13ecb67e3848f42b32ca47ac0c6c9586 --- /dev/null +++ b/model-00050-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ffd6a5430cfdcd63205facb17acc710d491151595dc3e67b54780c0cf69ba26 +size 5357003402 diff --git a/model-00051-of-00091.safetensors b/model-00051-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b6567a3613aded54ee584f3e87e8ad588a3ed00 --- /dev/null +++ b/model-00051-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:169c7dea6e4db05cbe4f8d821edfba36e1dea48ed7878ab148c4adb56aa40e2e +size 3947717380 diff --git a/model-00053-of-00091.safetensors b/model-00053-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..337aeb798c29620c7b9f17e095b206078e58156d --- /dev/null +++ b/model-00053-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:073766ec18761334f5bfec190f0727e2bf289ca7100c50210b29733100ea85de +size 3947717308 diff --git a/model-00054-of-00091.safetensors b/model-00054-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34985a2bf66013a1c715f0123a23d3ef3e96495a --- /dev/null +++ b/model-00054-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857def919a5f9b78203469c2b72be9e56fc07f05c71dbd0aa356158b08a60aa8 +size 5335155738 diff --git a/model-00055-of-00091.safetensors b/model-00055-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a2ca02e7f8da83eccafbe0573bfcf9864b350a24 --- /dev/null +++ b/model-00055-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bad2bc81a73b14184b240252d9e4bcce4b6692547a1adc8024d3441d0f5c311 +size 3847053918 diff --git a/model-00061-of-00091.safetensors b/model-00061-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c64102736a7be30aaf0e1fec0c8c7e79c79939a6 --- /dev/null +++ b/model-00061-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83fee6c50228c3b690d89d5dfd043849c575856aeaf3155cdff6426588025f90 +size 3847053956 diff --git a/model-00062-of-00091.safetensors b/model-00062-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..88d532e689b8dbdf9e27e98a9f4ceab77212bfe3 --- /dev/null +++ b/model-00062-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db48505e0c38aac1a6121ac6722d93db4937e525b34ff688835f227485845b53 +size 5357003388 diff --git a/model-00063-of-00091.safetensors b/model-00063-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..202ded1f6a01f6a0bc585d89da897a99dbf16653 --- /dev/null +++ b/model-00063-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb9b5a2383a037a85fff72bb619b4012773f9e18127bd16a32f7baf32d00598d +size 3947717356 diff --git a/model-00066-of-00091.safetensors b/model-00066-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c9d9d7a567b18241fae9b3335515d3ecde4defb9 --- /dev/null +++ b/model-00066-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b4a4aa2f51d29c06bf1cf85c1f84a53372a4ae75f2f23329681001bc39ac442 +size 5335155734 diff --git a/model-00067-of-00091.safetensors b/model-00067-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4d4a57d51a7f72ea010e4dcb5ad476afe99429ff --- /dev/null +++ b/model-00067-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6efc115600b3d9ba770c8d8e3445acdc9d3ef32988b1a0e0e3066856e0294cee +size 3847053928 diff --git a/model-00068-of-00091.safetensors b/model-00068-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..beb60e8c340a744289a784d4f994719900c36d4d --- /dev/null +++ b/model-00068-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:772db26190b6f876bb6becf268f1f52d3333ddaa4043f72dffbd8ac53c10090e +size 5357003402 diff --git a/model-00069-of-00091.safetensors b/model-00069-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6ef5f1b630694c15e63df43dac835605754db579 --- /dev/null +++ b/model-00069-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04cd01c84c5841b69cfcdffe816a5ca0a62d5b9c28afc8796f10d46139ba9524 +size 3947717300 diff --git a/model-00070-of-00091.safetensors b/model-00070-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2fc116b0988587e606bb8c670893bf9360e6cd3a --- /dev/null +++ b/model-00070-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d57bb63e3a6a43b3bd44ce1c1c9813f4955e3e19bc312d17d1a56a4f942c5d +size 5357003398 diff --git a/model-00073-of-00091.safetensors b/model-00073-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae62f157312f71d8777cac0dd5645216d85e1f33 --- /dev/null +++ b/model-00073-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b223acb29bbf70320df36d588ca3ce782142ecaba848a2976a1e18fd80598aa8 +size 3847053950 diff --git a/model-00075-of-00091.safetensors b/model-00075-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3afd427158e5d90d226c020ee4f66b0c3896be43 --- /dev/null +++ b/model-00075-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802476b9c73a79f7b4f0e02e3123c114a851de9319b9276db1919ce4c662ca26 +size 3947717358 diff --git a/model-00083-of-00091.safetensors b/model-00083-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77df620393d54a50ae65ee316ff7a8b791c72e62 --- /dev/null +++ b/model-00083-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15f916e6cf6582bd059b37c319792cf4197c56d95009ab775ffd75cb288d5d61 +size 3947717200 diff --git a/model-00086-of-00091.safetensors b/model-00086-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..976a9addaebaf2a526838d4846651a13e785a6f5 --- /dev/null +++ b/model-00086-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e9fd527cf28fc0cdd57c9771acc817f3910fbf2b6c8be4e3354517a7a0072be +size 5357003402 diff --git a/model-00089-of-00091.safetensors b/model-00089-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3eef54554c07333f5e61693106c6410e96fc768 --- /dev/null +++ b/model-00089-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69afc20b3d872474e471187e558c086db797aa688f90b7ef6141b3e6c357c6b7 +size 3947717258 diff --git a/model-00090-of-00091.safetensors b/model-00090-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..173ad124995ef7f6d5c4757d9a0d6e5db4f25a72 --- /dev/null +++ b/model-00090-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea48c7ff47af4d152c5737f653563f0db1f4e96bd08aba1d02eddb99fcfd528b +size 5335155736 diff --git a/model-00091-of-00091.safetensors b/model-00091-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8c192d22958f3865379a590439a91faf7042b4b2 --- /dev/null +++ b/model-00091-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13a9a2a609eff3fbb05e40be6cea16de70caeb2dfb122e04896281c73b66cb17 +size 660347957 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..4a0229bf5b85846b2c642afa92a12c0905cb1236 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,4116 @@ +{ + "metadata": { + "total_size": 418621403136, + "total_parameters": 743911218432 + }, + "weight_map": { + "lm_head.biases": "model-00091-of-00091.safetensors", + "lm_head.scales": "model-00091-of-00091.safetensors", + "lm_head.weight": "model-00091-of-00091.safetensors", + "model.embed_tokens.biases": "model-00001-of-00091.safetensors", + "model.embed_tokens.scales": "model-00001-of-00091.safetensors", + "model.embed_tokens.weight": "model-00001-of-00091.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.down_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.down_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.down_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.down_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.10.input_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.gate.e_score_correction_bias": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.gate.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.10.self_attn.embed_q.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.embed_q.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.embed_q.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.k_norm.bias": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.k_norm.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wk.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wk.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wk.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wq_b.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wq_b.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wq_b.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.o_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.o_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_b_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_b_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_b_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.unembed_out.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.unembed_out.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.unembed_out.weight": "model-00009-of-00091.safetensors", + "model.layers.11.input_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.gate.e_score_correction_bias": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.gate.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.11.self_attn.embed_q.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.embed_q.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.embed_q.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.k_norm.bias": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.k_norm.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wk.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wk.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wk.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wq_b.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wq_b.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wq_b.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.o_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.o_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_b_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_b_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_b_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.unembed_out.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.unembed_out.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.unembed_out.weight": "model-00010-of-00091.safetensors", + "model.layers.12.input_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.gate.e_score_correction_bias": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.gate.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00012-of-00091.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.12.self_attn.embed_q.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.embed_q.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.embed_q.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.k_norm.bias": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.k_norm.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wk.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wk.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wk.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wq_b.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wq_b.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wq_b.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.o_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.o_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_b_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_b_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_b_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.unembed_out.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.unembed_out.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.unembed_out.weight": "model-00011-of-00091.safetensors", + "model.layers.13.input_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.gate.e_score_correction_bias": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.gate.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.13.self_attn.embed_q.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.embed_q.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.embed_q.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.k_norm.bias": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.k_norm.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wk.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wk.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wk.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wq_b.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wq_b.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wq_b.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.o_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.o_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_b_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_b_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_b_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.unembed_out.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.unembed_out.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.unembed_out.weight": "model-00013-of-00091.safetensors", + "model.layers.14.input_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.gate.e_score_correction_bias": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.gate.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.14.self_attn.embed_q.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.embed_q.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.embed_q.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.k_norm.bias": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.k_norm.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wk.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wk.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wk.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wq_b.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wq_b.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wq_b.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.o_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.o_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_b_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_b_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_b_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.unembed_out.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.unembed_out.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.unembed_out.weight": "model-00014-of-00091.safetensors", + "model.layers.15.input_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.gate.e_score_correction_bias": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.gate.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.15.self_attn.embed_q.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.embed_q.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.embed_q.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.k_norm.bias": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.k_norm.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wk.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wk.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wk.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wq_b.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wq_b.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wq_b.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.o_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.o_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_b_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_b_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_b_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.unembed_out.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.unembed_out.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.unembed_out.weight": "model-00015-of-00091.safetensors", + "model.layers.16.input_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.gate.e_score_correction_bias": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.gate.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.16.self_attn.embed_q.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.embed_q.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.embed_q.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.k_norm.bias": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.k_norm.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wk.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wk.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wk.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wq_b.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wq_b.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wq_b.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.o_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.o_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_b_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_b_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_b_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.unembed_out.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.unembed_out.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.unembed_out.weight": "model-00016-of-00091.safetensors", + "model.layers.17.input_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.gate.e_score_correction_bias": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.gate.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00018-of-00091.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.17.self_attn.embed_q.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.embed_q.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.embed_q.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.k_norm.bias": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.k_norm.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wk.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wk.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wk.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wq_b.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wq_b.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wq_b.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.o_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.o_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_b_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_b_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_b_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.unembed_out.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.unembed_out.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.unembed_out.weight": "model-00017-of-00091.safetensors", + "model.layers.18.input_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.gate.e_score_correction_bias": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.gate.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.18.self_attn.embed_q.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.embed_q.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.embed_q.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.k_norm.bias": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.k_norm.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wk.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wk.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wk.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wq_b.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wq_b.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wq_b.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.o_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.o_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_b_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_b_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_b_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.unembed_out.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.unembed_out.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.unembed_out.weight": "model-00019-of-00091.safetensors", + "model.layers.19.input_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.gate.e_score_correction_bias": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.gate.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.19.self_attn.embed_q.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.embed_q.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.embed_q.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.k_norm.bias": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.k_norm.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wk.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wk.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wk.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wq_b.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wq_b.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wq_b.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.o_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.o_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_b_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_b_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_b_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.unembed_out.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.unembed_out.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.unembed_out.weight": "model-00020-of-00091.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.down_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.down_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.20.input_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.gate.e_score_correction_bias": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.gate.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.20.self_attn.embed_q.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.embed_q.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.embed_q.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.k_norm.bias": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.k_norm.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wk.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wk.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wk.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wq_b.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wq_b.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wq_b.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.o_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.o_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_b_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_b_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_b_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.unembed_out.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.unembed_out.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.unembed_out.weight": "model-00021-of-00091.safetensors", + "model.layers.21.input_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.gate.e_score_correction_bias": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.gate.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.21.self_attn.embed_q.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.embed_q.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.embed_q.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.k_norm.bias": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.k_norm.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wk.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wk.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wk.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wq_b.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wq_b.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wq_b.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.o_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.o_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_b_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_b_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_b_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.unembed_out.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.unembed_out.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.unembed_out.weight": "model-00022-of-00091.safetensors", + "model.layers.22.input_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.gate.e_score_correction_bias": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.gate.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00024-of-00091.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.22.self_attn.embed_q.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.embed_q.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.embed_q.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.k_norm.bias": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.k_norm.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wk.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wk.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wk.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wq_b.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wq_b.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wq_b.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.o_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.o_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_b_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_b_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_b_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.unembed_out.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.unembed_out.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.unembed_out.weight": "model-00023-of-00091.safetensors", + "model.layers.23.input_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.gate.e_score_correction_bias": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.gate.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.23.self_attn.embed_q.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.embed_q.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.embed_q.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.k_norm.bias": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.k_norm.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wk.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wk.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wk.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wq_b.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wq_b.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wq_b.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.o_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.o_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_b_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_b_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_b_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.unembed_out.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.unembed_out.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.unembed_out.weight": "model-00025-of-00091.safetensors", + "model.layers.24.input_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.gate.e_score_correction_bias": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.gate.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.24.self_attn.embed_q.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.embed_q.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.embed_q.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.k_norm.bias": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.k_norm.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wk.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wk.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wk.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wq_b.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wq_b.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wq_b.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.o_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.o_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_b_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_b_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_b_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.unembed_out.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.unembed_out.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.unembed_out.weight": "model-00026-of-00091.safetensors", + "model.layers.25.input_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.gate.e_score_correction_bias": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.gate.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.25.self_attn.embed_q.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.embed_q.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.embed_q.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.k_norm.bias": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.k_norm.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wk.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wk.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wk.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wq_b.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wq_b.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wq_b.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.o_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.o_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_b_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_b_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_b_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.unembed_out.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.unembed_out.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.unembed_out.weight": "model-00027-of-00091.safetensors", + "model.layers.26.input_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.gate.e_score_correction_bias": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.gate.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.26.self_attn.embed_q.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.embed_q.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.embed_q.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.k_norm.bias": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.k_norm.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wk.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wk.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wk.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wq_b.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wq_b.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wq_b.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.o_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.o_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_b_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_b_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_b_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.unembed_out.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.unembed_out.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.unembed_out.weight": "model-00028-of-00091.safetensors", + "model.layers.27.input_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.gate.e_score_correction_bias": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.gate.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00030-of-00091.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.27.self_attn.embed_q.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.embed_q.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.embed_q.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.k_norm.bias": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.k_norm.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wk.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wk.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wk.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wq_b.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wq_b.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wq_b.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.o_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.o_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_b_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_b_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_b_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.unembed_out.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.unembed_out.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.unembed_out.weight": "model-00029-of-00091.safetensors", + "model.layers.28.input_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.gate.e_score_correction_bias": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.gate.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.28.self_attn.embed_q.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.embed_q.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.embed_q.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.k_norm.bias": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.k_norm.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wk.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wk.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wk.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wq_b.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wq_b.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wq_b.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.o_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.o_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_b_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_b_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_b_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.unembed_out.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.unembed_out.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.unembed_out.weight": "model-00031-of-00091.safetensors", + "model.layers.29.input_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.gate.e_score_correction_bias": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.gate.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.29.self_attn.embed_q.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.embed_q.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.embed_q.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.k_norm.bias": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.k_norm.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wk.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wk.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wk.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wq_b.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wq_b.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wq_b.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.o_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.o_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_b_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_b_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_b_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.unembed_out.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.unembed_out.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.unembed_out.weight": "model-00032-of-00091.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.gate.e_score_correction_bias": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.gate.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.3.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.30.input_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.gate.e_score_correction_bias": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.gate.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.30.self_attn.embed_q.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.embed_q.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.embed_q.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.k_norm.bias": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.k_norm.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wk.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wk.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wk.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wq_b.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wq_b.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wq_b.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.o_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.o_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_b_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_b_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_b_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.unembed_out.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.unembed_out.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.unembed_out.weight": "model-00033-of-00091.safetensors", + "model.layers.31.input_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.gate.e_score_correction_bias": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.gate.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.31.self_attn.embed_q.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.embed_q.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.embed_q.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.k_norm.bias": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.k_norm.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wk.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wk.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wk.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wq_b.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wq_b.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wq_b.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.o_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.o_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_b_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_b_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_b_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.unembed_out.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.unembed_out.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.unembed_out.weight": "model-00034-of-00091.safetensors", + "model.layers.32.input_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.gate.e_score_correction_bias": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.gate.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.biases": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.scales": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.weight": "model-00036-of-00091.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.32.self_attn.embed_q.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.embed_q.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.embed_q.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.k_norm.bias": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.k_norm.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wk.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wk.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wk.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wq_b.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wq_b.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wq_b.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.o_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.o_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_b_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_b_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_b_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.unembed_out.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.unembed_out.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.unembed_out.weight": "model-00035-of-00091.safetensors", + "model.layers.33.input_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.gate.e_score_correction_bias": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.gate.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.33.self_attn.embed_q.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.embed_q.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.embed_q.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.k_norm.bias": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.k_norm.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wk.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wk.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wk.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wq_b.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wq_b.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wq_b.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.o_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.o_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_b_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_b_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_b_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.unembed_out.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.unembed_out.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.unembed_out.weight": "model-00037-of-00091.safetensors", + "model.layers.34.input_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.gate.e_score_correction_bias": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.gate.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.34.self_attn.embed_q.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.embed_q.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.embed_q.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.k_norm.bias": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.k_norm.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wk.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wk.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wk.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wq_b.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wq_b.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wq_b.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.o_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.o_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_b_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_b_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_b_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.unembed_out.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.unembed_out.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.unembed_out.weight": "model-00038-of-00091.safetensors", + "model.layers.35.input_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.gate.e_score_correction_bias": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.gate.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.35.self_attn.embed_q.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.embed_q.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.embed_q.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.k_norm.bias": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.k_norm.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wk.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wk.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wk.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wq_b.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wq_b.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wq_b.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.o_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.o_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_b_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_b_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_b_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.unembed_out.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.unembed_out.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.unembed_out.weight": "model-00039-of-00091.safetensors", + "model.layers.36.input_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.gate.e_score_correction_bias": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.gate.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.36.self_attn.embed_q.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.embed_q.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.embed_q.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.k_norm.bias": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.k_norm.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wk.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wk.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wk.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wq_b.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wq_b.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wq_b.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.o_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.o_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_b_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_b_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_b_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.unembed_out.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.unembed_out.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.unembed_out.weight": "model-00040-of-00091.safetensors", + "model.layers.37.input_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.gate.e_score_correction_bias": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.gate.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.biases": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.scales": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.weight": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.biases": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.scales": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.weight": "model-00042-of-00091.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.37.self_attn.embed_q.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.embed_q.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.embed_q.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.k_norm.bias": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.k_norm.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wk.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wk.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wk.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wq_b.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wq_b.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wq_b.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.o_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.o_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_b_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_b_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_b_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.unembed_out.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.unembed_out.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.unembed_out.weight": "model-00041-of-00091.safetensors", + "model.layers.38.input_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.gate.e_score_correction_bias": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.gate.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.38.self_attn.embed_q.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.embed_q.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.embed_q.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.k_norm.bias": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.k_norm.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wk.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wk.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wk.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wq_b.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wq_b.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wq_b.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.o_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.o_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_b_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_b_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_b_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.unembed_out.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.unembed_out.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.unembed_out.weight": "model-00043-of-00091.safetensors", + "model.layers.39.input_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.gate.e_score_correction_bias": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.gate.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.39.self_attn.embed_q.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.embed_q.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.embed_q.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.k_norm.bias": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.k_norm.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wk.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wk.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wk.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wq_b.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wq_b.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wq_b.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.o_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.o_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_b_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_b_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_b_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.unembed_out.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.unembed_out.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.unembed_out.weight": "model-00044-of-00091.safetensors", + "model.layers.4.input_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.gate.e_score_correction_bias": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.gate.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.4.self_attn.embed_q.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.embed_q.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.embed_q.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.k_norm.bias": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.k_norm.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wk.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wk.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wk.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wq_b.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wq_b.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wq_b.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.o_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.o_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_b_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_b_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_b_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.unembed_out.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.unembed_out.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.unembed_out.weight": "model-00002-of-00091.safetensors", + "model.layers.40.input_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.gate.e_score_correction_bias": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.gate.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.40.self_attn.embed_q.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.embed_q.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.embed_q.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.k_norm.bias": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.k_norm.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wk.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wk.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wk.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wq_b.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wq_b.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wq_b.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.o_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.o_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_b_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_b_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_b_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.unembed_out.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.unembed_out.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.unembed_out.weight": "model-00045-of-00091.safetensors", + "model.layers.41.input_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.gate.e_score_correction_bias": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.gate.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.41.self_attn.embed_q.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.embed_q.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.embed_q.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.k_norm.bias": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.k_norm.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wk.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wk.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wk.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wq_b.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wq_b.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wq_b.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.o_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.o_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_b_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_b_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_b_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.unembed_out.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.unembed_out.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.unembed_out.weight": "model-00046-of-00091.safetensors", + "model.layers.42.input_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.gate.e_score_correction_bias": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.gate.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.biases": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.weight": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.biases": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.scales": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.weight": "model-00048-of-00091.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.42.self_attn.embed_q.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.embed_q.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.embed_q.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.k_norm.bias": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.k_norm.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wk.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wk.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wk.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wq_b.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wq_b.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wq_b.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.o_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.o_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_b_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_b_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_b_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.unembed_out.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.unembed_out.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.unembed_out.weight": "model-00047-of-00091.safetensors", + "model.layers.43.input_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.gate.e_score_correction_bias": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.gate.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.43.self_attn.embed_q.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.embed_q.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.embed_q.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.k_norm.bias": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.k_norm.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wk.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wk.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wk.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wq_b.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wq_b.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wq_b.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.o_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.o_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_b_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_b_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_b_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.unembed_out.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.unembed_out.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.unembed_out.weight": "model-00049-of-00091.safetensors", + "model.layers.44.input_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.gate.e_score_correction_bias": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.gate.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.44.self_attn.embed_q.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.embed_q.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.embed_q.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.k_norm.bias": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.k_norm.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wk.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wk.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wk.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wq_b.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wq_b.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wq_b.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.o_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.o_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_b_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_b_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_b_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.unembed_out.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.unembed_out.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.unembed_out.weight": "model-00050-of-00091.safetensors", + "model.layers.45.input_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.gate.e_score_correction_bias": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.gate.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.45.self_attn.embed_q.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.embed_q.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.embed_q.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.k_norm.bias": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.k_norm.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wk.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wk.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wk.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wq_b.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wq_b.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wq_b.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.o_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.o_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_b_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_b_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_b_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.unembed_out.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.unembed_out.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.unembed_out.weight": "model-00051-of-00091.safetensors", + "model.layers.46.input_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.gate.e_score_correction_bias": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.gate.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.46.self_attn.embed_q.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.embed_q.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.embed_q.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.k_norm.bias": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.k_norm.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wk.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wk.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wk.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wq_b.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wq_b.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wq_b.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.o_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.o_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_b_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_b_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_b_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.unembed_out.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.unembed_out.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.unembed_out.weight": "model-00052-of-00091.safetensors", + "model.layers.47.input_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.gate.e_score_correction_bias": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.gate.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.biases": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.weight": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.biases": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.scales": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00054-of-00091.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.47.self_attn.embed_q.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.embed_q.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.embed_q.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.k_norm.bias": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.k_norm.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wk.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wk.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wk.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wq_b.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wq_b.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wq_b.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.o_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.o_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_b_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_b_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_b_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.unembed_out.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.unembed_out.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.unembed_out.weight": "model-00053-of-00091.safetensors", + "model.layers.48.input_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.gate.e_score_correction_bias": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.gate.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.48.self_attn.embed_q.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.embed_q.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.embed_q.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.k_norm.bias": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.k_norm.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wk.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wk.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wk.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wq_b.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wq_b.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wq_b.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.o_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.o_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_b_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_b_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_b_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.unembed_out.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.unembed_out.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.unembed_out.weight": "model-00055-of-00091.safetensors", + "model.layers.49.input_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.gate.e_score_correction_bias": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.gate.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.49.self_attn.embed_q.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.embed_q.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.embed_q.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.k_norm.bias": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.k_norm.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wk.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wk.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wk.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wq_b.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wq_b.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wq_b.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.o_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.o_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_b_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_b_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_b_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.unembed_out.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.unembed_out.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.unembed_out.weight": "model-00056-of-00091.safetensors", + "model.layers.5.input_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.gate.e_score_correction_bias": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.gate.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.5.self_attn.embed_q.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.embed_q.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.embed_q.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.k_norm.bias": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.k_norm.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wk.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wk.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wk.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wq_b.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wq_b.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wq_b.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.o_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.o_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_b_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_b_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_b_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.unembed_out.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.unembed_out.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.unembed_out.weight": "model-00003-of-00091.safetensors", + "model.layers.50.input_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.gate.e_score_correction_bias": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.gate.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.50.self_attn.embed_q.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.embed_q.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.embed_q.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.k_norm.bias": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.k_norm.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wk.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wk.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wk.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wq_b.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wq_b.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wq_b.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.o_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.o_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_b_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_b_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_b_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.unembed_out.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.unembed_out.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.unembed_out.weight": "model-00057-of-00091.safetensors", + "model.layers.51.input_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.gate.e_score_correction_bias": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.gate.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.51.self_attn.embed_q.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.embed_q.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.embed_q.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.k_norm.bias": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.k_norm.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wk.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wk.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wk.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wq_b.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wq_b.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wq_b.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.o_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.o_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_b_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_b_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_b_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.unembed_out.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.unembed_out.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.unembed_out.weight": "model-00058-of-00091.safetensors", + "model.layers.52.input_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.gate.e_score_correction_bias": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.gate.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.scales": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.weight": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.biases": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.scales": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.weight": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.biases": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.scales": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.weight": "model-00060-of-00091.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.52.self_attn.embed_q.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.embed_q.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.embed_q.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.k_norm.bias": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.k_norm.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wk.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wk.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wk.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wq_b.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wq_b.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wq_b.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.o_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.o_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_b_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_b_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_b_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.unembed_out.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.unembed_out.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.unembed_out.weight": "model-00059-of-00091.safetensors", + "model.layers.53.input_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.gate.e_score_correction_bias": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.gate.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.53.self_attn.embed_q.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.embed_q.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.embed_q.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.k_norm.bias": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.k_norm.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wk.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wk.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wk.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wq_b.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wq_b.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wq_b.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.o_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.o_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_b_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_b_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_b_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.unembed_out.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.unembed_out.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.unembed_out.weight": "model-00061-of-00091.safetensors", + "model.layers.54.input_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.gate.e_score_correction_bias": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.gate.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.54.self_attn.embed_q.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.embed_q.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.embed_q.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.k_norm.bias": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.k_norm.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wk.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wk.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wk.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wq_b.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wq_b.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wq_b.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.o_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.o_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_b_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_b_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_b_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.unembed_out.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.unembed_out.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.unembed_out.weight": "model-00062-of-00091.safetensors", + "model.layers.55.input_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.gate.e_score_correction_bias": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.gate.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.55.self_attn.embed_q.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.embed_q.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.embed_q.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.k_norm.bias": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.k_norm.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wk.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wk.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wk.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wq_b.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wq_b.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wq_b.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.o_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.o_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_b_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_b_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_b_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.unembed_out.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.unembed_out.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.unembed_out.weight": "model-00063-of-00091.safetensors", + "model.layers.56.input_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.gate.e_score_correction_bias": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.gate.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.56.self_attn.embed_q.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.embed_q.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.embed_q.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.k_norm.bias": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.k_norm.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wk.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wk.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wk.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wq_b.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wq_b.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wq_b.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.o_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.o_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_b_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_b_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_b_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.unembed_out.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.unembed_out.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.unembed_out.weight": "model-00064-of-00091.safetensors", + "model.layers.57.input_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.gate.e_score_correction_bias": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.gate.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.scales": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.weight": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.biases": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.scales": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.weight": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.biases": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.scales": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.weight": "model-00066-of-00091.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.57.self_attn.embed_q.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.embed_q.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.embed_q.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.k_norm.bias": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.k_norm.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wk.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wk.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wk.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wq_b.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wq_b.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wq_b.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.o_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.o_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_b_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_b_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_b_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.unembed_out.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.unembed_out.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.unembed_out.weight": "model-00065-of-00091.safetensors", + "model.layers.58.input_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.gate.e_score_correction_bias": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.gate.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.58.self_attn.embed_q.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.embed_q.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.embed_q.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.k_norm.bias": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.k_norm.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wk.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wk.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wk.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wq_b.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wq_b.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wq_b.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.o_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.o_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_b_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_b_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_b_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.unembed_out.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.unembed_out.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.unembed_out.weight": "model-00067-of-00091.safetensors", + "model.layers.59.input_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.gate.e_score_correction_bias": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.gate.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.59.self_attn.embed_q.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.embed_q.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.embed_q.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.k_norm.bias": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.k_norm.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wk.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wk.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wk.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wq_b.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wq_b.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wq_b.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.o_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.o_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_b_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_b_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_b_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.unembed_out.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.unembed_out.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.unembed_out.weight": "model-00068-of-00091.safetensors", + "model.layers.6.input_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.gate.e_score_correction_bias": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.gate.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.6.self_attn.embed_q.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.embed_q.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.embed_q.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.k_norm.bias": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.k_norm.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wk.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wk.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wk.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wq_b.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wq_b.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wq_b.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.o_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.o_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_b_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_b_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_b_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.unembed_out.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.unembed_out.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.unembed_out.weight": "model-00004-of-00091.safetensors", + "model.layers.60.input_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.gate.e_score_correction_bias": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.gate.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.60.self_attn.embed_q.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.embed_q.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.embed_q.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.k_norm.bias": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.k_norm.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wk.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wk.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wk.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wq_b.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wq_b.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wq_b.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.o_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.o_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_b_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_b_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_b_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.unembed_out.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.unembed_out.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.unembed_out.weight": "model-00069-of-00091.safetensors", + "model.layers.61.input_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.gate.e_score_correction_bias": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.gate.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.post_attention_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.61.self_attn.embed_q.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.embed_q.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.embed_q.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.k_norm.bias": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.k_norm.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wk.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wk.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wk.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wq_b.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wq_b.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wq_b.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.o_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.o_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.o_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_b_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_b_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_b_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.unembed_out.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.unembed_out.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.unembed_out.weight": "model-00070-of-00091.safetensors", + "model.layers.62.input_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.gate.e_score_correction_bias": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.gate.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.scales": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.weight": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.biases": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.scales": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.weight": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.biases": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.scales": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.weight": "model-00072-of-00091.safetensors", + "model.layers.62.post_attention_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.62.self_attn.embed_q.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.embed_q.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.embed_q.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.k_norm.bias": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.k_norm.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wk.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wk.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wk.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wq_b.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wq_b.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wq_b.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.o_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.o_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.o_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_b_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_b_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_b_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.unembed_out.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.unembed_out.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.unembed_out.weight": "model-00071-of-00091.safetensors", + "model.layers.63.input_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.gate.e_score_correction_bias": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.gate.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.post_attention_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.63.self_attn.embed_q.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.embed_q.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.embed_q.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.k_norm.bias": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.k_norm.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wk.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wk.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wk.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wq_b.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wq_b.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wq_b.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.o_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.o_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.o_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_b_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_b_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_b_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.unembed_out.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.unembed_out.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.unembed_out.weight": "model-00073-of-00091.safetensors", + "model.layers.64.input_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.gate.e_score_correction_bias": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.gate.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.post_attention_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.64.self_attn.embed_q.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.embed_q.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.embed_q.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.k_norm.bias": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.k_norm.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wk.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wk.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wk.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wq_b.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wq_b.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wq_b.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.o_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.o_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.o_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_b_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_b_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_b_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.unembed_out.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.unembed_out.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.unembed_out.weight": "model-00074-of-00091.safetensors", + "model.layers.65.input_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.gate.e_score_correction_bias": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.gate.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.post_attention_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.65.self_attn.embed_q.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.embed_q.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.embed_q.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.k_norm.bias": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.k_norm.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wk.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wk.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wk.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wq_b.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wq_b.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wq_b.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.o_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.o_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.o_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_b_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_b_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_b_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.unembed_out.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.unembed_out.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.unembed_out.weight": "model-00075-of-00091.safetensors", + "model.layers.66.input_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.gate.e_score_correction_bias": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.gate.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.post_attention_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.66.self_attn.embed_q.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.embed_q.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.embed_q.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.k_norm.bias": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.k_norm.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wk.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wk.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wk.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wq_b.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wq_b.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wq_b.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.o_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.o_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.o_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_b_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_b_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_b_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.unembed_out.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.unembed_out.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.unembed_out.weight": "model-00076-of-00091.safetensors", + "model.layers.67.input_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.gate.e_score_correction_bias": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.gate.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.scales": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.weight": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.biases": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.scales": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.weight": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.biases": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.scales": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.weight": "model-00078-of-00091.safetensors", + "model.layers.67.post_attention_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.67.self_attn.embed_q.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.embed_q.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.embed_q.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.k_norm.bias": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.k_norm.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wk.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wk.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wk.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wq_b.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wq_b.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wq_b.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.o_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.o_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.o_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_b_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_b_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_b_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.unembed_out.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.unembed_out.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.unembed_out.weight": "model-00077-of-00091.safetensors", + "model.layers.68.input_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.gate.e_score_correction_bias": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.gate.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.post_attention_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.68.self_attn.embed_q.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.embed_q.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.embed_q.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.k_norm.bias": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.k_norm.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wk.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wk.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wk.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wq_b.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wq_b.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wq_b.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.o_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.o_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.o_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_b_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_b_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_b_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.unembed_out.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.unembed_out.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.unembed_out.weight": "model-00079-of-00091.safetensors", + "model.layers.69.input_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.gate.e_score_correction_bias": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.gate.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.post_attention_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.69.self_attn.embed_q.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.embed_q.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.embed_q.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.k_norm.bias": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.k_norm.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wk.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wk.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wk.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wq_b.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wq_b.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wq_b.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.o_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.o_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.o_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_b_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_b_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_b_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.unembed_out.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.unembed_out.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.unembed_out.weight": "model-00080-of-00091.safetensors", + "model.layers.7.input_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.gate.e_score_correction_bias": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.gate.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00006-of-00091.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.7.self_attn.embed_q.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.embed_q.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.embed_q.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.k_norm.bias": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.k_norm.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wk.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wk.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wk.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wq_b.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wq_b.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wq_b.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.o_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.o_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_b_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_b_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_b_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.unembed_out.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.unembed_out.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.unembed_out.weight": "model-00005-of-00091.safetensors", + "model.layers.70.input_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.gate.e_score_correction_bias": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.gate.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.post_attention_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.70.self_attn.embed_q.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.embed_q.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.embed_q.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.k_norm.bias": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.k_norm.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wk.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wk.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wk.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wq_b.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wq_b.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wq_b.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.o_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.o_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.o_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_b_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_b_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_b_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.unembed_out.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.unembed_out.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.unembed_out.weight": "model-00081-of-00091.safetensors", + "model.layers.71.input_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.gate.e_score_correction_bias": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.gate.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.post_attention_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.71.self_attn.embed_q.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.embed_q.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.embed_q.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.k_norm.bias": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.k_norm.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wk.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wk.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wk.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wq_b.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wq_b.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wq_b.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.o_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.o_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.o_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_b_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_b_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_b_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.unembed_out.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.unembed_out.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.unembed_out.weight": "model-00082-of-00091.safetensors", + "model.layers.72.input_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.gate.e_score_correction_bias": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.gate.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.scales": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.weight": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.biases": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.scales": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.weight": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.biases": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.scales": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.weight": "model-00084-of-00091.safetensors", + "model.layers.72.post_attention_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.72.self_attn.embed_q.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.embed_q.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.embed_q.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.k_norm.bias": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.k_norm.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wk.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wk.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wk.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wq_b.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wq_b.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wq_b.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.o_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.o_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.o_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_b_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_b_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_b_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.unembed_out.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.unembed_out.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.unembed_out.weight": "model-00083-of-00091.safetensors", + "model.layers.73.input_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.gate.e_score_correction_bias": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.gate.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.post_attention_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.73.self_attn.embed_q.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.embed_q.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.embed_q.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.k_norm.bias": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.k_norm.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wk.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wk.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wk.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wq_b.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wq_b.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wq_b.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.o_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.o_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.o_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_b_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_b_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_b_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.unembed_out.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.unembed_out.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.unembed_out.weight": "model-00085-of-00091.safetensors", + "model.layers.74.input_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.gate.e_score_correction_bias": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.gate.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.post_attention_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.74.self_attn.embed_q.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.embed_q.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.embed_q.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.k_norm.bias": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.k_norm.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wk.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wk.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wk.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wq_b.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wq_b.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wq_b.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.o_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.o_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.o_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_b_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_b_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_b_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.unembed_out.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.unembed_out.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.unembed_out.weight": "model-00086-of-00091.safetensors", + "model.layers.75.input_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.gate.e_score_correction_bias": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.gate.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.post_attention_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.75.self_attn.embed_q.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.embed_q.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.embed_q.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.k_norm.bias": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.k_norm.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wk.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wk.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wk.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wq_b.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wq_b.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wq_b.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.o_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.o_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.o_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_b_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_b_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_b_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.unembed_out.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.unembed_out.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.unembed_out.weight": "model-00087-of-00091.safetensors", + "model.layers.76.input_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.gate.e_score_correction_bias": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.gate.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.post_attention_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.76.self_attn.embed_q.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.embed_q.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.embed_q.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.k_norm.bias": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.k_norm.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wk.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wk.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wk.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wq_b.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wq_b.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wq_b.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.o_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.o_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.o_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_b_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_b_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_b_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.unembed_out.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.unembed_out.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.unembed_out.weight": "model-00088-of-00091.safetensors", + "model.layers.77.input_layernorm.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.gate.e_score_correction_bias": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.gate.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.scales": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.scales": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.scales": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.scales": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.weight": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.biases": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.scales": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.weight": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.biases": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.scales": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.weight": "model-00090-of-00091.safetensors", + "model.layers.77.post_attention_layernorm.weight": "model-00091-of-00091.safetensors", + "model.layers.77.self_attn.embed_q.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.embed_q.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.embed_q.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.k_norm.bias": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.k_norm.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wk.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wk.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wk.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wq_b.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wq_b.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wq_b.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.o_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.o_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.o_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_b_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_b_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_b_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.unembed_out.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.unembed_out.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.unembed_out.weight": "model-00089-of-00091.safetensors", + "model.layers.8.input_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.gate.e_score_correction_bias": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.gate.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.8.self_attn.embed_q.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.embed_q.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.embed_q.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.k_norm.bias": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.k_norm.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wk.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wk.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wk.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wq_b.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wq_b.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wq_b.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.o_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.o_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_b_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_b_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_b_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.unembed_out.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.unembed_out.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.unembed_out.weight": "model-00007-of-00091.safetensors", + "model.layers.9.input_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.gate.e_score_correction_bias": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.gate.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.9.self_attn.embed_q.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.embed_q.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.embed_q.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.k_norm.bias": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.k_norm.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wk.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wk.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wk.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wq_b.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wq_b.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wq_b.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.o_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.o_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_b_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_b_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_b_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.unembed_out.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.unembed_out.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.unembed_out.weight": "model-00008-of-00091.safetensors", + "model.norm.weight": "model-00091-of-00091.safetensors" + } +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..aba40197a4cdb5607f4ab7a05fb0a4ee8054fd6d --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d +size 20217442 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa53776c9f7ac98333a470b78a5b732d5343d15 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,34 @@ +{ + "backend": "tokenizers", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|endoftext|>", + "extra_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>", + "<|begin_of_audio|>", + "<|end_of_audio|>", + "<|begin_of_transcription|>", + "<|end_of_transcription|>" + ], + "is_local": true, + "model_max_length": 202752, + "model_specific_special_tokens": {}, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "TokenizersBackend", + "tool_parser_type": "glm47" +}