diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3ecf40c44e884df08236436d658de99e75f8b888 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +--- +language: +- en +- zh +library_name: mlx +license: mit +pipeline_tag: text-generation +tags: +- mlx +base_model: zai-org/GLM-5 +--- + +# mlx-community/GLM-5-4bit + +This model [mlx-community/GLM-5-4bit](https://huggingface.co/mlx-community/GLM-5-4bit) was +converted to MLX format from [zai-org/GLM-5](https://huggingface.co/zai-org/GLM-5) +using mlx-lm version **0.30.7**. + +## Use with mlx + +```bash +pip install mlx-lm +``` + +```python +from mlx_lm import load, generate + +model, tokenizer = load("mlx-community/GLM-5-4bit") + +prompt = "hello" + +if tokenizer.chat_template is not None: + messages = [{"role": "user", "content": prompt}] + prompt = tokenizer.apply_chat_template( + messages, add_generation_prompt=True, return_dict=False, + ) + +response = generate(model, tokenizer, prompt=prompt, verbose=True) +``` diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..2ab98ef068d62829d17c5ade1827b9f013fa2bbf --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,86 @@ +[gMASK] +{%- if tools -%} +<|system|> +# Tools + +You may call one or more functions to assist with the user query. + +You are provided with function signatures within XML tags: + +{% for tool in tools %} +{{ tool | tojson(ensure_ascii=False) }} +{% endfor %} + + +For each function call, output the function name and arguments within the following XML format: +{function-name}{arg-key-1}{arg-value-1}{arg-key-2}{arg-value-2}...{%- endif -%} +{%- macro visible_text(content) -%} + {%- if content is string -%} + {{- content }} + {%- elif content is iterable and content is not mapping -%} + {%- for item in content -%} + {%- if item is mapping and item.type == 'text' -%} + {{- item.text }} + {%- elif item is string -%} + {{- item }} + {%- endif -%} + {%- endfor -%} + {%- else -%} + {{- content }} + {%- endif -%} +{%- endmacro -%} +{%- set ns = namespace(last_user_index=-1) %} +{%- for m in messages %} + {%- if m.role == 'user' %} + {% set ns.last_user_index = loop.index0 -%} + {%- endif %} +{%- endfor %} +{% for m in messages %} +{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }} +{%- elif m.role == 'assistant' -%} +<|assistant|> +{%- set reasoning_content = '' %} +{%- set content = visible_text(m.content) %} +{%- if m.reasoning_content is string %} + {%- set reasoning_content = m.reasoning_content %} +{%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} +{%- endif %} +{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%} +{{ '' + reasoning_content.strip() + ''}} +{%- else -%} +{{ '' }} +{%- endif -%} +{%- if content.strip() -%} +{{ content.strip() }} +{%- endif -%} +{% if m.tool_calls %} +{% for tc in m.tool_calls %} +{%- if tc.function %} + {%- set tc = tc.function %} +{%- endif %} +{{- '' + tc.name -}} +{% set _args = tc.arguments %}{% for k, v in _args.items() %}{{ k }}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{% endfor %}{% endfor %} +{% endif %} +{%- elif m.role == 'tool' -%} +{%- if m.content is string -%} +{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|observation|>' }} +{%- endif %} +{{- '' }} +{{- m.content }} +{{- '' }} +{%- else -%} +<|observation|>{% for tr in m.content %} +{{ tr.output if tr.output is defined else tr }}{% endfor -%} +{% endif -%} +{%- elif m.role == 'system' -%} +<|system|>{{ visible_text(m.content) }} +{%- endif -%} +{%- endfor -%} +{%- if add_generation_prompt -%} + <|assistant|>{{- '' if (enable_thinking is defined and not enable_thinking) else '' -}} +{%- endif -%} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..88cab08e5ff175d9aa1d40ec25a071508e2afd4f --- /dev/null +++ b/config.json @@ -0,0 +1,69 @@ +{ + "architectures": [ + "GlmMoeDsaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "dtype": "bfloat16", + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "ep_size": 1, + "first_k_dense_replace": 3, + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 6144, + "index_head_dim": 128, + "index_n_heads": 32, + "index_topk": 2048, + "indexer_rope_interleave": true, + "initializer_range": 0.02, + "intermediate_size": 12288, + "kv_lora_rank": 512, + "max_position_embeddings": 202752, + "model_type": "glm_moe_dsa", + "moe_intermediate_size": 2048, + "moe_layer_freq": 1, + "n_group": 1, + "n_routed_experts": 256, + "n_shared_experts": 1, + "norm_topk_prob": true, + "num_attention_heads": 64, + "num_experts_per_tok": 8, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "num_nextn_predict_layers": 1, + "pad_token_id": 154820, + "pretraining_tp": 1, + "q_lora_rank": 2048, + "qk_head_dim": 256, + "qk_nope_head_dim": 192, + "qk_rope_head_dim": 64, + "quantization": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "quantization_config": { + "group_size": 64, + "bits": 4, + "mode": "affine" + }, + "rms_norm_eps": 1e-05, + "rope_interleave": true, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "routed_scaling_factor": 2.5, + "scoring_func": "sigmoid", + "tie_word_embeddings": false, + "topk_group": 1, + "topk_method": "noaux_tc", + "transformers_version": "5.0.2.dev0", + "use_cache": true, + "v_head_dim": 256, + "vocab_size": 154880 +} \ No newline at end of file diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..640e99c64d2f17d76e2f1f13af219fb369e1004e --- /dev/null +++ b/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "eos_token_id": [ + 154820, + 154827, + 154829 + ], + "pad_token_id": 154820, + "temperature": 1.0, + "top_p": 0.95, + "transformers_version": "5.0.2.dev0" +} diff --git a/model-00003-of-00091.safetensors b/model-00003-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bb0985ef2b72f2691a2b0f7866dbd0e570177b4 --- /dev/null +++ b/model-00003-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a55c45ab728c2df37e29b224ca4b6be3892a06333bc15eb36986da18d2141ab2 +size 3947717330 diff --git a/model-00006-of-00091.safetensors b/model-00006-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..335e0fa1b96d38075b92134197eb3289de4d2854 --- /dev/null +++ b/model-00006-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3776116d2028d3fa2e117817dfda73d8e712e1cf3f6a53b508a6bc76fdeedcd0 +size 5335155728 diff --git a/model-00009-of-00091.safetensors b/model-00009-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ab9bd8df0638776240a2d6fc7964d05b7d6c50dd --- /dev/null +++ b/model-00009-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1fc6ef3ebe2b78258df5756bc297c521dc177e70b7a85af2838908ba1cdf3f +size 3947717362 diff --git a/model-00010-of-00091.safetensors b/model-00010-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..347d30ba2b71d71fe60e630e4179c3ff5df95d16 --- /dev/null +++ b/model-00010-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c18b7dc96903545754ac99a9ee6daf3904f2ba36c0dbacbf3204be804e408662 +size 5357003398 diff --git a/model-00011-of-00091.safetensors b/model-00011-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f8dd4af57684213ba821d3918c0e4f2ef719f6e1 --- /dev/null +++ b/model-00011-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:936c350081f4c836782d37e6c6cac1442d058a4a4c7d16f43c13d76a4644cf6a +size 3947717358 diff --git a/model-00012-of-00091.safetensors b/model-00012-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ea4a1f3e1286dcc1a05b7b23f7b37f62dfebbec --- /dev/null +++ b/model-00012-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8936f1cc1e0397f9fdbd5bede886019820b275aab7db10b4a7f1f1e3de6945e4 +size 5335155736 diff --git a/model-00014-of-00091.safetensors b/model-00014-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5ad2030982caff4c754d6d4d883ee5e56e7dea68 --- /dev/null +++ b/model-00014-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5df851ee107e59e8480c957256868fd706cd684f7ac19fd39e57bd1662a22718 +size 5357003402 diff --git a/model-00015-of-00091.safetensors b/model-00015-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..329c64463131820873c70cf3c16f06d55d5ba2dc --- /dev/null +++ b/model-00015-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a359215a77ae3a4609a737271ad7ce3190f45a8de034cb5b4757b2e1ab331d9 +size 3947717380 diff --git a/model-00017-of-00091.safetensors b/model-00017-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8cf40817a630c6e257859f545743feff897d614 --- /dev/null +++ b/model-00017-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1089db8e0a259ae289aa491d1c8f67750d3afb2929e77279728054051ee68fc +size 3947717212 diff --git a/model-00020-of-00091.safetensors b/model-00020-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40f3f558a0bf097c38de3e86e08ffbe95a552f79 --- /dev/null +++ b/model-00020-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa7b0c2214d4a257fa077d23ba839bb37778c3ff8df321f3046c370655454df +size 5357003394 diff --git a/model-00023-of-00091.safetensors b/model-00023-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee6808533a291b48b6b46678e99a309e907a4438 --- /dev/null +++ b/model-00023-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883a7a2c1949d29b6f9a99acc6633beb3b690b29e59d0f996e6281a336e30fdc +size 3947717338 diff --git a/model-00025-of-00091.safetensors b/model-00025-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4b8691a2f845b0fe9af5f3ca1f52f9ac76db1a1d --- /dev/null +++ b/model-00025-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8a2ff1aa7170e8e4623d833b19c1824caf584829448546fb5fdec22ecb6876c +size 3847053950 diff --git a/model-00026-of-00091.safetensors b/model-00026-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3cca3628cafd555a0a23f961a9a93088acf0aa51 --- /dev/null +++ b/model-00026-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59b34d890d599d0584f3ba7af8cd7a37fd1f4198560b434f2109761181631cc7 +size 5357003402 diff --git a/model-00027-of-00091.safetensors b/model-00027-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4ac9c12d8ad233e0d7783f09f81452d2fb9a8d8b --- /dev/null +++ b/model-00027-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22831893475d906001f8a6590dce2e6bfa07d5bf859a54c5319fad9a0b5329f1 +size 3947717384 diff --git a/model-00028-of-00091.safetensors b/model-00028-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9eb1b769776567ab1b56a979e73241360dc7cbd1 --- /dev/null +++ b/model-00028-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9395756aea64958f2560f2ef88d1f5d7ed81217f2f7b6e66a9f5d7b382a9e128 +size 5357003398 diff --git a/model-00029-of-00091.safetensors b/model-00029-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..958ec1585b030c51ceca24b46da6400ff89306dd --- /dev/null +++ b/model-00029-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5cccfb21344335cfa5d48a106ece94b4b73e319a14a7a457eccf125ffbceedf +size 3947717260 diff --git a/model-00031-of-00091.safetensors b/model-00031-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..958c304295bc19209b5f4fc9801a644316144dac --- /dev/null +++ b/model-00031-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6a2a72031aa99128ae31064595c9cc79d8b78b8e1c8227d60b2ac78d318b869 +size 3847053956 diff --git a/model-00032-of-00091.safetensors b/model-00032-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ea60613d9ea5fd1ae142bf0850b1d2f3d1ad742 --- /dev/null +++ b/model-00032-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f004db6da20a37967215070885af018131fd9ea983b73de181b47929ca36c187 +size 5357003402 diff --git a/model-00034-of-00091.safetensors b/model-00034-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f52223b44a2968fd187dba114da29454996b9c3a --- /dev/null +++ b/model-00034-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80e4e24d30f8ebdc6dbe1af0f2f6f804031b2998a411c59e201fcf0e022b5f2 +size 5357003398 diff --git a/model-00038-of-00091.safetensors b/model-00038-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9b0e8795ec7171cebf5a36318d92ba670592bb7 --- /dev/null +++ b/model-00038-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18872e40dc6c1e106530bbb7aad77f3d1a3c1d3a01eb8dfeba23d59174419783 +size 5357003400 diff --git a/model-00041-of-00091.safetensors b/model-00041-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d749fd78a6094d5806cddbf85520b3c503b2c43c --- /dev/null +++ b/model-00041-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:651ac755d7ffc36bc70856f9cfc37eb43a52344735523c71861959c64751373e +size 3947717182 diff --git a/model-00042-of-00091.safetensors b/model-00042-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6e06ab9d60b062734ffae8e9db59f95c81f8f54b --- /dev/null +++ b/model-00042-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cda63172e00ade852e858109807ce20e6b836554217347bf2ed839430a72729e +size 5335155738 diff --git a/model-00047-of-00091.safetensors b/model-00047-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..09fe9dccc9ad684946c6ee4e4c84b5e0e3a24d36 --- /dev/null +++ b/model-00047-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efbd7523e35bfac87004db629f293e8ef1802d91589f2a0ba98eca3e9cbf15a1 +size 3947717206 diff --git a/model-00048-of-00091.safetensors b/model-00048-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfddf04c2adb262695141a331a957d8604dd7300 --- /dev/null +++ b/model-00048-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c7ba175da56606cbfd4fe33fa7ad9565455582700dd7a815626ec5a465453b +size 5335155736 diff --git a/model-00050-of-00091.safetensors b/model-00050-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7bed0aa04af27461d9321514e61b31c82e52c89 --- /dev/null +++ b/model-00050-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12cc3d368b1e8119fbde2464eb4e719405b98fa2fc82ef547cef3b83f6fadbe9 +size 5357003402 diff --git a/model-00051-of-00091.safetensors b/model-00051-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5b28f63636e8a281f914faaff313dca14a340b56 --- /dev/null +++ b/model-00051-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b35d0c685d734f8de676699663498fb7c978d591ff5b18e6dac814869f3620be +size 3947717380 diff --git a/model-00054-of-00091.safetensors b/model-00054-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..67add36d21d1d8c81b051596a81e668f3fefaa10 --- /dev/null +++ b/model-00054-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:847442e71b2a275cffbff518ae385a068b7bab71e819e2087d4ff2b6327d8e2d +size 5335155738 diff --git a/model-00055-of-00091.safetensors b/model-00055-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a092620eacbf3e9280b074dd24d7938b8cfc810 --- /dev/null +++ b/model-00055-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c8baa28d54d8dd1d840055bfcbe31c083b65c1d8db15e95304ccd12de5de36e +size 3847053918 diff --git a/model-00059-of-00091.safetensors b/model-00059-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..30c5a270e6b8228a10cce4f9454e6d188839a5f2 --- /dev/null +++ b/model-00059-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0698314dfdb74f4d23dc0a0a330ced642d28817ca9c545b0a900d1f725a8656 +size 3947717312 diff --git a/model-00061-of-00091.safetensors b/model-00061-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf6b897fcf10d03b2572d73aa8b21fb4f0f358ef --- /dev/null +++ b/model-00061-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e4546d206bedbd88ed4e853d9b11ccc08dc286b578ec644ab9528fa8f59d13 +size 3847053956 diff --git a/model-00062-of-00091.safetensors b/model-00062-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a88932c73d31439956ec8aec18f642fe11c5e3fc --- /dev/null +++ b/model-00062-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52dd9f48ef1aea85f688696058396cea05a50406e0650145cba56edfeab931c6 +size 5357003388 diff --git a/model-00063-of-00091.safetensors b/model-00063-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..47c3fe8f15fecaf59c133bd44c5a4793f304a5ac --- /dev/null +++ b/model-00063-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8a1b4da64015451006b963ea2885875e4cca23243d599ad6e146dd94316174 +size 3947717356 diff --git a/model-00066-of-00091.safetensors b/model-00066-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c5458e7ad3757eab3391581356fbc4b2688eef58 --- /dev/null +++ b/model-00066-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:635e986fdb0e4d494b6ec4e4518f55e900145243d5d15a57c51790d420838178 +size 5335155734 diff --git a/model-00067-of-00091.safetensors b/model-00067-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4e1f30abff6f5005a1c8fa723b6f0cf386b7bc67 --- /dev/null +++ b/model-00067-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41fc25ba10d0ee929ece8057831918327c47acc8246f7711752da04702fea927 +size 3847053928 diff --git a/model-00068-of-00091.safetensors b/model-00068-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1988dc4754f3717c0c0de831792aa64bd2b10da2 --- /dev/null +++ b/model-00068-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9135d8e84d7990a8004e439f1ae5bec031f5ec50c2cd6cf1f81bd4ce4daa4904 +size 5357003402 diff --git a/model-00069-of-00091.safetensors b/model-00069-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..33962a9e53e21c7d02eaaa6e6f8578e5b0fe573d --- /dev/null +++ b/model-00069-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d7dbd508806905b224cbcf5d00ee6a329676c92c38822e38cca045faf9959b +size 3947717300 diff --git a/model-00070-of-00091.safetensors b/model-00070-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2a03012612f0c193be28706790a87ef67b9520d3 --- /dev/null +++ b/model-00070-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4316b86d40a45eecf9719eb53cce515fc7b9da579d7b85519f6bc9ef4debc1c8 +size 5357003398 diff --git a/model-00075-of-00091.safetensors b/model-00075-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a38827836bc4f9208084874d0efd960d3121f25e --- /dev/null +++ b/model-00075-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b42e970ff93c1ef3f768c2640c69fd454d37ae08e1f186e074edab37c5e961 +size 3947717358 diff --git a/model-00080-of-00091.safetensors b/model-00080-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4b8dd661b5994fdd1fe63a0e41bf2b2f9c3f2963 --- /dev/null +++ b/model-00080-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6a5128f9c32fa572d1ad93aa5dceb81a9a66e6276bfdadff38e07e3cfedcfa0 +size 5357003400 diff --git a/model-00083-of-00091.safetensors b/model-00083-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2a535309292bca4c4259823ef6ea811c22ffc04f --- /dev/null +++ b/model-00083-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0a9e9e3733b8938cf232e25154cfe8ae16de446756df41750a0873606d0770 +size 3947717200 diff --git a/model-00086-of-00091.safetensors b/model-00086-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4752373cfca82789170dc33d18a832543b13b6ad --- /dev/null +++ b/model-00086-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fded92bfa1c5b0b2709ed49c6406a88e9508217b8437edcad24b01077ff68d0e +size 5357003402 diff --git a/model-00089-of-00091.safetensors b/model-00089-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ab37fa79e8346434ac5c0a3f720be78fec28494 --- /dev/null +++ b/model-00089-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:542a764f9b91514fe67074aa197a42c1185193a5ee6760cd1df3ec6d43a6529d +size 3947717258 diff --git a/model-00091-of-00091.safetensors b/model-00091-of-00091.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ef4990a67d7ae016c05378b12b24152c073ba0d --- /dev/null +++ b/model-00091-of-00091.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05b3469f93f0ec9d4341052fa58e45c1c7ecf55f34f712cfa8cb276356b5090 +size 660347957 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..4a0229bf5b85846b2c642afa92a12c0905cb1236 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,4116 @@ +{ + "metadata": { + "total_size": 418621403136, + "total_parameters": 743911218432 + }, + "weight_map": { + "lm_head.biases": "model-00091-of-00091.safetensors", + "lm_head.scales": "model-00091-of-00091.safetensors", + "lm_head.weight": "model-00091-of-00091.safetensors", + "model.embed_tokens.biases": "model-00001-of-00091.safetensors", + "model.embed_tokens.scales": "model-00001-of-00091.safetensors", + "model.embed_tokens.weight": "model-00001-of-00091.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.down_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.down_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.0.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.down_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.down_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.1.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.10.input_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.gate.e_score_correction_bias": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.gate.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.down_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.gate_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.shared_experts.up_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.10.self_attn.embed_q.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.embed_q.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.embed_q.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.k_norm.bias": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.k_norm.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.weights_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wk.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wk.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wk.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wq_b.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wq_b.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.indexer.wq_b.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.o_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.o_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_a_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_b_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_b_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.q_b_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.unembed_out.biases": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.unembed_out.scales": "model-00009-of-00091.safetensors", + "model.layers.10.self_attn.unembed_out.weight": "model-00009-of-00091.safetensors", + "model.layers.11.input_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.gate.e_score_correction_bias": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.gate.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.down_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.gate_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.shared_experts.up_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.11.self_attn.embed_q.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.embed_q.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.embed_q.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.k_norm.bias": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.k_norm.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.weights_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wk.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wk.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wk.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wq_b.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wq_b.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.indexer.wq_b.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.o_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.o_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_layernorm.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_a_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_b_proj.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_b_proj.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.q_b_proj.weight": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.unembed_out.biases": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.unembed_out.scales": "model-00010-of-00091.safetensors", + "model.layers.11.self_attn.unembed_out.weight": "model-00010-of-00091.safetensors", + "model.layers.12.input_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.gate.e_score_correction_bias": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.gate.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.down_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.gate_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.shared_experts.up_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00012-of-00091.safetensors", + "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00012-of-00091.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.12.self_attn.embed_q.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.embed_q.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.embed_q.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.k_norm.bias": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.k_norm.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.weights_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wk.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wk.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wk.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wq_b.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wq_b.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.indexer.wq_b.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.o_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.o_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_layernorm.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_a_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_b_proj.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_b_proj.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.q_b_proj.weight": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.unembed_out.biases": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.unembed_out.scales": "model-00011-of-00091.safetensors", + "model.layers.12.self_attn.unembed_out.weight": "model-00011-of-00091.safetensors", + "model.layers.13.input_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.gate.e_score_correction_bias": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.gate.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.down_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.gate_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.shared_experts.up_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.13.self_attn.embed_q.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.embed_q.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.embed_q.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.k_norm.bias": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.k_norm.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.weights_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wk.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wk.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wk.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wq_b.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wq_b.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.indexer.wq_b.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.o_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.o_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_layernorm.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_a_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_b_proj.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_b_proj.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.q_b_proj.weight": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.unembed_out.biases": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.unembed_out.scales": "model-00013-of-00091.safetensors", + "model.layers.13.self_attn.unembed_out.weight": "model-00013-of-00091.safetensors", + "model.layers.14.input_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.gate.e_score_correction_bias": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.gate.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.down_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.gate_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.shared_experts.up_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.14.self_attn.embed_q.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.embed_q.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.embed_q.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.k_norm.bias": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.k_norm.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.weights_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wk.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wk.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wk.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wq_b.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wq_b.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.indexer.wq_b.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.o_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.o_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_layernorm.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_a_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_b_proj.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_b_proj.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.q_b_proj.weight": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.unembed_out.biases": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.unembed_out.scales": "model-00014-of-00091.safetensors", + "model.layers.14.self_attn.unembed_out.weight": "model-00014-of-00091.safetensors", + "model.layers.15.input_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.gate.e_score_correction_bias": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.gate.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.down_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.gate_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.shared_experts.up_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.15.self_attn.embed_q.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.embed_q.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.embed_q.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.k_norm.bias": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.k_norm.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.weights_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wk.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wk.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wk.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wq_b.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wq_b.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.indexer.wq_b.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.o_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.o_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_layernorm.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_a_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_b_proj.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_b_proj.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.q_b_proj.weight": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.unembed_out.biases": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.unembed_out.scales": "model-00015-of-00091.safetensors", + "model.layers.15.self_attn.unembed_out.weight": "model-00015-of-00091.safetensors", + "model.layers.16.input_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.gate.e_score_correction_bias": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.gate.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.down_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.gate_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.shared_experts.up_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.16.self_attn.embed_q.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.embed_q.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.embed_q.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.k_norm.bias": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.k_norm.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.weights_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wk.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wk.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wk.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wq_b.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wq_b.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.indexer.wq_b.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.o_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.o_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_layernorm.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_a_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_b_proj.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_b_proj.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.q_b_proj.weight": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.unembed_out.biases": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.unembed_out.scales": "model-00016-of-00091.safetensors", + "model.layers.16.self_attn.unembed_out.weight": "model-00016-of-00091.safetensors", + "model.layers.17.input_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.gate.e_score_correction_bias": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.gate.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.down_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.gate_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.shared_experts.up_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00018-of-00091.safetensors", + "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00018-of-00091.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.17.self_attn.embed_q.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.embed_q.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.embed_q.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.k_norm.bias": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.k_norm.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.weights_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wk.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wk.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wk.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wq_b.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wq_b.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.indexer.wq_b.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.o_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.o_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_layernorm.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_a_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_b_proj.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_b_proj.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.q_b_proj.weight": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.unembed_out.biases": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.unembed_out.scales": "model-00017-of-00091.safetensors", + "model.layers.17.self_attn.unembed_out.weight": "model-00017-of-00091.safetensors", + "model.layers.18.input_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.gate.e_score_correction_bias": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.gate.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.down_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.gate_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.shared_experts.up_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.18.self_attn.embed_q.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.embed_q.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.embed_q.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.k_norm.bias": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.k_norm.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.weights_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wk.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wk.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wk.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wq_b.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wq_b.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.indexer.wq_b.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.o_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.o_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_layernorm.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_a_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_b_proj.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_b_proj.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.q_b_proj.weight": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.unembed_out.biases": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.unembed_out.scales": "model-00019-of-00091.safetensors", + "model.layers.18.self_attn.unembed_out.weight": "model-00019-of-00091.safetensors", + "model.layers.19.input_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.gate.e_score_correction_bias": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.gate.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.down_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.gate_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.shared_experts.up_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.19.self_attn.embed_q.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.embed_q.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.embed_q.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.k_norm.bias": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.k_norm.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.weights_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wk.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wk.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wk.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wq_b.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wq_b.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.indexer.wq_b.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.o_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.o_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_layernorm.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_a_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_b_proj.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_b_proj.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.q_b_proj.weight": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.unembed_out.biases": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.unembed_out.scales": "model-00020-of-00091.safetensors", + "model.layers.19.self_attn.unembed_out.weight": "model-00020-of-00091.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.down_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.down_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.2.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.20.input_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.gate.e_score_correction_bias": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.gate.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.down_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.gate_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.shared_experts.up_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.20.self_attn.embed_q.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.embed_q.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.embed_q.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.k_norm.bias": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.k_norm.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.weights_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wk.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wk.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wk.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wq_b.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wq_b.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.indexer.wq_b.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.o_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.o_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_layernorm.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_a_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_b_proj.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_b_proj.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.q_b_proj.weight": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.unembed_out.biases": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.unembed_out.scales": "model-00021-of-00091.safetensors", + "model.layers.20.self_attn.unembed_out.weight": "model-00021-of-00091.safetensors", + "model.layers.21.input_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.gate.e_score_correction_bias": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.gate.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.down_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.gate_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.shared_experts.up_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.21.self_attn.embed_q.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.embed_q.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.embed_q.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.k_norm.bias": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.k_norm.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.weights_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wk.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wk.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wk.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wq_b.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wq_b.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.indexer.wq_b.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.o_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.o_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_layernorm.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_a_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_b_proj.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_b_proj.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.q_b_proj.weight": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.unembed_out.biases": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.unembed_out.scales": "model-00022-of-00091.safetensors", + "model.layers.21.self_attn.unembed_out.weight": "model-00022-of-00091.safetensors", + "model.layers.22.input_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.gate.e_score_correction_bias": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.gate.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.down_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.gate_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.shared_experts.up_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00024-of-00091.safetensors", + "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00024-of-00091.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.22.self_attn.embed_q.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.embed_q.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.embed_q.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.k_norm.bias": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.k_norm.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.weights_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wk.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wk.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wk.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wq_b.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wq_b.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.indexer.wq_b.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.o_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.o_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_layernorm.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_a_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_b_proj.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_b_proj.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.q_b_proj.weight": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.unembed_out.biases": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.unembed_out.scales": "model-00023-of-00091.safetensors", + "model.layers.22.self_attn.unembed_out.weight": "model-00023-of-00091.safetensors", + "model.layers.23.input_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.gate.e_score_correction_bias": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.gate.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.down_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.gate_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.shared_experts.up_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.23.self_attn.embed_q.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.embed_q.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.embed_q.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.k_norm.bias": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.k_norm.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.weights_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wk.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wk.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wk.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wq_b.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wq_b.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.indexer.wq_b.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.o_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.o_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_layernorm.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_a_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_b_proj.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_b_proj.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.q_b_proj.weight": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.unembed_out.biases": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.unembed_out.scales": "model-00025-of-00091.safetensors", + "model.layers.23.self_attn.unembed_out.weight": "model-00025-of-00091.safetensors", + "model.layers.24.input_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.gate.e_score_correction_bias": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.gate.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.down_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.gate_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.shared_experts.up_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.24.self_attn.embed_q.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.embed_q.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.embed_q.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.k_norm.bias": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.k_norm.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.weights_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wk.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wk.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wk.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wq_b.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wq_b.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.indexer.wq_b.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.o_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.o_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_layernorm.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_a_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_b_proj.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_b_proj.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.q_b_proj.weight": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.unembed_out.biases": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.unembed_out.scales": "model-00026-of-00091.safetensors", + "model.layers.24.self_attn.unembed_out.weight": "model-00026-of-00091.safetensors", + "model.layers.25.input_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.gate.e_score_correction_bias": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.gate.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.down_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.gate_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.shared_experts.up_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.25.self_attn.embed_q.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.embed_q.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.embed_q.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.k_norm.bias": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.k_norm.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.weights_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wk.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wk.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wk.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wq_b.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wq_b.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.indexer.wq_b.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.o_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.o_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_layernorm.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_a_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_b_proj.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_b_proj.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.q_b_proj.weight": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.unembed_out.biases": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.unembed_out.scales": "model-00027-of-00091.safetensors", + "model.layers.25.self_attn.unembed_out.weight": "model-00027-of-00091.safetensors", + "model.layers.26.input_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.gate.e_score_correction_bias": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.gate.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.down_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.gate_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.shared_experts.up_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.26.self_attn.embed_q.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.embed_q.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.embed_q.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.k_norm.bias": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.k_norm.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.weights_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wk.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wk.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wk.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wq_b.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wq_b.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.indexer.wq_b.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.o_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.o_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_layernorm.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_a_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_b_proj.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_b_proj.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.q_b_proj.weight": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.unembed_out.biases": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.unembed_out.scales": "model-00028-of-00091.safetensors", + "model.layers.26.self_attn.unembed_out.weight": "model-00028-of-00091.safetensors", + "model.layers.27.input_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.gate.e_score_correction_bias": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.gate.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.down_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.gate_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.shared_experts.up_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00030-of-00091.safetensors", + "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00030-of-00091.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.27.self_attn.embed_q.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.embed_q.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.embed_q.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.k_norm.bias": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.k_norm.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.weights_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wk.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wk.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wk.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wq_b.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wq_b.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.indexer.wq_b.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.o_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.o_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_layernorm.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_a_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_b_proj.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_b_proj.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.q_b_proj.weight": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.unembed_out.biases": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.unembed_out.scales": "model-00029-of-00091.safetensors", + "model.layers.27.self_attn.unembed_out.weight": "model-00029-of-00091.safetensors", + "model.layers.28.input_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.gate.e_score_correction_bias": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.gate.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.down_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.gate_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.shared_experts.up_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.down_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.mlp.switch_mlp.up_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.28.self_attn.embed_q.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.embed_q.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.embed_q.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.k_norm.bias": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.k_norm.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.weights_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wk.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wk.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wk.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wq_b.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wq_b.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.indexer.wq_b.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.o_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.o_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_layernorm.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_a_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_b_proj.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_b_proj.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.q_b_proj.weight": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.unembed_out.biases": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.unembed_out.scales": "model-00031-of-00091.safetensors", + "model.layers.28.self_attn.unembed_out.weight": "model-00031-of-00091.safetensors", + "model.layers.29.input_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.gate.e_score_correction_bias": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.gate.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.down_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.gate_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.shared_experts.up_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.down_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.gate_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.29.mlp.switch_mlp.up_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.29.self_attn.embed_q.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.embed_q.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.embed_q.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.k_norm.bias": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.k_norm.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.weights_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wk.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wk.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wk.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wq_b.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wq_b.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.indexer.wq_b.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.o_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.o_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_layernorm.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_a_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_b_proj.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_b_proj.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.q_b_proj.weight": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.unembed_out.biases": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.unembed_out.scales": "model-00032-of-00091.safetensors", + "model.layers.29.self_attn.unembed_out.weight": "model-00032-of-00091.safetensors", + "model.layers.3.input_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.gate.e_score_correction_bias": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.gate.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.down_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.gate_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.shared_experts.up_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.3.self_attn.embed_q.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.embed_q.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.embed_q.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.k_norm.bias": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.k_norm.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.weights_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wk.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wk.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wk.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wq_b.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wq_b.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.indexer.wq_b.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.o_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.o_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_b_proj.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_b_proj.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.unembed_out.biases": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.unembed_out.scales": "model-00001-of-00091.safetensors", + "model.layers.3.self_attn.unembed_out.weight": "model-00001-of-00091.safetensors", + "model.layers.30.input_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.gate.e_score_correction_bias": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.gate.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.down_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.gate_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.shared_experts.up_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.down_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.gate_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.30.mlp.switch_mlp.up_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.30.self_attn.embed_q.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.embed_q.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.embed_q.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.k_norm.bias": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.k_norm.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.weights_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wk.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wk.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wk.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wq_b.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wq_b.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.indexer.wq_b.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.o_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.o_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_layernorm.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_a_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_b_proj.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_b_proj.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.q_b_proj.weight": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.unembed_out.biases": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.unembed_out.scales": "model-00033-of-00091.safetensors", + "model.layers.30.self_attn.unembed_out.weight": "model-00033-of-00091.safetensors", + "model.layers.31.input_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.gate.e_score_correction_bias": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.gate.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.down_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.gate_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.shared_experts.up_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.down_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.31.mlp.switch_mlp.up_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.31.self_attn.embed_q.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.embed_q.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.embed_q.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.k_norm.bias": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.k_norm.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.weights_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wk.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wk.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wk.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wq_b.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wq_b.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.indexer.wq_b.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.o_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.o_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_layernorm.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_a_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_b_proj.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_b_proj.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.q_b_proj.weight": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.unembed_out.biases": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.unembed_out.scales": "model-00034-of-00091.safetensors", + "model.layers.31.self_attn.unembed_out.weight": "model-00034-of-00091.safetensors", + "model.layers.32.input_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.gate.e_score_correction_bias": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.gate.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.down_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.gate_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.shared_experts.up_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.scales": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.down_proj.weight": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.biases": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.scales": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.gate_proj.weight": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.biases": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.scales": "model-00036-of-00091.safetensors", + "model.layers.32.mlp.switch_mlp.up_proj.weight": "model-00036-of-00091.safetensors", + "model.layers.32.post_attention_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.32.self_attn.embed_q.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.embed_q.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.embed_q.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.k_norm.bias": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.k_norm.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.weights_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wk.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wk.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wk.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wq_b.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wq_b.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.indexer.wq_b.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.kv_a_proj_with_mqa.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.o_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.o_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.o_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_layernorm.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_a_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_b_proj.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_b_proj.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.q_b_proj.weight": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.unembed_out.biases": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.unembed_out.scales": "model-00035-of-00091.safetensors", + "model.layers.32.self_attn.unembed_out.weight": "model-00035-of-00091.safetensors", + "model.layers.33.input_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.gate.e_score_correction_bias": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.gate.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.down_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.gate_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.shared_experts.up_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.down_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.mlp.switch_mlp.up_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.post_attention_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.33.self_attn.embed_q.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.embed_q.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.embed_q.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.k_norm.bias": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.k_norm.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.weights_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wk.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wk.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wk.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wq_b.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wq_b.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.indexer.wq_b.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.kv_a_proj_with_mqa.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.o_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.o_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.o_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_layernorm.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_a_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_b_proj.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_b_proj.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.q_b_proj.weight": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.unembed_out.biases": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.unembed_out.scales": "model-00037-of-00091.safetensors", + "model.layers.33.self_attn.unembed_out.weight": "model-00037-of-00091.safetensors", + "model.layers.34.input_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.gate.e_score_correction_bias": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.gate.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.down_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.gate_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.shared_experts.up_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.down_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.gate_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.34.mlp.switch_mlp.up_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.post_attention_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.34.self_attn.embed_q.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.embed_q.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.embed_q.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.k_norm.bias": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.k_norm.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.weights_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wk.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wk.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wk.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wq_b.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wq_b.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.indexer.wq_b.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.kv_a_proj_with_mqa.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.o_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.o_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.o_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_layernorm.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_a_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_b_proj.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_b_proj.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.q_b_proj.weight": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.unembed_out.biases": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.unembed_out.scales": "model-00038-of-00091.safetensors", + "model.layers.34.self_attn.unembed_out.weight": "model-00038-of-00091.safetensors", + "model.layers.35.input_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.gate.e_score_correction_bias": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.gate.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.down_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.gate_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.shared_experts.up_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.down_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.gate_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.35.mlp.switch_mlp.up_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.35.post_attention_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.35.self_attn.embed_q.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.embed_q.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.embed_q.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.k_norm.bias": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.k_norm.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.weights_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wk.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wk.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wk.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wq_b.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wq_b.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.indexer.wq_b.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.kv_a_proj_with_mqa.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.o_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.o_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.o_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_layernorm.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_a_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_b_proj.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_b_proj.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.q_b_proj.weight": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.unembed_out.biases": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.unembed_out.scales": "model-00039-of-00091.safetensors", + "model.layers.35.self_attn.unembed_out.weight": "model-00039-of-00091.safetensors", + "model.layers.36.input_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.gate.e_score_correction_bias": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.gate.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.down_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.gate_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.shared_experts.up_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.down_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.36.mlp.switch_mlp.up_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.36.post_attention_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.36.self_attn.embed_q.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.embed_q.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.embed_q.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.k_norm.bias": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.k_norm.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.weights_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wk.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wk.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wk.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wq_b.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wq_b.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.indexer.wq_b.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.kv_a_proj_with_mqa.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.o_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.o_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.o_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_layernorm.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_a_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_b_proj.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_b_proj.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.q_b_proj.weight": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.unembed_out.biases": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.unembed_out.scales": "model-00040-of-00091.safetensors", + "model.layers.36.self_attn.unembed_out.weight": "model-00040-of-00091.safetensors", + "model.layers.37.input_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.gate.e_score_correction_bias": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.gate.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.down_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.gate_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.shared_experts.up_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.scales": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.down_proj.weight": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.biases": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.scales": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.gate_proj.weight": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.biases": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.scales": "model-00042-of-00091.safetensors", + "model.layers.37.mlp.switch_mlp.up_proj.weight": "model-00042-of-00091.safetensors", + "model.layers.37.post_attention_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.37.self_attn.embed_q.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.embed_q.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.embed_q.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.k_norm.bias": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.k_norm.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.weights_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wk.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wk.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wk.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wq_b.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wq_b.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.indexer.wq_b.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.kv_a_proj_with_mqa.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.o_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.o_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.o_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_layernorm.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_a_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_b_proj.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_b_proj.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.q_b_proj.weight": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.unembed_out.biases": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.unembed_out.scales": "model-00041-of-00091.safetensors", + "model.layers.37.self_attn.unembed_out.weight": "model-00041-of-00091.safetensors", + "model.layers.38.input_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.gate.e_score_correction_bias": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.gate.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.down_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.gate_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.shared_experts.up_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.down_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.mlp.switch_mlp.up_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.post_attention_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.38.self_attn.embed_q.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.embed_q.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.embed_q.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.k_norm.bias": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.k_norm.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.weights_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wk.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wk.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wk.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wq_b.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wq_b.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.indexer.wq_b.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.kv_a_proj_with_mqa.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.o_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.o_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.o_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_layernorm.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_a_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_b_proj.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_b_proj.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.q_b_proj.weight": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.unembed_out.biases": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.unembed_out.scales": "model-00043-of-00091.safetensors", + "model.layers.38.self_attn.unembed_out.weight": "model-00043-of-00091.safetensors", + "model.layers.39.input_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.gate.e_score_correction_bias": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.gate.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.down_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.gate_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.shared_experts.up_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.down_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.gate_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.39.mlp.switch_mlp.up_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.post_attention_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.39.self_attn.embed_q.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.embed_q.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.embed_q.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.k_norm.bias": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.k_norm.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.weights_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wk.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wk.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wk.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wq_b.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wq_b.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.indexer.wq_b.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.kv_a_proj_with_mqa.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.o_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.o_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.o_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_layernorm.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_a_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_b_proj.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_b_proj.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.q_b_proj.weight": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.unembed_out.biases": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.unembed_out.scales": "model-00044-of-00091.safetensors", + "model.layers.39.self_attn.unembed_out.weight": "model-00044-of-00091.safetensors", + "model.layers.4.input_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.gate.e_score_correction_bias": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.gate.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.down_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.gate_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.shared_experts.up_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.4.self_attn.embed_q.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.embed_q.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.embed_q.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.k_norm.bias": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.k_norm.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.weights_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wk.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wk.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wk.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wq_b.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wq_b.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.indexer.wq_b.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.o_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.o_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_layernorm.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_a_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_b_proj.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_b_proj.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.q_b_proj.weight": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.unembed_out.biases": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.unembed_out.scales": "model-00002-of-00091.safetensors", + "model.layers.4.self_attn.unembed_out.weight": "model-00002-of-00091.safetensors", + "model.layers.40.input_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.gate.e_score_correction_bias": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.gate.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.down_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.gate_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.shared_experts.up_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.down_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.gate_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.40.mlp.switch_mlp.up_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.40.post_attention_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.40.self_attn.embed_q.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.embed_q.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.embed_q.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.k_norm.bias": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.k_norm.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.weights_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wk.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wk.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wk.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wq_b.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wq_b.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.indexer.wq_b.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.kv_a_proj_with_mqa.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.o_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.o_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.o_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_layernorm.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_a_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_b_proj.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_b_proj.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.q_b_proj.weight": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.unembed_out.biases": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.unembed_out.scales": "model-00045-of-00091.safetensors", + "model.layers.40.self_attn.unembed_out.weight": "model-00045-of-00091.safetensors", + "model.layers.41.input_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.gate.e_score_correction_bias": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.gate.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.down_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.gate_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.shared_experts.up_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.down_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.gate_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.41.mlp.switch_mlp.up_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.41.post_attention_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.41.self_attn.embed_q.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.embed_q.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.embed_q.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.k_norm.bias": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.k_norm.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.weights_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wk.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wk.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wk.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wq_b.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wq_b.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.indexer.wq_b.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.kv_a_proj_with_mqa.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.o_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.o_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.o_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_layernorm.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_a_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_b_proj.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_b_proj.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.q_b_proj.weight": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.unembed_out.biases": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.unembed_out.scales": "model-00046-of-00091.safetensors", + "model.layers.41.self_attn.unembed_out.weight": "model-00046-of-00091.safetensors", + "model.layers.42.input_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.gate.e_score_correction_bias": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.gate.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.down_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.gate_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.shared_experts.up_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.scales": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.down_proj.weight": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.biases": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.scales": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.gate_proj.weight": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.biases": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.scales": "model-00048-of-00091.safetensors", + "model.layers.42.mlp.switch_mlp.up_proj.weight": "model-00048-of-00091.safetensors", + "model.layers.42.post_attention_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.42.self_attn.embed_q.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.embed_q.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.embed_q.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.k_norm.bias": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.k_norm.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.weights_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wk.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wk.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wk.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wq_b.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wq_b.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.indexer.wq_b.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.kv_a_proj_with_mqa.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.o_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.o_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.o_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_layernorm.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_a_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_b_proj.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_b_proj.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.q_b_proj.weight": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.unembed_out.biases": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.unembed_out.scales": "model-00047-of-00091.safetensors", + "model.layers.42.self_attn.unembed_out.weight": "model-00047-of-00091.safetensors", + "model.layers.43.input_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.gate.e_score_correction_bias": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.gate.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.down_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.gate_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.shared_experts.up_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.down_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.gate_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.mlp.switch_mlp.up_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.post_attention_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.43.self_attn.embed_q.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.embed_q.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.embed_q.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.k_norm.bias": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.k_norm.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.weights_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wk.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wk.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wk.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wq_b.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wq_b.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.indexer.wq_b.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.kv_a_proj_with_mqa.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.o_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.o_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.o_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_layernorm.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_a_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_b_proj.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_b_proj.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.q_b_proj.weight": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.unembed_out.biases": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.unembed_out.scales": "model-00049-of-00091.safetensors", + "model.layers.43.self_attn.unembed_out.weight": "model-00049-of-00091.safetensors", + "model.layers.44.input_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.gate.e_score_correction_bias": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.gate.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.down_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.gate_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.shared_experts.up_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.down_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.gate_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.44.mlp.switch_mlp.up_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.post_attention_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.44.self_attn.embed_q.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.embed_q.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.embed_q.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.k_norm.bias": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.k_norm.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.weights_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wk.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wk.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wk.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wq_b.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wq_b.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.indexer.wq_b.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.kv_a_proj_with_mqa.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.o_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.o_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.o_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_layernorm.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_a_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_b_proj.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_b_proj.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.q_b_proj.weight": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.unembed_out.biases": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.unembed_out.scales": "model-00050-of-00091.safetensors", + "model.layers.44.self_attn.unembed_out.weight": "model-00050-of-00091.safetensors", + "model.layers.45.input_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.gate.e_score_correction_bias": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.gate.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.down_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.gate_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.shared_experts.up_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.down_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.gate_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.45.mlp.switch_mlp.up_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.45.post_attention_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.45.self_attn.embed_q.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.embed_q.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.embed_q.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.k_norm.bias": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.k_norm.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.weights_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wk.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wk.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wk.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wq_b.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wq_b.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.indexer.wq_b.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.kv_a_proj_with_mqa.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.o_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.o_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.o_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_layernorm.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_a_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_b_proj.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_b_proj.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.q_b_proj.weight": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.unembed_out.biases": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.unembed_out.scales": "model-00051-of-00091.safetensors", + "model.layers.45.self_attn.unembed_out.weight": "model-00051-of-00091.safetensors", + "model.layers.46.input_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.gate.e_score_correction_bias": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.gate.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.down_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.gate_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.shared_experts.up_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.down_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.gate_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.46.mlp.switch_mlp.up_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.46.post_attention_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.46.self_attn.embed_q.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.embed_q.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.embed_q.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.k_norm.bias": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.k_norm.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.weights_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wk.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wk.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wk.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wq_b.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wq_b.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.indexer.wq_b.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.kv_a_proj_with_mqa.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.o_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.o_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.o_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_layernorm.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_a_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_b_proj.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_b_proj.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.q_b_proj.weight": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.unembed_out.biases": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.unembed_out.scales": "model-00052-of-00091.safetensors", + "model.layers.46.self_attn.unembed_out.weight": "model-00052-of-00091.safetensors", + "model.layers.47.input_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.gate.e_score_correction_bias": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.gate.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.down_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.gate_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.shared_experts.up_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.scales": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.down_proj.weight": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.biases": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.scales": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.gate_proj.weight": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.biases": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.scales": "model-00054-of-00091.safetensors", + "model.layers.47.mlp.switch_mlp.up_proj.weight": "model-00054-of-00091.safetensors", + "model.layers.47.post_attention_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.47.self_attn.embed_q.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.embed_q.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.embed_q.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.k_norm.bias": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.k_norm.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.weights_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wk.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wk.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wk.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wq_b.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wq_b.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.indexer.wq_b.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.kv_a_proj_with_mqa.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.o_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.o_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.o_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_layernorm.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_a_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_b_proj.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_b_proj.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.q_b_proj.weight": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.unembed_out.biases": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.unembed_out.scales": "model-00053-of-00091.safetensors", + "model.layers.47.self_attn.unembed_out.weight": "model-00053-of-00091.safetensors", + "model.layers.48.input_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.gate.e_score_correction_bias": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.gate.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.down_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.gate_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.shared_experts.up_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.down_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.gate_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.mlp.switch_mlp.up_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.post_attention_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.48.self_attn.embed_q.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.embed_q.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.embed_q.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.k_norm.bias": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.k_norm.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.weights_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wk.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wk.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wk.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wq_b.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wq_b.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.indexer.wq_b.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.kv_a_proj_with_mqa.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.o_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.o_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.o_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_layernorm.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_a_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_b_proj.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_b_proj.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.q_b_proj.weight": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.unembed_out.biases": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.unembed_out.scales": "model-00055-of-00091.safetensors", + "model.layers.48.self_attn.unembed_out.weight": "model-00055-of-00091.safetensors", + "model.layers.49.input_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.gate.e_score_correction_bias": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.gate.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.down_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.gate_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.shared_experts.up_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.down_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.gate_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.49.mlp.switch_mlp.up_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.post_attention_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.49.self_attn.embed_q.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.embed_q.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.embed_q.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.k_norm.bias": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.k_norm.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.weights_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wk.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wk.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wk.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wq_b.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wq_b.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.indexer.wq_b.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.kv_a_proj_with_mqa.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.o_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.o_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.o_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_layernorm.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_a_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_b_proj.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_b_proj.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.q_b_proj.weight": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.unembed_out.biases": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.unembed_out.scales": "model-00056-of-00091.safetensors", + "model.layers.49.self_attn.unembed_out.weight": "model-00056-of-00091.safetensors", + "model.layers.5.input_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.gate.e_score_correction_bias": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.gate.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.down_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.gate_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.shared_experts.up_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.5.self_attn.embed_q.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.embed_q.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.embed_q.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.k_norm.bias": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.k_norm.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.weights_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wk.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wk.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wk.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wq_b.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wq_b.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.indexer.wq_b.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.o_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.o_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_layernorm.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_a_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_b_proj.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_b_proj.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.q_b_proj.weight": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.unembed_out.biases": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.unembed_out.scales": "model-00003-of-00091.safetensors", + "model.layers.5.self_attn.unembed_out.weight": "model-00003-of-00091.safetensors", + "model.layers.50.input_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.gate.e_score_correction_bias": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.gate.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.down_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.gate_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.shared_experts.up_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.down_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.gate_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.50.mlp.switch_mlp.up_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.50.post_attention_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.50.self_attn.embed_q.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.embed_q.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.embed_q.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.k_norm.bias": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.k_norm.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.weights_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wk.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wk.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wk.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wq_b.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wq_b.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.indexer.wq_b.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.kv_a_proj_with_mqa.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.o_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.o_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.o_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_layernorm.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_a_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_b_proj.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_b_proj.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.q_b_proj.weight": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.unembed_out.biases": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.unembed_out.scales": "model-00057-of-00091.safetensors", + "model.layers.50.self_attn.unembed_out.weight": "model-00057-of-00091.safetensors", + "model.layers.51.input_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.gate.e_score_correction_bias": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.gate.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.down_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.gate_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.shared_experts.up_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.down_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.gate_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.51.mlp.switch_mlp.up_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.51.post_attention_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.51.self_attn.embed_q.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.embed_q.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.embed_q.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.k_norm.bias": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.k_norm.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.weights_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wk.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wk.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wk.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wq_b.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wq_b.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.indexer.wq_b.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.kv_a_proj_with_mqa.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.o_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.o_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.o_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_layernorm.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_a_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_b_proj.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_b_proj.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.q_b_proj.weight": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.unembed_out.biases": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.unembed_out.scales": "model-00058-of-00091.safetensors", + "model.layers.51.self_attn.unembed_out.weight": "model-00058-of-00091.safetensors", + "model.layers.52.input_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.gate.e_score_correction_bias": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.gate.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.down_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.gate_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.shared_experts.up_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.scales": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.down_proj.weight": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.biases": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.scales": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.gate_proj.weight": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.biases": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.scales": "model-00060-of-00091.safetensors", + "model.layers.52.mlp.switch_mlp.up_proj.weight": "model-00060-of-00091.safetensors", + "model.layers.52.post_attention_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.52.self_attn.embed_q.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.embed_q.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.embed_q.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.k_norm.bias": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.k_norm.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.weights_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wk.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wk.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wk.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wq_b.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wq_b.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.indexer.wq_b.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.kv_a_proj_with_mqa.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.o_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.o_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.o_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_layernorm.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_a_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_b_proj.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_b_proj.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.q_b_proj.weight": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.unembed_out.biases": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.unembed_out.scales": "model-00059-of-00091.safetensors", + "model.layers.52.self_attn.unembed_out.weight": "model-00059-of-00091.safetensors", + "model.layers.53.input_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.gate.e_score_correction_bias": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.gate.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.down_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.gate_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.shared_experts.up_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.down_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.gate_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.mlp.switch_mlp.up_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.post_attention_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.53.self_attn.embed_q.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.embed_q.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.embed_q.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.k_norm.bias": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.k_norm.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.weights_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wk.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wk.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wk.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wq_b.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wq_b.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.indexer.wq_b.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.kv_a_proj_with_mqa.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.o_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.o_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.o_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_layernorm.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_a_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_b_proj.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_b_proj.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.q_b_proj.weight": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.unembed_out.biases": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.unembed_out.scales": "model-00061-of-00091.safetensors", + "model.layers.53.self_attn.unembed_out.weight": "model-00061-of-00091.safetensors", + "model.layers.54.input_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.gate.e_score_correction_bias": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.gate.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.down_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.gate_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.shared_experts.up_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.down_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.gate_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.54.mlp.switch_mlp.up_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.post_attention_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.54.self_attn.embed_q.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.embed_q.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.embed_q.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.k_norm.bias": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.k_norm.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.weights_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wk.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wk.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wk.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wq_b.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wq_b.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.indexer.wq_b.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.kv_a_proj_with_mqa.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.o_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.o_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.o_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_layernorm.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_a_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_b_proj.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_b_proj.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.q_b_proj.weight": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.unembed_out.biases": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.unembed_out.scales": "model-00062-of-00091.safetensors", + "model.layers.54.self_attn.unembed_out.weight": "model-00062-of-00091.safetensors", + "model.layers.55.input_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.gate.e_score_correction_bias": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.gate.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.down_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.gate_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.shared_experts.up_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.down_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.gate_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.55.mlp.switch_mlp.up_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.55.post_attention_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.55.self_attn.embed_q.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.embed_q.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.embed_q.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.k_norm.bias": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.k_norm.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.weights_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wk.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wk.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wk.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wq_b.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wq_b.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.indexer.wq_b.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.kv_a_proj_with_mqa.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.o_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.o_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.o_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_layernorm.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_a_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_b_proj.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_b_proj.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.q_b_proj.weight": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.unembed_out.biases": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.unembed_out.scales": "model-00063-of-00091.safetensors", + "model.layers.55.self_attn.unembed_out.weight": "model-00063-of-00091.safetensors", + "model.layers.56.input_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.gate.e_score_correction_bias": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.gate.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.down_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.gate_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.shared_experts.up_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.down_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.gate_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.56.mlp.switch_mlp.up_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.56.post_attention_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.56.self_attn.embed_q.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.embed_q.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.embed_q.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.k_norm.bias": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.k_norm.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.weights_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wk.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wk.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wk.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wq_b.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wq_b.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.indexer.wq_b.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.kv_a_proj_with_mqa.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.o_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.o_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.o_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_layernorm.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_a_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_b_proj.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_b_proj.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.q_b_proj.weight": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.unembed_out.biases": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.unembed_out.scales": "model-00064-of-00091.safetensors", + "model.layers.56.self_attn.unembed_out.weight": "model-00064-of-00091.safetensors", + "model.layers.57.input_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.gate.e_score_correction_bias": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.gate.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.down_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.gate_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.shared_experts.up_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.scales": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.down_proj.weight": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.biases": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.scales": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.gate_proj.weight": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.biases": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.scales": "model-00066-of-00091.safetensors", + "model.layers.57.mlp.switch_mlp.up_proj.weight": "model-00066-of-00091.safetensors", + "model.layers.57.post_attention_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.57.self_attn.embed_q.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.embed_q.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.embed_q.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.k_norm.bias": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.k_norm.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.weights_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wk.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wk.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wk.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wq_b.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wq_b.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.indexer.wq_b.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.kv_a_proj_with_mqa.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.o_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.o_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.o_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_layernorm.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_a_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_b_proj.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_b_proj.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.q_b_proj.weight": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.unembed_out.biases": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.unembed_out.scales": "model-00065-of-00091.safetensors", + "model.layers.57.self_attn.unembed_out.weight": "model-00065-of-00091.safetensors", + "model.layers.58.input_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.gate.e_score_correction_bias": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.gate.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.down_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.gate_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.shared_experts.up_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.down_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.gate_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.mlp.switch_mlp.up_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.post_attention_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.58.self_attn.embed_q.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.embed_q.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.embed_q.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.k_norm.bias": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.k_norm.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.weights_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wk.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wk.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wk.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wq_b.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wq_b.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.indexer.wq_b.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.kv_a_proj_with_mqa.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.o_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.o_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.o_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_layernorm.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_a_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_b_proj.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_b_proj.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.q_b_proj.weight": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.unembed_out.biases": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.unembed_out.scales": "model-00067-of-00091.safetensors", + "model.layers.58.self_attn.unembed_out.weight": "model-00067-of-00091.safetensors", + "model.layers.59.input_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.gate.e_score_correction_bias": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.gate.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.down_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.gate_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.shared_experts.up_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.down_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.gate_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.59.mlp.switch_mlp.up_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.post_attention_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.59.self_attn.embed_q.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.embed_q.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.embed_q.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.k_norm.bias": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.k_norm.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.weights_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wk.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wk.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wk.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wq_b.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wq_b.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.indexer.wq_b.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.kv_a_proj_with_mqa.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.o_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.o_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.o_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_layernorm.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_a_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_b_proj.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_b_proj.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.q_b_proj.weight": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.unembed_out.biases": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.unembed_out.scales": "model-00068-of-00091.safetensors", + "model.layers.59.self_attn.unembed_out.weight": "model-00068-of-00091.safetensors", + "model.layers.6.input_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.gate.e_score_correction_bias": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.gate.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.down_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.gate_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.shared_experts.up_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.6.self_attn.embed_q.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.embed_q.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.embed_q.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.k_norm.bias": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.k_norm.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.weights_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wk.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wk.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wk.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wq_b.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wq_b.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.indexer.wq_b.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.o_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.o_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_layernorm.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_a_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_b_proj.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_b_proj.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.q_b_proj.weight": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.unembed_out.biases": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.unembed_out.scales": "model-00004-of-00091.safetensors", + "model.layers.6.self_attn.unembed_out.weight": "model-00004-of-00091.safetensors", + "model.layers.60.input_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.gate.e_score_correction_bias": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.gate.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.down_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.gate_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.shared_experts.up_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.down_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.gate_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.60.mlp.switch_mlp.up_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.60.post_attention_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.60.self_attn.embed_q.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.embed_q.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.embed_q.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.k_norm.bias": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.k_norm.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.weights_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wk.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wk.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wk.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wq_b.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wq_b.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.indexer.wq_b.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.kv_a_proj_with_mqa.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.o_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.o_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.o_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_layernorm.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_a_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_b_proj.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_b_proj.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.q_b_proj.weight": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.unembed_out.biases": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.unembed_out.scales": "model-00069-of-00091.safetensors", + "model.layers.60.self_attn.unembed_out.weight": "model-00069-of-00091.safetensors", + "model.layers.61.input_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.gate.e_score_correction_bias": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.gate.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.down_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.gate_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.shared_experts.up_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.down_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.gate_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.61.mlp.switch_mlp.up_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.61.post_attention_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.61.self_attn.embed_q.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.embed_q.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.embed_q.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.k_norm.bias": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.k_norm.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.weights_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wk.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wk.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wk.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wq_b.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wq_b.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.indexer.wq_b.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.kv_a_proj_with_mqa.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.o_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.o_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.o_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_layernorm.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_a_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_b_proj.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_b_proj.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.q_b_proj.weight": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.unembed_out.biases": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.unembed_out.scales": "model-00070-of-00091.safetensors", + "model.layers.61.self_attn.unembed_out.weight": "model-00070-of-00091.safetensors", + "model.layers.62.input_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.gate.e_score_correction_bias": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.gate.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.down_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.gate_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.shared_experts.up_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.scales": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.down_proj.weight": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.biases": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.scales": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.gate_proj.weight": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.biases": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.scales": "model-00072-of-00091.safetensors", + "model.layers.62.mlp.switch_mlp.up_proj.weight": "model-00072-of-00091.safetensors", + "model.layers.62.post_attention_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.62.self_attn.embed_q.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.embed_q.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.embed_q.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.k_norm.bias": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.k_norm.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.weights_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wk.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wk.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wk.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wq_b.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wq_b.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.indexer.wq_b.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.kv_a_proj_with_mqa.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.o_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.o_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.o_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_layernorm.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_a_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_b_proj.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_b_proj.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.q_b_proj.weight": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.unembed_out.biases": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.unembed_out.scales": "model-00071-of-00091.safetensors", + "model.layers.62.self_attn.unembed_out.weight": "model-00071-of-00091.safetensors", + "model.layers.63.input_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.gate.e_score_correction_bias": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.gate.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.down_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.gate_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.shared_experts.up_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.down_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.gate_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.mlp.switch_mlp.up_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.post_attention_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.63.self_attn.embed_q.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.embed_q.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.embed_q.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.k_norm.bias": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.k_norm.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.weights_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wk.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wk.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wk.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wq_b.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wq_b.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.indexer.wq_b.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.kv_a_proj_with_mqa.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.o_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.o_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.o_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_layernorm.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_a_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_b_proj.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_b_proj.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.q_b_proj.weight": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.unembed_out.biases": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.unembed_out.scales": "model-00073-of-00091.safetensors", + "model.layers.63.self_attn.unembed_out.weight": "model-00073-of-00091.safetensors", + "model.layers.64.input_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.gate.e_score_correction_bias": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.gate.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.down_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.gate_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.shared_experts.up_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.down_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.gate_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.64.mlp.switch_mlp.up_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.post_attention_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.64.self_attn.embed_q.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.embed_q.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.embed_q.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.k_norm.bias": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.k_norm.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.weights_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wk.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wk.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wk.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wq_b.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wq_b.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.indexer.wq_b.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.kv_a_proj_with_mqa.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.o_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.o_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.o_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_layernorm.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_a_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_b_proj.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_b_proj.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.q_b_proj.weight": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.unembed_out.biases": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.unembed_out.scales": "model-00074-of-00091.safetensors", + "model.layers.64.self_attn.unembed_out.weight": "model-00074-of-00091.safetensors", + "model.layers.65.input_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.gate.e_score_correction_bias": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.gate.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.down_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.gate_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.shared_experts.up_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.down_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.gate_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.65.mlp.switch_mlp.up_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.65.post_attention_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.65.self_attn.embed_q.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.embed_q.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.embed_q.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.k_norm.bias": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.k_norm.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.weights_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wk.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wk.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wk.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wq_b.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wq_b.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.indexer.wq_b.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.kv_a_proj_with_mqa.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.o_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.o_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.o_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_layernorm.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_a_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_b_proj.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_b_proj.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.q_b_proj.weight": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.unembed_out.biases": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.unembed_out.scales": "model-00075-of-00091.safetensors", + "model.layers.65.self_attn.unembed_out.weight": "model-00075-of-00091.safetensors", + "model.layers.66.input_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.gate.e_score_correction_bias": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.gate.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.down_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.gate_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.shared_experts.up_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.down_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.gate_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.66.mlp.switch_mlp.up_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.66.post_attention_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.66.self_attn.embed_q.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.embed_q.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.embed_q.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.k_norm.bias": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.k_norm.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.weights_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wk.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wk.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wk.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wq_b.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wq_b.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.indexer.wq_b.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.kv_a_proj_with_mqa.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.o_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.o_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.o_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_layernorm.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_a_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_b_proj.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_b_proj.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.q_b_proj.weight": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.unembed_out.biases": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.unembed_out.scales": "model-00076-of-00091.safetensors", + "model.layers.66.self_attn.unembed_out.weight": "model-00076-of-00091.safetensors", + "model.layers.67.input_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.gate.e_score_correction_bias": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.gate.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.down_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.gate_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.shared_experts.up_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.scales": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.down_proj.weight": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.biases": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.scales": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.gate_proj.weight": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.biases": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.scales": "model-00078-of-00091.safetensors", + "model.layers.67.mlp.switch_mlp.up_proj.weight": "model-00078-of-00091.safetensors", + "model.layers.67.post_attention_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.67.self_attn.embed_q.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.embed_q.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.embed_q.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.k_norm.bias": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.k_norm.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.weights_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wk.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wk.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wk.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wq_b.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wq_b.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.indexer.wq_b.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.kv_a_proj_with_mqa.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.o_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.o_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.o_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_layernorm.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_a_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_b_proj.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_b_proj.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.q_b_proj.weight": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.unembed_out.biases": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.unembed_out.scales": "model-00077-of-00091.safetensors", + "model.layers.67.self_attn.unembed_out.weight": "model-00077-of-00091.safetensors", + "model.layers.68.input_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.gate.e_score_correction_bias": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.gate.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.down_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.gate_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.shared_experts.up_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.down_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.gate_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.mlp.switch_mlp.up_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.post_attention_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.68.self_attn.embed_q.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.embed_q.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.embed_q.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.k_norm.bias": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.k_norm.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.weights_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wk.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wk.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wk.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wq_b.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wq_b.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.indexer.wq_b.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.kv_a_proj_with_mqa.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.o_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.o_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.o_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_layernorm.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_a_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_b_proj.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_b_proj.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.q_b_proj.weight": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.unembed_out.biases": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.unembed_out.scales": "model-00079-of-00091.safetensors", + "model.layers.68.self_attn.unembed_out.weight": "model-00079-of-00091.safetensors", + "model.layers.69.input_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.gate.e_score_correction_bias": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.gate.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.down_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.gate_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.shared_experts.up_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.down_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.gate_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.69.mlp.switch_mlp.up_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.post_attention_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.69.self_attn.embed_q.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.embed_q.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.embed_q.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.k_norm.bias": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.k_norm.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.weights_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wk.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wk.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wk.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wq_b.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wq_b.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.indexer.wq_b.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.kv_a_proj_with_mqa.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.o_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.o_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.o_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_layernorm.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_a_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_b_proj.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_b_proj.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.q_b_proj.weight": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.unembed_out.biases": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.unembed_out.scales": "model-00080-of-00091.safetensors", + "model.layers.69.self_attn.unembed_out.weight": "model-00080-of-00091.safetensors", + "model.layers.7.input_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.gate.e_score_correction_bias": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.gate.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.down_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.gate_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.shared_experts.up_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00006-of-00091.safetensors", + "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00006-of-00091.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.7.self_attn.embed_q.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.embed_q.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.embed_q.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.k_norm.bias": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.k_norm.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.weights_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wk.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wk.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wk.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wq_b.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wq_b.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.indexer.wq_b.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.o_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.o_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_layernorm.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_a_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_b_proj.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_b_proj.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.q_b_proj.weight": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.unembed_out.biases": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.unembed_out.scales": "model-00005-of-00091.safetensors", + "model.layers.7.self_attn.unembed_out.weight": "model-00005-of-00091.safetensors", + "model.layers.70.input_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.gate.e_score_correction_bias": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.gate.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.down_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.gate_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.shared_experts.up_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.down_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.gate_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.70.mlp.switch_mlp.up_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.70.post_attention_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.70.self_attn.embed_q.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.embed_q.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.embed_q.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.k_norm.bias": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.k_norm.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.weights_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wk.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wk.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wk.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wq_b.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wq_b.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.indexer.wq_b.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.kv_a_proj_with_mqa.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.o_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.o_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.o_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_layernorm.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_a_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_b_proj.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_b_proj.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.q_b_proj.weight": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.unembed_out.biases": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.unembed_out.scales": "model-00081-of-00091.safetensors", + "model.layers.70.self_attn.unembed_out.weight": "model-00081-of-00091.safetensors", + "model.layers.71.input_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.gate.e_score_correction_bias": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.gate.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.down_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.gate_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.shared_experts.up_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.down_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.gate_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.71.mlp.switch_mlp.up_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.71.post_attention_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.71.self_attn.embed_q.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.embed_q.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.embed_q.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.k_norm.bias": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.k_norm.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.weights_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wk.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wk.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wk.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wq_b.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wq_b.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.indexer.wq_b.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.kv_a_proj_with_mqa.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.o_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.o_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.o_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_layernorm.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_a_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_b_proj.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_b_proj.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.q_b_proj.weight": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.unembed_out.biases": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.unembed_out.scales": "model-00082-of-00091.safetensors", + "model.layers.71.self_attn.unembed_out.weight": "model-00082-of-00091.safetensors", + "model.layers.72.input_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.gate.e_score_correction_bias": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.gate.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.down_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.gate_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.shared_experts.up_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.scales": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.down_proj.weight": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.biases": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.scales": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.gate_proj.weight": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.biases": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.scales": "model-00084-of-00091.safetensors", + "model.layers.72.mlp.switch_mlp.up_proj.weight": "model-00084-of-00091.safetensors", + "model.layers.72.post_attention_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.72.self_attn.embed_q.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.embed_q.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.embed_q.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.k_norm.bias": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.k_norm.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.weights_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wk.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wk.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wk.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wq_b.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wq_b.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.indexer.wq_b.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.kv_a_proj_with_mqa.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.o_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.o_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.o_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_layernorm.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_a_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_b_proj.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_b_proj.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.q_b_proj.weight": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.unembed_out.biases": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.unembed_out.scales": "model-00083-of-00091.safetensors", + "model.layers.72.self_attn.unembed_out.weight": "model-00083-of-00091.safetensors", + "model.layers.73.input_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.gate.e_score_correction_bias": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.gate.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.down_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.gate_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.shared_experts.up_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.down_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.gate_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.mlp.switch_mlp.up_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.post_attention_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.73.self_attn.embed_q.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.embed_q.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.embed_q.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.k_norm.bias": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.k_norm.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.weights_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wk.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wk.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wk.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wq_b.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wq_b.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.indexer.wq_b.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.kv_a_proj_with_mqa.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.o_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.o_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.o_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_layernorm.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_a_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_b_proj.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_b_proj.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.q_b_proj.weight": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.unembed_out.biases": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.unembed_out.scales": "model-00085-of-00091.safetensors", + "model.layers.73.self_attn.unembed_out.weight": "model-00085-of-00091.safetensors", + "model.layers.74.input_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.gate.e_score_correction_bias": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.gate.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.down_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.gate_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.shared_experts.up_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.down_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.gate_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.74.mlp.switch_mlp.up_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.post_attention_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.74.self_attn.embed_q.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.embed_q.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.embed_q.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.k_norm.bias": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.k_norm.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.weights_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wk.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wk.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wk.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wq_b.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wq_b.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.indexer.wq_b.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.kv_a_proj_with_mqa.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.o_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.o_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.o_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_layernorm.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_a_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_b_proj.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_b_proj.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.q_b_proj.weight": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.unembed_out.biases": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.unembed_out.scales": "model-00086-of-00091.safetensors", + "model.layers.74.self_attn.unembed_out.weight": "model-00086-of-00091.safetensors", + "model.layers.75.input_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.gate.e_score_correction_bias": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.gate.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.down_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.gate_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.shared_experts.up_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.down_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.gate_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.75.mlp.switch_mlp.up_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.75.post_attention_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.75.self_attn.embed_q.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.embed_q.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.embed_q.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.k_norm.bias": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.k_norm.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.weights_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wk.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wk.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wk.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wq_b.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wq_b.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.indexer.wq_b.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.kv_a_proj_with_mqa.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.o_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.o_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.o_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_layernorm.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_a_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_b_proj.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_b_proj.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.q_b_proj.weight": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.unembed_out.biases": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.unembed_out.scales": "model-00087-of-00091.safetensors", + "model.layers.75.self_attn.unembed_out.weight": "model-00087-of-00091.safetensors", + "model.layers.76.input_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.gate.e_score_correction_bias": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.gate.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.down_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.gate_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.shared_experts.up_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.down_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.gate_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.76.mlp.switch_mlp.up_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.76.post_attention_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.76.self_attn.embed_q.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.embed_q.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.embed_q.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.k_norm.bias": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.k_norm.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.weights_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wk.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wk.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wk.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wq_b.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wq_b.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.indexer.wq_b.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.kv_a_proj_with_mqa.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.o_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.o_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.o_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_layernorm.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_a_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_b_proj.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_b_proj.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.q_b_proj.weight": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.unembed_out.biases": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.unembed_out.scales": "model-00088-of-00091.safetensors", + "model.layers.76.self_attn.unembed_out.weight": "model-00088-of-00091.safetensors", + "model.layers.77.input_layernorm.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.gate.e_score_correction_bias": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.gate.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.scales": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.down_proj.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.scales": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.gate_proj.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.scales": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.shared_experts.up_proj.weight": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.biases": "model-00091-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.scales": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.down_proj.weight": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.biases": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.scales": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.gate_proj.weight": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.biases": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.scales": "model-00090-of-00091.safetensors", + "model.layers.77.mlp.switch_mlp.up_proj.weight": "model-00090-of-00091.safetensors", + "model.layers.77.post_attention_layernorm.weight": "model-00091-of-00091.safetensors", + "model.layers.77.self_attn.embed_q.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.embed_q.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.embed_q.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.k_norm.bias": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.k_norm.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.weights_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wk.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wk.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wk.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wq_b.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wq_b.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.indexer.wq_b.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.kv_a_proj_with_mqa.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.o_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.o_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.o_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_layernorm.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_a_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_b_proj.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_b_proj.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.q_b_proj.weight": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.unembed_out.biases": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.unembed_out.scales": "model-00089-of-00091.safetensors", + "model.layers.77.self_attn.unembed_out.weight": "model-00089-of-00091.safetensors", + "model.layers.8.input_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.gate.e_score_correction_bias": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.gate.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.down_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.gate_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.shared_experts.up_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.8.self_attn.embed_q.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.embed_q.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.embed_q.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.k_norm.bias": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.k_norm.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.weights_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wk.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wk.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wk.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wq_b.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wq_b.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.indexer.wq_b.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.o_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.o_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_layernorm.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_a_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_b_proj.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_b_proj.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.q_b_proj.weight": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.unembed_out.biases": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.unembed_out.scales": "model-00007-of-00091.safetensors", + "model.layers.8.self_attn.unembed_out.weight": "model-00007-of-00091.safetensors", + "model.layers.9.input_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.gate.e_score_correction_bias": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.gate.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.down_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.gate_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.shared_experts.up_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00009-of-00091.safetensors", + "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00009-of-00091.safetensors", + "model.layers.9.self_attn.embed_q.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.embed_q.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.embed_q.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.k_norm.bias": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.k_norm.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.weights_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wk.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wk.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wk.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wq_b.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wq_b.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.indexer.wq_b.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.o_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.o_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_layernorm.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_a_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_b_proj.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_b_proj.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.q_b_proj.weight": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.unembed_out.biases": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.unembed_out.scales": "model-00008-of-00091.safetensors", + "model.layers.9.self_attn.unembed_out.weight": "model-00008-of-00091.safetensors", + "model.norm.weight": "model-00091-of-00091.safetensors" + } +} \ No newline at end of file diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..aba40197a4cdb5607f4ab7a05fb0a4ee8054fd6d --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d +size 20217442 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6aa53776c9f7ac98333a470b78a5b732d5343d15 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,34 @@ +{ + "backend": "tokenizers", + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "<|endoftext|>", + "extra_special_tokens": [ + "<|endoftext|>", + "[MASK]", + "[gMASK]", + "[sMASK]", + "", + "", + "<|system|>", + "<|user|>", + "<|assistant|>", + "<|observation|>", + "<|begin_of_image|>", + "<|end_of_image|>", + "<|begin_of_video|>", + "<|end_of_video|>", + "<|begin_of_audio|>", + "<|end_of_audio|>", + "<|begin_of_transcription|>", + "<|end_of_transcription|>" + ], + "is_local": true, + "model_max_length": 202752, + "model_specific_special_tokens": {}, + "pad_token": "<|endoftext|>", + "padding_side": "left", + "remove_space": false, + "tokenizer_class": "TokenizersBackend", + "tool_parser_type": "glm47" +}