diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..409a715bd27354ec2a345d2e4dc2ccfc307839f0
Binary files /dev/null and b/.DS_Store differ
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b59aba0b8844f4adb4ead13ae902acaffe6fcb93
--- /dev/null
+++ b/README.md
@@ -0,0 +1,17 @@
+---
+license: mit
+library_name: mlx
+pipeline_tag: text-generation
+tags:
+- transformers
+- mlx
+base_model: meituan-longcat/LongCat-Flash-Chat
+---
+
+This model [finding1/LongCat-Flash-Chat-MLX-5.5bpw](https://huggingface.co/finding1/LongCat-Flash-Chat-MLX-5.5bpw) was
+converted to MLX format from [meituan-longcat/LongCat-Flash-Chat](https://huggingface.co/meituan-longcat/LongCat-Flash-Chat)
+using mlx-lm version **0.27.1** by running
+`mlx_lm.convert --quantize --q-bits 5 --mlx-path MLX-8.5bpw --hf-path meituan-longcat/LongCat-Flash-Chat`
+until it crashed with a `KeyError`;
+[adding `"model_type": "longcat_flash",` to the downloaded `config.json`](https://github.com/ml-explore/mlx-lm/issues/433#issuecomment-3262138687),
+then running the command again.
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..cbca5a5732d77b19acaa3db53e54367069ab78e5
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,138 @@
+{%- set tool_choice = tool_choice | default('auto') %}
+{%- set ns = namespace(rounds = 0, tool_types = [], last_query_index = -1) %}
+
+{%- if tools and tool_choice != 'none' %}
+ {{- "# Tools
+" }}
+ {{- "You have access to the following tools:
+
+" }}
+ {%- for tool in tools %}
+ {%- if tool.type in ['code_interpreter', 'function'] %}
+ {%- if tool.type not in ns.tool_types %}
+ {%- set ns.tool_types = ns.tool_types + [tool.type] %}
+ {{- "## Tool namespace: " ~ tool.type ~ "
+
+" }}
+ {%- endif %}
+ {%- if tool.type == 'code_interpreter' %}
+ {%- set tool = {"type":"code_interpreter","function":{"name":"code_interpreter_preview","description":"The code will be executed in a stateful Jupyter notebook sandbox environment, only supports local computation, data processing, and file operations.
+Code sandbox environment (network isolated) Any external network requests or online API calls are prohibited.
+If online functionality is needed, please use other permitted tools.
+Code will respond with the output of the execution or time out after 60.0 seconds. ","parameters":{"type":"object","properties":{"language":{"type":"string","description":"The programming language of the code to be executed. Available values: python (Default), java, go, js, ts, c, c++."},"code":{"type":"string","description":"Python code to be executed must not include the following:
+- Importing network libraries such as requests, httplib, etc.
+- Any form of HTTP requests.
+- External API calls.
+- Network port operations. Example: ```python
+import pandas as pd
+pd.DataFrame({'A':[1,2]})
+```"},"timeout":{"type":"number","description":"The maximum execution time of the code, in seconds. Default is 60.0."}}},"required":["code"]}} %}
+ {%- endif %}
+ {{- "### Tool name: " + tool.function.name + "
+
+" }}
+ {{- "Description: " + tool.function.description + "
+
+" }}
+ {{- "InputSchema:
+" + tool.function.parameters | tojson(indent=2) + "
+
+" }}
+ {%- endif %}
+ {%- endfor %}
+ {{- '**Note**: For each function call, return a json object with function name and arguments within XML tags as follows:
+
+{"name": , "arguments": }
+
+' }}
+ {{- 'When multiple functions need to be called simultaneously, each function call should be wrapped in its own tag and placed consecutively. For example:
+
+{"name": , "arguments": }
+
+{"name": , "arguments": }
+
+
+' }}
+ {{- "# Messages
+" }}
+
+ {%- for idx in range(messages|length - 1) %}
+ {%- set msg = messages[idx] %}
+ {%- if msg.role == 'assistant' and not msg.tool_calls %}
+ {%- set ns.last_query_index = idx %}
+ {%- endif %}
+ {%- endfor%}
+{%- endif %}
+
+{%- for msg in messages %}
+ {%- if msg.role == "system" %}
+ {{- "SYSTEM:" + msg.content }}
+ {%- elif msg.role == "user" %}
+ {%- if loop.first %}
+ {{- "[Round " ~ (ns.rounds) ~ "] USER:" }}
+ {%- else %}
+ {{- " [Round " ~ (ns.rounds) ~ "] USER:"}}
+ {%- endif %}
+ {%- set ns.rounds = ns.rounds + 1 %}
+ {%- if msg["files"] %}
+ {{- '
+' ~ msg.files | tojson(indent=2) ~ '
+' }}
+ {%- endif %}
+ {{- msg.content }}
+ {%- elif msg.role == "assistant" %}
+ {{- " ASSISTANT:" }}
+ {%- if enable_thinking == true and msg.reasoning_content and ns.tool_types != [] and loop.index0 > ns.last_query_index %}
+ {{- "
+
+" ~ msg.reasoning_content ~ "
+
+" }}
+ {%- endif %}
+ {%- if msg.content%}
+ {{- msg.content }}
+ {%- endif %}
+ {%- if msg.tool_calls %}
+ {%- for tool_call in msg.tool_calls -%}
+ {{- "
+" -}}
+ {%- if tool_call.function.arguments is string -%}
+ {"name": "{{ tool_call.function.name}}", "arguments": {{tool_call.function.arguments}}}
+ {%- else -%}
+ {"name": "{{ tool_call.function.name}}", "arguments": {{tool_call.function.arguments | tojson}}}
+ {%- endif -%}
+ {{- "
+" }}
+ {%- endfor %}
+ {%- endif %}
+ {{- "" -}}
+ {%- elif msg.role == "tool" %}
+ {{- " TOOL:" -}}
+ {%- if msg.name -%}
+ {"name": {{msg.name | tojson}}, "content": {{msg.content | tojson}}}
+ {%- else -%}
+ {"content": {{msg.content | tojson}}}
+ {%- endif -%}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {%- if enable_thinking == true %}
+ {{- " /think_on" }}
+ {%- if thinking_budget %}
+ {%- if thinking_budget < 1024 %}
+ {%- set thinking_budget = 1024 %}
+ {%- endif%}
+ {{- "
+thinking_budget: < " ~ thinking_budget ~ "."}}
+ {%- endif %}
+ {{- " ASSISTANT:
+"}}
+ {%- elif enable_thinking == false %}
+ {{- " /think_off ASSISTANT:
+
+
+" }}
+ {%- else %}
+ {{- " ASSISTANT:" }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ac223893839685c1e3b02d280fd8e16ae894edb5
--- /dev/null
+++ b/config.json
@@ -0,0 +1,272 @@
+{
+ "architectures": [
+ "LongcatFlashForCausalLM"
+ ],
+ "attention_bias": false,
+ "attention_dropout": 0.0,
+ "attention_method": "MLA",
+ "auto_map": {
+ "AutoConfig": "configuration_longcat_flash.LongcatFlashConfig",
+ "AutoModel": "modeling_longcat_flash.LongcatFlashModel",
+ "AutoModelForCausalLM": "modeling_longcat_flash.LongcatFlashForCausalLM"
+ },
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "expert_ffn_hidden_size": 2048,
+ "ffn_hidden_size": 12288,
+ "hidden_size": 6144,
+ "kv_lora_rank": 512,
+ "max_position_embeddings": 131072,
+ "mla_scale_kv_lora": true,
+ "mla_scale_q_lora": true,
+ "model_type": "longcat_flash",
+ "moe_topk": 12,
+ "n_routed_experts": 512,
+ "num_attention_heads": 64,
+ "num_layers": 28,
+ "q_lora_rank": 1536,
+ "qk_nope_head_dim": 128,
+ "qk_rope_head_dim": 64,
+ "quantization": {
+ "group_size": 64,
+ "bits": 5,
+ "mode": "affine",
+ "model.layers.0.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.1.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.2.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.3.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.4.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.5.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.6.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.7.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.8.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.9.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.10.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.11.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.12.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.13.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.14.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.15.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.16.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.17.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.18.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.19.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.20.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.21.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.22.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.23.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.24.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.25.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.26.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.27.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ }
+ },
+ "quantization_config": {
+ "group_size": 64,
+ "bits": 5,
+ "mode": "affine",
+ "model.layers.0.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.1.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.2.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.3.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.4.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.5.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.6.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.7.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.8.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.9.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.10.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.11.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.12.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.13.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.14.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.15.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.16.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.17.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.18.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.19.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.20.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.21.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.22.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.23.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.24.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.25.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.26.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ },
+ "model.layers.27.mlp.router.classifier": {
+ "group_size": 64,
+ "bits": 8
+ }
+ },
+ "rms_norm_eps": 1e-05,
+ "rope_theta": 10000000.0,
+ "routed_scaling_factor": 6.0,
+ "use_cache": true,
+ "v_head_dim": 128,
+ "vocab_size": 131072,
+ "zero_expert_num": 256,
+ "zero_expert_type": "identity"
+}
\ No newline at end of file
diff --git a/configuration_longcat_flash.py b/configuration_longcat_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..e70b239db05a4f4d35edd503b56086043f51b221
--- /dev/null
+++ b/configuration_longcat_flash.py
@@ -0,0 +1,216 @@
+
+"""LongcatFlash model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+
+
+LONGCAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LongcatFlashConfig(PretrainedConfig):
+ r"""
+ This is the configuration class to store the configuration of a [`LongcatFlashModel`]. It is used to instantiate an LongcatFlash
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+ defaults will yield a similar configuration to that of the LongcatFlash.
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+ documentation from [`PretrainedConfig`] for more information.
+
+
+ Args:
+ vocab_size (`int`, *optional*, defaults to 131072):
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+ `inputs_ids` passed when calling [`LongcatFlashModel`]
+ hidden_size (`int`, *optional*, defaults to 7168):
+ Dimension of the hidden representations.
+ ffn_hidden_size (`int`, *optional*, defaults to 18432):
+ Dimension of the MLP representations.
+ expert_ffn_hidden_size (`int`, *optional*, defaults to 2048):
+ Dimension of the MoE representations.
+ num_layers (`int`, *optional*, defaults to 61):
+ Number of hidden layers in the Transformer decoder.
+ num_attention_heads (`int`, *optional*, defaults to 128):
+ Number of attention heads for each attention layer in the Transformer decoder.
+ num_key_value_heads (`int`, *optional*, defaults to 128):
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+ by meanpooling all the original heads within that group. For more details checkout [this
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+ `num_attention_heads`.
+ n_routed_experts (`int`, *optional*, defaults to 256):
+ Number of routed experts.
+ routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+ Scaling factor or routed experts.
+ kv_lora_rank (`int`, *optional*, defaults to 512):
+ Rank of the LoRA matrices for key and value projections.
+ q_lora_rank (`int`, *optional*, defaults to 1536):
+ Rank of the LoRA matrices for query projections.
+ qk_rope_head_dim (`int`, *optional*, defaults to 64):
+ Dimension of the query/key heads that use rotary position embeddings.
+ v_head_dim (`int`, *optional*, defaults to 128):
+ Dimension of the value heads.
+ qk_nope_head_dim (`int`, *optional*, defaults to 128):
+ Dimension of the query/key heads that don't use rotary position embeddings.
+ norm_topk_prob (`bool`, *optional*, defaults to `True`):
+ Whether to normalize the weights of the routed experts.
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+ The non-linear activation function (function or string) in the decoder.
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
+ The maximum sequence length that this model might ever be used with.
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+ The epsilon used by the rms normalization layers.
+ use_cache (`bool`, *optional*, defaults to `True`):
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
+ relevant if `config.is_decoder=True`.
+ pad_token_id (`int`, *optional*):
+ Padding token id.
+ bos_token_id (`int`, *optional*, defaults to 0):
+ Beginning of stream token id.
+ eos_token_id (`int`, *optional*, defaults to 1):
+ End of stream token id.
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+ Whether to tie weight embeddings
+ rope_theta (`float`, *optional*, defaults to 10000.0):
+ The base period of the RoPE embeddings.
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
+ attention_dropout (`float`, *optional*, defaults to 0.0):
+ The dropout ratio for the attention probabilities.
+ attention_method (`str`, *optional*, defaults to `"MLA"`):
+ The attention method to use.
+ initializer_range (`float`, *optional*, defaults to 0.006):
+ The initializer range for the model.
+ router_bias (`bool`, *optional*, defaults to `False`):
+ Whether to use a bias in the router.
+ zero_expert_num (`int`, *optional*, defaults to `None`):
+ The number of zero experts to use.
+ zero_expert_type (`str`, *optional*, defaults to `None`):
+ The type of zero expert to use.
+
+ ```python
+ >>> from transformers import LongcatFlashModel, LongcatFlashConfig
+
+ >>> # Initializing a LongcatFlash style configuration
+ >>> configuration = LongcatFlashConfig()
+
+ >>> # Accessing the model configuration
+ >>> configuration = model.config
+ ```"""
+
+ model_type = "longcat_flash"
+ keys_to_ignore_at_inference = ["past_key_values"]
+ base_model_tp_plan = {
+ "layers.*.self_attn.k_proj": "colwise",
+ "layers.*.self_attn.v_proj": "colwise",
+ "layers.*.self_attn.o_proj": "rowwise",
+ "layers.*.mlp.experts.*.gate_proj": "local_colwise",
+ "layers.*.mlp.experts.*.up_proj": "local_colwise",
+ "layers.*.mlp.experts.*.down_proj": "local_rowwise",
+ "layers.*.mlps.*.gate_proj": "local_colwise",
+ "layers.*.mlps.*.up_proj": "local_colwise",
+ "layers.*.mlps.*.down_proj": "local_rowwise",
+ }
+ base_model_pp_plan = {
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+ "norm": (["hidden_states"], ["hidden_states"]),
+ }
+
+ def __init__(
+ self,
+ vocab_size=131072,
+ hidden_size=7168,
+ ffn_hidden_size=18432,
+ expert_ffn_hidden_size=2048,
+ num_layers=61,
+ num_attention_heads=128,
+ num_key_value_heads=None,
+ n_routed_experts=256,
+ routed_scaling_factor=1,
+ kv_lora_rank=512,
+ q_lora_rank=1536,
+ qk_rope_head_dim=64,
+ v_head_dim=128,
+ qk_nope_head_dim=128,
+ mla_scale_q_lora=True,
+ mla_scale_kv_lora=True,
+ moe_topk=8,
+ norm_topk_prob=False,
+ hidden_act="silu",
+ max_position_embeddings=4096,
+ rms_norm_eps=1e-6,
+ use_cache=True,
+ pad_token_id=None,
+ bos_token_id=0,
+ eos_token_id=1,
+ tie_word_embeddings=False,
+ rope_theta=10000.0,
+ attention_bias=False,
+ attention_dropout=0.0,
+ attention_method='MLA',
+ initializer_range=0.006,
+ router_bias=False,
+ zero_expert_num=None,
+ zero_expert_type=None,
+ **kwargs,
+ ):
+ self.vocab_size = vocab_size
+ self.max_position_embeddings = max_position_embeddings
+ self.hidden_size = hidden_size
+ self.ffn_hidden_size = ffn_hidden_size
+ self.expert_ffn_hidden_size = expert_ffn_hidden_size
+ self.num_layers = num_layers
+ self.num_attention_heads = num_attention_heads
+ self.n_routed_experts = n_routed_experts
+ self.routed_scaling_factor = routed_scaling_factor
+ self.kv_lora_rank = kv_lora_rank
+ self.q_lora_rank = q_lora_rank
+ self.qk_rope_head_dim = qk_rope_head_dim
+ self.v_head_dim = v_head_dim
+ self.qk_nope_head_dim = qk_nope_head_dim
+ self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+ self.moe_topk = moe_topk
+ self.norm_topk_prob = norm_topk_prob
+ self.mla_scale_q_lora = mla_scale_q_lora
+ self.mla_scale_kv_lora = mla_scale_kv_lora
+ self.attention_method = attention_method
+ self.initializer_range = initializer_range
+ self.router_bias = router_bias
+ self.zero_expert_num = zero_expert_num
+ self.zero_expert_type = zero_expert_type
+
+ if self.attention_method == "MLA":
+ self.head_dim = qk_rope_head_dim
+ else:
+ ValueError('attention_method should be one of ["MLA"]')
+
+
+ if num_key_value_heads is None:
+ num_key_value_heads = num_attention_heads
+
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.rms_norm_eps = rms_norm_eps
+ self.use_cache = use_cache
+ self.rope_theta = rope_theta
+ self.attention_bias = attention_bias
+ self.attention_dropout = attention_dropout
+
+ rope_config_validation(self)
+
+ super().__init__(
+ pad_token_id=pad_token_id,
+ bos_token_id=bos_token_id,
+ eos_token_id=eos_token_id,
+ tie_word_embeddings=tie_word_embeddings,
+ **kwargs,
+ )
+
+ @property
+ def num_hidden_layers(self):
+ return self.num_layers
+
+
+__all__ = ["LongcatFlashConfig"]
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..778f600d04e3f1a148383c1a99c7948be9249bdd
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "_from_model_config": true,
+ "bos_token_id": 1,
+ "eos_token_id": 2,
+ "pad_token_id": 3,
+ "transformers_version": "4.55.0"
+}
\ No newline at end of file
diff --git a/model-00001-of-00085.safetensors b/model-00001-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e671b9285e28451a443ddd4de4f2cbe46cf4b736
--- /dev/null
+++ b/model-00001-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:686da8791145ba82cb3e60158eda0f690cb88d7fcc2dd571d5e614baf17e1bc0
+size 4982833873
diff --git a/model-00002-of-00085.safetensors b/model-00002-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4100cd7a55d9ebee415d154402ea3fa6071e7358
--- /dev/null
+++ b/model-00002-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:972b4be672b881f5172fa546dfbf36dd748228888e146774bfd5e4f89213f633
+size 4429185427
diff --git a/model-00003-of-00085.safetensors b/model-00003-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..474d82c410bbaf2d973e82ab360c08428c359fb7
--- /dev/null
+++ b/model-00003-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4671be71cb1afbddaa40abe8eb34ed78f80816fc2620445f8939eff017f48cfe
+size 4870523118
diff --git a/model-00004-of-00085.safetensors b/model-00004-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a78e43bb5968bb7bf383b64cda12352b90836a00
--- /dev/null
+++ b/model-00004-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ed2285e92b5bda5c005dfa4a5cc8f65d34867852989e3048604c9b43996d78e
+size 4429185433
diff --git a/model-00005-of-00085.safetensors b/model-00005-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ebd08ae40715db1352d34d07b53379d217413464
--- /dev/null
+++ b/model-00005-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:add36338563d089251d50f1debf0880d0b702d9451e0ead759158a01614efe61
+size 4429185427
diff --git a/model-00006-of-00085.safetensors b/model-00006-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..50547f2284b2a65a90e97b50a682e9012aa1b936
--- /dev/null
+++ b/model-00006-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eb02edd015ead840e64255daf096887116cb73b711f3cc1ba29d2f39ffaa559
+size 4870523134
diff --git a/model-00007-of-00085.safetensors b/model-00007-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8091d5aaef0f158114952e21708e6b322803b484
--- /dev/null
+++ b/model-00007-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fa6aaa8e7613d8860a9cf4025fe5bb70493236bad32fe3c23b75872ec9a5f5a
+size 4429185433
diff --git a/model-00008-of-00085.safetensors b/model-00008-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..838aeb1013648810c72e689714c72b3d7de32bae
--- /dev/null
+++ b/model-00008-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3e63529f96436a06245bb7850ec63fb6104fd6049767df9228a7dc399866bb8a
+size 4429185427
diff --git a/model-00009-of-00085.safetensors b/model-00009-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..51493931527c5294e6f19d3af353c5b86ca298d7
--- /dev/null
+++ b/model-00009-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2afb85377f3d76804626b0afa56f4c4910c05a617ab609f78f45c10632bb9219
+size 4870523102
diff --git a/model-00010-of-00085.safetensors b/model-00010-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ae03846bcd856f57487c7ecf097b83d679437e95
--- /dev/null
+++ b/model-00010-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56408f987c0bab3108f71660c550e2135895c67860e0e9ed012f323f71c8f979
+size 4429185433
diff --git a/model-00011-of-00085.safetensors b/model-00011-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d0179b516b1b7303ee7ec640ff526b84003d61b2
--- /dev/null
+++ b/model-00011-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d7aaf63fab521c746fa00623592c8d6ee2f3f2e27b9b85be11d2f462a8ea979
+size 4429185427
diff --git a/model-00012-of-00085.safetensors b/model-00012-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c9e748c3b656217cd76078103f58d37f5249c1d2
--- /dev/null
+++ b/model-00012-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3d8f063f7926575f539fe41fffa51d78bee8383d0e326da13f0262b3627782f
+size 4870523134
diff --git a/model-00013-of-00085.safetensors b/model-00013-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..57a32b386b12ab617fe76d113c50dd646341502c
--- /dev/null
+++ b/model-00013-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16514ae49100746e4a2d6d282c323d2068a74d74bfa3e061744f7068f212f2d7
+size 4429185433
diff --git a/model-00014-of-00085.safetensors b/model-00014-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d621714f526094718e7c13e76e4decb2f498bdd2
--- /dev/null
+++ b/model-00014-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f1dbe5dd3e816ad853ef42209e70c93446ebcc8df433e5e3512d48ef4c1b442
+size 4429185427
diff --git a/model-00015-of-00085.safetensors b/model-00015-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..01d023fa2c0b9df83e18fb194c31341369ab0fd4
--- /dev/null
+++ b/model-00015-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5def97c2a8dfd370c3fd4ee8810be7d1cc23b13917915c208b548e4c0da44d9a
+size 4870523124
diff --git a/model-00016-of-00085.safetensors b/model-00016-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a5d37a3bdc4ee974e369f54814856124a300749b
--- /dev/null
+++ b/model-00016-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a78e5b9fa0addaa036a181108f924a2ff2d1c21e3d3736c0575fc4ee92ae8611
+size 4429185433
diff --git a/model-00017-of-00085.safetensors b/model-00017-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a17bda674c24f768919c494a518ba3a8e2a7c25a
--- /dev/null
+++ b/model-00017-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8d72dec11b4883213088d13742dd4951f726a4adb92138492d9122be1bb085
+size 4429185427
diff --git a/model-00018-of-00085.safetensors b/model-00018-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..20ff0e4f7fdadb43849eb39dc28e3747a482b7d7
--- /dev/null
+++ b/model-00018-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4319a2152a8adb68f3423b4a472554069ca50eeda40cd1bc97e4055598899741
+size 4870523122
diff --git a/model-00019-of-00085.safetensors b/model-00019-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6c3f5b67b3b9ab10d5a62882ae22011d5eeb9457
--- /dev/null
+++ b/model-00019-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28382f1e42a534dbc208f243374211973293b58d4275e9b6f0bae0c6003dfea3
+size 4429185433
diff --git a/model-00020-of-00085.safetensors b/model-00020-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3cfc791efc020ec02faa66ebe67936e8584138a2
--- /dev/null
+++ b/model-00020-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca5c87022a8e4a6b76f66d18525c7ca120691e3f232e57bd27197fbb7c895626
+size 4429185427
diff --git a/model-00021-of-00085.safetensors b/model-00021-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9b3fbfdbb9de252d6776607f8cd4f53abf1a8f17
--- /dev/null
+++ b/model-00021-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f8fcbcbdf91a193309655c664b9b8133ee6bc0a7c95daa0b35ff9ef3e71c630
+size 4870523160
diff --git a/model-00022-of-00085.safetensors b/model-00022-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a8c92fb69ff99f7e16b3ceb3af6b41554fe59a88
--- /dev/null
+++ b/model-00022-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42078c61b5d2171b80c371a3c0121b2f4289185225081a8de20644a9593bc86a
+size 4429185433
diff --git a/model-00023-of-00085.safetensors b/model-00023-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0a18ea8360ece7fe3d236e179cfc9e720117cf6b
--- /dev/null
+++ b/model-00023-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:213d1db8a10013b778880d572da09d84f999304d5ce7d7eb6e4ca7e978ab3fc6
+size 4429185427
diff --git a/model-00024-of-00085.safetensors b/model-00024-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2be30bfdfe30b402038aebc73cf2c65be6ef9a6f
--- /dev/null
+++ b/model-00024-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b65ea40b028ef8dab8e7e6fba7a638c4c2ba2c6bcd93286d505dfcd96d58aeda
+size 4870523130
diff --git a/model-00025-of-00085.safetensors b/model-00025-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6afa2f97cdb92b7ef0618439e45e33dbacd4d680
--- /dev/null
+++ b/model-00025-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc1e71f00377512009fc340f9d9c0410eef34625cc4f0fb1d8d39af995c65301
+size 4429185433
diff --git a/model-00026-of-00085.safetensors b/model-00026-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9de89cd52ad190bf185cc1155ff181ab6430ac55
--- /dev/null
+++ b/model-00026-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7e1f46560f71bdb3472120dd69f577bbcba3c457e44005c2293c1e7605a5ad
+size 4429185427
diff --git a/model-00027-of-00085.safetensors b/model-00027-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..833258191e008ba482f1921d83a19530f1ac74f3
--- /dev/null
+++ b/model-00027-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e90b71a80bf35ad2e55d3e1b3441a2a20751b3613bae67554356592cc20acd8
+size 4870523106
diff --git a/model-00028-of-00085.safetensors b/model-00028-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cdcc6c5b4903884312aa6657e60ef7e1cfe4056d
--- /dev/null
+++ b/model-00028-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ce1381073be1b372b231572527eb4f2fe294b9b97b482797d3a566eb5f76f54
+size 4429185433
diff --git a/model-00029-of-00085.safetensors b/model-00029-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ef7685b608871e3c78003aa474a5d817f7c0cbc1
--- /dev/null
+++ b/model-00029-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50deb062d16565873202cfb8a0d6a1f819ef8b61d402e16e5d4133acc8e93034
+size 4429185427
diff --git a/model-00030-of-00085.safetensors b/model-00030-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..85422bafd207693b94d565dbdc335f174a2ec35b
--- /dev/null
+++ b/model-00030-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce970ed66c8b85af1e1eeba1c1f2145e453b237901f2ab8626aaa241c0148a3f
+size 4870523128
diff --git a/model-00031-of-00085.safetensors b/model-00031-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3b83b6402635edd8e196863c53e070bd8058aa3f
--- /dev/null
+++ b/model-00031-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0451132a1783b096076dc8ba51b3e06d18fd9b217842de3e5c5647c11ff0d93b
+size 4429185436
diff --git a/model-00032-of-00085.safetensors b/model-00032-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d1b171d08870900d271424d06d1a3118bdc1e271
--- /dev/null
+++ b/model-00032-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa31062615ef88d7532f3a7342c818946aa26a8c567158c34601dce00dee8c87
+size 4429185430
diff --git a/model-00033-of-00085.safetensors b/model-00033-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b1545eebc6c6b2015bb753c342cfb85e870adbe6
--- /dev/null
+++ b/model-00033-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33b97362bd0cde8d3208bdd2e622d26eb0deb3ffce4582d3d1bac7b88cbdbb06
+size 4870523265
diff --git a/model-00034-of-00085.safetensors b/model-00034-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..87c65ef98d0165fa62abf8ab6aeb47487719b111
--- /dev/null
+++ b/model-00034-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed91549e70382475f6c1038b6a08e492d6a8c0cae2d64a8aa94af3843390fd3
+size 4429185436
diff --git a/model-00035-of-00085.safetensors b/model-00035-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a84d30da18fefd6eb31265433e8be3c000fbd391
--- /dev/null
+++ b/model-00035-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51b21a96df69482c02a42d0a9e632c7cc80d65484d468df8ba37b702c21f434d
+size 4429185430
diff --git a/model-00036-of-00085.safetensors b/model-00036-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..27fde8ad01f5e6c248e3dcb5c2658130377590c9
--- /dev/null
+++ b/model-00036-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db282c7c4d5cf4bd5f76bbd45a462118f7977f380cb941a79f63be6d35c6676c
+size 4870523213
diff --git a/model-00037-of-00085.safetensors b/model-00037-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..62a28bd1cebc9d182d1b3f190f637b495b951096
--- /dev/null
+++ b/model-00037-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e068b3108d11849ab41883beac8b9a5e069648f127975c91809e26da4d3f0bdc
+size 4429185436
diff --git a/model-00038-of-00085.safetensors b/model-00038-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..19d7efd953a8069d4f8d29631e315065651bcbcc
--- /dev/null
+++ b/model-00038-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a5cf2b9f2642ea63a3ad2531850628f35e98506ee9e6bee489ccf204d6936ce
+size 4429185430
diff --git a/model-00039-of-00085.safetensors b/model-00039-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e5dabdb06558e09e6c42a85ca56c7d2e583c9810
--- /dev/null
+++ b/model-00039-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c906905d4337f30f7a6b6d8c82e33369045369a184cda01db2197fdfc9d4347
+size 4870523163
diff --git a/model-00040-of-00085.safetensors b/model-00040-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9efcc9f69eb5844959e1a803219194120dd9fd3f
--- /dev/null
+++ b/model-00040-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6616f0a433580f12e85f6c24023077b515a6a076b6498060a6073a4e30d7c7f9
+size 4429185436
diff --git a/model-00041-of-00085.safetensors b/model-00041-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..56b40852233519beb485f11e7761e70f0c3964bf
--- /dev/null
+++ b/model-00041-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdbf5620fb091176a7e0c05d76ca82740a5ceb6b11186a7d0773d0c93dd6a9df
+size 4429185430
diff --git a/model-00042-of-00085.safetensors b/model-00042-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..66fa7bf63a1b79cc2c32994f5dbb67a829790c79
--- /dev/null
+++ b/model-00042-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:566ff1861cfad6832b702774d381965b3c70d4cdc5554a077085ce10454ede9e
+size 4870523155
diff --git a/model-00043-of-00085.safetensors b/model-00043-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1b3934a1265622427f92b385fdf18fe2a0047470
--- /dev/null
+++ b/model-00043-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e723c92d28145254f3e964be6a60f25fa8eb497e9a02531252c757b3f6fb0a1a
+size 4429185436
diff --git a/model-00044-of-00085.safetensors b/model-00044-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0eabc3cd9d4d2d169d499643f3b6ddb0538dcd05
--- /dev/null
+++ b/model-00044-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58fcf48f43ed595d16c0cb932f1c129bed18e2a8bbe3ba9b7e6f72313db8852e
+size 4429185430
diff --git a/model-00045-of-00085.safetensors b/model-00045-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5f247bd4d456763b4a73ebacfa2d4234f6ac92cf
--- /dev/null
+++ b/model-00045-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ca7665fce40079e4d5550d1bb855f9a0c2d93fba2e726e107a03a55b058bfca
+size 4870523163
diff --git a/model-00046-of-00085.safetensors b/model-00046-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..98a459eef2a98094ba7f03b84a7d5810f0f9775f
--- /dev/null
+++ b/model-00046-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe0a58b4e5d8e7448ff3e85c4245aa3b328afe71fe5d37b1d7f714370f868385
+size 4429185436
diff --git a/model-00047-of-00085.safetensors b/model-00047-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5adb40ec5fd91dd82fca5b9845d486ee9bc66b03
--- /dev/null
+++ b/model-00047-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6806125b23d639dcc0fd36f8033b9af99d6b9ef2038d89ec91c091fd45afdd75
+size 4429185430
diff --git a/model-00048-of-00085.safetensors b/model-00048-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1745ab7b1f3928f14d0b53fb368865a597153a01
--- /dev/null
+++ b/model-00048-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8db3ec94eb0c5f08d0939e61455aae0cd46cfa6ae765646b9fa0aaa73fc2ab45
+size 4870523181
diff --git a/model-00049-of-00085.safetensors b/model-00049-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..75a2fc64bb3479a24ba795d9a07a35fc234a9a93
--- /dev/null
+++ b/model-00049-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:64bd93934d6bf3ca156123c0ce9fe31ca14dda8cac8665ca383b0008fdc2c7b4
+size 4429185436
diff --git a/model-00050-of-00085.safetensors b/model-00050-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fbf430d20148ba9d2b3f0e19a071262dfc4a7139
--- /dev/null
+++ b/model-00050-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43e6493408b0659f2c76472f4fb187dc4259e0b3fb2b589455605235c6473bd6
+size 4429185430
diff --git a/model-00051-of-00085.safetensors b/model-00051-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cd401ca49c2f1289ea711ffd76ee694659f9df8a
--- /dev/null
+++ b/model-00051-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e0bb19c015822a3899a590f40b1f5b15ac3fec7ee89e1fe1870fe7e30bf6bfd
+size 4870523179
diff --git a/model-00052-of-00085.safetensors b/model-00052-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c78c729bc151bc4014f657eac5f2dadb9536e3c3
--- /dev/null
+++ b/model-00052-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46d5043b623b6046ce9e5d1de89ab9376163813b01a6d71dfa53b668369e8372
+size 4429185436
diff --git a/model-00053-of-00085.safetensors b/model-00053-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6ff38b9da722aae437c75fe0800efab5957cdf87
--- /dev/null
+++ b/model-00053-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4a8f84d1769277042f8b51086398a69ca53ec4da6474248019710a823509591
+size 4429185430
diff --git a/model-00054-of-00085.safetensors b/model-00054-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0f2b9b1a7354f926cdadcb203d3ecefc3fbd6cf9
--- /dev/null
+++ b/model-00054-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2c6a30f5d006adf16e15a89fb2271909cd2959a6ba39458df8621479d665f22a
+size 4870523175
diff --git a/model-00055-of-00085.safetensors b/model-00055-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..38dd6099181186229d87deacd04b54b4363abf30
--- /dev/null
+++ b/model-00055-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e07b52372711cffae5b22307a31f6ead7bdb4055d5b40836c7e391357a63e00
+size 4429185436
diff --git a/model-00056-of-00085.safetensors b/model-00056-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..18518048fdd3f8655dee268fb0a66d8d0a58b563
--- /dev/null
+++ b/model-00056-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8224c08fd7f81a8a0be4349d70b7885f584872b35dfd7335a38b8531e804c4c8
+size 4429185430
diff --git a/model-00057-of-00085.safetensors b/model-00057-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3dbdd0455ccc9dcdeccc146d3e86bfdac16dd568
--- /dev/null
+++ b/model-00057-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d1303cf0f29001f6bd51c7aaa63a54ceec626087e5bfe98d2984e301e18eb51
+size 4870523231
diff --git a/model-00058-of-00085.safetensors b/model-00058-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..61dcaa7b1b3125b310c86ebdf6e6af21ffcfe0db
--- /dev/null
+++ b/model-00058-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:144ca4efe6149b18f4aa9140a1c5f66d2295ca0bb19f5abe7417d42809a7ca76
+size 4429185436
diff --git a/model-00059-of-00085.safetensors b/model-00059-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a5ee87f8bfadd7c7eb2a0edfbd20d536bc43bf1e
--- /dev/null
+++ b/model-00059-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d86dda7d3b489e99c1089a01becffb27e1477b0fceaf45862704e4bb0da72c0
+size 4429185430
diff --git a/model-00060-of-00085.safetensors b/model-00060-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d952451e718cf4cadc7b49345a578cb41b1fef6b
--- /dev/null
+++ b/model-00060-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:922c56577b8c6704d77bbbdeb5f70023fe614fa3087e019d20aabee1b8f38ee8
+size 4870523165
diff --git a/model-00061-of-00085.safetensors b/model-00061-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..96d5c00f62e1c7969804d352a5f038841937a492
--- /dev/null
+++ b/model-00061-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:59ea192c523d8423b94198725164000528bab48e95680ada48268370a78e55df
+size 4429185436
diff --git a/model-00062-of-00085.safetensors b/model-00062-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0a242c923a0237e25d2db498d87be6f894880572
--- /dev/null
+++ b/model-00062-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec7525dbea6ad5147c2b7d6ca806f4b0fc6a60413d87b5e1c8330fe1cfb0cf2e
+size 4429185430
diff --git a/model-00063-of-00085.safetensors b/model-00063-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..115b29271b2ca45b47ad03bad24e3f586f7feb57
--- /dev/null
+++ b/model-00063-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b347a81833436574fefd6d2ed56e357955112a472048978221310011cb55591b
+size 4870523169
diff --git a/model-00064-of-00085.safetensors b/model-00064-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..edc417e38a7af95e036496437b68dbc7e33d887b
--- /dev/null
+++ b/model-00064-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1dcb5673a6bed23390453aa0467c1818471794b331ea9d2e1824d1f586c9c65
+size 4429185436
diff --git a/model-00065-of-00085.safetensors b/model-00065-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..31d7f03fdb838b90b97157714f965561e7ceb2fb
--- /dev/null
+++ b/model-00065-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38218cc3597fc1e3697df3ab783f025d953394c6a591d0f69b9a4279b64439c6
+size 4429185430
diff --git a/model-00066-of-00085.safetensors b/model-00066-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6607b014751cee51768a3082a6ee870af5680493
--- /dev/null
+++ b/model-00066-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bd9c9c6fca2fbecd2c02b82b638cd171618fbe8de7b2553769cb01db5d2ffb5
+size 4870523171
diff --git a/model-00067-of-00085.safetensors b/model-00067-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..53ab25a63b74ecc0f97ba96dd1c5b8cd13587e83
--- /dev/null
+++ b/model-00067-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15f0011adf5075d827c82cd28f7c72a1cb6819d56050d35d40624990d076dd8c
+size 4429185436
diff --git a/model-00068-of-00085.safetensors b/model-00068-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ceb9317d1d6db1ec6c6fea90b409eaada14fbfbb
--- /dev/null
+++ b/model-00068-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f23455851554df078eb1429a7a626317a0ca5712ee2c78c343a855995e25a2a
+size 4429185430
diff --git a/model-00069-of-00085.safetensors b/model-00069-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0253f2deb352b3bc0ec1a82a6b522d8bd7f7b363
--- /dev/null
+++ b/model-00069-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b772902f1d3ec42739af6e9fefd0ce9412eecbb149a0cee1ca4c8ab0365a5ddc
+size 4870523199
diff --git a/model-00070-of-00085.safetensors b/model-00070-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..74b3308f6947d5e93791e5b2a52fe63587546c50
--- /dev/null
+++ b/model-00070-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7ed02d3bae19a32d70a829d1c8e1114c57e2e12ee5d8ed2d1577a56617a40efd
+size 4429185436
diff --git a/model-00071-of-00085.safetensors b/model-00071-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b9a615d39e923dd80c4b75b0e1801d91a2c2763f
--- /dev/null
+++ b/model-00071-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4520b10b0d14ab318cd3bfba280aa73369ac5db4cb4320db7efd2300bbfc843
+size 4429185430
diff --git a/model-00072-of-00085.safetensors b/model-00072-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a738ec299ba1165868394fcc81cf17425cb60157
--- /dev/null
+++ b/model-00072-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a476cf46bf2b21a63a9a3cd4b439ccd21c819255e280c4c1dea013c25f753246
+size 4870523227
diff --git a/model-00073-of-00085.safetensors b/model-00073-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..5980a2f853370af7bd42d0ddd2bf203027dc24ec
--- /dev/null
+++ b/model-00073-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7a05539d1723600556f47cbbc06208383dae41f0f5c85291749ded2617514c1
+size 4429185436
diff --git a/model-00074-of-00085.safetensors b/model-00074-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9fa535977d3961238acbe28eb5dee2b625bcac39
--- /dev/null
+++ b/model-00074-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55f1fcf579697fcb8e1e360045db6ba8754a60bb2e1c33a5ed843832d46ddc7a
+size 4429185430
diff --git a/model-00075-of-00085.safetensors b/model-00075-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..96bcdf0ccb1ce7ba243e3b72ee231d9c00f13961
--- /dev/null
+++ b/model-00075-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e6e3ec091cbd351e2df37cb97a3558aadfeda193261fcbdd4fc871473a573396
+size 4870523191
diff --git a/model-00076-of-00085.safetensors b/model-00076-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..58ec013800d3483db7c9ccc293460c0ec239d280
--- /dev/null
+++ b/model-00076-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8dc26cc15dd234058f468e827c1bba2400fe8a60b3110e1d75b1d9c558f19fe6
+size 4429185436
diff --git a/model-00077-of-00085.safetensors b/model-00077-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..811c7128451a71f9b5b4d408a74360c94f8779e0
--- /dev/null
+++ b/model-00077-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef735683e8a9bec5605dbe917ca56a3d47657eb9dc14d2a3cd0d76eab1cabab3
+size 4429185430
diff --git a/model-00078-of-00085.safetensors b/model-00078-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9f1f47746b545b04c540a57f3128617a705d2d6d
--- /dev/null
+++ b/model-00078-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78c219186ae0adbb49722882c15768186e84f75d774d133cb5c95ba5b126935d
+size 4870523169
diff --git a/model-00079-of-00085.safetensors b/model-00079-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..97a0fd6c99019a2af7b0ffdb41386bb52c8976e8
--- /dev/null
+++ b/model-00079-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:689675c912272b7d99ced9c5fb41bdf8d3bdc30938d54cf18434b5a346509bb7
+size 4429185436
diff --git a/model-00080-of-00085.safetensors b/model-00080-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..adcdbd51e837d9c719aa21046963e2611608e4f5
--- /dev/null
+++ b/model-00080-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4ceaf5e732db0e9e7a3da5a5a7577ed4a1b822f70966cb9adfcd37fd5b9cb6b7
+size 4429185430
diff --git a/model-00081-of-00085.safetensors b/model-00081-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b7ffbd4d5fddfc498d10a1e2f2ad2bd7b91d64a5
--- /dev/null
+++ b/model-00081-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6985ad1d4058f85131328dd9805949a84f6a79f0d89a67779fe3a866ebb21230
+size 4870523245
diff --git a/model-00082-of-00085.safetensors b/model-00082-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..439796b72dcb022dd92818ddbc3ab0d743345a44
--- /dev/null
+++ b/model-00082-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:29c729130da4215dd451ab664aafbf1ec8a4cf7d64d3f1d461a576e8cd481f58
+size 4429185436
diff --git a/model-00083-of-00085.safetensors b/model-00083-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8689246cfc2bf8bdd07b9b8543675ba2d0519b18
--- /dev/null
+++ b/model-00083-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:54b3f6ec10d0a738e15aed2f2fd89abf9b8e28775ee85fc953ee28b5f361e3e2
+size 4429185430
diff --git a/model-00084-of-00085.safetensors b/model-00084-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..314602f73d0f7aab6ecad69b186408568f7ccabd
--- /dev/null
+++ b/model-00084-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9822da53a92a9ebfd60fb0337485b177bfebecf7fc95b4cd905a0907c6b207bf
+size 4870535540
diff --git a/model-00085-of-00085.safetensors b/model-00085-of-00085.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e16dda599cb42330a4b1d7388cd434eeb0908ce0
--- /dev/null
+++ b/model-00085-of-00085.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22e7eb52fbb8ef4874cbcbbb5cbb962d0211467566b1c1218d3b66bb5bbf166f
+size 553648430
diff --git a/model.safetensors.index.json b/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..af4a4b7a0725e35eae04de02a8317e8c1ec590b3
--- /dev/null
+++ b/model.safetensors.index.json
@@ -0,0 +1,1947 @@
+{
+ "metadata": {
+ "total_size": 385516109824,
+ "total_parameters": 560664958976
+ },
+ "weight_map": {
+ "lm_head.biases": "model-00085-of-00085.safetensors",
+ "lm_head.scales": "model-00085-of-00085.safetensors",
+ "lm_head.weight": "model-00085-of-00085.safetensors",
+ "model.embed_tokens.biases": "model-00001-of-00085.safetensors",
+ "model.embed_tokens.scales": "model-00001-of-00085.safetensors",
+ "model.embed_tokens.weight": "model-00001-of-00085.safetensors",
+ "model.layers.0.input_layernorm.0.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.input_layernorm.1.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.router.classifier.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.router.classifier.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.router.classifier.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.router.e_score_correction_bias": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.down_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.down_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.down_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.gate_proj.biases": "model-00001-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.gate_proj.scales": "model-00001-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.gate_proj.weight": "model-00001-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.up_proj.biases": "model-00002-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.up_proj.scales": "model-00002-of-00085.safetensors",
+ "model.layers.0.mlp.switch_mlp.up_proj.weight": "model-00002-of-00085.safetensors",
+ "model.layers.0.mlps.0.down_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.down_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.down_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.gate_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.gate_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.gate_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.up_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.up_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.0.up_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.down_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.down_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.down_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.gate_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.gate_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.gate_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.up_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.up_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.mlps.1.up_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.post_attention_layernorm.0.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.post_attention_layernorm.1.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_a_layernorm.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_a_proj_with_mqa.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_a_proj_with_mqa.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_a_proj_with_mqa.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_b_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_b_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.kv_b_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.o_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.o_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.o_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_a_layernorm.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_a_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_a_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_a_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_b_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_b_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.0.q_b_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_a_layernorm.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_a_proj_with_mqa.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_a_proj_with_mqa.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_a_proj_with_mqa.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_b_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_b_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.kv_b_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.o_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.o_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.o_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_a_layernorm.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_a_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_a_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_a_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_b_proj.biases": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_b_proj.scales": "model-00003-of-00085.safetensors",
+ "model.layers.0.self_attn.1.q_b_proj.weight": "model-00003-of-00085.safetensors",
+ "model.layers.1.input_layernorm.0.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.input_layernorm.1.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.router.classifier.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.router.classifier.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.router.classifier.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.router.e_score_correction_bias": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.down_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.down_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.down_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.gate_proj.biases": "model-00004-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.gate_proj.scales": "model-00004-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.gate_proj.weight": "model-00004-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.up_proj.biases": "model-00005-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.up_proj.scales": "model-00005-of-00085.safetensors",
+ "model.layers.1.mlp.switch_mlp.up_proj.weight": "model-00005-of-00085.safetensors",
+ "model.layers.1.mlps.0.down_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.down_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.down_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.gate_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.gate_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.gate_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.up_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.up_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.0.up_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.down_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.down_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.down_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.gate_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.gate_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.gate_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.up_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.up_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.mlps.1.up_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.post_attention_layernorm.0.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.post_attention_layernorm.1.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_a_layernorm.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_a_proj_with_mqa.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_a_proj_with_mqa.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_a_proj_with_mqa.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_b_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_b_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.kv_b_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.o_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.o_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.o_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_a_layernorm.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_a_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_a_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_a_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_b_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_b_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.0.q_b_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_a_layernorm.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_a_proj_with_mqa.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_a_proj_with_mqa.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_a_proj_with_mqa.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_b_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_b_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.kv_b_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.o_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.o_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.o_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_a_layernorm.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_a_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_a_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_a_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_b_proj.biases": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_b_proj.scales": "model-00006-of-00085.safetensors",
+ "model.layers.1.self_attn.1.q_b_proj.weight": "model-00006-of-00085.safetensors",
+ "model.layers.10.input_layernorm.0.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.input_layernorm.1.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.router.classifier.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.router.classifier.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.router.classifier.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.router.e_score_correction_bias": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.down_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.down_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.down_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.gate_proj.biases": "model-00031-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.gate_proj.scales": "model-00031-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.gate_proj.weight": "model-00031-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.up_proj.biases": "model-00032-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.up_proj.scales": "model-00032-of-00085.safetensors",
+ "model.layers.10.mlp.switch_mlp.up_proj.weight": "model-00032-of-00085.safetensors",
+ "model.layers.10.mlps.0.down_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.down_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.down_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.gate_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.gate_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.gate_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.up_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.up_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.0.up_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.down_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.down_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.down_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.gate_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.gate_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.gate_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.up_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.up_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.mlps.1.up_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.post_attention_layernorm.0.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.post_attention_layernorm.1.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_a_layernorm.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_a_proj_with_mqa.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_a_proj_with_mqa.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_a_proj_with_mqa.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_b_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_b_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.kv_b_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.o_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.o_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.o_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_a_layernorm.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_a_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_a_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_a_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_b_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_b_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.0.q_b_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_a_layernorm.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_a_proj_with_mqa.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_a_proj_with_mqa.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_a_proj_with_mqa.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_b_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_b_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.kv_b_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.o_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.o_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.o_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_a_layernorm.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_a_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_a_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_a_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_b_proj.biases": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_b_proj.scales": "model-00033-of-00085.safetensors",
+ "model.layers.10.self_attn.1.q_b_proj.weight": "model-00033-of-00085.safetensors",
+ "model.layers.11.input_layernorm.0.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.input_layernorm.1.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.router.classifier.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.router.classifier.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.router.classifier.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.router.e_score_correction_bias": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.down_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.down_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.down_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.gate_proj.biases": "model-00034-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.gate_proj.scales": "model-00034-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.gate_proj.weight": "model-00034-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.up_proj.biases": "model-00035-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.up_proj.scales": "model-00035-of-00085.safetensors",
+ "model.layers.11.mlp.switch_mlp.up_proj.weight": "model-00035-of-00085.safetensors",
+ "model.layers.11.mlps.0.down_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.down_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.down_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.gate_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.gate_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.gate_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.up_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.up_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.0.up_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.down_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.down_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.down_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.gate_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.gate_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.gate_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.up_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.up_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.mlps.1.up_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.post_attention_layernorm.0.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.post_attention_layernorm.1.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_a_layernorm.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_a_proj_with_mqa.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_a_proj_with_mqa.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_a_proj_with_mqa.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_b_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_b_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.kv_b_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.o_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.o_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.o_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_a_layernorm.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_a_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_a_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_a_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_b_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_b_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.0.q_b_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_a_layernorm.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_a_proj_with_mqa.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_a_proj_with_mqa.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_a_proj_with_mqa.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_b_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_b_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.kv_b_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.o_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.o_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.o_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_a_layernorm.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_a_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_a_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_a_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_b_proj.biases": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_b_proj.scales": "model-00036-of-00085.safetensors",
+ "model.layers.11.self_attn.1.q_b_proj.weight": "model-00036-of-00085.safetensors",
+ "model.layers.12.input_layernorm.0.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.input_layernorm.1.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.router.classifier.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.router.classifier.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.router.classifier.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.router.e_score_correction_bias": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.down_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.down_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.down_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.gate_proj.biases": "model-00037-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.gate_proj.scales": "model-00037-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.gate_proj.weight": "model-00037-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.up_proj.biases": "model-00038-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.up_proj.scales": "model-00038-of-00085.safetensors",
+ "model.layers.12.mlp.switch_mlp.up_proj.weight": "model-00038-of-00085.safetensors",
+ "model.layers.12.mlps.0.down_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.down_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.down_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.gate_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.gate_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.gate_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.up_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.up_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.0.up_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.down_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.down_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.down_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.gate_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.gate_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.gate_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.up_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.up_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.mlps.1.up_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.post_attention_layernorm.0.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.post_attention_layernorm.1.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_a_layernorm.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_a_proj_with_mqa.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_a_proj_with_mqa.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_a_proj_with_mqa.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_b_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_b_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.kv_b_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.o_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.o_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.o_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_a_layernorm.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_a_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_a_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_a_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_b_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_b_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.0.q_b_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_a_layernorm.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_a_proj_with_mqa.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_a_proj_with_mqa.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_a_proj_with_mqa.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_b_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_b_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.kv_b_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.o_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.o_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.o_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_a_layernorm.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_a_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_a_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_a_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_b_proj.biases": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_b_proj.scales": "model-00039-of-00085.safetensors",
+ "model.layers.12.self_attn.1.q_b_proj.weight": "model-00039-of-00085.safetensors",
+ "model.layers.13.input_layernorm.0.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.input_layernorm.1.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.router.classifier.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.router.classifier.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.router.classifier.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.router.e_score_correction_bias": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.down_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.down_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.down_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.gate_proj.biases": "model-00040-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.gate_proj.scales": "model-00040-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.gate_proj.weight": "model-00040-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.up_proj.biases": "model-00041-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.up_proj.scales": "model-00041-of-00085.safetensors",
+ "model.layers.13.mlp.switch_mlp.up_proj.weight": "model-00041-of-00085.safetensors",
+ "model.layers.13.mlps.0.down_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.down_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.down_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.gate_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.gate_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.gate_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.up_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.up_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.0.up_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.down_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.down_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.down_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.gate_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.gate_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.gate_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.up_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.up_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.mlps.1.up_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.post_attention_layernorm.0.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.post_attention_layernorm.1.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_a_layernorm.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_a_proj_with_mqa.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_a_proj_with_mqa.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_a_proj_with_mqa.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_b_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_b_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.kv_b_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.o_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.o_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.o_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_a_layernorm.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_a_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_a_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_a_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_b_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_b_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.0.q_b_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_a_layernorm.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_a_proj_with_mqa.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_a_proj_with_mqa.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_a_proj_with_mqa.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_b_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_b_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.kv_b_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.o_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.o_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.o_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_a_layernorm.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_a_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_a_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_a_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_b_proj.biases": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_b_proj.scales": "model-00042-of-00085.safetensors",
+ "model.layers.13.self_attn.1.q_b_proj.weight": "model-00042-of-00085.safetensors",
+ "model.layers.14.input_layernorm.0.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.input_layernorm.1.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.router.classifier.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.router.classifier.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.router.classifier.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.router.e_score_correction_bias": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.down_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.down_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.down_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.gate_proj.biases": "model-00043-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.gate_proj.scales": "model-00043-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.gate_proj.weight": "model-00043-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.up_proj.biases": "model-00044-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.up_proj.scales": "model-00044-of-00085.safetensors",
+ "model.layers.14.mlp.switch_mlp.up_proj.weight": "model-00044-of-00085.safetensors",
+ "model.layers.14.mlps.0.down_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.down_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.down_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.gate_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.gate_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.gate_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.up_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.up_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.0.up_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.down_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.down_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.down_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.gate_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.gate_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.gate_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.up_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.up_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.mlps.1.up_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.post_attention_layernorm.0.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.post_attention_layernorm.1.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_a_layernorm.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_a_proj_with_mqa.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_a_proj_with_mqa.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_a_proj_with_mqa.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_b_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_b_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.kv_b_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.o_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.o_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.o_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_a_layernorm.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_a_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_a_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_a_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_b_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_b_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.0.q_b_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_a_layernorm.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_a_proj_with_mqa.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_a_proj_with_mqa.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_a_proj_with_mqa.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_b_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_b_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.kv_b_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.o_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.o_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.o_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_a_layernorm.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_a_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_a_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_a_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_b_proj.biases": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_b_proj.scales": "model-00045-of-00085.safetensors",
+ "model.layers.14.self_attn.1.q_b_proj.weight": "model-00045-of-00085.safetensors",
+ "model.layers.15.input_layernorm.0.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.input_layernorm.1.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.router.classifier.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.router.classifier.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.router.classifier.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.router.e_score_correction_bias": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.down_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.down_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.down_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.gate_proj.biases": "model-00046-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.gate_proj.scales": "model-00046-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.gate_proj.weight": "model-00046-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.up_proj.biases": "model-00047-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.up_proj.scales": "model-00047-of-00085.safetensors",
+ "model.layers.15.mlp.switch_mlp.up_proj.weight": "model-00047-of-00085.safetensors",
+ "model.layers.15.mlps.0.down_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.down_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.down_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.gate_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.gate_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.gate_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.up_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.up_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.0.up_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.down_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.down_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.down_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.gate_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.gate_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.gate_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.up_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.up_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.mlps.1.up_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.post_attention_layernorm.0.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.post_attention_layernorm.1.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_a_layernorm.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_a_proj_with_mqa.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_a_proj_with_mqa.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_a_proj_with_mqa.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_b_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_b_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.kv_b_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.o_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.o_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.o_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_a_layernorm.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_a_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_a_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_a_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_b_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_b_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.0.q_b_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_a_layernorm.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_a_proj_with_mqa.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_a_proj_with_mqa.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_a_proj_with_mqa.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_b_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_b_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.kv_b_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.o_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.o_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.o_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_a_layernorm.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_a_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_a_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_a_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_b_proj.biases": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_b_proj.scales": "model-00048-of-00085.safetensors",
+ "model.layers.15.self_attn.1.q_b_proj.weight": "model-00048-of-00085.safetensors",
+ "model.layers.16.input_layernorm.0.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.input_layernorm.1.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.router.classifier.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.router.classifier.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.router.classifier.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.router.e_score_correction_bias": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.down_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.down_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.down_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.gate_proj.biases": "model-00049-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.gate_proj.scales": "model-00049-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.gate_proj.weight": "model-00049-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.up_proj.biases": "model-00050-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.up_proj.scales": "model-00050-of-00085.safetensors",
+ "model.layers.16.mlp.switch_mlp.up_proj.weight": "model-00050-of-00085.safetensors",
+ "model.layers.16.mlps.0.down_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.down_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.down_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.gate_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.gate_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.gate_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.up_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.up_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.0.up_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.down_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.down_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.down_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.gate_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.gate_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.gate_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.up_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.up_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.mlps.1.up_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.post_attention_layernorm.0.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.post_attention_layernorm.1.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_a_layernorm.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_a_proj_with_mqa.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_a_proj_with_mqa.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_a_proj_with_mqa.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_b_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_b_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.kv_b_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.o_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.o_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.o_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_a_layernorm.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_a_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_a_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_a_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_b_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_b_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.0.q_b_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_a_layernorm.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_a_proj_with_mqa.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_a_proj_with_mqa.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_a_proj_with_mqa.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_b_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_b_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.kv_b_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.o_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.o_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.o_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_a_layernorm.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_a_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_a_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_a_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_b_proj.biases": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_b_proj.scales": "model-00051-of-00085.safetensors",
+ "model.layers.16.self_attn.1.q_b_proj.weight": "model-00051-of-00085.safetensors",
+ "model.layers.17.input_layernorm.0.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.input_layernorm.1.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.router.classifier.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.router.classifier.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.router.classifier.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.router.e_score_correction_bias": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.down_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.down_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.down_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.gate_proj.biases": "model-00052-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.gate_proj.scales": "model-00052-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.gate_proj.weight": "model-00052-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.up_proj.biases": "model-00053-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.up_proj.scales": "model-00053-of-00085.safetensors",
+ "model.layers.17.mlp.switch_mlp.up_proj.weight": "model-00053-of-00085.safetensors",
+ "model.layers.17.mlps.0.down_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.down_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.down_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.gate_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.gate_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.gate_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.up_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.up_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.0.up_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.down_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.down_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.down_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.gate_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.gate_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.gate_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.up_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.up_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.mlps.1.up_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.post_attention_layernorm.0.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.post_attention_layernorm.1.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_a_layernorm.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_a_proj_with_mqa.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_a_proj_with_mqa.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_a_proj_with_mqa.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_b_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_b_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.kv_b_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.o_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.o_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.o_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_a_layernorm.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_a_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_a_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_a_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_b_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_b_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.0.q_b_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_a_layernorm.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_a_proj_with_mqa.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_a_proj_with_mqa.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_a_proj_with_mqa.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_b_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_b_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.kv_b_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.o_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.o_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.o_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_a_layernorm.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_a_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_a_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_a_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_b_proj.biases": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_b_proj.scales": "model-00054-of-00085.safetensors",
+ "model.layers.17.self_attn.1.q_b_proj.weight": "model-00054-of-00085.safetensors",
+ "model.layers.18.input_layernorm.0.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.input_layernorm.1.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.router.classifier.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.router.classifier.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.router.classifier.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.router.e_score_correction_bias": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.down_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.down_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.down_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.gate_proj.biases": "model-00055-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.gate_proj.scales": "model-00055-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.gate_proj.weight": "model-00055-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.up_proj.biases": "model-00056-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.up_proj.scales": "model-00056-of-00085.safetensors",
+ "model.layers.18.mlp.switch_mlp.up_proj.weight": "model-00056-of-00085.safetensors",
+ "model.layers.18.mlps.0.down_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.down_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.down_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.gate_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.gate_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.gate_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.up_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.up_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.0.up_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.down_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.down_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.down_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.gate_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.gate_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.gate_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.up_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.up_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.mlps.1.up_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.post_attention_layernorm.0.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.post_attention_layernorm.1.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_a_layernorm.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_a_proj_with_mqa.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_a_proj_with_mqa.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_a_proj_with_mqa.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_b_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_b_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.kv_b_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.o_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.o_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.o_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_a_layernorm.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_a_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_a_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_a_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_b_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_b_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.0.q_b_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_a_layernorm.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_a_proj_with_mqa.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_a_proj_with_mqa.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_a_proj_with_mqa.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_b_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_b_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.kv_b_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.o_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.o_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.o_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_a_layernorm.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_a_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_a_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_a_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_b_proj.biases": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_b_proj.scales": "model-00057-of-00085.safetensors",
+ "model.layers.18.self_attn.1.q_b_proj.weight": "model-00057-of-00085.safetensors",
+ "model.layers.19.input_layernorm.0.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.input_layernorm.1.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.router.classifier.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.router.classifier.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.router.classifier.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.router.e_score_correction_bias": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.down_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.down_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.down_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.gate_proj.biases": "model-00058-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.gate_proj.scales": "model-00058-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.gate_proj.weight": "model-00058-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.up_proj.biases": "model-00059-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.up_proj.scales": "model-00059-of-00085.safetensors",
+ "model.layers.19.mlp.switch_mlp.up_proj.weight": "model-00059-of-00085.safetensors",
+ "model.layers.19.mlps.0.down_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.down_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.down_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.gate_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.gate_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.gate_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.up_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.up_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.0.up_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.down_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.down_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.down_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.gate_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.gate_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.gate_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.up_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.up_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.mlps.1.up_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.post_attention_layernorm.0.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.post_attention_layernorm.1.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_a_layernorm.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_a_proj_with_mqa.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_a_proj_with_mqa.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_a_proj_with_mqa.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_b_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_b_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.kv_b_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.o_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.o_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.o_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_a_layernorm.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_a_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_a_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_a_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_b_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_b_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.0.q_b_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_a_layernorm.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_a_proj_with_mqa.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_a_proj_with_mqa.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_a_proj_with_mqa.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_b_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_b_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.kv_b_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.o_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.o_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.o_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_a_layernorm.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_a_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_a_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_a_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_b_proj.biases": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_b_proj.scales": "model-00060-of-00085.safetensors",
+ "model.layers.19.self_attn.1.q_b_proj.weight": "model-00060-of-00085.safetensors",
+ "model.layers.2.input_layernorm.0.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.input_layernorm.1.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.router.classifier.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.router.classifier.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.router.classifier.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.router.e_score_correction_bias": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.down_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.down_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.down_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.gate_proj.biases": "model-00007-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.gate_proj.scales": "model-00007-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.gate_proj.weight": "model-00007-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.up_proj.biases": "model-00008-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.up_proj.scales": "model-00008-of-00085.safetensors",
+ "model.layers.2.mlp.switch_mlp.up_proj.weight": "model-00008-of-00085.safetensors",
+ "model.layers.2.mlps.0.down_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.down_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.down_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.gate_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.gate_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.gate_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.up_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.up_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.0.up_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.down_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.down_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.down_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.gate_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.gate_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.gate_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.up_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.up_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.mlps.1.up_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.post_attention_layernorm.0.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.post_attention_layernorm.1.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_a_layernorm.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_a_proj_with_mqa.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_a_proj_with_mqa.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_a_proj_with_mqa.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_b_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_b_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.kv_b_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.o_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.o_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.o_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_a_layernorm.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_a_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_a_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_a_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_b_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_b_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.0.q_b_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_a_layernorm.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_a_proj_with_mqa.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_a_proj_with_mqa.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_a_proj_with_mqa.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_b_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_b_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.kv_b_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.o_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.o_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.o_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_a_layernorm.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_a_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_a_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_a_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_b_proj.biases": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_b_proj.scales": "model-00009-of-00085.safetensors",
+ "model.layers.2.self_attn.1.q_b_proj.weight": "model-00009-of-00085.safetensors",
+ "model.layers.20.input_layernorm.0.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.input_layernorm.1.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.router.classifier.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.router.classifier.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.router.classifier.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.router.e_score_correction_bias": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.down_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.down_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.down_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.gate_proj.biases": "model-00061-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.gate_proj.scales": "model-00061-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.gate_proj.weight": "model-00061-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.up_proj.biases": "model-00062-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.up_proj.scales": "model-00062-of-00085.safetensors",
+ "model.layers.20.mlp.switch_mlp.up_proj.weight": "model-00062-of-00085.safetensors",
+ "model.layers.20.mlps.0.down_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.down_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.down_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.gate_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.gate_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.gate_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.up_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.up_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.0.up_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.down_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.down_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.down_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.gate_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.gate_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.gate_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.up_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.up_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.mlps.1.up_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.post_attention_layernorm.0.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.post_attention_layernorm.1.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_a_layernorm.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_a_proj_with_mqa.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_a_proj_with_mqa.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_a_proj_with_mqa.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_b_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_b_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.kv_b_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.o_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.o_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.o_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_a_layernorm.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_a_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_a_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_a_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_b_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_b_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.0.q_b_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_a_layernorm.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_a_proj_with_mqa.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_a_proj_with_mqa.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_a_proj_with_mqa.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_b_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_b_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.kv_b_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.o_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.o_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.o_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_a_layernorm.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_a_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_a_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_a_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_b_proj.biases": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_b_proj.scales": "model-00063-of-00085.safetensors",
+ "model.layers.20.self_attn.1.q_b_proj.weight": "model-00063-of-00085.safetensors",
+ "model.layers.21.input_layernorm.0.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.input_layernorm.1.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.router.classifier.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.router.classifier.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.router.classifier.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.router.e_score_correction_bias": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.down_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.down_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.down_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.gate_proj.biases": "model-00064-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.gate_proj.scales": "model-00064-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.gate_proj.weight": "model-00064-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.up_proj.biases": "model-00065-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.up_proj.scales": "model-00065-of-00085.safetensors",
+ "model.layers.21.mlp.switch_mlp.up_proj.weight": "model-00065-of-00085.safetensors",
+ "model.layers.21.mlps.0.down_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.down_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.down_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.gate_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.gate_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.gate_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.up_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.up_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.0.up_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.down_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.down_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.down_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.gate_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.gate_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.gate_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.up_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.up_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.mlps.1.up_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.post_attention_layernorm.0.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.post_attention_layernorm.1.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_a_layernorm.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_a_proj_with_mqa.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_a_proj_with_mqa.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_a_proj_with_mqa.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_b_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_b_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.kv_b_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.o_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.o_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.o_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_a_layernorm.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_a_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_a_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_a_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_b_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_b_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.0.q_b_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_a_layernorm.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_a_proj_with_mqa.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_a_proj_with_mqa.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_a_proj_with_mqa.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_b_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_b_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.kv_b_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.o_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.o_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.o_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_a_layernorm.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_a_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_a_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_a_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_b_proj.biases": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_b_proj.scales": "model-00066-of-00085.safetensors",
+ "model.layers.21.self_attn.1.q_b_proj.weight": "model-00066-of-00085.safetensors",
+ "model.layers.22.input_layernorm.0.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.input_layernorm.1.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.router.classifier.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.router.classifier.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.router.classifier.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.router.e_score_correction_bias": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.down_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.down_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.down_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.gate_proj.biases": "model-00067-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.gate_proj.scales": "model-00067-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.gate_proj.weight": "model-00067-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.up_proj.biases": "model-00068-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.up_proj.scales": "model-00068-of-00085.safetensors",
+ "model.layers.22.mlp.switch_mlp.up_proj.weight": "model-00068-of-00085.safetensors",
+ "model.layers.22.mlps.0.down_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.down_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.down_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.gate_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.gate_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.gate_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.up_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.up_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.0.up_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.down_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.down_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.down_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.gate_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.gate_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.gate_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.up_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.up_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.mlps.1.up_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.post_attention_layernorm.0.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.post_attention_layernorm.1.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_a_layernorm.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_a_proj_with_mqa.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_a_proj_with_mqa.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_a_proj_with_mqa.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_b_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_b_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.kv_b_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.o_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.o_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.o_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_a_layernorm.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_a_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_a_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_a_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_b_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_b_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.0.q_b_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_a_layernorm.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_a_proj_with_mqa.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_a_proj_with_mqa.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_a_proj_with_mqa.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_b_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_b_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.kv_b_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.o_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.o_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.o_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_a_layernorm.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_a_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_a_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_a_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_b_proj.biases": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_b_proj.scales": "model-00069-of-00085.safetensors",
+ "model.layers.22.self_attn.1.q_b_proj.weight": "model-00069-of-00085.safetensors",
+ "model.layers.23.input_layernorm.0.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.input_layernorm.1.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.router.classifier.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.router.classifier.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.router.classifier.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.router.e_score_correction_bias": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.down_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.down_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.down_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.gate_proj.biases": "model-00070-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.gate_proj.scales": "model-00070-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.gate_proj.weight": "model-00070-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.up_proj.biases": "model-00071-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.up_proj.scales": "model-00071-of-00085.safetensors",
+ "model.layers.23.mlp.switch_mlp.up_proj.weight": "model-00071-of-00085.safetensors",
+ "model.layers.23.mlps.0.down_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.down_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.down_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.gate_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.gate_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.gate_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.up_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.up_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.0.up_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.down_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.down_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.down_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.gate_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.gate_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.gate_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.up_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.up_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.mlps.1.up_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.post_attention_layernorm.0.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.post_attention_layernorm.1.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_a_layernorm.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_a_proj_with_mqa.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_a_proj_with_mqa.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_a_proj_with_mqa.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_b_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_b_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.kv_b_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.o_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.o_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.o_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_a_layernorm.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_a_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_a_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_a_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_b_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_b_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.0.q_b_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_a_layernorm.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_a_proj_with_mqa.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_a_proj_with_mqa.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_a_proj_with_mqa.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_b_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_b_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.kv_b_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.o_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.o_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.o_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_a_layernorm.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_a_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_a_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_a_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_b_proj.biases": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_b_proj.scales": "model-00072-of-00085.safetensors",
+ "model.layers.23.self_attn.1.q_b_proj.weight": "model-00072-of-00085.safetensors",
+ "model.layers.24.input_layernorm.0.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.input_layernorm.1.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.router.classifier.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.router.classifier.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.router.classifier.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.router.e_score_correction_bias": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.down_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.down_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.down_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.gate_proj.biases": "model-00073-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.gate_proj.scales": "model-00073-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.gate_proj.weight": "model-00073-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.up_proj.biases": "model-00074-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.up_proj.scales": "model-00074-of-00085.safetensors",
+ "model.layers.24.mlp.switch_mlp.up_proj.weight": "model-00074-of-00085.safetensors",
+ "model.layers.24.mlps.0.down_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.down_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.down_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.gate_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.gate_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.gate_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.up_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.up_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.0.up_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.down_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.down_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.down_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.gate_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.gate_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.gate_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.up_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.up_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.mlps.1.up_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.post_attention_layernorm.0.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.post_attention_layernorm.1.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_a_layernorm.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_a_proj_with_mqa.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_a_proj_with_mqa.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_a_proj_with_mqa.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_b_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_b_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.kv_b_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.o_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.o_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.o_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_a_layernorm.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_a_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_a_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_a_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_b_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_b_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.0.q_b_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_a_layernorm.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_a_proj_with_mqa.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_a_proj_with_mqa.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_a_proj_with_mqa.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_b_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_b_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.kv_b_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.o_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.o_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.o_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_a_layernorm.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_a_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_a_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_a_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_b_proj.biases": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_b_proj.scales": "model-00075-of-00085.safetensors",
+ "model.layers.24.self_attn.1.q_b_proj.weight": "model-00075-of-00085.safetensors",
+ "model.layers.25.input_layernorm.0.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.input_layernorm.1.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.router.classifier.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.router.classifier.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.router.classifier.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.router.e_score_correction_bias": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.down_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.down_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.down_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.gate_proj.biases": "model-00076-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.gate_proj.scales": "model-00076-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.gate_proj.weight": "model-00076-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.up_proj.biases": "model-00077-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.up_proj.scales": "model-00077-of-00085.safetensors",
+ "model.layers.25.mlp.switch_mlp.up_proj.weight": "model-00077-of-00085.safetensors",
+ "model.layers.25.mlps.0.down_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.down_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.down_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.gate_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.gate_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.gate_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.up_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.up_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.0.up_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.down_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.down_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.down_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.gate_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.gate_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.gate_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.up_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.up_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.mlps.1.up_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.post_attention_layernorm.0.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.post_attention_layernorm.1.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_a_layernorm.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_a_proj_with_mqa.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_a_proj_with_mqa.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_a_proj_with_mqa.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_b_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_b_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.kv_b_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.o_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.o_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.o_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_a_layernorm.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_a_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_a_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_a_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_b_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_b_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.0.q_b_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_a_layernorm.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_a_proj_with_mqa.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_a_proj_with_mqa.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_a_proj_with_mqa.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_b_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_b_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.kv_b_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.o_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.o_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.o_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_a_layernorm.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_a_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_a_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_a_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_b_proj.biases": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_b_proj.scales": "model-00078-of-00085.safetensors",
+ "model.layers.25.self_attn.1.q_b_proj.weight": "model-00078-of-00085.safetensors",
+ "model.layers.26.input_layernorm.0.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.input_layernorm.1.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.router.classifier.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.router.classifier.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.router.classifier.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.router.e_score_correction_bias": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.down_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.down_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.down_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.gate_proj.biases": "model-00079-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.gate_proj.scales": "model-00079-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.gate_proj.weight": "model-00079-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.up_proj.biases": "model-00080-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.up_proj.scales": "model-00080-of-00085.safetensors",
+ "model.layers.26.mlp.switch_mlp.up_proj.weight": "model-00080-of-00085.safetensors",
+ "model.layers.26.mlps.0.down_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.down_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.down_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.gate_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.gate_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.gate_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.up_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.up_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.0.up_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.down_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.down_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.down_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.gate_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.gate_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.gate_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.up_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.up_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.mlps.1.up_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.post_attention_layernorm.0.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.post_attention_layernorm.1.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_a_layernorm.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_a_proj_with_mqa.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_a_proj_with_mqa.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_a_proj_with_mqa.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_b_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_b_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.kv_b_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.o_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.o_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.o_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_a_layernorm.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_a_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_a_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_a_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_b_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_b_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.0.q_b_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_a_layernorm.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_a_proj_with_mqa.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_a_proj_with_mqa.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_a_proj_with_mqa.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_b_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_b_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.kv_b_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.o_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.o_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.o_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_a_layernorm.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_a_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_a_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_a_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_b_proj.biases": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_b_proj.scales": "model-00081-of-00085.safetensors",
+ "model.layers.26.self_attn.1.q_b_proj.weight": "model-00081-of-00085.safetensors",
+ "model.layers.27.input_layernorm.0.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.input_layernorm.1.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.router.classifier.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.router.classifier.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.router.classifier.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.router.e_score_correction_bias": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.down_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.down_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.down_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.gate_proj.biases": "model-00082-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.gate_proj.scales": "model-00082-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.gate_proj.weight": "model-00082-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.up_proj.biases": "model-00083-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.up_proj.scales": "model-00083-of-00085.safetensors",
+ "model.layers.27.mlp.switch_mlp.up_proj.weight": "model-00083-of-00085.safetensors",
+ "model.layers.27.mlps.0.down_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.down_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.down_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.gate_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.gate_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.gate_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.up_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.up_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.0.up_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.down_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.down_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.down_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.gate_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.gate_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.gate_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.up_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.up_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.mlps.1.up_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.post_attention_layernorm.0.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.post_attention_layernorm.1.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_a_layernorm.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_a_proj_with_mqa.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_a_proj_with_mqa.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_a_proj_with_mqa.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_b_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_b_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.kv_b_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.o_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.o_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.o_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_a_layernorm.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_a_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_a_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_a_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_b_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_b_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.0.q_b_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_a_layernorm.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_a_proj_with_mqa.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_a_proj_with_mqa.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_a_proj_with_mqa.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_b_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_b_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.kv_b_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.o_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.o_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.o_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_a_layernorm.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_a_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_a_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_a_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_b_proj.biases": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_b_proj.scales": "model-00084-of-00085.safetensors",
+ "model.layers.27.self_attn.1.q_b_proj.weight": "model-00084-of-00085.safetensors",
+ "model.layers.3.input_layernorm.0.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.input_layernorm.1.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.router.classifier.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.router.classifier.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.router.classifier.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.router.e_score_correction_bias": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.down_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.down_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.down_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.gate_proj.biases": "model-00010-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.gate_proj.scales": "model-00010-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.gate_proj.weight": "model-00010-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.up_proj.biases": "model-00011-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.up_proj.scales": "model-00011-of-00085.safetensors",
+ "model.layers.3.mlp.switch_mlp.up_proj.weight": "model-00011-of-00085.safetensors",
+ "model.layers.3.mlps.0.down_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.down_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.down_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.gate_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.gate_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.gate_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.up_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.up_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.0.up_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.down_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.down_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.down_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.gate_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.gate_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.gate_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.up_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.up_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.mlps.1.up_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.post_attention_layernorm.0.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.post_attention_layernorm.1.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_a_layernorm.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_a_proj_with_mqa.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_a_proj_with_mqa.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_a_proj_with_mqa.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_b_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_b_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.kv_b_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.o_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.o_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.o_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_a_layernorm.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_a_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_a_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_a_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_b_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_b_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.0.q_b_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_a_layernorm.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_a_proj_with_mqa.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_a_proj_with_mqa.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_a_proj_with_mqa.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_b_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_b_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.kv_b_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.o_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.o_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.o_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_a_layernorm.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_a_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_a_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_a_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_b_proj.biases": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_b_proj.scales": "model-00012-of-00085.safetensors",
+ "model.layers.3.self_attn.1.q_b_proj.weight": "model-00012-of-00085.safetensors",
+ "model.layers.4.input_layernorm.0.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.input_layernorm.1.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.router.classifier.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.router.classifier.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.router.classifier.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.router.e_score_correction_bias": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.down_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.down_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.down_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.gate_proj.biases": "model-00013-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.gate_proj.scales": "model-00013-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.gate_proj.weight": "model-00013-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.up_proj.biases": "model-00014-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.up_proj.scales": "model-00014-of-00085.safetensors",
+ "model.layers.4.mlp.switch_mlp.up_proj.weight": "model-00014-of-00085.safetensors",
+ "model.layers.4.mlps.0.down_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.down_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.down_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.gate_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.gate_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.gate_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.up_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.up_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.0.up_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.down_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.down_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.down_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.gate_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.gate_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.gate_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.up_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.up_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.mlps.1.up_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.post_attention_layernorm.0.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.post_attention_layernorm.1.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_a_layernorm.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_a_proj_with_mqa.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_a_proj_with_mqa.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_a_proj_with_mqa.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_b_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_b_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.kv_b_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.o_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.o_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.o_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_a_layernorm.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_a_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_a_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_a_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_b_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_b_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.0.q_b_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_a_layernorm.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_a_proj_with_mqa.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_a_proj_with_mqa.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_a_proj_with_mqa.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_b_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_b_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.kv_b_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.o_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.o_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.o_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_a_layernorm.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_a_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_a_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_a_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_b_proj.biases": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_b_proj.scales": "model-00015-of-00085.safetensors",
+ "model.layers.4.self_attn.1.q_b_proj.weight": "model-00015-of-00085.safetensors",
+ "model.layers.5.input_layernorm.0.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.input_layernorm.1.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.router.classifier.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.router.classifier.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.router.classifier.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.router.e_score_correction_bias": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.down_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.down_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.down_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.gate_proj.biases": "model-00016-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.gate_proj.scales": "model-00016-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.gate_proj.weight": "model-00016-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.up_proj.biases": "model-00017-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.up_proj.scales": "model-00017-of-00085.safetensors",
+ "model.layers.5.mlp.switch_mlp.up_proj.weight": "model-00017-of-00085.safetensors",
+ "model.layers.5.mlps.0.down_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.down_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.down_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.gate_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.gate_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.gate_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.up_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.up_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.0.up_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.down_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.down_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.down_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.gate_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.gate_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.gate_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.up_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.up_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.mlps.1.up_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.post_attention_layernorm.0.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.post_attention_layernorm.1.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_a_layernorm.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_a_proj_with_mqa.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_a_proj_with_mqa.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_a_proj_with_mqa.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_b_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_b_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.kv_b_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.o_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.o_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.o_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_a_layernorm.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_a_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_a_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_a_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_b_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_b_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.0.q_b_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_a_layernorm.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_a_proj_with_mqa.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_a_proj_with_mqa.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_a_proj_with_mqa.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_b_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_b_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.kv_b_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.o_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.o_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.o_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_a_layernorm.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_a_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_a_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_a_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_b_proj.biases": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_b_proj.scales": "model-00018-of-00085.safetensors",
+ "model.layers.5.self_attn.1.q_b_proj.weight": "model-00018-of-00085.safetensors",
+ "model.layers.6.input_layernorm.0.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.input_layernorm.1.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.router.classifier.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.router.classifier.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.router.classifier.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.router.e_score_correction_bias": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.down_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.down_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.down_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.gate_proj.biases": "model-00019-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.gate_proj.scales": "model-00019-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.gate_proj.weight": "model-00019-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.up_proj.biases": "model-00020-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.up_proj.scales": "model-00020-of-00085.safetensors",
+ "model.layers.6.mlp.switch_mlp.up_proj.weight": "model-00020-of-00085.safetensors",
+ "model.layers.6.mlps.0.down_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.down_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.down_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.gate_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.gate_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.gate_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.up_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.up_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.0.up_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.down_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.down_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.down_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.gate_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.gate_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.gate_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.up_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.up_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.mlps.1.up_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.post_attention_layernorm.0.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.post_attention_layernorm.1.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_a_layernorm.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_a_proj_with_mqa.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_a_proj_with_mqa.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_a_proj_with_mqa.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_b_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_b_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.kv_b_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.o_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.o_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.o_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_a_layernorm.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_a_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_a_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_a_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_b_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_b_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.0.q_b_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_a_layernorm.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_a_proj_with_mqa.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_a_proj_with_mqa.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_a_proj_with_mqa.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_b_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_b_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.kv_b_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.o_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.o_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.o_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_a_layernorm.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_a_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_a_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_a_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_b_proj.biases": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_b_proj.scales": "model-00021-of-00085.safetensors",
+ "model.layers.6.self_attn.1.q_b_proj.weight": "model-00021-of-00085.safetensors",
+ "model.layers.7.input_layernorm.0.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.input_layernorm.1.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.router.classifier.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.router.classifier.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.router.classifier.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.router.e_score_correction_bias": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.down_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.down_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.down_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.gate_proj.biases": "model-00022-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.gate_proj.scales": "model-00022-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.gate_proj.weight": "model-00022-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.up_proj.biases": "model-00023-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.up_proj.scales": "model-00023-of-00085.safetensors",
+ "model.layers.7.mlp.switch_mlp.up_proj.weight": "model-00023-of-00085.safetensors",
+ "model.layers.7.mlps.0.down_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.down_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.down_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.gate_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.gate_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.gate_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.up_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.up_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.0.up_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.down_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.down_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.down_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.gate_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.gate_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.gate_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.up_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.up_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.mlps.1.up_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.post_attention_layernorm.0.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.post_attention_layernorm.1.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_a_layernorm.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_a_proj_with_mqa.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_a_proj_with_mqa.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_a_proj_with_mqa.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_b_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_b_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.kv_b_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.o_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.o_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.o_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_a_layernorm.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_a_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_a_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_a_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_b_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_b_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.0.q_b_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_a_layernorm.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_a_proj_with_mqa.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_a_proj_with_mqa.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_a_proj_with_mqa.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_b_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_b_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.kv_b_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.o_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.o_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.o_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_a_layernorm.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_a_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_a_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_a_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_b_proj.biases": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_b_proj.scales": "model-00024-of-00085.safetensors",
+ "model.layers.7.self_attn.1.q_b_proj.weight": "model-00024-of-00085.safetensors",
+ "model.layers.8.input_layernorm.0.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.input_layernorm.1.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.router.classifier.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.router.classifier.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.router.classifier.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.router.e_score_correction_bias": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.down_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.down_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.down_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.gate_proj.biases": "model-00025-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.gate_proj.scales": "model-00025-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.gate_proj.weight": "model-00025-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.up_proj.biases": "model-00026-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.up_proj.scales": "model-00026-of-00085.safetensors",
+ "model.layers.8.mlp.switch_mlp.up_proj.weight": "model-00026-of-00085.safetensors",
+ "model.layers.8.mlps.0.down_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.down_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.down_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.gate_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.gate_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.gate_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.up_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.up_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.0.up_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.down_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.down_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.down_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.gate_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.gate_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.gate_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.up_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.up_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.mlps.1.up_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.post_attention_layernorm.0.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.post_attention_layernorm.1.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_a_layernorm.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_a_proj_with_mqa.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_a_proj_with_mqa.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_a_proj_with_mqa.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_b_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_b_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.kv_b_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.o_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.o_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.o_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_a_layernorm.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_a_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_a_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_a_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_b_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_b_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.0.q_b_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_a_layernorm.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_a_proj_with_mqa.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_a_proj_with_mqa.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_a_proj_with_mqa.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_b_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_b_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.kv_b_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.o_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.o_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.o_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_a_layernorm.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_a_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_a_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_a_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_b_proj.biases": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_b_proj.scales": "model-00027-of-00085.safetensors",
+ "model.layers.8.self_attn.1.q_b_proj.weight": "model-00027-of-00085.safetensors",
+ "model.layers.9.input_layernorm.0.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.input_layernorm.1.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.router.classifier.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.router.classifier.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.router.classifier.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.router.e_score_correction_bias": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.down_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.down_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.down_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.gate_proj.biases": "model-00028-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.gate_proj.scales": "model-00028-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.gate_proj.weight": "model-00028-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.up_proj.biases": "model-00029-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.up_proj.scales": "model-00029-of-00085.safetensors",
+ "model.layers.9.mlp.switch_mlp.up_proj.weight": "model-00029-of-00085.safetensors",
+ "model.layers.9.mlps.0.down_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.down_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.down_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.gate_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.gate_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.gate_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.up_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.up_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.0.up_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.down_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.down_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.down_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.gate_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.gate_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.gate_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.up_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.up_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.mlps.1.up_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.post_attention_layernorm.0.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.post_attention_layernorm.1.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_a_layernorm.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_a_proj_with_mqa.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_a_proj_with_mqa.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_a_proj_with_mqa.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_b_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_b_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.kv_b_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.o_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.o_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.o_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_a_layernorm.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_a_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_a_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_a_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_b_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_b_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.0.q_b_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_a_layernorm.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_a_proj_with_mqa.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_a_proj_with_mqa.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_a_proj_with_mqa.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_b_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_b_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.kv_b_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.o_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.o_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.o_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_a_layernorm.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_a_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_a_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_a_proj.weight": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_b_proj.biases": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_b_proj.scales": "model-00030-of-00085.safetensors",
+ "model.layers.9.self_attn.1.q_b_proj.weight": "model-00030-of-00085.safetensors",
+ "model.norm.weight": "model-00084-of-00085.safetensors"
+ }
+}
\ No newline at end of file
diff --git a/modeling_longcat_flash.py b/modeling_longcat_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..a05a4a388bfc4d5751548b89c2809f9259f85d64
--- /dev/null
+++ b/modeling_longcat_flash.py
@@ -0,0 +1,648 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2025 Meituan
+# This code is licensed under the MIT License, for details, see the ./LICENSE file.
+
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
+from transformers.utils.generic import check_model_inputs
+from .configuration_longcat_flash import LongcatFlashConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class LongcatFlashRMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ LongcatFlashRMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+ def extra_repr(self):
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class LongcatFlashRotaryEmbedding(nn.Module):
+ def __init__(self, config: LongcatFlashConfig, device=None):
+ super().__init__()
+ # BC: "rope_type" was originally "type"
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ @torch.no_grad()
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
+ def forward(self, x, position_ids):
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+ position_ids_expanded = position_ids[:, None, :].float()
+
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos() * self.attention_scaling
+ sin = emb.sin() * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class LongcatFlashMLP(nn.Module):
+ def __init__(self, config, hidden_size=None, intermediate_size=None):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+ self.intermediate_size = config.ffn_hidden_size if intermediate_size is None else intermediate_size
+
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+ return down_proj
+
+
+class LongcatFlashTopkRouter(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.top_k = config.moe_topk
+ self.n_routed_experts = (
+ config.n_routed_experts
+ if config.zero_expert_num is None
+ else config.n_routed_experts + config.zero_expert_num
+ )
+ self.routed_scaling_factor = config.routed_scaling_factor
+ self.norm_topk_prob = config.norm_topk_prob
+ self.router_bias = config.router_bias
+
+ self.classifier = nn.Linear(config.hidden_size, self.n_routed_experts, bias=self.router_bias)
+ self.register_buffer("e_score_correction_bias", torch.zeros((self.n_routed_experts)))
+
+ @torch.no_grad()
+ def get_topk_indices(self, scores):
+ scores_for_choice = scores.view(-1, self.n_routed_experts) + self.e_score_correction_bias.unsqueeze(0)
+ topk_indices = torch.topk(scores_for_choice, k=self.top_k, dim=-1, sorted=False)[1]
+ return topk_indices
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.view(-1, self.config.hidden_size)
+ router_logits = F.linear(hidden_states.type(torch.float32), self.classifier.weight.type(torch.float32))
+ scores = router_logits.softmax(dim=-1)
+ topk_indices = self.get_topk_indices(scores)
+ topk_weights = scores.gather(1, topk_indices)
+ if self.norm_topk_prob:
+ denominator = topk_weights.sum(dim=-1, keepdim=True) + 1e-20
+ topk_weights /= denominator
+ topk_weights = topk_weights * self.routed_scaling_factor
+ return topk_indices, topk_weights
+
+
+class LongcatFlashMoE(nn.Module):
+ """
+ moe module.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.experts = nn.ModuleList(
+ [
+ LongcatFlashMLP(config, intermediate_size=config.expert_ffn_hidden_size)
+ for _ in range(config.n_routed_experts)
+ ]
+ )
+ self.router = LongcatFlashTopkRouter(config)
+ self.zero_expert_num = config.zero_expert_num
+ self.zero_expert_type = config.zero_expert_type
+
+ def moe(self, hidden_states: torch.Tensor, topk_indices: torch.Tensor, topk_weights: torch.Tensor):
+ final_hidden_states = torch.zeros_like(hidden_states, dtype=topk_weights.dtype)
+ total_experts = len(self.experts) if self.zero_expert_num is None else len(self.experts) + self.zero_expert_num
+
+ expert_mask = torch.nn.functional.one_hot(topk_indices, num_classes=total_experts)
+ expert_mask = expert_mask.permute(2, 0, 1)
+
+ for expert_idx in range(total_experts):
+ expert = self.experts[expert_idx] if expert_idx < len(self.experts) else None
+ mask = expert_mask[expert_idx]
+ token_indices, weight_indices = torch.where(mask)
+
+ if token_indices.numel() > 0:
+ expert_weights = topk_weights[token_indices, weight_indices]
+ expert_input = hidden_states[token_indices]
+
+ if self.zero_expert_num is None or expert_idx < len(self.experts):
+ expert_output = expert(expert_input)
+ elif self.zero_expert_type == "identity":
+ expert_output = expert_input
+ else:
+ raise ValueError("Unknown condition")
+
+ weighted_output = expert_output * expert_weights.unsqueeze(-1)
+ final_hidden_states.index_add_(0, token_indices, weighted_output)
+
+ return final_hidden_states.type(hidden_states.dtype)
+
+ def forward(self, hidden_states):
+ orig_shape = hidden_states.shape
+ topk_indices, topk_weights = self.router(hidden_states)
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ hidden_states = self.moe(hidden_states, topk_indices, topk_weights).view(*orig_shape)
+ return hidden_states
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+ module: nn.Module,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ attention_mask: Optional[torch.Tensor],
+ scaling: float,
+ dropout: float = 0.0,
+ **kwargs: Unpack[TransformersKwargs],
+):
+ key_states = repeat_kv(key, module.num_key_value_groups)
+ value_states = repeat_kv(value, module.num_key_value_groups)
+
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+ if attention_mask is not None:
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, use_mla=False):
+ """Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`, *optional*):
+ Deprecated and unused.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+
+ if use_mla:
+ b, h, s, d = q.shape
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ b, h, s, d = k.shape
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class LongcatFlashMLA(nn.Module):
+ """Modified from Deepseek MLA"""
+
+ def __init__(self, config: LongcatFlashConfig, layer_idx: int):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+ self.attention_dropout = config.attention_dropout
+ self.num_heads = config.num_attention_heads
+ self.rope_theta = config.rope_theta
+ self.q_lora_rank = config.q_lora_rank
+ self.qk_rope_head_dim = config.qk_rope_head_dim
+ self.kv_lora_rank = config.kv_lora_rank
+ self.v_head_dim = config.v_head_dim
+ self.qk_nope_head_dim = config.qk_nope_head_dim
+ self.qk_head_dim = config.qk_head_dim
+
+ self.is_causal = True
+ if self.q_lora_rank is None:
+ self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
+ else:
+ self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+ self.q_a_layernorm = LongcatFlashRMSNorm(config.q_lora_rank)
+ self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+ self.kv_a_proj_with_mqa = nn.Linear(
+ config.hidden_size,
+ self.kv_lora_rank + self.qk_rope_head_dim,
+ bias=config.attention_bias,
+ )
+ self.kv_a_layernorm = LongcatFlashRMSNorm(self.kv_lora_rank)
+ self.kv_b_proj = nn.Linear(
+ self.kv_lora_rank,
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+ bias=False,
+ )
+
+ self.o_proj = nn.Linear(
+ self.num_heads * self.v_head_dim,
+ config.hidden_size,
+ bias=config.attention_bias,
+ )
+
+ if config.mla_scale_q_lora:
+ self.mla_scale_q_lora = (config.hidden_size / self.q_lora_rank) ** 0.5
+ if config.mla_scale_kv_lora:
+ self.mla_scale_kv_lora = (config.hidden_size / self.kv_lora_rank) ** 0.5
+ self.scaling = self.qk_head_dim ** (-0.5)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor],
+ past_key_value: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+ batch_size, seq_length = hidden_states.shape[:-1]
+ query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+ key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+ q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
+ q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+ # apply q_lora scaling
+ if self.mla_scale_q_lora is not None:
+ q_pass = q_pass * self.mla_scale_q_lora
+ q_rot = q_rot * self.mla_scale_q_lora
+
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+ k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+ k_pass = self.kv_a_layernorm(k_pass)
+
+ # apply kv_lora scaling
+ if self.mla_scale_kv_lora is not None:
+ k_pass = k_pass * self.mla_scale_kv_lora
+
+ k_pass = self.kv_b_proj(k_pass).view(key_shape).transpose(1, 2)
+ k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+ k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+ cos, sin = position_embeddings
+ q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin, use_mla=True)
+ k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+
+ query_states = torch.cat((q_pass, q_rot), dim=-1)
+ key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+ if past_key_value is not None:
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+ attention_interface: Callable = eager_attention_forward
+ if self.config._attn_implementation != "eager":
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ dropout=0.0 if not self.training else self.attention_dropout,
+ scaling=self.scaling,
+ **kwargs,
+ )
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+ attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+ attn_output = self.o_proj(attn_output)
+ return attn_output, attn_weights
+
+
+def create_attention_block(class_name, *args, **kwargs):
+ attention_mapping = {"MLA": LongcatFlashMLA}
+
+ chosen_class = attention_mapping.get(class_name)
+ if not chosen_class:
+ raise ValueError(f"No class found for name: {class_name}")
+
+ return chosen_class(*args, **kwargs)
+
+
+class LongcatFlashDecoderLayer(GradientCheckpointingLayer):
+ def __init__(self, config: LongcatFlashConfig, layer_idx: int):
+ super().__init__()
+ self.layer_idx = layer_idx
+ self.hidden_size = config.hidden_size
+ self.mlp = LongcatFlashMoE(config)
+
+ self_attn = []
+ mlps = []
+ input_layernorm = []
+ post_attention_layernorm = []
+ for i in range(2):
+ self_attn.append(
+ create_attention_block(config.attention_method, config=config, layer_idx=layer_idx * 2 + i)
+ )
+ mlps.append(LongcatFlashMLP(config))
+ input_layernorm.append(LongcatFlashRMSNorm(config.hidden_size, eps=config.rms_norm_eps))
+ post_attention_layernorm.append(LongcatFlashRMSNorm(config.hidden_size, eps=config.rms_norm_eps))
+
+ self.self_attn = nn.ModuleList(self_attn)
+ self.mlps = nn.ModuleList(mlps)
+ self.input_layernorm = nn.ModuleList(input_layernorm)
+ self.post_attention_layernorm = nn.ModuleList(post_attention_layernorm)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ use_cache: Optional[bool] = False,
+ cache_position: Optional[torch.LongTensor] = None,
+ position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ for i in range(2):
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm[i](hidden_states)
+
+ hidden_states, _ = self.self_attn[i](
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **kwargs,
+ )
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm[i](hidden_states)
+
+ if i == 0:
+ shortcut_mlp_output = self.mlp(hidden_states) # shortcut output (MoE output)
+
+ hidden_states = self.mlps[i](hidden_states)
+ hidden_states = residual + hidden_states
+ if i == 1:
+ hidden_states = hidden_states + shortcut_mlp_output
+
+ return hidden_states
+
+
+@auto_docstring
+class LongcatFlashPreTrainedModel(PreTrainedModel):
+ config: LongcatFlashConfig
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["LongcatFlashDecoderLayer"]
+ _skip_keys_device_placement = ["past_key_values"]
+ _supports_flash_attn = True
+ _supports_sdpa = True
+ _supports_flex_attn = True
+ _can_compile_fullgraph = True
+ _supports_attention_backend = True
+ _can_record_outputs = {
+ "hidden_states": LongcatFlashDecoderLayer,
+ "attentions": LongcatFlashMLA,
+ }
+
+
+@auto_docstring
+class LongcatFlashModel(LongcatFlashPreTrainedModel):
+ _keys_to_ignore_on_load_unexpected = [r"model\.mtp.*"]
+
+ def __init__(self, config: LongcatFlashConfig):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = nn.ModuleList(
+ [LongcatFlashDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+ )
+ self.norm = LongcatFlashRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.rotary_emb = LongcatFlashRotaryEmbedding(config=config)
+ self.gradient_checkpointing = False
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ @check_model_inputs
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ **kwargs: Unpack[TransformersKwargs],
+ ) -> BaseModelOutputWithPast:
+ if (input_ids is None) ^ (inputs_embeds is not None):
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+ if use_cache and past_key_values is None:
+ past_key_values = DynamicCache()
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position: torch.Tensor = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ causal_mask = create_causal_mask(
+ config=self.config,
+ input_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ cache_position=cache_position,
+ past_key_values=past_key_values,
+ position_ids=position_ids,
+ )
+
+ hidden_states = inputs_embeds
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+ hidden_states = decoder_layer(
+ hidden_states,
+ attention_mask=causal_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ cache_position=cache_position,
+ position_embeddings=position_embeddings,
+ **kwargs,
+ )
+
+ hidden_states = self.norm(hidden_states)
+ return BaseModelOutputWithPast(
+ last_hidden_state=hidden_states,
+ past_key_values=past_key_values,
+ )
+
+
+@auto_docstring
+class LongcatFlashForCausalLM(LongcatFlashPreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+ _tp_plan = {"lm_head": "colwise_rep"}
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+ _keys_to_ignore_on_load_unexpected = [r"model\.mtp.*"]
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = LongcatFlashModel(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @can_return_tuple
+ @auto_docstring
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Cache] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ logits_to_keep: Union[int, torch.Tensor] = 0,
+ **kwargs: Unpack[TransformersKwargs],
+ ) -> CausalLMOutputWithPast:
+ r"""
+ Example:
+
+ ```python
+ >>> from transformers import AutoTokenizer, LongcatFlashForCausalLM
+
+ >>> model = LongcatFlashForCausalLM.from_pretrained("meta-longcat_flash/LongcatFlash-2-7b-hf")
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-longcat_flash/LongcatFlash-2-7b-hf")
+
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+ outputs: BaseModelOutputWithPast = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ cache_position=cache_position,
+ **kwargs,
+ )
+
+ hidden_states = outputs.last_hidden_state
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+ loss = None
+ if labels is not None:
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+ return CausalLMOutputWithPast(
+ loss=loss,
+ logits=logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+
+__all__ = ["LongcatFlashPreTrainedModel", "LongcatFlashModel", "LongcatFlashForCausalLM"]
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..365debacc452f0526c29fba6aca892fc55d17435
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "unk_token": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..0823129c151ceba44d9e3942d8ad9617c93bcfb9
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,655525 @@
+{
+ "version": "1.0",
+ "truncation": null,
+ "padding": null,
+ "added_tokens": [
+ {
+ "id": 0,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 1,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 2,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 3,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 4,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 5,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 6,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 7,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 8,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 9,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 10,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 11,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 12,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 13,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 14,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 15,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 16,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 17,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 18,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 19,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 20,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 21,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 22,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 23,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 24,
+ "content": "<|image_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 25,
+ "content": "<|url_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 26,
+ "content": "<|hyperlink_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 27,
+ "content": "<|table_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 28,
+ "content": "<|equation_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 29,
+ "content": "<|code_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 30,
+ "content": "<|reference_placeholder|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 31,
+ "content": "<|endoftext|>",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 32,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 33,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 34,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 35,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 36,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 37,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 38,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 39,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 40,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 41,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 42,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 43,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": false
+ },
+ {
+ "id": 44,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 45,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 46,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 47,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 48,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 49,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 50,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 51,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 52,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 53,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 54,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 55,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 56,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 57,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 58,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 59,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 60,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 61,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 62,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 63,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 64,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 65,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 66,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 67,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 68,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 69,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 70,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 71,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 72,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 73,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 74,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 75,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 76,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 77,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 78,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 79,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 80,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 81,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 82,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 83,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 84,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 85,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 86,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 87,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 88,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 89,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 90,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 91,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 92,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 93,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 94,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 95,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 96,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 97,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 98,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 99,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 100,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 101,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 102,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 103,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 104,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 105,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 106,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 107,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 108,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 109,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 110,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 111,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 112,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 113,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 114,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 115,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 116,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 117,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 118,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 119,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 120,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 121,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 122,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 123,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 124,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 125,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 126,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 127,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 128,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 129,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 130,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 131,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 132,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 133,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 134,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 135,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 136,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 137,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 138,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 139,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 140,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 141,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 142,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 143,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 144,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 145,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 146,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 147,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 148,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 149,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 150,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 151,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 152,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 153,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 154,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 155,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 156,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 157,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 158,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 159,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 160,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 161,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 162,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 163,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 164,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 165,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 166,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 167,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 168,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 169,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 170,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 171,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 172,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 173,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 174,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 175,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 176,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 177,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 178,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 179,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 180,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 181,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 182,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 183,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 184,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 185,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 186,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 187,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 188,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 189,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 190,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 191,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 192,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 193,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 194,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 195,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 196,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 197,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 198,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 199,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 200,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 201,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 202,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 203,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 204,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 205,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 206,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 207,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 208,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 209,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 210,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 211,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 212,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 213,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 214,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 215,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 216,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 217,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 218,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 219,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 220,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 221,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 222,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ },
+ {
+ "id": 223,
+ "content": "",
+ "single_word": false,
+ "lstrip": false,
+ "rstrip": false,
+ "normalized": false,
+ "special": true
+ }
+ ],
+ "normalizer": null,
+ "pre_tokenizer": {
+ "type": "Sequence",
+ "pretokenizers": [
+ {
+ "type": "Split",
+ "pattern": {
+ "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
+ },
+ "behavior": "Isolated",
+ "invert": false
+ },
+ {
+ "type": "Split",
+ "pattern": {
+ "Regex": " ?[!-/:-~‘-‟ -。《》「」【】]+"
+ },
+ "behavior": "Isolated",
+ "invert": false
+ },
+ {
+ "type": "Split",
+ "pattern": {
+ "Regex": "[一-龥ࠀ-一가-]+"
+ },
+ "behavior": "Isolated",
+ "invert": false
+ },
+ {
+ "type": "ByteLevel",
+ "add_prefix_space": false,
+ "trim_offsets": true,
+ "use_regex": false
+ }
+ ]
+ },
+ "post_processor": {
+ "type": "ByteLevel",
+ "add_prefix_space": true,
+ "trim_offsets": false,
+ "use_regex": true
+ },
+ "decoder": {
+ "type": "ByteLevel",
+ "add_prefix_space": true,
+ "trim_offsets": true,
+ "use_regex": true
+ },
+ "model": {
+ "type": "BPE",
+ "dropout": null,
+ "unk_token": null,
+ "continuing_subword_prefix": null,
+ "end_of_word_suffix": null,
+ "fuse_unk": false,
+ "byte_fallback": false,
+ "ignore_merges": false,
+ "vocab": {
+ "": 0,
+ "": 1,
+ "": 2,
+ "": 3,
+ "": 4,
+ "": 5,
+ "": 6,
+ "": 7,
+ "": 8,
+ "": 9,
+ "": 10,
+ "": 11,
+ "": 12,
+ "": 13,
+ "": 14,
+ "": 15,
+ "": 16,
+ "": 17,
+ "": 18,
+ "": 19,
+ "": 20,
+ "": 21,
+ "": 22,
+ "": 23,
+ "<|image_placeholder|>": 24,
+ "<|url_placeholder|>": 25,
+ "<|hyperlink_placeholder|>": 26,
+ "<|table_placeholder|>": 27,
+ "<|equation_placeholder|>": 28,
+ "<|code_placeholder|>": 29,
+ "<|reference_placeholder|>": 30,
+ "<|endoftext|>": 31,
+ "": 32,
+ "": 33,
+ "": 34,
+ "": 35,
+ "": 36,
+ "": 37,
+ "": 38,
+ "": 39,
+ "": 40,
+ "": 41,
+ "": 42,
+ "": 43,
+ "": 44,
+ "": 45,
+ "": 46,
+ "": 47,
+ "": 48,
+ "": 49,
+ "": 50,
+ "": 51,
+ "": 52,
+ "": 53,
+ "": 54,
+ "": 55,
+ "": 56,
+ "": 57,
+ "": 58,
+ "": 59,
+ "": 60,
+ "": 61,
+ "": 62,
+ "": 63,
+ "": 64,
+ "": 65,
+ "