diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/chat_template.jinja b/chat_template.jinja
new file mode 100644
index 0000000000000000000000000000000000000000..7b86dfc5818f72a8dfd6b1bac4c8964fbf8728ec
--- /dev/null
+++ b/chat_template.jinja
@@ -0,0 +1,91 @@
+{%- if messages[0].role == 'system' %}
+ {%- if messages[0].content != '' %}
+ {{- 'SYSTEM\n' + messages[0].content + '\n\n' }}
+ {%- elif tools %}
+ {{- 'SYSTEM\n' }}
+ {%- endif %}
+{%- elif tools %}
+ {{- 'SYSTEM\n' }}
+{%- endif %}
+{%- if tools %}
+ {{- "# Tools\nYou may call one or more functions to assist with the user query.\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n\n" }}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if message.content is string %}
+ {%- set content = message.content %}
+ {%- else %}
+ {%- set content = '' %}
+ {%- endif %}
+ {%- if message.role == "user" %}
+ {{- 'HUMAN\n' + message.content + '<|role_end|>\n\n' }}
+ {%- elif message.role == "system" and not loop.first %}
+ {{- 'SYSTEM\n' + message.content + '<|role_end|>\n\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is string %}
+ {%- if message.reasoning_content !='' %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- endif %}
+ {%- else %}
+ {%- if '' in content %}
+ {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- set content = content.split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if reasoning_content != '' %}
+ {{- 'ASSISTANT\n' + '\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- 'ASSISTANT\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- 'ASSISTANT\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|role_end|>\n\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- 'OBSERVATION' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|role_end|>\n\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- 'ASSISTANT' }}
+{%- endif %}
diff --git a/config.json b/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..65fd0f1fb06d3ee8df4e8b196ed8e2560600b5cd
--- /dev/null
+++ b/config.json
@@ -0,0 +1,116 @@
+{
+ "architectures": [
+ "BailingMoeV2_5ForCausalLM"
+ ],
+ "attention_dropout": 0.0,
+ "auto_map": {
+ "AutoConfig": "configuration_bailing_moe_v2_5.BailingMoeV2_5Config",
+ "AutoModel": "modeling_bailing_moe_v2_5.BailingMoeV2_5Model",
+ "AutoModelForCausalLM": "modeling_bailing_moe_v2_5.BailingMoeV2_5ForCausalLM"
+ },
+ "embedding_dropout": 0.0,
+ "eos_token_id": 156892,
+ "first_k_dense_replace": 4,
+ "group_norm_size": 8,
+ "head_dim": 128,
+ "hidden_act": "silu",
+ "hidden_size": 8192,
+ "initializer_range": 0.02,
+ "intermediate_size": 18432,
+ "kv_lora_rank": 512,
+ "layer_group_size": 8,
+ "linear_silu": false,
+ "max_position_embeddings": 131072,
+ "max_window_layers": 20,
+ "model_type": "bailing_hybrid",
+ "moe_intermediate_size": 2048,
+ "moe_router_enable_expert_bias": true,
+ "moe_shared_expert_intermediate_size": 2048,
+ "mtp_loss_scaling_factor": 0,
+ "n_group": 8,
+ "num_attention_heads": 64,
+ "num_experts": 256,
+ "num_experts_per_tok": 8,
+ "num_hidden_layers": 80,
+ "num_key_value_heads": 64,
+ "num_kv_heads_for_linear_attn": 64,
+ "num_nextn_predict_layers": 0,
+ "num_shared_experts": 1,
+ "output_dropout": 0.0,
+ "output_router_logits": false,
+ "pad_token_id": 156892,
+ "partial_rotary_factor": 0.5,
+ "q_lora_rank": 1536,
+ "qk_head_dim": 192,
+ "qk_nope_head_dim": 128,
+ "qk_rope_head_dim": 64,
+ "quantization_config": {
+ "config_groups": {
+ "FP8_DYNAMIC": {
+ "format": "float-quantized",
+ "input_activations": {
+ "actorder": null,
+ "block_structure": null,
+ "dynamic": true,
+ "group_size": null,
+ "num_bits": 8,
+ "observer": null,
+ "observer_kwargs": {},
+ "strategy": "token",
+ "symmetric": true,
+ "type": "float"
+ },
+ "output_activations": null,
+ "targets": [
+ "Linear"
+ ],
+ "weights": {
+ "actorder": null,
+ "block_structure": null,
+ "dynamic": false,
+ "group_size": null,
+ "num_bits": 8,
+ "observer": "static_minmax",
+ "observer_kwargs": {},
+ "strategy": "channel",
+ "symmetric": true,
+ "type": "float"
+ }
+ }
+ },
+ "format": "float-quantized",
+ "global_compression_ratio": null,
+ "ignore": [
+ "re:.*mlp.gate$",
+ "lm_head",
+ "model.word_embeddings"
+ ],
+ "kv_cache_scheme": null,
+ "quant_method": "compressed-tensors",
+ "quantization_status": "compressed",
+ "sparsity_config": {},
+ "transform_config": {},
+ "version": "0.13.0"
+ },
+ "rms_norm_eps": 1e-06,
+ "rope_interleave": true,
+ "rope_scaling": null,
+ "rope_theta": 6000000,
+ "rotary_dim": 64,
+ "routed_scaling_factor": 2.5,
+ "router_dtype": "fp32",
+ "score_function": "sigmoid",
+ "scoring_func": "sigmoid",
+ "seq_aux": true,
+ "tie_word_embeddings": false,
+ "topk_group": 4,
+ "topk_method": "noaux_tc",
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.56.2",
+ "use_bias": false,
+ "use_cache": true,
+ "use_qk_norm": true,
+ "use_qkv_bias": false,
+ "v_head_dim": 128,
+ "vocab_size": 157184
+}
\ No newline at end of file
diff --git a/configuration_bailing_moe_v2_5.py b/configuration_bailing_moe_v2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdad8862dd020e89fb5d4a08a76186314f18d3d0
--- /dev/null
+++ b/configuration_bailing_moe_v2_5.py
@@ -0,0 +1,120 @@
+"""Bailing MoE V2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class BailingMoeV2_5Config(PretrainedConfig):
+
+ def __init__(
+ self,
+ vocab_size=157184,
+ hidden_size=2048,
+ intermediate_size=5120,
+ num_hidden_layers=20,
+ num_attention_heads=16,
+ num_key_value_heads=4,
+ hidden_act="silu",
+ use_qkv_bias=False, # bailing only
+ use_bias=False, # bailing only
+ rms_norm_eps=1e-06,
+ tie_word_embeddings=False, # PretrainedConfig key, here change default value.
+ embedding_dropout=0.0,
+ attention_dropout=0.0,
+ output_dropout=0.0,
+ initializer_range=0.02,
+ max_position_embeddings=32768,
+ rope_theta=600000.0,
+ use_cache=True,
+ max_window_layers=20,
+ rope_scaling=None,
+ pad_token_id=156892,
+ eos_token_id=156892,
+ num_experts=256,
+ num_shared_experts=1,
+ num_experts_per_tok=8,
+ n_group=8,
+ topk_group=4,
+ moe_intermediate_size=512,
+ first_k_dense_replace=1,
+ head_dim=128,
+ output_router_logits=False,
+ use_qk_norm=True,
+ num_nextn_predict_layers=0,
+ mtp_loss_scaling_factor=0,
+ moe_router_enable_expert_bias=True,
+ routed_scaling_factor=1.0,
+ layer_group_size=5,
+ group_norm_size=4,
+ linear_silu=False,
+ kv_lora_rank=512,
+ q_lora_rank=None,
+ qk_rope_head_dim=64,
+ v_head_dim=128,
+ qk_nope_head_dim=128,
+ rope_interleave=True,
+ partial_rotary_factor=0.5,
+ score_function="sigmoid",
+ scoring_func="sigmoid",
+ seq_aux=True,
+ topk_method="noaux_tc",
+ router_dtype="fp32",
+ **kwargs,
+ ):
+ self.num_hidden_layers = num_hidden_layers
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.intermediate_size = intermediate_size
+ self.num_attention_heads = num_attention_heads
+ self.num_key_value_heads = num_key_value_heads
+ self.hidden_act = hidden_act
+ self.use_qkv_bias = use_qkv_bias
+ self.use_bias = use_bias
+ self.rms_norm_eps = rms_norm_eps
+ self.embedding_dropout = embedding_dropout
+ self.attention_dropout = attention_dropout
+ self.output_dropout = output_dropout
+ self.num_nextn_predict_layers = num_nextn_predict_layers
+ self.mtp_loss_scaling_factor = mtp_loss_scaling_factor
+ self.initializer_range = initializer_range
+ self.max_position_embeddings = max_position_embeddings
+ self.rope_theta = rope_theta
+ self.use_cache = use_cache
+ self.max_window_layers = max_window_layers
+ self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
+ self.rope_scaling = rope_scaling
+ self.use_qk_norm = use_qk_norm
+ self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+ self.routed_scaling_factor = routed_scaling_factor
+
+ # MoE configs
+ self.num_experts = num_experts
+ self.num_shared_experts = num_shared_experts
+ self.num_experts_per_tok = num_experts_per_tok
+ self.n_group = n_group
+ self.topk_group = topk_group
+ self.moe_intermediate_size = moe_intermediate_size
+ self.first_k_dense_replace = first_k_dense_replace
+ self.output_router_logits = output_router_logits
+
+ # Linear configs
+ self.layer_group_size = layer_group_size
+ self.group_norm_size = group_norm_size
+ self.linear_silu = linear_silu
+ # mla
+ self.kv_lora_rank = kv_lora_rank
+ self.q_lora_rank = q_lora_rank
+ self.qk_rope_head_dim = qk_rope_head_dim
+
+ self.score_function = score_function
+ self.scoring_func = scoring_func
+ self.seq_aux = seq_aux
+ self.topk_method = topk_method
+ self.v_head_dim = v_head_dim
+ self.qk_nope_head_dim = qk_nope_head_dim
+ self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+ self.rope_interleave = rope_interleave
+ self.router_dtype = router_dtype
+ self.partial_rotary_factor = partial_rotary_factor
+ super().__init__(
+ pad_token_id=pad_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+ )
diff --git a/generation_config.json b/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..a7b4625d03f7764803aa7a36169554286ff2aa67
--- /dev/null
+++ b/generation_config.json
@@ -0,0 +1,7 @@
+{
+ "bos_token_id": 156891,
+ "eos_token_id": [
+ 156895
+ ],
+ "pad_token_id": 156892
+}
diff --git a/model-00005-of-00160.safetensors b/model-00005-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..50441193ade87b2d2d9597a7da26ce65d32890b3
--- /dev/null
+++ b/model-00005-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d39cd3a0fb0770f48ff4db5c988948b368e73cfa6c4960a8e78ac9b62933c0a
+size 3222846072
diff --git a/model-00007-of-00160.safetensors b/model-00007-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..42bfadf5eb594c3cc00578dc84bc5eb444c8f665
--- /dev/null
+++ b/model-00007-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cfae59a6508f0a8caea39a73e8ed76a18733fbaf42f453ec4802324950fcc13
+size 3222846072
diff --git a/model-00008-of-00160.safetensors b/model-00008-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3f1e8027890faf4ed2eeac8c670f24f1af49ec00
--- /dev/null
+++ b/model-00008-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2e9e62b56f0d18a063ca56e8af4d2e1bdc159784ef09f8744b1c7ebbeafb01a
+size 7782723480
diff --git a/model-00021-of-00160.safetensors b/model-00021-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..aefb3b1674213ab39b18f438d107767677fdbed3
--- /dev/null
+++ b/model-00021-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10940e895d2abc27ab29a2dc385feb52a8b5bb9e75e385254b73b27a5f65dd1a
+size 6445693304
diff --git a/model-00026-of-00160.safetensors b/model-00026-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..3ab1a17bd3334eaeb95d0351a3cdca75fff99b88
--- /dev/null
+++ b/model-00026-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d2f767b47708e477468db23e3f927a0e560d0ae6fa446a81dbdc03794f0b7f6
+size 6445692536
diff --git a/model-00028-of-00160.safetensors b/model-00028-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..dd872574673750886f3325204e8a3bbfc86493ca
--- /dev/null
+++ b/model-00028-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bc80a508a028915aaed252321c37a9028d061be9c2e1f1347dfbb3b0e2efb31
+size 6445693304
diff --git a/model-00034-of-00160.safetensors b/model-00034-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..48e469e156ef37e2da379fef9ab1bf0a71d598a6
--- /dev/null
+++ b/model-00034-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1caa3fc796bb7c198baf8ea322a52e1341bb3610cb837440175cbeb0da8d3dd3
+size 6445692536
diff --git a/model-00038-of-00160.safetensors b/model-00038-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..4864cc18a83776c6ff908ed219aee9ab586449dc
--- /dev/null
+++ b/model-00038-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ec1ea895a35c1acd00cf51379519857aabf627d56ad356b46f7a7d01e717e4f
+size 6445693304
diff --git a/model-00042-of-00160.safetensors b/model-00042-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..0fc291ab684091948d562c58d6d5855ac73b30b2
--- /dev/null
+++ b/model-00042-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b28157c1d6ca687c789d7de7d34d28a07a793377969b8b097478b9ee2743547
+size 6445692536
diff --git a/model-00058-of-00160.safetensors b/model-00058-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..ad7d7f70e69e832124247c4d525abca462f09c81
--- /dev/null
+++ b/model-00058-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecdd3054a2f1b4da2bc116403ec00884a7df12b26a5fd3f8d133c04f36ddc0f0
+size 6445692536
diff --git a/model-00060-of-00160.safetensors b/model-00060-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fbf9872d40575c78cfc9725ae3c41a5e9395c4cd
--- /dev/null
+++ b/model-00060-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea65650b2c2fb2a82c9c40d8dbbd8a90d838c5cb1a1067e637e6b5756bf00fd7
+size 6445693304
diff --git a/model-00067-of-00160.safetensors b/model-00067-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..41d615fdd095808650b272ea8f52b57fbd5a8272
--- /dev/null
+++ b/model-00067-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92f1c288f4680eef6e1204b556f8dc462b6ab31acc528a25c375f2221b86fec4
+size 6445693208
diff --git a/model-00075-of-00160.safetensors b/model-00075-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d8e805d5eca3bbef04aafa9fa67a47117bd9c047
--- /dev/null
+++ b/model-00075-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:371f087c6100873a58c9a5d58a05b9ec75d0a783b498b80e3535786957eff987
+size 6445693208
diff --git a/model-00076-of-00160.safetensors b/model-00076-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..630e5dcc64153f8a980d3a62f371ce2e0bb52cd0
--- /dev/null
+++ b/model-00076-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed09860c9f696e3e4687b879fc71e7f185a3d4cdc46df4922e5d9c7df01985a
+size 6445693304
diff --git a/model-00080-of-00160.safetensors b/model-00080-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b9dbfc8f593cb6bc33d29d0bd1340b64edcadb71
--- /dev/null
+++ b/model-00080-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83e763627c9246187528523ca0e66095fc5aca8399e87f7486beafdf5a87c02e
+size 8006611688
diff --git a/model-00082-of-00160.safetensors b/model-00082-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..38254955d6533df7be842f149d503ddd08f092b0
--- /dev/null
+++ b/model-00082-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28804cda65aace4020370c58c40d1a74ed33063d926c039fe420caec81ad539b
+size 6445692536
diff --git a/model-00084-of-00160.safetensors b/model-00084-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..486f2d891c8b5ca7296c7a5f0cc50f448506e080
--- /dev/null
+++ b/model-00084-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:591f2715213c242469f895e5d7fbc48778ac25c3d70aa7045ca54c30e4d2f1a4
+size 6445693304
diff --git a/model-00085-of-00160.safetensors b/model-00085-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8ea79365b84a13058d05585d08cc31aaa71df834
--- /dev/null
+++ b/model-00085-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97342c501c2debf215d8a19572a16ca3ee72c932f547b38c848552c01f06d9e5
+size 6445693304
diff --git a/model-00092-of-00160.safetensors b/model-00092-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c1a274a0e9673be4cb413bf47eda78c8cedfde08
--- /dev/null
+++ b/model-00092-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7ab102571fe6971c2316b6649e82335a8b1bba18f8b8ce37a9382850789eb30
+size 6445693304
diff --git a/model-00097-of-00160.safetensors b/model-00097-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..907d5e9b9008f7fd777bd24bb876bef080c6a482
--- /dev/null
+++ b/model-00097-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:533e6c2493cba48765847610eeb81d5797e85756c66222676d553250f910cd35
+size 6445692536
diff --git a/model-00098-of-00160.safetensors b/model-00098-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..cf1e72a2e5fed9d50447d1624f8ce08ce6747d00
--- /dev/null
+++ b/model-00098-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3877304be86bbb8ab02699a25505c2d41f37677852f313883239191f1d387f15
+size 6445692536
diff --git a/model-00105-of-00160.safetensors b/model-00105-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..1d3f8faa6aa5fd1dc33198db83acb358e1db2b8e
--- /dev/null
+++ b/model-00105-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a216635fd5af018cf7be931ce488d5ea013ed7a2700771f5ef9185de99dc3cdb
+size 6445692536
diff --git a/model-00108-of-00160.safetensors b/model-00108-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..d61496691366bf818c2edfe12dde46120f5ceda8
--- /dev/null
+++ b/model-00108-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0f6c1b34b703cb0a5832dd8f847b87fc8335d4e7791f0fae6c72a57677162a9
+size 6445693304
diff --git a/model-00113-of-00160.safetensors b/model-00113-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c3a2281da8cc3b9c7d8b9dac4009797874571dc9
--- /dev/null
+++ b/model-00113-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98d25ae89d30c32b9c3048af66cefd77d03e0b6e5480292279a196663df70642
+size 6445692536
diff --git a/model-00115-of-00160.safetensors b/model-00115-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e8c9472fc8dc97eb1abc04b02f76759023734581
--- /dev/null
+++ b/model-00115-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39689310d9f61af4f6f69051afb3fb44476339fd05515973637faca748b861d0
+size 6445693208
diff --git a/model-00119-of-00160.safetensors b/model-00119-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..85177f65c5cea986ef34f70fbd9d49e43133ab03
--- /dev/null
+++ b/model-00119-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b273d8ad701e874ed21884a605ef964a318a1c35cf51f2a5a1536369dd21deaf
+size 6445693304
diff --git a/model-00120-of-00160.safetensors b/model-00120-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..38096b927f17251645aede823996cbbdf4593861
--- /dev/null
+++ b/model-00120-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7a4aa14dc7e3837b0fa62fd9e1b84fb77ab00f8b33bf4e6d4e96fa5793c0c22
+size 7782724328
diff --git a/model-00122-of-00160.safetensors b/model-00122-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..fe9708e50ec156240b78564ab9c5aa4febb52b3f
--- /dev/null
+++ b/model-00122-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:47d67ae7c9cf2c2ee923cd754419a2a2d7f377e8e783b09deab2583622150944
+size 6445692536
diff --git a/model-00125-of-00160.safetensors b/model-00125-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..98289e03dbe36db85204a23823492f21bff2696f
--- /dev/null
+++ b/model-00125-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5ff95e3c317b81c61c7aa43d598de0c992ddad63e9da18915ae5c8a8185c0e1
+size 6445693304
diff --git a/model-00130-of-00160.safetensors b/model-00130-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6ffb176a25f0739b66ae5a93e2fca1ac8c9bf711
--- /dev/null
+++ b/model-00130-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97aecca9f4cb3e927f7d5ea22ad0b982c42d8027614d56cc2cf08bf2c295dc7f
+size 6445692536
diff --git a/model-00131-of-00160.safetensors b/model-00131-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a3208690bacf7dbc4b3621e7b49cab080b3e9385
--- /dev/null
+++ b/model-00131-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2835a770c72a9c3d97e59306fd3b0c1d4af6172093ee6ee7c6e6b09b5352c5f
+size 6445693208
diff --git a/model-00132-of-00160.safetensors b/model-00132-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..778d944a340802c79dce2ce658941756ca1ef4bc
--- /dev/null
+++ b/model-00132-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b50533c0bc0d16a03f547c282242bb8b862501d93c95415ec5e27178ec167281
+size 6445693304
diff --git a/model-00134-of-00160.safetensors b/model-00134-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..10865547598f123dd3c674d2133eb026ea0f2631
--- /dev/null
+++ b/model-00134-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c322e5ae3d2161998220747b66759ad67586a62d3dd70565466bf4ddacd3669a
+size 6445693304
diff --git a/model-00137-of-00160.safetensors b/model-00137-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..afee6ae38de18eb6aa44f6a159d9e0e9e8c308b4
--- /dev/null
+++ b/model-00137-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3109cd8ec4995d6fdf3c34a0cecbd1e4c4dc6935aacf9fa79fb8be80761c952a
+size 6445692536
diff --git a/model-00139-of-00160.safetensors b/model-00139-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..8b49f29fd45f98a5c4c346cfae581273f1df6fab
--- /dev/null
+++ b/model-00139-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c76b362b053f9c828501041f84da08c71659ab33526d08cb82eabdbf9e3248ff
+size 6445693208
diff --git a/model-00142-of-00160.safetensors b/model-00142-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..2822d1ed78d4fd395eeced55a57d855ede14d71a
--- /dev/null
+++ b/model-00142-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2991b749715ac19912caea5d084bc226b83247e95e611e0a49eb03217e3fda18
+size 6445693304
diff --git a/model-00143-of-00160.safetensors b/model-00143-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..34208f08c5b6a02f2da894d9b2f5b5b8e4596b63
--- /dev/null
+++ b/model-00143-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1bef39884a61d1d7555a1f012a4a99381fa0685cf26960e93cc267e8d05fbe08
+size 6445693304
diff --git a/model-00145-of-00160.safetensors b/model-00145-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..c539d4372f9a7e7d53d0c85869961a471b4d2812
--- /dev/null
+++ b/model-00145-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:baf52f2f5fc766501b4a20e9b5e272e7f52f4d27e1b486015923969f8d1a3142
+size 6445692536
diff --git a/model-00146-of-00160.safetensors b/model-00146-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..df6aadf8b71c2c6beb118f808a7526cee32d5b7c
--- /dev/null
+++ b/model-00146-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f270ba9dc96d0d96ac42f314ba94d9f114431e852a9d27a647f5ca49d404a3f0
+size 6445692536
diff --git a/model-00152-of-00160.safetensors b/model-00152-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..6810cddce55b2819b56950b96ce9140f91dae524
--- /dev/null
+++ b/model-00152-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09a9e96f8f43c7865b830ee158a19744ccb3e10995dd25c92046eb179ae3532d
+size 6354738944
diff --git a/model-00157-of-00160.safetensors b/model-00157-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..41c51244e3cf0416bcc1b45151e18389f05c8cdf
--- /dev/null
+++ b/model-00157-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc934686b32eb8f1ab86e2091e3204ab5321dca1a4a5138811d08a0a8e6769fe
+size 3222846456
diff --git a/model-00158-of-00160.safetensors b/model-00158-of-00160.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..a37fe4792d1430ebb458d37e252dbb4c1ec68c49
--- /dev/null
+++ b/model-00158-of-00160.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b4bc5cfc410e8a77a39956a3ddd451c2cf5b4a42cca3fdd8a4e6749aa5a39a3
+size 3222846456
diff --git a/modeling_bailing_moe_v2_5.py b/modeling_bailing_moe_v2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d714e326a5779358a1fefc8e938e3d790601a6
--- /dev/null
+++ b/modeling_bailing_moe_v2_5.py
@@ -0,0 +1,1603 @@
+# coding=utf-8
+# Copyright 2025 Antgroup and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BailingMoE model."""
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union, Callable
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+ AttentionMaskConverter,
+ _prepare_4d_attention_mask,
+ _prepare_4d_causal_attention_mask,
+ _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import MoeModelOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
+from transformers.utils import (
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+ replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from .configuration_bailing_moe_v2_5 import BailingMoeV2_5Config
+from transformers.generation.utils import GenerationMixin
+from dataclasses import dataclass
+from transformers.utils import ModelOutput
+from transformers import DynamicLayer
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+
+from fla.ops.simple_gla.fused_recurrent import fused_recurrent_simple_gla
+from fla.ops.simple_gla.chunk import chunk_simple_gla
+
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+ if not is_torch_greater_or_equal_than_1_13:
+ import torch.fx
+
+ _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "BailingMoeV2_5Config"
+
+
+def roll_tensor(tensor, shifts=-1, dims=-1, fill_value=0):
+ """Roll the tensor input along the given dimension(s).
+ Inserted elements are set to be 0.0.
+ """
+ rolled_tensor = torch.roll(tensor, shifts=shifts, dims=dims)
+ rolled_tensor.select(dims, shifts).fill_(fill_value)
+ return rolled_tensor, rolled_tensor.sum()
+
+
+@dataclass
+class MoEV2_5CausalLMOutputWithPast(ModelOutput):
+ """
+ Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden
+ states terms, to train a MoE model.
+ Args:
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ z_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+ z_loss for the sparse modules.
+ aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+ aux_loss for the sparse modules.
+ router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+ Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse
+ modules.
+ """
+
+ loss: Optional[torch.FloatTensor] = None
+ logits: Optional[torch.FloatTensor] = None
+ past_key_values: Optional[Cache] = None
+ hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+ attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+ z_loss: Optional[torch.FloatTensor] = None
+ aux_loss: Optional[torch.FloatTensor] = None
+ router_logits: Optional[tuple[torch.FloatTensor]] = None
+ mtp_loss: Optional[torch.FloatTensor] = None
+ mtp_logits: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class MoeV2_5ModelOutputWithPast(MoeModelOutputWithPast):
+
+ def __init__(self, mtp_hidden_states=None, **kwargs):
+ super().__init__(**kwargs)
+ self.mtp_hidden_states = mtp_hidden_states
+
+
+def _get_unpad_data(attention_mask):
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+ return (
+ indices,
+ cu_seqlens,
+ max_seqlen_in_batch,
+ )
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+ warnings.warn(
+ "Calling `transformers.models.BailingMoeV2_5.modeling_BailingMoeV2_5._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
+ )
+ return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+
+
+def _make_causal_mask(
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+ warnings.warn(
+ "Calling `transformers.models.BailingMoeV2_5.modeling_BailingMoeV2_5._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.BailingMoeV2_5.modeling_BailingMoeV2_5.AttentionMaskConverter._make_causal_mask"
+ )
+ return AttentionMaskConverter._make_causal_mask(
+ input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
+ )
+
+
+class BailingMoeV2_5RMSNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ BailingMoeV2_5RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype)
+
+
+class BailingMoeV2_5GroupRMSNorm(nn.Module):
+ def __init__(self, hidden_size, group_norm_size, eps=1e-6):
+ """
+ BailingMoeV2_5RMSNorm is equivalent to T5LayerNorm
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.group_norm_size = group_norm_size
+ assert hidden_size % group_norm_size == 0, "hidden_size must be divisible by group_norm_size"
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ input_dtype = hidden_states.dtype
+ input_shape = hidden_states.size()
+ group_input_shape = input_shape[:-1] + (self.group_norm_size, input_shape[-1] // self.group_norm_size)
+ hidden_states = hidden_states.view(group_input_shape)
+ hidden_states = hidden_states.to(torch.float32)
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+ return self.weight * hidden_states.to(input_dtype).view(input_shape)
+
+
+ALL_LAYERNORM_LAYERS.append(BailingMoeV2_5RMSNorm)
+
+
+class BailingMoeV2_5RotaryEmbedding(nn.Module):
+ def __init__(self, config: BailingMoeV2_5Config, device=None):
+ super().__init__()
+ # BC: "rope_type" was originally "type"
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ @torch.no_grad()
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
+ def forward(self, x, position_ids):
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+ position_ids_expanded = position_ids[:, None, :].float()
+
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos() * self.attention_scaling
+ sin = emb.sin() * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+ """Applies Rotary Position Embedding to the query and key tensors.
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+
+ # Keep half or full tensor for later concatenation
+ rotary_dim = cos.shape[-1]
+ q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+ k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+ # Apply rotary embeddings on the first half or full tensor
+ q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+ k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+
+ # Concatenate back to full shape
+ q_embed = torch.cat([q_embed, q_pass], dim=-1)
+ k_embed = torch.cat([k_embed, k_pass], dim=-1)
+ return q_embed, k_embed
+
+
+class BailingMoeV2_5MLP(nn.Module):
+ def __init__(self, config: BailingMoeV2_5Config, intermediate_size: int):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = intermediate_size
+
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+ self.act_fn = ACT2FN[config.hidden_act]
+
+ def forward(self, x):
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class BailingMoeV2_5Gate(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.top_k = config.num_experts_per_tok
+ self.num_experts = config.num_experts
+
+ self.n_group = config.n_group
+ self.topk_group = config.topk_group
+
+ # topk selection algorithm
+ self.gating_dim = config.hidden_size
+ self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim)))
+ self.routed_scaling_factor = config.routed_scaling_factor
+
+ self.register_buffer("expert_bias", torch.zeros((self.num_experts)))
+ self.reset_parameters()
+
+ def reset_parameters(self) -> None:
+ import torch.nn.init as init
+
+ init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+ def group_limited_topk(
+ self,
+ scores: torch.Tensor,
+ ):
+ num_tokens, _ = scores.size()
+ # Organize the experts into groups
+ group_scores = scores.view(num_tokens, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
+ group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]
+ group_mask = torch.zeros_like(group_scores)
+ group_mask.scatter_(1, group_idx, 1)
+
+ # Mask the experts based on selection groups
+ score_mask = (
+ group_mask.unsqueeze(-1)
+ .expand(num_tokens, self.n_group, self.num_experts // self.n_group)
+ .reshape(num_tokens, -1)
+ )
+
+ masked_scores = scores.masked_fill(~score_mask.bool(), float('-inf'))
+ probs, top_indices = torch.topk(masked_scores, k=self.top_k, dim=-1)
+
+ return probs, top_indices
+
+ def forward(self, hidden_states):
+ # compute gating score
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ logits = F.linear(hidden_states.type(torch.float32), self.weight.type(torch.float32))
+
+ scores = torch.sigmoid(logits.float()).type_as(logits)
+
+ scores_for_routing = scores + self.expert_bias
+ _, topk_idx = self.group_limited_topk(scores_for_routing)
+
+ scores = torch.gather(scores, dim=1, index=topk_idx).type_as(logits)
+
+ topk_weight = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20) if self.top_k > 1 else scores
+ topk_weight = topk_weight * self.routed_scaling_factor
+
+ return topk_idx, topk_weight, logits
+
+
+class BailingMoeV2_5SparseMoeBlock(nn.Module):
+ """
+ A mixed expert module containing shared experts.
+ """
+
+ def __init__(self, config: BailingMoeV2_5Config):
+ super().__init__()
+ self.config = config
+ self.num_experts_per_tok = config.num_experts_per_tok
+ self._setup_experts()
+ self.gate = BailingMoeV2_5Gate(config)
+ if config.num_shared_experts is not None:
+ self.shared_experts = BailingMoeV2_5MLP(
+ config=config, intermediate_size=config.moe_intermediate_size * config.num_shared_experts
+ )
+
+ def _setup_experts(self):
+ self.experts = nn.ModuleList(
+ [
+ BailingMoeV2_5MLP(config=self.config, intermediate_size=self.config.moe_intermediate_size)
+ for _ in range(self.config.num_experts)
+ ]
+ )
+
+ def forward(self, hidden_states):
+ identity = hidden_states
+ bsz, seq_len, h = hidden_states.shape
+ topk_idx, topk_weight, router_logits = self.gate(hidden_states)
+ hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+ flat_topk_idx = topk_idx.view(-1)
+ if self.training:
+ hidden_states = hidden_states.repeat_interleave(self.num_experts_per_tok, dim=0)
+ y = torch.empty_like(hidden_states)
+ for i, expert in enumerate(self.experts):
+ y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
+ y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+ y = y.to(hidden_states.dtype).view(bsz, seq_len, h)
+ else:
+ y = self.moe_infer(hidden_states, topk_idx, topk_weight).view(bsz, seq_len, h)
+ if self.config.num_shared_experts is not None:
+ y = y + self.shared_experts(identity)
+ return y, (router_logits.view(bsz, seq_len, -1), topk_idx.view(bsz, seq_len, -1))
+
+ @torch.no_grad()
+ def moe_infer(self, x, topk_ids, topk_weight):
+ cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
+ cnts.scatter_(1, topk_ids, 1)
+ tokens_per_expert = cnts.sum(dim=0)
+ idxs = topk_ids.view(-1).argsort()
+ sorted_tokens = x[idxs // topk_ids.shape[1]]
+ tokens_per_expert = tokens_per_expert.cpu().numpy()
+ outputs = []
+ start_idx = 0
+ for i, num_tokens in enumerate(tokens_per_expert):
+ end_idx = start_idx + num_tokens
+ if num_tokens == 0:
+ continue
+ expert = self.experts[i]
+ tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+ expert_out = expert(tokens_for_this_expert)
+ outputs.append(expert_out.to(x.device))
+ start_idx = end_idx
+
+ outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+ new_x = torch.empty_like(outs)
+ new_x[idxs] = outs
+ final_out = (
+ new_x.view(*topk_ids.shape, -1)
+ .type(topk_weight.dtype)
+ .mul_(topk_weight.unsqueeze(dim=-1))
+ .sum(dim=1)
+ .type(new_x.dtype)
+ )
+ return final_out
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int, head_first: bool = True) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). If head_first is True, the hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ if n_rep == 1:
+ return hidden_states
+ if head_first:
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+ else:
+ batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+ hidden_states = hidden_states[:, :, :, None, :].expand(batch, slen, num_key_value_heads, n_rep, head_dim)
+ return hidden_states.reshape(batch, slen, num_key_value_heads * n_rep, head_dim)
+
+
+def repeat_kv2(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+ """
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+ """
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+ if n_rep == 1:
+ return hidden_states
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+ module: nn.Module,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ value: torch.Tensor,
+ attention_mask: Optional[torch.Tensor],
+ scaling: float,
+ dropout: float = 0.0,
+ **kwargs: Unpack[TransformersKwargs],
+):
+ key_states = repeat_kv2(key, module.num_key_value_groups)
+ value_states = repeat_kv2(value, module.num_key_value_groups)
+
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+ if attention_mask is not None:
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+ attn_weights = attn_weights + causal_mask
+
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+ attn_output = torch.matmul(attn_weights, value_states)
+ attn_output = attn_output.transpose(1, 2).contiguous()
+
+ return attn_output, attn_weights
+
+
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+ r"""
+ TODO let's just use the original freqcis computation to not have the view
+ transpose + reshape! This is not optimized!
+ Applies Rotary Position Embedding to the query and key tensors.
+
+ Args:
+ q (`torch.Tensor`): The query tensor.
+ k (`torch.Tensor`): The key tensor.
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
+ position_ids (`torch.Tensor`):
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+ used to pass offsetted position ids when working with a KV-cache.
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+ Returns:
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+ """
+ cos = cos.unsqueeze(unsqueeze_dim)
+ sin = sin.unsqueeze(unsqueeze_dim)
+
+ b, h, s, d = q.shape
+ q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ b, h, s, d = k.shape
+ k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+
+ q_embed = (q * cos) + (rotate_half(q) * sin)
+ k_embed = (k * cos) + (rotate_half(k) * sin)
+ return q_embed, k_embed
+
+
+class BailingMoeV2_5MLARotaryEmbedding(nn.Module):
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
+
+ def __init__(self, config: BailingMoeV2_5Config, device=None):
+ super().__init__()
+ # BC: "rope_type" was originally "type"
+ if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+ else:
+ self.rope_type = "default"
+ self.max_seq_len_cached = config.max_position_embeddings
+ self.original_max_seq_len = config.max_position_embeddings
+
+ self.config = config
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
+ self.original_inv_freq = self.inv_freq
+
+ @torch.no_grad()
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
+ def forward(self, x, position_ids):
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+ position_ids_expanded = position_ids[:, None, :].float()
+
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos() * self.attention_scaling
+ sin = emb.sin() * self.attention_scaling
+
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def yarn_get_mscale(scale=1, mscale=1):
+ if scale <= 1:
+ return 1.0
+ return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class BailingMoeV2_5MultiLatentAttention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(self, config: BailingMoeV2_5Config, layer_idx: int):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+ self.attention_dropout = config.attention_dropout
+ self.num_heads = config.num_attention_heads
+ self.rope_theta = config.rope_theta
+ self.q_lora_rank = config.q_lora_rank
+ self.qk_rope_head_dim = config.qk_rope_head_dim
+ self.kv_lora_rank = config.kv_lora_rank
+ self.v_head_dim = config.v_head_dim
+ self.qk_nope_head_dim = config.qk_nope_head_dim
+ self.qk_head_dim = config.qk_head_dim
+
+ self.is_causal = True
+ if self.q_lora_rank is None:
+ self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
+ else:
+ self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.use_qkv_bias)
+ self.q_a_layernorm = BailingMoeV2_5RMSNorm(config.q_lora_rank)
+ self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+
+ self.kv_a_proj_with_mqa = nn.Linear(
+ config.hidden_size,
+ self.kv_lora_rank + self.qk_rope_head_dim,
+ bias=config.use_qkv_bias,
+ )
+ self.kv_a_layernorm = BailingMoeV2_5RMSNorm(self.kv_lora_rank)
+ self.kv_b_proj = nn.Linear(
+ self.kv_lora_rank,
+ self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+ bias=False,
+ )
+
+ self.dense = nn.Linear(
+ self.num_heads * self.v_head_dim,
+ config.hidden_size,
+ bias=config.use_qkv_bias,
+ )
+
+ self.scaling = self.qk_head_dim ** (-0.5)
+ if self.config.rope_scaling is not None:
+ mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+ scaling_factor = self.config.rope_scaling["factor"]
+ if mscale_all_dim:
+ mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+ self.scaling = self.scaling * mscale * mscale
+
+ @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
+ attention_mask: Optional[torch.Tensor],
+ past_key_values: Optional[Cache] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ **kwargs: Unpack[FlashAttentionKwargs],
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+
+ batch_size, seq_length = hidden_states.shape[:-1]
+ query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+ key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+
+ if self.q_lora_rank is None:
+ q_states = self.q_proj(hidden_states)
+ else:
+ q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+ q_states = q_states.view(query_shape).transpose(1, 2)
+ q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+ compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+ k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+
+ k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+ k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+ k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+
+ cos, sin = position_embeddings # tptest
+ if self.config.rope_interleave: # support using interleaved weights for efficiency
+ q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+ else:
+ x = 1 / 0
+ q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+ k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+
+ query_states = torch.cat((q_pass, q_rot), dim=-1)
+ key_states = torch.cat((k_pass, k_rot), dim=-1)
+
+ if past_key_values is not None:
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+
+ attention_interface: Callable = eager_attention_forward
+
+ attn_output, attn_weights = attention_interface(
+ self,
+ query_states,
+ key_states,
+ value_states,
+ attention_mask,
+ dropout=0.0 if not self.training else self.attention_dropout,
+ scaling=self.scaling,
+ **kwargs,
+ )
+
+ if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+ attn_output = attn_output[:, :, :, : self.v_head_dim]
+
+ attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+ attn_output = self.dense(attn_output)
+ return attn_output, attn_weights, past_key_values
+
+
+class BailingMoeV2_5LinearAttention(nn.Module):
+ """
+ BailingMoeAttention implements a linear attention mechanism based on Lightning Attention-2
+ (https://arxiv.org/abs/2401.04658) with efficient computation using flash-linear-attention operators.
+
+ The implementation leverages optimized kernels from the flash-linear-attention library
+ (https://github.com/fla-org/flash-linear-attention) for maximum performance.
+ """
+
+ def __init__(self, config: BailingMoeV2_5Config, layer_idx: Optional[int] = None):
+ super().__init__()
+ self.config = config
+ self.layer_idx = layer_idx
+ if layer_idx is None:
+ logger.warning_once(
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+ "when creating this class."
+ )
+ self.hidden_size = config.hidden_size
+ self.num_heads = config.num_attention_heads
+ self.head_dim = config.head_dim or self.hidden_size // self.num_heads
+ self.num_key_value_heads = config.num_attention_heads
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+ self.rope_dim = int(self.head_dim * partial_rotary_factor)
+
+ self.use_qk_norm = getattr(config, "use_qk_norm", False)
+ self.rms_norm_eps = getattr(config, "rms_norm_eps", 1e-5)
+ self.mode = 'chunk'
+
+ self.query_key_value = nn.Linear(
+ self.hidden_size,
+ (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
+ bias=config.use_qkv_bias,
+ )
+
+ if self.config.use_qk_norm:
+ self.query_layernorm = BailingMoeV2_5RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+ self.key_layernorm = BailingMoeV2_5RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+ self.rotary_emb = BailingMoeV2_5RotaryEmbedding(config=config)
+
+ self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias)
+
+ self.g_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+ self.g_norm = BailingMoeV2_5GroupRMSNorm(
+ self.num_heads * self.head_dim, group_norm_size=config.group_norm_size, eps=self.rms_norm_eps
+ )
+ slope = -BailingMoeV2_5LinearAttention.build_slope_tensor(self.num_heads) * (
+ 1 - (self.layer_idx - 1) / (self.config.num_hidden_layers - 1) + 1e-5
+ )
+ self.register_buffer('slope', slope, persistent=False)
+
+ self.lightning_attn_ops = {'chunk': chunk_simple_gla, 'fused_recurrent': fused_recurrent_simple_gla}
+
+ @staticmethod
+ def build_slope_tensor(n_attention_heads: int):
+ """
+ Build a tensor of slopes for Lightning Attention-2 as described in the paper:
+ "Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence Lengths in Large Language Models"
+ (https://arxiv.org/abs/2401.04658)
+
+ This function computes the slope values that control the decay rate of attention scores
+ based on the number of attention heads. The slopes are designed to have specific
+ mathematical properties that work optimally when the number of heads is a power of 2.
+
+ For non-power-of-2 head counts, a workaround is implemented to maintain similar properties.
+
+ Args:
+ n_attention_heads (int): Number of attention heads in the model
+
+ Returns:
+ torch.Tensor: A tensor of shape [n_attention_heads] containing the computed slopes
+
+ Note:
+ Code copied from: https://github.com/OpenNLPLab/lightning-attention/blob/d15c38529bbd5c2c82b44ddda3cac885825aa873/lightning_attn/utils/utils.py#L6
+ """
+
+ def get_slopes(n):
+ def get_slopes_power_of_2(n):
+ start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+ ratio = start
+ return [start * ratio**i for i in range(n)]
+
+ if math.log2(n).is_integer():
+ return get_slopes_power_of_2(
+ n
+ ) # In the paper, we only train models that have 2^a heads for some a. This function has
+ else: # some good properties that only occur when the input is a power of 2. To maintain that even
+ closest_power_of_2 = 2 ** math.floor(
+ math.log2(n)
+ ) # when the number of heads is not a power of 2, we use this workaround.
+ return (
+ get_slopes_power_of_2(closest_power_of_2)
+ + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+ )
+
+ slopes = torch.tensor(get_slopes(n_attention_heads), dtype=torch.float)
+ return slopes
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Cache] = None,
+ output_attentions: bool = False,
+ use_cache: bool = False,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ if attention_mask is not None:
+ assert len(attention_mask.shape) == 2, (
+ "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+ "for padding purposes (0 indicating padding). "
+ "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+ )
+
+ # launching the triton kernel for just one token will actually be slower
+ mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+
+ # Currently output_attentions can only be False, returning attention weights is not supported
+ assert (
+ not output_attentions
+ ), "output_attentions can only be False, returning attention weights is not supported"
+
+ bsz, q_len, _ = hidden_states.size()
+ device = hidden_states.device
+
+ qkv = self.query_key_value(hidden_states)
+ qkv = qkv.view(bsz, q_len, self.num_heads + 2 * self.num_key_value_heads, self.head_dim)
+ query_states, key_states, value_states = qkv.split(
+ [self.num_heads, self.num_key_value_heads, self.num_key_value_heads], dim=-2
+ )
+ if self.config.use_qk_norm:
+ query_states = self.query_layernorm(query_states)
+ key_states = self.key_layernorm(key_states)
+
+ cos, sin = position_embeddings
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=2)
+
+ if self.num_key_value_groups > 1:
+ # [bsz, q_len, n_kv_heads, head_dim] -> [bsz, q_len, n_heads, head_dim]
+ key_states = repeat_kv(key_states, self.num_key_value_groups, head_first=False)
+ value_states = repeat_kv(value_states, self.num_key_value_groups, head_first=False)
+
+ recurrent_state = None
+ if past_key_value is not None and isinstance(past_key_value, Cache):
+ # ensure the cache list is long enough
+ while len(past_key_value.layers) <= self.layer_idx:
+ past_key_value.layers.append(DynamicLayer())
+
+ if past_key_value.layers[self.layer_idx].keys is not None:
+ recurrent_state = past_key_value.layers[self.layer_idx].keys
+ # ensure recurrent_state is on the same device as hidden_states
+ if recurrent_state.device != hidden_states.device:
+ recurrent_state = recurrent_state.to(device).contiguous()
+
+ if recurrent_state is None:
+ # dealing with left-padding
+ if attention_mask is not None and use_cache:
+ value_states = value_states.mul_(attention_mask[:, -q_len:, None, None])
+
+ o, recurrent_state = self.lightning_attn_ops[mode](
+ q=query_states,
+ k=key_states,
+ v=value_states,
+ g=self.slope[None, None, :].expand(bsz, q_len, self.num_heads),
+ initial_state=recurrent_state,
+ output_final_state=use_cache,
+ )
+
+ o = o.reshape(bsz, q_len, -1)
+ o = self.g_norm(o)
+ g_proj = self.g_proj(hidden_states)
+ o = o * torch.sigmoid_(g_proj)
+ o = self.dense(o)
+
+ if use_cache and past_key_value is not None and isinstance(past_key_value, Cache):
+ target_device = None
+ for cache in past_key_value.layers:
+ if cache.keys is not None:
+ target_device = cache.keys.device
+ break
+ if target_device is None:
+ target_device = recurrent_state.device
+
+ # move to target device
+ if recurrent_state.device != target_device:
+ recurrent_state = recurrent_state.to(target_device)
+
+ past_key_value.layers[self.layer_idx].keys = recurrent_state
+
+ return o, None, past_key_value
+
+
+class BailingMoeV2_5MTPLayer(nn.Module):
+ def __init__(self, config: BailingMoeV2_5Config, layer_idx: int):
+ super().__init__()
+ self.layer_idx = layer_idx
+ self.input_layernorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.enorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+ self.post_attention_layernorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.attention = BailingMoeV2_5MultiLatentAttention(config=config, layer_idx=layer_idx)
+ self.mlp = BailingMoeV2_5SparseMoeBlock(config)
+
+ self.hnorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.final_layernorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ input_embeds,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ output_router_logits: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ input_embeds = self.enorm(input_embeds)
+ hidden_states = self.hnorm(hidden_states)
+ hidden_states = self.eh_proj(torch.cat([input_embeds, hidden_states], dim=-1))
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ position_embeddings=position_embeddings,
+ use_cache=use_cache,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ if isinstance(hidden_states, tuple):
+ hidden_states, router_logits = hidden_states
+ else:
+ router_logits = None
+ hidden_states = residual + hidden_states.to(residual.device)
+ hidden_states = self.final_layernorm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ if output_router_logits:
+ outputs += (router_logits,)
+
+ return outputs
+
+
+class BailingMoeV2_5DecoderLayer(nn.Module):
+ def __init__(self, config: BailingMoeV2_5Config, layer_idx: int):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.layer_idx = layer_idx
+ self.attention_layer_type = (
+ "attention"
+ if (layer_idx + 1) % config.layer_group_size == 0
+ or layer_idx >= config.num_hidden_layers // config.layer_group_size * config.layer_group_size
+ else "linear_attention"
+ )
+
+ if self.attention_layer_type == "attention":
+ self.attention = BailingMoeV2_5MultiLatentAttention(config=config, layer_idx=layer_idx)
+ else:
+ self.attention = BailingMoeV2_5LinearAttention(config=config, layer_idx=layer_idx)
+
+ self.mlp = (
+ BailingMoeV2_5SparseMoeBlock(config)
+ if (config.num_experts is not None and layer_idx >= config.first_k_dense_replace)
+ else BailingMoeV2_5MLP(config=config, intermediate_size=config.intermediate_size)
+ )
+ self.input_layernorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.post_attention_layernorm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = False,
+ output_router_logits: Optional[bool] = False,
+ use_cache: Optional[bool] = False,
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
+ position_embeddings_mla: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
+ **kwargs,
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`, *optional*):
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+ query_sequence_length, key_sequence_length)` if default attention is used.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
+ cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_router_logits (`bool`, *optional*):
+ Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+ and should not be returned during inference.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+ (see `past_key_values`).
+ """
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ if self.attention_layer_type == "attention":
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_value,
+ use_cache=use_cache,
+ cache_position=cache_position, #
+ position_embeddings=position_embeddings_mla, #
+ **kwargs,
+ )
+ else:
+ batch_size, seq_len = hidden_states.shape[0], hidden_states.shape[1]
+ device = hidden_states.device
+
+ if attention_mask is None:
+ # if attention_mask is None, create a full mask
+ attention_mask = torch.ones((batch_size, seq_len), dtype=torch.int32, device=device)
+ elif attention_mask.dim() == 4 and attention_mask.shape[1] == 1:
+ attention_mask = attention_mask[:, 0, -1, :].to(torch.int32)
+ attention_mask = (attention_mask > -1e4).to(torch.int32)
+ elif attention_mask.dim() == 2:
+ attention_mask = attention_mask.to(torch.int32)
+ else:
+ raise ValueError(f"Unsupported mask dimension: {attention_mask.shape}")
+
+ hidden_states, self_attn_weights, present_key_value = self.attention(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ past_key_value=past_key_value,
+ position_ids=position_ids,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ position_embeddings=position_embeddings,
+ )
+
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ if isinstance(hidden_states, tuple):
+ hidden_states, router_logits = hidden_states
+ else:
+ router_logits = None
+ hidden_states = residual + hidden_states.to(residual.device)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (self_attn_weights,)
+
+ if use_cache:
+ outputs += (present_key_value,)
+
+ if output_router_logits:
+ outputs += (router_logits,)
+
+ return outputs
+
+
+BAILINGMOEV2_5_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+ Parameters:
+ config ([`BailingMoeV2_5Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+ "The bare BailingMoeV2_5 Model outputting raw hidden-states without any specific head on top.",
+ BAILINGMOEV2_5_START_DOCSTRING,
+)
+class BailingMoeV2_5PreTrainedModel(PreTrainedModel):
+ config_class = BailingMoeV2_5Config
+ base_model_prefix = "model"
+ supports_gradient_checkpointing = True
+ _no_split_modules = ["BailingMoeV2_5DecoderLayer"]
+ _skip_keys_device_placement = "past_key_values"
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ _supports_cache_class = True
+
+ def _init_weights(self, module):
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+
+BAILINGMOEV2_5_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+ it.
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+ [What are attention masks?](../glossary#attention-mask)
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+ `past_key_values`).
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+ config.n_positions - 1]`.
+ [What are position IDs?](../glossary#position-ids)
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+ Two formats are allowed:
+ - a [`~cache_utils.Cache`] instance;
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+ cache format.
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+ legacy cache format will be returned.
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+ of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+ "The bare BailingMoeV2_5 Model outputting raw hidden-states without any specific head on top.",
+ BAILINGMOEV2_5_START_DOCSTRING,
+)
+class BailingMoeV2_5Model(BailingMoeV2_5PreTrainedModel):
+ """
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BailingMoeV2_5DecoderLayer`]
+ Args:
+ config: BailingMoeV2_5Config
+ """
+
+ def __init__(self, config: BailingMoeV2_5Config):
+ super().__init__(config)
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.num_nextn_predict_layers = config.num_nextn_predict_layers
+
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+ self.layers = []
+ for layer_idx in range(config.num_hidden_layers + config.num_nextn_predict_layers):
+ layer_cls = BailingMoeV2_5DecoderLayer if layer_idx < config.num_hidden_layers else BailingMoeV2_5MTPLayer
+ self.layers.append(layer_cls(config, layer_idx))
+
+ self.layers = nn.ModuleList(self.layers)
+
+ self._use_sdpa = config._attn_implementation == "sdpa"
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+ self.norm = BailingMoeV2_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+ self.rotary_emb = BailingMoeV2_5RotaryEmbedding(config=config)
+ self.rotary_emb_mla = BailingMoeV2_5MLARotaryEmbedding(config=config)
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.word_embeddings = value
+
+ @add_start_docstrings_to_model_forward(BAILINGMOEV2_5_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ cache_position: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_router_logits: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple, MoeV2_5ModelOutputWithPast]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_router_logits = (
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ batch_size, seq_length = input_ids.shape[:2]
+ elif inputs_embeds is not None:
+ batch_size, seq_length = inputs_embeds.shape[:2]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
+ )
+ use_cache = False
+
+ if use_cache and past_key_values is None:
+ past_key_values = DynamicCache()
+
+ if inputs_embeds is None:
+ inputs_embeds = self.word_embeddings(input_ids)
+
+ if cache_position is None:
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+ cache_position: torch.Tensor = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+
+ if position_ids is None:
+ position_ids = cache_position.unsqueeze(0)
+
+ softmax_attention_layer_id = self.config.layer_group_size - 1
+ past_seen_tokens = (
+ past_key_values.get_seq_length(layer_idx=softmax_attention_layer_id) if past_key_values is not None else 0
+ )
+
+ if position_ids is None:
+ position_ids = torch.arange(
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ if self._use_flash_attention_2:
+ # 2d mask is passed through the layers
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+ elif self._use_sdpa and not output_attentions:
+ # output_attentions=True can not be supported when using SDPA, and we fall back on
+ # the manual implementation that requires a 4D causal mask in all cases.
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+ attention_mask,
+ (batch_size, seq_length),
+ inputs_embeds,
+ past_seen_tokens,
+ )
+ else:
+ # 4d mask is passed through the layers
+ attention_mask = _prepare_4d_causal_attention_mask(
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_seen_tokens
+ )
+
+ # embed positions
+ hidden_states = inputs_embeds
+
+ # create position embeddings to be shared across the decoder layers
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
+ position_embeddings_mla = self.rotary_emb_mla(hidden_states, position_ids)
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_router_logits = () if output_router_logits else None
+ next_decoder_cache = None
+ layers = self.layers[: -self.num_nextn_predict_layers] if self.num_nextn_predict_layers > 0 else self.layers
+ mtp_layers = self.layers[-self.num_nextn_predict_layers :] if self.num_nextn_predict_layers > 0 else None
+
+ # tptest miss causal_mask = create_causal_mask(
+
+ for decoder_layer in layers:
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ position_ids,
+ past_key_values,
+ cache_position,
+ output_attentions,
+ output_router_logits,
+ use_cache,
+ position_embeddings,
+ position_embeddings_mla,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ cache_position=cache_position,
+ output_attentions=output_attentions,
+ output_router_logits=output_router_logits,
+ use_cache=use_cache,
+ position_embeddings=position_embeddings,
+ position_embeddings_mla=position_embeddings_mla,
+ )
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ if output_router_logits and layer_outputs[-1] is not None:
+ all_router_logits += (layer_outputs[-1],)
+
+ hidden_states = self.norm(hidden_states)
+ main_hidden_states = hidden_states
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (main_hidden_states,)
+
+ mtp_hidden_states = None
+
+ if mtp_layers:
+ for decoder_layer in mtp_layers:
+ input_ids, _ = roll_tensor(input_ids, shifts=-1, dims=-1)
+ inputs_embeds = self.word_embeddings(input_ids)
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ inputs_embeds,
+ hidden_states,
+ attention_mask,
+ position_ids,
+ past_key_values,
+ output_attentions,
+ output_router_logits,
+ use_cache,
+ position_embeddings,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ inputs_embeds,
+ hidden_states,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_value=past_key_values,
+ output_attentions=output_attentions,
+ output_router_logits=output_router_logits,
+ use_cache=use_cache,
+ position_embeddings=position_embeddings,
+ )
+ if mtp_hidden_states is None:
+ mtp_hidden_states = []
+ hidden_states = layer_outputs[0]
+ mtp_hidden_states.append(hidden_states)
+
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if use_cache:
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[1],)
+
+ if output_router_logits and layer_outputs[-1] is not None:
+ all_router_logits += (layer_outputs[-1],)
+
+ next_cache = None
+ if use_cache:
+ next_cache = next_decoder_cache
+ if not return_dict:
+ return tuple(
+ v
+ for v in [main_hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+ if v is not None
+ )
+ return MoeV2_5ModelOutputWithPast(
+ last_hidden_state=main_hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ mtp_hidden_states=mtp_hidden_states,
+ attentions=all_self_attns,
+ router_logits=all_router_logits,
+ )
+
+
+class BailingMoeV2_5ForCausalLM(BailingMoeV2_5PreTrainedModel, GenerationMixin):
+ _tied_weights_keys = ["lm_head.weight"]
+
+ def __init__(self, config: BailingMoeV2_5Config):
+ super().__init__(config)
+ self.model = BailingMoeV2_5Model(config)
+ self.vocab_size = config.vocab_size
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+ self.num_nextn_predict_layers = config.num_nextn_predict_layers
+ self.mtp_loss_scaling_factor = config.mtp_loss_scaling_factor
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.model.word_embeddings
+
+ def set_input_embeddings(self, value):
+ self.model.word_embeddings = value
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def set_decoder(self, decoder):
+ self.model = decoder
+
+ def get_decoder(self):
+ return self.model
+
+ @add_start_docstrings_to_model_forward(BAILINGMOEV2_5_INPUTS_DOCSTRING)
+ @replace_return_docstrings(output_type=MoEV2_5CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ output_router_logits: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple, MoEV2_5CausalLMOutputWithPast]:
+ r"""
+ Args:
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+ Returns:
+ Example:
+ ```python
+ >>> from transformers import AutoTokenizer
+ >>> model = BailingMoeV2_5ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
+ >>> # Generate
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+ ```"""
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ output_router_logits = (
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+ outputs = self.model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ inputs_embeds=inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ output_router_logits=output_router_logits,
+ return_dict=return_dict,
+ **kwargs,
+ )
+
+ loss = None
+ all_mtp_loss = None
+ aux_loss = None
+ hidden_states = outputs[0]
+ logits = self.lm_head(hidden_states)
+ logits = logits.float()
+
+ if labels is not None:
+ loss = self.loss_function(logits, labels, self.config.vocab_size, **kwargs)
+
+ all_mtp_logits = None
+ if self.num_nextn_predict_layers > 0:
+ mtp_hidden_states = outputs.mtp_hidden_states
+ shift_labels_mtp = None
+ for i in range(self.num_nextn_predict_layers):
+ mtp_hidden_states = mtp_hidden_states[i]
+ mtp_logits = self.lm_head(mtp_hidden_states).float()
+ if all_mtp_logits is None:
+ all_mtp_logits = []
+ all_mtp_logits.append(mtp_logits)
+ if labels is not None:
+ if shift_labels_mtp is None:
+ shift_labels_mtp = labels.clone()
+ shift_labels_mtp, _ = roll_tensor(shift_labels_mtp, shifts=-1, dims=-1, fill_value=-100)
+ mtp_logits_ = mtp_logits.view(-1, self.config.vocab_size)
+ mtp_loss = self.loss_function(
+ mtp_logits_, shift_labels_mtp.to(mtp_logits_.device).view(-1), self.config.vocab_size, **kwargs
+ )
+ if loss is not None:
+ loss += self.mtp_loss_scaling_factor * mtp_loss
+ else:
+ loss = self.mtp_loss_scaling_factor * mtp_loss
+
+ if all_mtp_loss is None:
+ all_mtp_loss = []
+ all_mtp_loss.append(mtp_loss)
+
+ if not return_dict:
+ output = (logits,) + outputs[1:]
+ if output_router_logits:
+ output = (aux_loss,) + output
+ return (loss,) + output if loss is not None else output
+
+ return MoEV2_5CausalLMOutputWithPast(
+ loss=loss,
+ mtp_loss=all_mtp_loss,
+ aux_loss=aux_loss,
+ logits=logits,
+ mtp_logits=all_mtp_logits,
+ past_key_values=outputs.past_key_values,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ router_logits=outputs.router_logits,
+ )
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..4c93b526a98839e853dca99127d8202d23bf5f85
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,30 @@
+{
+ "bos_token": {
+ "content": "<|startoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "cls_token": {
+ "content": "[CLS]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "eos_token": {
+ "content": "<|endoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ },
+ "pad_token": {
+ "content": "<|endoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false
+ }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2e3b3ae9d33db880d47397059cc4f8f02010473
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ce9d2d10f1d6da7b2439bc9655e51a00a8c5970f7dd015ae8407ca3962199f4
+size 12205770
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d78abecfa1f92cdb38eac6e16069da72b1e4bc4a
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,2114 @@
+{
+ "add_bos_token": false,
+ "add_eos_token": false,
+ "added_tokens_decoder": {
+ "156891": {
+ "content": "<|startoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156892": {
+ "content": "<|endoftext|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156893": {
+ "content": "[CLS]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156894": {
+ "content": "[gMASK]",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156895": {
+ "content": "<|role_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156896": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "156897": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "156898": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "156899": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "156900": {
+ "content": "<|fim_start|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156901": {
+ "content": "<|fim_hole|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156902": {
+ "content": "<|fim_end|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156903": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "156904": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": false
+ },
+ "156905": {
+ "content": "<|reserved_token_10|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156906": {
+ "content": "<|reserved_token_11|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156907": {
+ "content": "<|reserved_token_12|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156908": {
+ "content": "<|reserved_token_13|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156909": {
+ "content": "<|reserved_token_14|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156910": {
+ "content": "<|reserved_token_15|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156911": {
+ "content": "<|reserved_token_16|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156912": {
+ "content": "<|reserved_token_17|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156913": {
+ "content": "<|reserved_token_18|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156914": {
+ "content": "<|reserved_token_19|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156915": {
+ "content": "<|reserved_token_20|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156916": {
+ "content": "<|reserved_token_21|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156917": {
+ "content": "<|reserved_token_22|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156918": {
+ "content": "<|reserved_token_23|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156919": {
+ "content": "<|reserved_token_24|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156920": {
+ "content": "<|reserved_token_25|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156921": {
+ "content": "<|reserved_token_26|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156922": {
+ "content": "<|reserved_token_27|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156923": {
+ "content": "<|reserved_token_28|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156924": {
+ "content": "<|reserved_token_29|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156925": {
+ "content": "<|reserved_token_30|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156926": {
+ "content": "<|reserved_token_31|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156927": {
+ "content": "<|reserved_token_32|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156928": {
+ "content": "<|reserved_token_33|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156929": {
+ "content": "<|reserved_token_34|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156930": {
+ "content": "<|reserved_token_35|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156931": {
+ "content": "<|reserved_token_36|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156932": {
+ "content": "<|reserved_token_37|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156933": {
+ "content": "<|reserved_token_38|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156934": {
+ "content": "<|reserved_token_39|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156935": {
+ "content": "<|reserved_token_40|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156936": {
+ "content": "<|reserved_token_41|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156937": {
+ "content": "<|reserved_token_42|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156938": {
+ "content": "<|reserved_token_43|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156939": {
+ "content": "<|reserved_token_44|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156940": {
+ "content": "<|reserved_token_45|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156941": {
+ "content": "<|reserved_token_46|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156942": {
+ "content": "<|reserved_token_47|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156943": {
+ "content": "<|reserved_token_48|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156944": {
+ "content": "<|reserved_token_49|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156945": {
+ "content": "<|reserved_token_50|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156946": {
+ "content": "<|reserved_token_51|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156947": {
+ "content": "<|reserved_token_52|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156948": {
+ "content": "<|reserved_token_53|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156949": {
+ "content": "<|reserved_token_54|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156950": {
+ "content": "<|reserved_token_55|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156951": {
+ "content": "<|reserved_token_56|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156952": {
+ "content": "<|reserved_token_57|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156953": {
+ "content": "<|reserved_token_58|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156954": {
+ "content": "<|reserved_token_59|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156955": {
+ "content": "<|reserved_token_60|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156956": {
+ "content": "<|reserved_token_61|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156957": {
+ "content": "<|reserved_token_62|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156958": {
+ "content": "<|reserved_token_63|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156959": {
+ "content": "<|reserved_token_64|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156960": {
+ "content": "<|reserved_token_65|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156961": {
+ "content": "<|reserved_token_66|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156962": {
+ "content": "<|reserved_token_67|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156963": {
+ "content": "<|reserved_token_68|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156964": {
+ "content": "<|reserved_token_69|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156965": {
+ "content": "<|reserved_token_70|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156966": {
+ "content": "<|reserved_token_71|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156967": {
+ "content": "<|reserved_token_72|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156968": {
+ "content": "<|reserved_token_73|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156969": {
+ "content": "<|reserved_token_74|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156970": {
+ "content": "<|reserved_token_75|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156971": {
+ "content": "<|reserved_token_76|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156972": {
+ "content": "<|reserved_token_77|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156973": {
+ "content": "<|reserved_token_78|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156974": {
+ "content": "<|reserved_token_79|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156975": {
+ "content": "<|reserved_token_80|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156976": {
+ "content": "<|reserved_token_81|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156977": {
+ "content": "<|reserved_token_82|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156978": {
+ "content": "<|reserved_token_83|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156979": {
+ "content": "<|reserved_token_84|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156980": {
+ "content": "<|reserved_token_85|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156981": {
+ "content": "<|reserved_token_86|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156982": {
+ "content": "<|reserved_token_87|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156983": {
+ "content": "<|reserved_token_88|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156984": {
+ "content": "<|reserved_token_89|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156985": {
+ "content": "<|reserved_token_90|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156986": {
+ "content": "<|reserved_token_91|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156987": {
+ "content": "<|reserved_token_92|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156988": {
+ "content": "<|reserved_token_93|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156989": {
+ "content": "<|reserved_token_94|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156990": {
+ "content": "<|reserved_token_95|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156991": {
+ "content": "<|reserved_token_96|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156992": {
+ "content": "<|reserved_token_97|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156993": {
+ "content": "<|reserved_token_98|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156994": {
+ "content": "<|reserved_token_99|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156995": {
+ "content": "<|reserved_token_100|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156996": {
+ "content": "<|reserved_token_101|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156997": {
+ "content": "<|reserved_token_102|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156998": {
+ "content": "<|reserved_token_103|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "156999": {
+ "content": "<|reserved_token_104|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157000": {
+ "content": "<|reserved_token_105|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157001": {
+ "content": "<|reserved_token_106|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157002": {
+ "content": "<|reserved_token_107|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157003": {
+ "content": "<|reserved_token_108|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157004": {
+ "content": "<|reserved_token_109|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157005": {
+ "content": "<|reserved_token_110|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157006": {
+ "content": "<|reserved_token_111|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157007": {
+ "content": "<|reserved_token_112|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157008": {
+ "content": "<|reserved_token_113|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157009": {
+ "content": "<|reserved_token_114|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157010": {
+ "content": "<|reserved_token_115|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157011": {
+ "content": "<|reserved_token_116|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157012": {
+ "content": "<|reserved_token_117|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157013": {
+ "content": "<|reserved_token_118|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157014": {
+ "content": "<|reserved_token_119|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157015": {
+ "content": "<|reserved_token_120|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157016": {
+ "content": "<|reserved_token_121|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157017": {
+ "content": "<|reserved_token_122|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157018": {
+ "content": "<|reserved_token_123|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157019": {
+ "content": "<|reserved_token_124|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157020": {
+ "content": "<|reserved_token_125|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157021": {
+ "content": "<|reserved_token_126|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157022": {
+ "content": "<|reserved_token_127|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157023": {
+ "content": "<|reserved_token_128|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157024": {
+ "content": "<|reserved_token_129|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157025": {
+ "content": "<|reserved_token_130|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157026": {
+ "content": "<|reserved_token_131|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157027": {
+ "content": "<|reserved_token_132|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157028": {
+ "content": "<|reserved_token_133|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157029": {
+ "content": "<|reserved_token_134|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157030": {
+ "content": "<|reserved_token_135|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157031": {
+ "content": "<|reserved_token_136|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157032": {
+ "content": "<|reserved_token_137|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157033": {
+ "content": "<|reserved_token_138|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157034": {
+ "content": "<|reserved_token_139|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157035": {
+ "content": "<|reserved_token_140|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157036": {
+ "content": "<|reserved_token_141|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157037": {
+ "content": "<|reserved_token_142|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157038": {
+ "content": "<|reserved_token_143|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157039": {
+ "content": "<|reserved_token_144|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157040": {
+ "content": "<|reserved_token_145|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157041": {
+ "content": "<|reserved_token_146|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157042": {
+ "content": "<|reserved_token_147|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157043": {
+ "content": "<|reserved_token_148|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157044": {
+ "content": "<|reserved_token_149|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157045": {
+ "content": "<|reserved_token_150|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157046": {
+ "content": "<|reserved_token_151|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157047": {
+ "content": "<|reserved_token_152|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157048": {
+ "content": "<|reserved_token_153|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157049": {
+ "content": "<|reserved_token_154|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157050": {
+ "content": "<|reserved_token_155|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157051": {
+ "content": "<|reserved_token_156|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157052": {
+ "content": "<|reserved_token_157|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157053": {
+ "content": "<|reserved_token_158|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157054": {
+ "content": "<|reserved_token_159|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157055": {
+ "content": "<|reserved_token_160|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157056": {
+ "content": "<|reserved_token_161|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157057": {
+ "content": "<|reserved_token_162|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157058": {
+ "content": "<|reserved_token_163|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157059": {
+ "content": "<|reserved_token_164|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157060": {
+ "content": "<|reserved_token_165|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157061": {
+ "content": "<|reserved_token_166|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157062": {
+ "content": "<|reserved_token_167|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157063": {
+ "content": "<|reserved_token_168|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157064": {
+ "content": "<|reserved_token_169|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157065": {
+ "content": "<|reserved_token_170|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157066": {
+ "content": "<|reserved_token_171|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157067": {
+ "content": "<|reserved_token_172|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157068": {
+ "content": "<|reserved_token_173|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157069": {
+ "content": "<|reserved_token_174|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157070": {
+ "content": "<|reserved_token_175|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157071": {
+ "content": "<|reserved_token_176|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157072": {
+ "content": "<|reserved_token_177|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157073": {
+ "content": "<|reserved_token_178|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157074": {
+ "content": "<|reserved_token_179|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157075": {
+ "content": "<|reserved_token_180|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157076": {
+ "content": "<|reserved_token_181|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157077": {
+ "content": "<|reserved_token_182|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157078": {
+ "content": "<|reserved_token_183|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157079": {
+ "content": "<|reserved_token_184|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157080": {
+ "content": "<|reserved_token_185|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157081": {
+ "content": "<|reserved_token_186|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157082": {
+ "content": "<|reserved_token_187|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157083": {
+ "content": "<|reserved_token_188|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157084": {
+ "content": "<|reserved_token_189|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157085": {
+ "content": "<|reserved_token_190|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157086": {
+ "content": "<|reserved_token_191|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157087": {
+ "content": "<|reserved_token_192|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157088": {
+ "content": "<|reserved_token_193|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157089": {
+ "content": "<|reserved_token_194|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157090": {
+ "content": "<|reserved_token_195|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157091": {
+ "content": "<|reserved_token_196|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157092": {
+ "content": "<|reserved_token_197|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157093": {
+ "content": "<|reserved_token_198|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157094": {
+ "content": "<|reserved_token_199|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157095": {
+ "content": "<|reserved_token_200|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157096": {
+ "content": "<|reserved_token_201|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157097": {
+ "content": "<|reserved_token_202|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157098": {
+ "content": "<|reserved_token_203|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157099": {
+ "content": "<|reserved_token_204|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157100": {
+ "content": "<|reserved_token_205|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157101": {
+ "content": "<|reserved_token_206|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157102": {
+ "content": "<|reserved_token_207|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157103": {
+ "content": "<|reserved_token_208|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157104": {
+ "content": "<|reserved_token_209|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157105": {
+ "content": "<|reserved_token_210|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157106": {
+ "content": "<|reserved_token_211|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157107": {
+ "content": "<|reserved_token_212|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157108": {
+ "content": "<|reserved_token_213|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157109": {
+ "content": "<|reserved_token_214|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157110": {
+ "content": "<|reserved_token_215|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157111": {
+ "content": "<|reserved_token_216|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157112": {
+ "content": "<|reserved_token_217|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157113": {
+ "content": "<|reserved_token_218|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157114": {
+ "content": "<|reserved_token_219|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157115": {
+ "content": "<|reserved_token_220|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157116": {
+ "content": "<|reserved_token_221|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157117": {
+ "content": "<|reserved_token_222|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157118": {
+ "content": "<|reserved_token_223|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157119": {
+ "content": "<|reserved_token_224|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157120": {
+ "content": "<|reserved_token_225|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157121": {
+ "content": "<|reserved_token_226|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157122": {
+ "content": "<|reserved_token_227|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157123": {
+ "content": "<|reserved_token_228|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157124": {
+ "content": "<|reserved_token_229|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157125": {
+ "content": "<|reserved_token_230|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157126": {
+ "content": "<|reserved_token_231|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157127": {
+ "content": "<|reserved_token_232|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157128": {
+ "content": "<|reserved_token_233|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157129": {
+ "content": "<|reserved_token_234|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157130": {
+ "content": "<|reserved_token_235|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157131": {
+ "content": "<|reserved_token_236|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157132": {
+ "content": "<|reserved_token_237|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157133": {
+ "content": "<|reserved_token_238|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157134": {
+ "content": "<|reserved_token_239|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157135": {
+ "content": "<|reserved_token_240|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157136": {
+ "content": "<|reserved_token_241|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157137": {
+ "content": "<|reserved_token_242|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157138": {
+ "content": "<|reserved_token_243|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157139": {
+ "content": "<|reserved_token_244|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157140": {
+ "content": "<|reserved_token_245|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157141": {
+ "content": "<|reserved_token_246|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157142": {
+ "content": "<|reserved_token_247|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157143": {
+ "content": "<|reserved_token_248|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157144": {
+ "content": "<|reserved_token_249|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157145": {
+ "content": "<|reserved_token_250|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157146": {
+ "content": "<|reserved_token_251|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157147": {
+ "content": "<|reserved_token_252|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157148": {
+ "content": "<|reserved_token_253|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157149": {
+ "content": "<|reserved_token_254|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157150": {
+ "content": "<|reserved_token_255|>",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157151": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ },
+ "157152": {
+ "content": "",
+ "lstrip": false,
+ "normalized": false,
+ "rstrip": false,
+ "single_word": false,
+ "special": true
+ }
+ },
+ "bos_token": "<|startoftext|>",
+ "clean_up_tokenization_spaces": false,
+ "cls_token": "[CLS]",
+ "eos_token": "<|role_end|>",
+ "extra_special_tokens": {},
+ "fast_tokenizer": true,
+ "gmask_token": "[gMASK]",
+ "merges_file": null,
+ "model_max_length": 1000000000000000019884624838656,
+ "pad_token": "<|endoftext|>",
+ "tokenizer_class": "PreTrainedTokenizerFast",
+ "trust_remote_code": true
+}