Zephyr271828 commited on
Commit
e4cea77
·
verified ·
1 Parent(s): 7c47b2e

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ ---
5
+
6
+ # SDAR
7
+
8
+ <div align="center">
9
+ <img src="https://raw.githubusercontent.com/JetAstra/SDAR/main/assets/SDAR_doc_head.png">
10
+
11
+
12
+ <div>&nbsp;</div>
13
+
14
+ [💻Github Repo](https://github.com/JetAstra/SDAR) • [🤗Model Collections](https://huggingface.co/collections/JetLM/sdar-689b1b6d392a4eeb2664f8ff)
15
+
16
+ </div>
17
+
18
+ # Introduction
19
+
20
+ **SDAR** (**S**ynergy of **D**iffusion and **A**uto**R**egression) model is a new large language model that integrates autoregressive (AR) and discrete diffusion modeling strategies. It combines the efficient training paradigm of AR models with the highly parallel inference capability of diffusion models, while delivering performance fully on par with SOTA open-source AR models. At the same time, SDAR sets a new benchmark as the most powerful diffusion language model to date. We highlight three major conclusions from our study:
21
+
22
+ > [!IMPORTANT]
23
+ > Take-home message
24
+ >
25
+ > - **Balanced Efficiency:** SDAR unifies the **efficient training** of AR models with the **parallel inference** of diffusion, achieving both fast training and inference.
26
+ > - **Fair Comparisons:** In rigorously controlled experiments, SDAR achieves **on-par general task performance** with strong AR baselines, ensuring credibility and reproducibility.
27
+ > - **Superior Learning Efficiency:** On complex scientific reasoning tasks (e.g., GPQA, ChemBench, Physics), SDAR shows **clear gains over AR models** of the same scale, approaching or even exceeding leading closed-source systems.
28
+
29
+ # Performance
30
+
31
+ ### SDAR v.s. Qwen
32
+
33
+ For **SDAR** models, inference hyperparameters are set to: `block_length = 4`, `denoising_steps = 4`, greedy decoding.
34
+
35
+ For **Qwen3-1.7B-AR-SFT** and **Qwen3-30B-AR-SFT**, we use *greedy decoding*, and the base models **Qwen3-1.7B-Base** and **Qwen3-30B-Base** are derived from the [Qwen3 Technical Report](https://arxiv.org/abs/2505.09388).
36
+
37
+ <p align="center">
38
+ <img src="https://raw.githubusercontent.com/JetAstra/SDAR/main/assets/table1.png" style="max-width:80%; height:auto;">
39
+ <p align="center">
40
+
41
+ ### SDAR-Sci v.s. AR Baseline
42
+
43
+ This table presents a **controlled comparison** between AR and SDAR under the same backbone and dataset settings.
44
+ The results are averaged over 8 runs for GPQA, and over 32 runs each for AIME 2024, AIME 2025, and LiveMathBench.
45
+
46
+ <p align="center">
47
+ <img src="https://raw.githubusercontent.com/JetAstra/SDAR/main/assets/table2.png" style="max-width:80%; height:auto;">
48
+ <p align="center">
49
+
50
+ #### SDAR-Sci v.s. Other Models
51
+
52
+ This table positions **SDAR-30B-A3B-Sci(sample)** against leading open-source and closed-source LLMs.
53
+ Scores for external models are sourced from the [InternLM/Intern-S1](https://github.com/InternLM/Intern-S1) repository.
54
+
55
+ <p align="center">
56
+ <img src="https://raw.githubusercontent.com/JetAstra/SDAR/main/assets/table3.png" style="max-width:80%; height:auto;">
57
+ <p align="center">
added_tokens.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<MASK>": 151669,
6
+ "<think>": 151667,
7
+ "<tool_call>": 151657,
8
+ "<tool_response>": 151665,
9
+ "<|box_end|>": 151649,
10
+ "<|box_start|>": 151648,
11
+ "<|endoftext|>": 151643,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|im_end|>": 151645,
18
+ "<|im_start|>": 151644,
19
+ "<|image_pad|>": 151655,
20
+ "<|object_ref_end|>": 151647,
21
+ "<|object_ref_start|>": 151646,
22
+ "<|quad_end|>": 151651,
23
+ "<|quad_start|>": 151650,
24
+ "<|repo_name|>": 151663,
25
+ "<|video_pad|>": 151656,
26
+ "<|vision_end|>": 151653,
27
+ "<|vision_pad|>": 151654,
28
+ "<|vision_start|>": 151652
29
+ }
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SDARMTPForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_sdar_mtp.SDARMTPConfig",
7
+ "AutoModel": "modeling_sdar_mtp.SDARMTPModel",
8
+ "AutoModelForCausalLM": "modeling_sdar_mtp.SDARMTPForCausalLM"
9
+ },
10
+ "attention_bias": false,
11
+ "attention_dropout": 0.0,
12
+ "attn_implementation": "flex_attention",
13
+ "bos_token_id": 151643,
14
+ "debug": false,
15
+ "eos_token_id": 151643,
16
+ "block_size": 4,
17
+ "mask_token_id": 151669,
18
+ "ep_size": 1,
19
+ "fuse_cross_entropy": true,
20
+ "head_dim": 128,
21
+ "hidden_act": "silu",
22
+ "hidden_size": 2560,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 9728,
25
+ "max_position_embeddings": 32768,
26
+ "max_window_layers": 36,
27
+ "micro_forward": false,
28
+ "model_type": "sdar",
29
+ "num_attention_heads": 32,
30
+ "num_hidden_layers": 36,
31
+ "num_nextn_predict_layers": 1,
32
+ "num_key_value_heads": 8,
33
+ "rms_norm_eps": 1e-06,
34
+ "rope_scaling": null,
35
+ "rope_theta": 1000000,
36
+ "skip_checkpoint": false,
37
+ "sliding_window": null,
38
+ "tie_word_embeddings": false,
39
+ "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.52.4",
41
+ "use_cache": false,
42
+ "use_deepep": false,
43
+ "use_sliding_window": false,
44
+ "vocab_size": 151936
45
+ }
configuration_sdar_mtp.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """SDAR model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class SDARMTPConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`SDARModel`]. It is used to instantiate a
28
+ SDAR model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of
30
+ SDAR-1.7B [DiffuOpen/SDAR-1.7B-Chat](https://huggingface.co/DiffuOpen/SDAR-1.7B-Chat/).
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 151936):
38
+ Vocabulary size of the SDAR model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`SDARModel`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 22016):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 36):
45
+ Number of hidden layers in the target model.
46
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
47
+ Number of hidden layers in the MTP module.
48
+ num_attention_heads (`int`, *optional*, defaults to 32):
49
+ Number of attention heads for each attention layer in the Transformer encoder.
50
+ num_key_value_heads (`int`, *optional*, defaults to 32):
51
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
+ by meanpooling all the original heads within that group. For more details checkout [this
56
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
57
+ head_dim (`int`, *optional*, defaults to 128):
58
+ The attention head dimension.
59
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
60
+ The non-linear activation function (function or string) in the decoder.
61
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
62
+ The maximum sequence length that this model might ever be used with.
63
+ initializer_range (`float`, *optional*, defaults to 0.02):
64
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
66
+ The epsilon used by the rms normalization layers.
67
+ use_cache (`bool`, *optional*, defaults to `True`):
68
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
69
+ relevant if `config.is_decoder=True`.
70
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
71
+ Whether the model's input and output word embeddings should be tied.
72
+ rope_theta (`float`, *optional*, defaults to 10000.0):
73
+ The base period of the RoPE embeddings.
74
+ rope_scaling (`Dict`, *optional*):
75
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
76
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
77
+ accordingly.
78
+ Expected contents:
79
+ `rope_type` (`str`):
80
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
81
+ 'llama3'], with 'default' being the original RoPE implementation.
82
+ `factor` (`float`, *optional*):
83
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
84
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
85
+ original maximum pre-trained length.
86
+ `original_max_position_embeddings` (`int`, *optional*):
87
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
88
+ pretraining.
89
+ `attention_factor` (`float`, *optional*):
90
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
91
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
92
+ `factor` field to infer the suggested value.
93
+ `beta_fast` (`float`, *optional*):
94
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
95
+ ramp function. If unspecified, it defaults to 32.
96
+ `beta_slow` (`float`, *optional*):
97
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
98
+ ramp function. If unspecified, it defaults to 1.
99
+ `short_factor` (`List[float]`, *optional*):
100
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
101
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
102
+ size divided by the number of attention heads divided by 2
103
+ `long_factor` (`List[float]`, *optional*):
104
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
105
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
106
+ size divided by the number of attention heads divided by 2
107
+ `low_freq_factor` (`float`, *optional*):
108
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
109
+ `high_freq_factor` (`float`, *optional*):
110
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
111
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
112
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
113
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
114
+ Whether to use sliding window attention.
115
+ sliding_window (`int`, *optional*, defaults to 4096):
116
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
117
+ max_window_layers (`int`, *optional*, defaults to 28):
118
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
119
+ attention_dropout (`float`, *optional*, defaults to 0.0):
120
+ The dropout ratio for the attention probabilities.
121
+
122
+ ```python
123
+ >>> from transformers import SDARModel, SDARConfig
124
+
125
+ >>> # Initializing a SDAR style configuration
126
+ >>> configuration = SDARConfig()
127
+
128
+ >>> # Initializing a model from the SDAR-8B style configuration
129
+ >>> model = SDARModel(configuration)
130
+
131
+ >>> # Accessing the model configuration
132
+ >>> configuration = model.config
133
+ ```"""
134
+
135
+ model_type = "sdar"
136
+ keys_to_ignore_at_inference = ["past_key_values"]
137
+
138
+ # Default tensor parallel plan for base model `SDAR`
139
+ base_model_tp_plan = {
140
+ "layers.*.self_attn.q_proj": "colwise",
141
+ "layers.*.self_attn.k_proj": "colwise",
142
+ "layers.*.self_attn.v_proj": "colwise",
143
+ "layers.*.self_attn.o_proj": "rowwise",
144
+ "layers.*.mlp.gate_proj": "colwise",
145
+ "layers.*.mlp.up_proj": "colwise",
146
+ "layers.*.mlp.down_proj": "rowwise",
147
+ }
148
+ base_model_pp_plan = {
149
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
150
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
151
+ "norm": (["hidden_states"], ["hidden_states"]),
152
+ }
153
+
154
+ def __init__(
155
+ self,
156
+ vocab_size=151936,
157
+ hidden_size=4096,
158
+ intermediate_size=22016,
159
+ num_hidden_layers=36,
160
+ num_nextn_predict_layers=1,
161
+ num_attention_heads=32,
162
+ num_key_value_heads=32,
163
+ head_dim=128,
164
+ hidden_act="silu",
165
+ max_position_embeddings=32768,
166
+ initializer_range=0.02,
167
+ rms_norm_eps=1e-6,
168
+ use_cache=True,
169
+ tie_word_embeddings=False,
170
+ rope_theta=10000.0,
171
+ rope_scaling=None,
172
+ attention_bias=False,
173
+ use_sliding_window=False,
174
+ sliding_window=4096,
175
+ max_window_layers=28,
176
+ attention_dropout=0.0,
177
+ **kwargs,
178
+ ):
179
+ self.vocab_size = vocab_size
180
+ self.max_position_embeddings = max_position_embeddings
181
+ self.hidden_size = hidden_size
182
+ self.intermediate_size = intermediate_size
183
+ self.num_hidden_layers = num_hidden_layers
184
+ self.num_nextn_predict_layers = num_nextn_predict_layers
185
+ self.num_attention_heads = num_attention_heads
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window # we check `use_sliding_window` in the modeling code
188
+ self.max_window_layers = max_window_layers
189
+
190
+ # for backward compatibility
191
+ if num_key_value_heads is None:
192
+ num_key_value_heads = num_attention_heads
193
+
194
+ self.num_key_value_heads = num_key_value_heads
195
+ self.head_dim = head_dim
196
+ self.hidden_act = hidden_act
197
+ self.initializer_range = initializer_range
198
+ self.rms_norm_eps = rms_norm_eps
199
+ self.use_cache = use_cache
200
+ self.rope_theta = rope_theta
201
+ self.rope_scaling = rope_scaling
202
+ self.attention_bias = attention_bias
203
+ self.attention_dropout = attention_dropout
204
+ # Validate the correctness of rotary position embeddings parameters
205
+ # BC: if there is a 'type' field, move it to 'rope_type'.
206
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
207
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
208
+ rope_config_validation(self)
209
+
210
+ super().__init__(
211
+ tie_word_embeddings=tie_word_embeddings,
212
+ **kwargs,
213
+ )
214
+
215
+
216
+ __all__ = ["SDARMTPConfig"]
fused_linear_diffusion_cross_entropy.py ADDED
@@ -0,0 +1,682 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Code adapted from
4
+ # https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/fused_linear_cross_entropy.py
5
+ # Implementation of element-wise division of cross entropy loss
6
+
7
+
8
+ # Code adapted from
9
+ # https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/fused_linear_cross_entropy.py
10
+
11
+ from functools import partial
12
+ from typing import Optional, Tuple
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+ import torch.nn.functional as F
17
+ import triton
18
+ import triton.language as tl
19
+ from torch.distributed import DeviceMesh
20
+ from torch.distributed.tensor import DTensor, Replicate, Shard, distribute_module
21
+ from torch.distributed.tensor.parallel import ParallelStyle
22
+
23
+ # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576
24
+ # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
25
+ # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
26
+ # The optimal maximum block size depends on your hardware, your kernel, and your dtype
27
+ MAX_FUSED_SIZE = 65536 // 2
28
+
29
+
30
+ @triton.heuristics({
31
+ 'HAS_SCALE': lambda args: args['scale'] is not None
32
+ })
33
+ @triton.autotune(
34
+ configs=[
35
+ triton.Config({}, num_warps=num_warps)
36
+ for num_warps in [1, 2, 4, 8, 16, 32]
37
+ ],
38
+ key=['D']
39
+ )
40
+ @triton.jit
41
+ def logsumexp_fwd_kernel(
42
+ x,
43
+ z,
44
+ scale,
45
+ D: tl.constexpr,
46
+ B: tl.constexpr,
47
+ HAS_SCALE: tl.constexpr
48
+ ):
49
+ i_n, i_d = tl.program_id(0).to(tl.int64), tl.program_id(1).to(tl.int64)
50
+ o_d = i_d * B + tl.arange(0, B)
51
+ m_d = o_d < D
52
+
53
+ b_x = tl.load(x + i_n * D + o_d, mask=m_d, other=-float('inf'))
54
+ if HAS_SCALE:
55
+ b_x = b_x * scale
56
+ b_m = tl.max(b_x, 0)
57
+ b_z = tl.log(tl.sum(tl.exp(b_x - b_m), 0)) + b_m
58
+ tl.store(z + i_n * tl.cdiv(D, B) + i_d, b_z)
59
+
60
+
61
+ def logsumexp_fwd(
62
+ x,
63
+ scale: Optional[float] = None,
64
+ dtype: Optional[torch.dtype] = None
65
+ ):
66
+ r"""
67
+ Compute the logsumexp of the input tensor over the last dimension.
68
+
69
+ Args:
70
+ x (Tensor):
71
+ The input tensor of any shape.
72
+ scale (Optional[float]):
73
+ The scale applied to the input tensor. Default: `None`.
74
+ dtype (Optional[torch.dtype]):
75
+ The data type of the output tensor. Default: `None`.
76
+ Returns:
77
+ Tensor: The logsumexp of the input tensor.
78
+ """
79
+
80
+ shape = x.shape
81
+ x = x.view(-1, shape[-1])
82
+ N, D = x.shape
83
+ B = min(triton.next_power_of_2(D), 64 * 1024)
84
+ ND = triton.cdiv(D, B)
85
+
86
+ z = x.new_empty(N, ND, dtype=torch.float)
87
+ logsumexp_fwd_kernel[(N, ND)](
88
+ x=x,
89
+ z=z,
90
+ scale=scale,
91
+ D=D,
92
+ B=B
93
+ )
94
+ z = z.logsumexp(-1).view(*shape[:-1])
95
+ if dtype is not None and dtype != torch.float:
96
+ z = z.to(dtype)
97
+ return z
98
+
99
+ @triton.jit
100
+ def cross_entropy_kernel(
101
+ logits,
102
+ lse,
103
+ target,
104
+ p_mask,
105
+ loss,
106
+ total,
107
+ ignore_index,
108
+ label_smoothing: tl.constexpr,
109
+ logit_scale: tl.constexpr,
110
+ reduction: tl.constexpr,
111
+ V: tl.constexpr,
112
+ BV: tl.constexpr
113
+ ):
114
+ """
115
+ This kernel computes both cross entropy loss and the gradient of the input.
116
+ We only consider hard label + mean reduction for now.
117
+ Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math.
118
+
119
+ Args:
120
+ logits:
121
+ Pointer to logits tensor.
122
+ lse:
123
+ Pointer to logsumexp tensor.
124
+ target: Pointer to target tensor.
125
+ loss:
126
+ Pointer to tensor to store the loss.
127
+ V (int):
128
+ The number of columns in the input tensor.
129
+ total (int):
130
+ The number of non-ignored classes.
131
+ ignore_index (int):
132
+ The index to ignore in the target.
133
+ label_smoothing (float):
134
+ The amount of smoothing when computing the loss, where 0.0 means no smoothing.
135
+ reduction (str):
136
+ The string for the reduction to apply
137
+ BV (int):
138
+ The block size for vocab.
139
+ """
140
+
141
+ # https://github.com/triton-lang/triton/issues/1058
142
+ # If B*T*V is too large, i_n * stride will overflow out of int32, so we convert to int64
143
+ i_n = tl.program_id(0).to(tl.int64)
144
+ NV = tl.cdiv(V, BV)
145
+
146
+ # 1. Load target first because if the target is ignore_index, we can return right away
147
+ b_y = tl.load(target + i_n)
148
+ # load p_mask
149
+ b_p_mask = tl.load(p_mask + i_n)
150
+
151
+ # 2. locate the start index
152
+ logits += i_n * V
153
+
154
+ if b_y == ignore_index:
155
+ # set all x as 0
156
+ for i in range(0, V, BV):
157
+ o_v = i + tl.arange(0, BV)
158
+ tl.store(logits + o_v, 0.0, mask=o_v < V)
159
+ return
160
+
161
+ # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax)
162
+ # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867
163
+
164
+ # 3. [Online softmax] first pass: compute logsumexp
165
+ # we did this in anouter kernel
166
+ b_l = tl.load(logits + b_y) * logit_scale
167
+ b_lse = tl.load(lse + i_n)
168
+
169
+ # 4. Calculate the loss
170
+ # loss = lse - logits_l
171
+ # celoss = -log(q_y) = -log(softmax(x_y))
172
+ b_loss = (b_lse - b_l) / b_p_mask # Diffusion Scaled '1/t'
173
+
174
+ # Label smoothing is a general case of normal cross entropy
175
+ # See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310
176
+ b_z = 0.0
177
+ eps = label_smoothing / V
178
+
179
+ # We need tl.debug_barrier() as mentioned in
180
+ # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34
181
+ tl.debug_barrier()
182
+
183
+ # 5. [Online Softmax] Second pass: compute gradients
184
+ # For 'mean' reduction, gradients are normalized by number of non-ignored elements
185
+ # dx_y = (softmax(x_y) - 1) / N
186
+ # dx_i = softmax(x_i) / N, i != y
187
+ # For label smoothing:
188
+ # dx_i = (softmax(x_y) - label_smoothing / V) / N, i != y
189
+ # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N
190
+ # = dx_i - (1 - label_smoothing) / N
191
+ for iv in range(0, NV):
192
+ o_v = iv * BV + tl.arange(0, BV)
193
+ b_logits = tl.load(logits + o_v, mask=o_v < V, other=float('-inf')) * logit_scale
194
+ if label_smoothing > 0:
195
+ # scale X beforehand to avoid overflow
196
+ b_z += tl.sum(tl.where(o_v < V, -eps * b_logits, 0.0))
197
+ b_p = (tl.exp(b_logits - b_lse) - eps) * logit_scale
198
+ b_p /= b_p_mask # 修改
199
+ if reduction == "mean":
200
+ b_p = b_p / total
201
+ tl.store(logits + o_v, b_p, mask=o_v < V)
202
+
203
+ tl.debug_barrier()
204
+
205
+ # Orginal loss = H(q, p), with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
206
+ # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
207
+ # = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i))
208
+ # By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as:
209
+ # = (1 - label_smoothing) * H(q, p) + (-sum(x_i * eps) + label_smoothing * (m + logd))
210
+ # Refer to H(q', p) in section 7 of the paper:
211
+ # https://arxiv.org/pdf/1512.00567
212
+ # pytorch:
213
+ # https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516
214
+ # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087
215
+ if label_smoothing > 0:
216
+ b_loss = b_loss * (1 - label_smoothing) + (b_z + label_smoothing * b_lse)
217
+
218
+ # 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - (1 - label_smoothing) / N`
219
+ b_l = tl.load(logits + b_y)
220
+
221
+ # Normalize the loss by the number of non-ignored elements if reduction is "mean"
222
+ if reduction == 'mean':
223
+ b_loss = b_loss / total
224
+ # b_l += (label_smoothing - 1) / total * logit_scale
225
+ # b_l has already been divided by b_p_mask and total
226
+ b_l += (label_smoothing - 1) / b_p_mask / total * logit_scale
227
+ else:
228
+ # b_l += (label_smoothing - 1) * logit_scale
229
+ b_l += (label_smoothing - 1) / b_p_mask * logit_scale
230
+
231
+ tl.store(loss + i_n, b_loss)
232
+ tl.store(logits + b_y, b_l)
233
+
234
+
235
+ @triton.jit
236
+ def elementwise_mul_kernel(
237
+ x,
238
+ g,
239
+ N: tl.constexpr,
240
+ B: tl.constexpr
241
+ ):
242
+ """
243
+ This function multiplies each element of the tensor pointed by x with the value pointed by g.
244
+ The multiplication is performed in-place on the tensor pointed by x.
245
+
246
+ Parameters:
247
+ x:
248
+ Pointer to the input tensor.
249
+ g:
250
+ Pointer to the gradient output value.
251
+ N (int):
252
+ The number of columns in the input tensor.
253
+ B (int):
254
+ The block size for Triton operations.
255
+ """
256
+
257
+ # Get the program ID and convert it to int64 to avoid overflow
258
+ i_x = tl.program_id(0).to(tl.int64)
259
+ o_x = i_x * B + tl.arange(0, B)
260
+
261
+ # Load the gradient output value
262
+ b_g = tl.load(g)
263
+ b_x = tl.load(x + o_x, mask=o_x < N)
264
+ tl.store(x + o_x, b_x * b_g, mask=o_x < N)
265
+
266
+
267
+ def fused_linear_cross_entropy_forward(
268
+ x: torch.Tensor,
269
+ target: torch.LongTensor,
270
+ weight: torch.Tensor,
271
+ bias: torch.Tensor = None,
272
+ p_mask: torch.Tensor = None,
273
+ ignore_index: int = -100,
274
+ label_smoothing: float = 0.0,
275
+ logit_scale: float = 1.0,
276
+ num_chunks: int = 8,
277
+ reduction: str = "mean"
278
+ ):
279
+ device = x.device
280
+ # inputs have shape: [N, H]
281
+ # materialized activations will have shape: [N, V]
282
+ # the increase in memory = [N, V]
283
+ # reduction can be achieved by partitioning the number of tokens N into smaller chunks.
284
+
285
+ # ideally, we would like to achieve the same memory consumption as [N, H],
286
+ # so the expected chunk size should be:
287
+ # NC = ceil(V / H)
288
+ # C = ceil(N / NC)
289
+ # for ex: N = 4096*4, V = 32000, H = 4096 ==> NC = 8, C = ceil(N / NC) = 2048
290
+ N, H, V = *x.shape, weight.shape[0]
291
+ BV = min(MAX_FUSED_SIZE, triton.next_power_of_2(V))
292
+ # TODO: in real cases, we may need to limit the number of chunks NC to
293
+ # ensure the precisions of accumulated gradients
294
+ NC = min(num_chunks, triton.cdiv(V, H))
295
+ C = triton.next_power_of_2(triton.cdiv(N, NC))
296
+ NC = triton.cdiv(N, C)
297
+
298
+ # [N, H]
299
+ dx = torch.zeros_like(x, device=device)
300
+ # [V, H]
301
+ dw = torch.zeros_like(weight, device=device, dtype=torch.float) if weight is not None else None
302
+ # [V]
303
+ db = torch.zeros_like(bias, device=device, dtype=torch.float) if bias is not None else None
304
+ # [N]
305
+ loss = torch.zeros(N, device=device, dtype=torch.float)
306
+
307
+ total = target.ne(ignore_index).sum().item()
308
+
309
+ for ic in range(NC):
310
+ start, end = ic * C, min((ic + 1) * C, N)
311
+ # [C, N]
312
+ c_x = x[start:end]
313
+ # when doing matmul, use the original precision
314
+ # [C, V]
315
+ c_logits = F.linear(c_x, weight, bias)
316
+ c_target = target[start:end]
317
+ c_p_mask = p_mask[start:end]
318
+ # [C]
319
+ # keep lse in fp32 to maintain precision
320
+ c_lse = logsumexp_fwd(c_logits, scale=logit_scale, dtype=torch.float)
321
+
322
+ # unreduced loss
323
+ c_loss = loss[start:end]
324
+
325
+ # Here we calculate the gradient of c_logits in place so we can save memory.
326
+ cross_entropy_kernel[(c_logits.shape[0],)](
327
+ logits=c_logits,
328
+ lse=c_lse,
329
+ target=c_target,
330
+ p_mask=c_p_mask,
331
+ loss=c_loss,
332
+ total=total,
333
+ ignore_index=ignore_index,
334
+ label_smoothing=label_smoothing,
335
+ logit_scale=logit_scale,
336
+ reduction=reduction,
337
+ V=V,
338
+ BV=BV,
339
+ num_warps=32
340
+ )
341
+
342
+ # gradient of logits is computed in-place by the above triton kernel and is of shape: C x V
343
+ # thus dx should be of shape: C x H
344
+ dx[start:end] = torch.mm(c_logits, weight)
345
+
346
+ # keep dw in fp32 to maintain precision
347
+ if weight is not None:
348
+ dw += c_logits.t() @ c_x
349
+
350
+ if bias is not None:
351
+ torch.add(input=db, other=c_logits.sum(0), out=db)
352
+
353
+ loss = loss.sum()
354
+ if dw is not None:
355
+ dw = dw.to(weight)
356
+ if db is not None:
357
+ db = db.to(bias)
358
+ return loss, dx, dw, db
359
+
360
+
361
+ def fused_linear_cross_entropy_backward(
362
+ do: torch.Tensor,
363
+ dx: torch.Tensor,
364
+ dw: torch.Tensor,
365
+ db: torch.Tensor
366
+ ):
367
+ # If cross entropy is the last layer, do is 1.0. Skip the mul to save time
368
+ if torch.ne(do, torch.tensor(1.0, device=do.device)):
369
+ # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place
370
+ # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton.
371
+ N, H = dx.shape
372
+ B = min(MAX_FUSED_SIZE, triton.next_power_of_2(H))
373
+
374
+ elementwise_mul_kernel[(triton.cdiv(N * H, B),)](
375
+ x=dx,
376
+ g=do,
377
+ N=N*H,
378
+ B=B,
379
+ num_warps=32,
380
+ )
381
+
382
+ # handle dw
383
+ if dw is not None:
384
+ V, H = dw.shape
385
+ elementwise_mul_kernel[(triton.cdiv(V * H, B),)](
386
+ x=dw,
387
+ g=do,
388
+ N=V*H,
389
+ B=B,
390
+ num_warps=32,
391
+ )
392
+
393
+ if db is not None:
394
+ V = db.shape[0]
395
+ elementwise_mul_kernel[(triton.cdiv(V, B),)](
396
+ x=db,
397
+ g=do,
398
+ N=V,
399
+ B=B,
400
+ num_warps=32,
401
+ )
402
+ return dx, dw, db
403
+
404
+
405
+ class FusedLinearCrossEntropyFunction(torch.autograd.Function):
406
+
407
+ @staticmethod
408
+ def forward(
409
+ ctx,
410
+ x: torch.Tensor,
411
+ target: torch.LongTensor,
412
+ weight: torch.Tensor,
413
+ bias: torch.Tensor = None,
414
+ p_mask: torch.Tensor = None,
415
+ ignore_index: int = -100,
416
+ label_smoothing: float = 0.0,
417
+ logit_scale: float = 1.0,
418
+ num_chunks: int = 8,
419
+ reduction: str = "mean"
420
+ ):
421
+ """
422
+ Fusing the last linear layer with cross-entropy loss
423
+ Reference: https://github.com/mgmalek/efficient_cross_entropy
424
+
425
+ Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding
426
+ the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can
427
+ compute the gradient at the forward pass. By doing so, we don't have to store the x and target
428
+ for the backward pass.
429
+
430
+ x (torch.Tensor): [batch_size * seq_len, hidden_size]
431
+ target (torch.LongTensor): [batch_size * seq_len]
432
+ where each value is in [0, vocab_size).
433
+ weight (torch.Tensor): [vocab_size, hidden_size]
434
+ where `vocab_size` is the number of classes.
435
+ bias (Optional[torch.Tensor]): [vocab_size]
436
+ where `vocab_size` is the number of classes.
437
+ p_mask(torch.Tensor): [batch_size * seq_len]
438
+ Its shape should be same as target.
439
+ ignore_index:
440
+ the index to ignore in the target.
441
+ label_smoothing:
442
+ the amount of smoothing when computing the loss, where 0.0 means no smoothing.
443
+ logit_scale: float = 1.0,
444
+ A scaling factor applied to the logits. Default: 1.0
445
+ num_chunks: int
446
+ The number of chunks to split the input tensor into for processing.
447
+ This can help optimize memory usage and computation speed.
448
+ Default: 8
449
+ reduction:
450
+ Specifies the reduction to apply to the output: 'mean' | 'sum'.
451
+ 'mean': the weighted mean of the output is taken,
452
+ 'sum': the output will be summed.
453
+ Default: 'mean'.
454
+ """
455
+ loss, dx, dw, db = fused_linear_cross_entropy_forward(
456
+ x,
457
+ target,
458
+ weight,
459
+ bias,
460
+ p_mask,
461
+ ignore_index,
462
+ label_smoothing,
463
+ logit_scale,
464
+ num_chunks,
465
+ reduction
466
+ )
467
+ # downcast to dtype and store for backward
468
+ ctx.save_for_backward(
469
+ dx.detach(),
470
+ dw.detach() if weight is not None else None,
471
+ db.detach() if bias is not None else None,
472
+ )
473
+ return loss
474
+
475
+ @staticmethod
476
+ def backward(ctx, do):
477
+ dx, dw, db = ctx.saved_tensors
478
+ dx, dw, db = fused_linear_cross_entropy_backward(do, dx, dw, db)
479
+ # 10 gradients should be returned, with `p_mask` having no grads
480
+ # Check the number of arguments in the `forward` method
481
+ return dx, None, dw, db, None, None, None, None, None, None
482
+
483
+
484
+ def fused_linear_cross_entropy_loss(
485
+ x: torch.Tensor,
486
+ target: torch.LongTensor,
487
+ weight: torch.Tensor,
488
+ bias: torch.Tensor = None,
489
+ p_mask: torch.Tensor = None,
490
+ ignore_index: int = -100,
491
+ label_smoothing: float = 0.0,
492
+ logit_scale: float = 1.0,
493
+ num_chunks: int = 8,
494
+ reduction: str = "mean"
495
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
496
+ """
497
+ Args:
498
+ x (torch.Tensor): [batch_size * seq_len, hidden_size]
499
+ target (torch.LongTensor): [batch_size * seq_len]
500
+ where each value is in [0, vocab_size).
501
+ weight (torch.Tensor): [vocab_size, hidden_size]
502
+ where `vocab_size` is the number of classes.
503
+ bias (Optional[torch.Tensor]): [vocab_size]
504
+ where `vocab_size` is the number of classes.
505
+ p_mask(torch.Tensor): [batch_size * seq_len]
506
+ Its shape should be same as target.
507
+ ignore_index: int.
508
+ If target == ignore_index, the loss is set to 0.0.
509
+ label_smoothing: float
510
+ logit_scale: float
511
+ A scaling factor applied to the logits. Default: 1.0
512
+ num_chunks: int
513
+ The number of chunks to split the input tensor into for processing.
514
+ This can help optimize memory usage and computation speed.
515
+ Default: 8
516
+ reduction:
517
+ Specifies the reduction to apply to the output: 'mean' | 'sum'.
518
+ 'mean': the weighted mean of the output is taken,
519
+ 'sum': the output will be summed.
520
+ Default: 'mean'.
521
+ Returns:
522
+ losses: [batch,], float
523
+ """
524
+ return FusedLinearCrossEntropyFunction.apply(
525
+ x,
526
+ target,
527
+ weight,
528
+ bias,
529
+ p_mask,
530
+ ignore_index,
531
+ label_smoothing,
532
+ logit_scale,
533
+ num_chunks,
534
+ reduction
535
+ )
536
+
537
+
538
+ class FusedLinearDiffusionCrossEntropyLoss(nn.Module):
539
+
540
+ def __init__(
541
+ self,
542
+ ignore_index: int = -100,
543
+ label_smoothing: float = 0.0,
544
+ logit_scale: float = 1.0,
545
+ num_chunks: int = 8,
546
+ reduction: str = "mean"
547
+ ):
548
+ """
549
+ Args:
550
+ ignore_index: int.
551
+ If target == ignore_index, the loss is set to 0.0.
552
+ label_smoothing: float
553
+ logit_scale: float
554
+ A scaling factor applied to the logits. Default: 1.0
555
+ num_chunks: int
556
+ The number of chunks to split the input tensor into for processing.
557
+ This can help optimize memory usage and computation speed.
558
+ Default: 8
559
+ reduction:
560
+ Specifies the reduction to apply to the output: 'mean' | 'sum'.
561
+ 'mean': the weighted mean of the output is taken,
562
+ 'sum': the output will be summed.
563
+ Default: 'mean'.
564
+ """
565
+ super().__init__()
566
+
567
+ assert reduction in ["mean", "sum"], f"reduction: {reduction} is not supported"
568
+
569
+ self.ignore_index = ignore_index
570
+ self.label_smoothing = label_smoothing
571
+ self.logit_scale = logit_scale
572
+ self.num_chunks = num_chunks
573
+ self.reduction = reduction
574
+
575
+ @torch.compiler.disable
576
+ def forward(
577
+ self,
578
+ x: torch.Tensor,
579
+ target: torch.LongTensor,
580
+ weight: torch.Tensor,
581
+ bias: Optional[torch.Tensor] = None,
582
+ p_mask: torch.Tensor = None
583
+ ):
584
+ """
585
+ Args:
586
+ x (torch.Tensor): [batch_size, seq_len, hidden_size]
587
+ target (torch.LongTensor): [batch_size, seq_len]
588
+ where each value is in [0, V).
589
+ weight (torch.Tensor): [vocab_size, hidden_size]
590
+ where `vocab_size` is the number of classes.
591
+ bias (Optional[torch.Tensor]): [vocab_size]
592
+ where `vocab_size` is the number of classes.
593
+ p_mask(torch.Tensor): [batch_size, seq_len]
594
+ Its shape is same as target.
595
+ Shape: (1, packed_length) when varlen attn is used.
596
+ Returns:
597
+ loss
598
+
599
+ TODO:
600
+ follow https://github.com/ML-GSAI/LLaDA/blob/main/GUIDELINES.md#pre-training
601
+ ```py
602
+ unreduced_loss /= p_mask
603
+ ```
604
+ Scale the values of `unreduced_loss at different positions
605
+ """
606
+ if p_mask is None:
607
+ p_mask = torch.ones_like(target, dtype=torch.float, device=x.device)
608
+
609
+ x = x.contiguous().view(-1, x.shape[-1])
610
+ target = target.contiguous().view(-1)
611
+ weight = weight.contiguous()
612
+ bias = bias.contiguous() if bias else None
613
+ p_mask = p_mask.contiguous().view(-1)
614
+ l, d = x.shape
615
+ assert l == target.shape[0] == p_mask.shape[0], f"{x.shape=}, {target.shape=}, {p_mask.shape=}"
616
+
617
+ loss = fused_linear_cross_entropy_loss(
618
+ x,
619
+ target,
620
+ weight=weight,
621
+ bias=bias,
622
+ p_mask=p_mask,
623
+ ignore_index=self.ignore_index,
624
+ label_smoothing=self.label_smoothing,
625
+ logit_scale=self.logit_scale,
626
+ num_chunks=self.num_chunks,
627
+ reduction=self.reduction
628
+ )
629
+ return loss
630
+
631
+
632
+ class LinearLossParallel(ParallelStyle):
633
+ def __init__(
634
+ self,
635
+ *,
636
+ sequence_dim: int = 1,
637
+ use_local_output: bool = False,
638
+ ):
639
+ super().__init__()
640
+
641
+ self.sequence_sharding = (Shard(sequence_dim),)
642
+ self.use_local_output = use_local_output
643
+
644
+ @staticmethod
645
+ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
646
+ x, target, weight, bias = inputs
647
+
648
+ if not isinstance(x, DTensor):
649
+ # assume the input passed in already sharded on the sequence dim and create the DTensor
650
+ x = DTensor.from_local(x, device_mesh, sequence_sharding)
651
+ if x.placements != sequence_sharding:
652
+ x = x.redistribute(placements=sequence_sharding, async_op=True)
653
+ if not isinstance(target, DTensor):
654
+ target = DTensor.from_local(target, device_mesh, [Replicate()])
655
+ if target.placements != sequence_sharding:
656
+ target = target.redistribute(placements=sequence_sharding, async_op=True)
657
+
658
+ if not isinstance(weight, DTensor):
659
+ weight = DTensor.from_local(weight, device_mesh, [Replicate()])
660
+ if weight.placements != [Replicate()]:
661
+ # we replicate the weight/bias in FLCE
662
+ weight = weight.redistribute(placements=[Replicate()], async_op=True)
663
+
664
+ if bias is not None and not isinstance(bias, DTensor):
665
+ bias = DTensor.from_local(bias, device_mesh, [Replicate()])
666
+ if bias is not None and bias.placements != [Replicate()]:
667
+ bias = bias.redistribute(placements=[Replicate()], async_op=True)
668
+
669
+ return x.to_local(), target.to_local(), weight.to_local(), bias.to_local() if bias is not None else bias
670
+
671
+ @staticmethod
672
+ def _prepare_output_fn(use_local_output, mod, outputs, device_mesh):
673
+ return outputs.to_local() if use_local_output else outputs
674
+
675
+ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
676
+ return distribute_module(
677
+ module,
678
+ device_mesh,
679
+ partition_fn=None,
680
+ input_fn=partial(self._prepare_input_fn, self.sequence_sharding),
681
+ output_fn=partial(self._prepare_output_fn, self.use_local_output)
682
+ )
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.51.0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46babb47657b7f43bf059f0368b9159bfa4a6b99aa1fee5da47c1fea73b1d3c1
3
+ size 4967215360
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd369bed554afeb4f50747950da85d2915fafc7de0f1289d5b53ad2b44abbb02
3
+ size 3855679144
model.safetensors.index.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8822848512
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.norm.weight": "model-00002-of-00002.safetensors"
405
+ }
406
+ }
modeling_sdar_mtp.py ADDED
@@ -0,0 +1,1788 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is modified based on https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen3/modeling_qwen3.py.
2
+ #
3
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
4
+ # This file was automatically generated from src/transformers/models/qwen3/modular_qwen3.py.
5
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
6
+ # the file from the modular. If any change should be done, please apply the change to the
7
+ # modular_qwen3.py file directly. One of our CI enforces this.
8
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
9
+ # coding=utf-8
10
+ # Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
11
+ #
12
+ # Licensed under the Apache License, Version 2.0 (the "License");
13
+ # you may not use this file except in compliance with the License.
14
+ # You may obtain a copy of the License at
15
+ #
16
+ # http://www.apache.org/licenses/LICENSE-2.0
17
+ #
18
+ # Unless required by applicable law or agreed to in writing, software
19
+ # distributed under the License is distributed on an "AS IS" BASIS,
20
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
+ # See the License for the specific language governing permissions and
22
+ # limitations under the License.
23
+
24
+ from typing import Callable, Optional, Tuple, Union, List
25
+
26
+ import torch
27
+ from torch import nn
28
+ from einops import rearrange
29
+
30
+ from transformers.activations import ACT2FN
31
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
32
+ from transformers.generation import GenerationMixin
33
+ from transformers.integrations import use_kernel_forward_from_hub
34
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
35
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
36
+ from transformers.modeling_layers import GradientCheckpointingLayer
37
+ from transformers.modeling_outputs import (
38
+ BaseModelOutputWithPast,
39
+ CausalLMOutputWithPast,
40
+ QuestionAnsweringModelOutput,
41
+ SequenceClassifierOutputWithPast,
42
+ TokenClassifierOutput,
43
+ )
44
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
45
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
46
+ from transformers.processing_utils import Unpack
47
+ from transformers.utils import LossKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging
48
+ from .configuration_sdar_mtp import SDARMTPConfig
49
+ from .fused_linear_diffusion_cross_entropy import FusedLinearDiffusionCrossEntropyLoss
50
+
51
+ from flash_attn.ops.triton.layer_norm import rms_norm_fn as flash_rms_norm
52
+
53
+ import torch.nn.functional as F
54
+ try:
55
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
56
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
57
+ except:
58
+ pass
59
+
60
+ try:
61
+ from liger_kernel.ops.swiglu import LigerSiLUMulFunction # noqa: F401
62
+ liger_kernel_is_available = True
63
+ except ImportError:
64
+ liger_kernel_is_available = False
65
+
66
+
67
+ if is_torch_flex_attn_available():
68
+ from torch.nn.attention.flex_attention import BlockMask, create_block_mask, flex_attention
69
+ from transformers.integrations.flex_attention import make_flex_block_causal_mask
70
+
71
+
72
+ logger = logging.get_logger(__name__)
73
+
74
+
75
+ def modify_padded_position_ids_2d(position_ids: torch.LongTensor) -> torch.LongTensor:
76
+ """
77
+ 使用完全向量化的 PyTorch 操作修改一个 batch 的 packed position_ids。
78
+ 这个函数假设输入是一个 2D Tensor,形状为 (batch_size, sequence_length)。
79
+ 它会独立地处理 batch 中的每一行。
80
+
81
+ Args:
82
+ position_ids: 二维 PyTorch Tensor, shape (batch_size, sequence_length).
83
+
84
+ Returns:
85
+ 修改后的 position_ids Tensor, shape (batch_size, sequence_length).
86
+ """
87
+ if position_ids.dim() != 2:
88
+ raise ValueError(f"Input tensor must be 2D, but got {position_ids.dim()} dimensions.")
89
+
90
+ batch_size, seq_len = position_ids.shape
91
+ device = position_ids.device
92
+
93
+ col_indices = torch.arange(seq_len, device=device, dtype=position_ids.dtype).expand(batch_size, -1)
94
+ mask = (position_ids != 0)
95
+
96
+ masked_indices = col_indices * mask
97
+ last_nonzero_idx = torch.max(masked_indices, dim=1).values
98
+ has_nonzero = torch.any(mask, dim=1)
99
+ pad_start_idx = torch.where(has_nonzero, last_nonzero_idx + 1, torch.tensor(0, device=device, dtype=position_ids.dtype))
100
+
101
+ padding_mask = col_indices >= pad_start_idx.unsqueeze(1)
102
+ new_pad_values = col_indices - pad_start_idx.unsqueeze(1)
103
+ position_ids = torch.where(padding_mask, new_pad_values, position_ids)
104
+
105
+ return position_ids
106
+
107
+
108
+ def calculate_token_nums(position_ids: torch.Tensor):
109
+ """
110
+ 使用 PyTorch 高效计算一个批次中每个打包序列的长度。
111
+
112
+ Args:
113
+ position_ids (torch.Tensor): 一个 2D Tensor,形状为 (batch_size, sequence_length)。
114
+ 例如:tensor([[0,1,2,3,4,0,1,2,3,4,5,0,1,2,3,0,0,0]])
115
+ Returns:
116
+ list[list[int]]: 一个嵌套列表,包含每个批次项中各个序列的长度。
117
+ 例如:[[5, 6, 4, 1, 1, 1]]
118
+ """
119
+ # 检查输入是否为 2D Tensor
120
+ if position_ids.dim() != 2:
121
+ raise ValueError(f"输入必须是 2D Tensor,但得到了 {position_ids.dim()}D")
122
+
123
+ all_lengths = []
124
+
125
+ # 我们按批次逐行处理。因为每行的序列长度数量不同(ragged),
126
+ # 所以 Python 循环在批次维度上是最高效且最清晰的写法。
127
+ # 循环内部的操作是完全向量化的。
128
+ for pids_row in position_ids:
129
+ # 获取当前行的总长度
130
+ seq_len = pids_row.shape[0]
131
+
132
+ # 1. 找到所有值为 0 的元素的索引
133
+ # pids_row == 0 会返回一个布尔 Tensor: [True, False, ..., True, ...]
134
+ # torch.nonzero 会返回这些 True 值的索引
135
+ # .flatten() 将其从 (N, 1) 形状的 Tensor 变为 (N,) 形状
136
+ zero_indices = torch.nonzero(pids_row == 0).flatten()
137
+
138
+ # 2. 将序列的总长度作为一个额外的切分点添加到末尾
139
+ # 这对于计算最后一个序列的长度至关重要
140
+ # 注意:要确保新创建的 tensor 和原始 tensor 在同一个设备上 (cpu/cuda)
141
+ split_points = torch.cat([
142
+ zero_indices,
143
+ torch.tensor([seq_len], device=pids_row.device, dtype=zero_indices.dtype)
144
+ ])
145
+
146
+ # 3. 计算相邻切分点之间的差值,这就是我们想要的长度
147
+ # torch.diff([a, b, c, d]) 会返回 [b-a, c-b, d-c]
148
+ lengths = torch.diff(split_points)
149
+
150
+ all_lengths.append(lengths)
151
+
152
+ return all_lengths
153
+
154
+
155
+ def forward_add_noise_packed(
156
+ inputs_ids: torch.Tensor,
157
+ num_tokens_list: List[torch.Tensor],
158
+ prompt_mask: torch.Tensor,
159
+ mask_id: int,
160
+ eps: float = 1e-3,
161
+ max_tries: int = 10,
162
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
163
+ """
164
+ 为一批打包(packed)序列的 token ID 添加噪声。
165
+
166
+ 此函数保留了为每个逻辑样本(在每个批次项内拼接)生成独立随机噪声率的逻辑。
167
+ 它会随机将一部分 token 的 ID 替换为 mask_id。
168
+ 这个过程会避开被 prompt_mask 标记的位置。
169
+
170
+ Args:
171
+ inputs_ids (torch.Tensor):
172
+ 输入的 token ID 张量,形状为 (bsz, total_tokens)。
173
+ num_tokens_list (List[torch.Tensor]):
174
+ 一个张量列表,长度为 bsz。列表中的每个张量记录了对应批次项中
175
+ 每个逻辑样本的长度。例如: [tensor([len1, len2]), tensor([len3, len4, len5])].
176
+ prompt_mask (torch.Tensor):
177
+ 布尔型张量,形状为 (bsz, total_tokens),值为 True 的位置表示是 prompt,
178
+ 不应添加噪声。
179
+ mask_id (int):
180
+ 用于替换的 mask token 的 ID。
181
+ eps (float):
182
+ 微小值,用于防止噪声率 t 恰好为 0,确保 p_mask > 0。
183
+ max_tries (int):
184
+ 为确保至少一个非 prompt token 被 mask,对每个批次项尝试的最大次数。
185
+
186
+ Returns:
187
+ Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
188
+ - noisy_input_ids (torch.Tensor):
189
+ 添加噪声后的 token ID 张量,形状为 (bsz, total_tokens)。
190
+ - final_masked_indices (torch.Tensor):
191
+ 布尔型张量,标记了哪些位置被实际 mask 了,形状为 (bsz, total_tokens)。
192
+ - p_masks (torch.Tensor):
193
+ 一个一维张量,包含了被 mask 的 token 对应的实际噪声率。
194
+ """
195
+ # 1. 验证和获取形状
196
+ bsz, total_tokens = inputs_ids.shape
197
+ device = inputs_ids.device
198
+
199
+ # 检查输入的一致性
200
+ assert len(num_tokens_list) == bsz, f"num_tokens_list 的长度 ({len(num_tokens_list)}) 必须等于 bsz ({bsz})"
201
+ assert prompt_mask.shape == (bsz, total_tokens), f"prompt_mask 形状不匹配, 期望 {(bsz, total_tokens)}, 得到 {prompt_mask.shape}"
202
+
203
+ # 准备结果容器
204
+ noisy_ids_list = []
205
+ final_masked_indices_list = []
206
+ p_masks_per_token_list = []
207
+
208
+ # 2. 在批次维度上迭代
209
+ # 这是处理不同打包结构最直接有效的方法
210
+ for i in range(bsz):
211
+ # 提取当前批次项的数据
212
+ current_ids = inputs_ids[i:i+1] # shape: (1, total_tokens)
213
+ current_num_tokens = num_tokens_list[i]
214
+ current_prompt_mask = prompt_mask[i:i+1] # shape: (1, total_tokens)
215
+
216
+ num_samples_in_item = len(current_num_tokens)
217
+ # 验证当前批次项的 token 总数是否匹配
218
+ assert total_tokens == torch.sum(current_num_tokens), \
219
+ f"批次项 {i} 的 num_tokens 之和 ({torch.sum(current_num_tokens)}) 与 total_tokens ({total_tokens}) 不匹配"
220
+
221
+ eligible_for_masking = ~current_prompt_mask
222
+
223
+ # 如果没有任何 token 可以被 mask,直接使用原始输入,并设置 p_mask 为 eps
224
+ if not eligible_for_masking.any():
225
+ noisy_ids_list.append(current_ids)
226
+ final_masked_indices_list.append(torch.zeros_like(current_prompt_mask, dtype=torch.bool))
227
+ # p_mask_per_token 的形状应为 (1, total_tokens) 以便后续拼接
228
+ p_masks_per_token_list.append(torch.full((1, total_tokens), eps, device=device, dtype=torch.float))
229
+ continue
230
+
231
+ # --- 尝试生成 mask,确保至少 mask 一个 token ---
232
+ final_masked_indices_item = torch.zeros_like(current_prompt_mask, dtype=torch.bool)
233
+ p_mask_per_token = None
234
+
235
+ for _ in range(max_tries):
236
+ # 为每个逻辑样本生成一个独立的噪声率 t
237
+ t = torch.rand(num_samples_in_item, device=device)
238
+ p_mask_per_sample = (1 - eps) * t + eps
239
+
240
+ # 将每个样本的噪声率扩展到其所有 token 上
241
+ p_mask_per_token_1d = torch.repeat_interleave(p_mask_per_sample, current_num_tokens)
242
+ p_mask_per_token = p_mask_per_token_1d.unsqueeze(0) # shape: (1, total_tokens)
243
+
244
+ # 根据噪声率生成随机 mask
245
+ masked_indices = torch.rand_like(p_mask_per_token) < p_mask_per_token
246
+ # 应用 prompt mask,确保 prompt 不被 mask
247
+ final_masked_indices_item = masked_indices & eligible_for_masking
248
+
249
+ # 如果成功 mask 了至少一个 token,则跳出尝试循环
250
+ if final_masked_indices_item.any():
251
+ break
252
+
253
+ # 如果 max_tries 之后仍然没有 mask 任何 token (极小概率),就强制 mask 一个可 mask 的 token
254
+ if not final_masked_indices_item.any():
255
+ eligible_indices = torch.nonzero(eligible_for_masking.squeeze(0), as_tuple=True)[0]
256
+ if len(eligible_indices) > 0:
257
+ # 随机选择一个可 mask 的位置
258
+ random_choice = torch.randint(0, len(eligible_indices), (1,)).item()
259
+ force_mask_idx = eligible_indices[random_choice]
260
+ final_masked_indices_item[0, force_mask_idx] = True
261
+
262
+
263
+ # --- 根据最终的 mask 生成带噪声的 IDs ---
264
+ noisy_ids_item = torch.where(
265
+ final_masked_indices_item,
266
+ mask_id,
267
+ current_ids
268
+ )
269
+
270
+ # 保存这个批次项的结果
271
+ noisy_ids_list.append(noisy_ids_item)
272
+ final_masked_indices_list.append(final_masked_indices_item)
273
+ p_masks_per_token_list.append(p_mask_per_token)
274
+
275
+ # 3. 将列表中的结果堆叠成最终的批处理张量
276
+ noisy_input_ids = torch.cat(noisy_ids_list, dim=0)
277
+ final_masked_indices = torch.cat(final_masked_indices_list, dim=0)
278
+ p_mask_full = torch.cat(p_masks_per_token_list, dim=0)
279
+
280
+ # 4. 提取被 mask 位置对应的噪声率
281
+ p_masks = p_mask_full[final_masked_indices]
282
+
283
+ return noisy_input_ids, final_masked_indices, p_masks
284
+
285
+
286
+ def block_diff_mask(b, h, q_idx, kv_idx, block_size=None, n=None):
287
+ """
288
+ Constructs the specialized block diffusion attention mask for training
289
+ composed of three masks:
290
+ - **Block Diagonal Mask (M_BD)**: Self-attention within noised blocks
291
+ - **Offset Block Causal Mask (M_OBC)**: Cross-attention for conditional context
292
+ - **Block Causal Mask (M_BC)**: Attention to update x0
293
+
294
+ Args:
295
+ b, h: Batch and head indices (ignored for mask logic).
296
+ q_idx, kv_idx: Query and Key indices.
297
+ seq_len: Total sequence length.
298
+ block_size: Defines the block structure.
299
+
300
+ Returns:
301
+ A boolean attention mask.
302
+ """
303
+
304
+ # Indicate whether token belongs to xt or x0
305
+ x0_flag_q = q_idx >= n
306
+ x0_flag_kv = kv_idx >= n
307
+
308
+ # Compute block indices
309
+ block_q = torch.where(
310
+ x0_flag_q == 1, (q_idx - n) // block_size, q_idx // block_size
311
+ )
312
+ block_kv = torch.where(
313
+ x0_flag_kv == 1, (kv_idx - n) // block_size, kv_idx // block_size
314
+ )
315
+
316
+ # **1. Block Diagonal Mask (M_BD) **
317
+ block_diagonal = (block_q == block_kv) & (x0_flag_q == x0_flag_kv)
318
+
319
+ # **2. Offset Block-Causal Mask (M_OBC) **
320
+ offset_block_causal = (block_q > block_kv) & (
321
+ x0_flag_kv == 1) & (x0_flag_q == 0)
322
+
323
+ # **3. Block-Causal Mask (M_BC) **
324
+ block_causal = (block_q >= block_kv) & (x0_flag_kv == 1) & (x0_flag_q == 1)
325
+
326
+ # **4. Combine Masks **
327
+ return block_diagonal | offset_block_causal | block_causal
328
+
329
+
330
+ def block_attn_mask(num_tokens, block_size, device):
331
+ masks = []
332
+ for i in range(len(num_tokens)):
333
+ cur_masks = []
334
+ for num in num_tokens[i]:
335
+ # 全部返回 n*n 而非 2n*2n
336
+ single_mask = block_diff_mask(
337
+ b=None,
338
+ h=None,
339
+ q_idx=torch.arange(num * 2, device=device)[:, None],
340
+ kv_idx=torch.arange(num * 2, device=device)[None, :],
341
+ block_size=block_size,
342
+ n=num,
343
+ )
344
+ cur_masks.append(single_mask)
345
+ masks.append(torch.block_diag(*cur_masks))
346
+ masks = torch.stack(masks, dim=0)
347
+ return masks
348
+
349
+
350
+ def top_k_logits(logits, k):
351
+ if k <= 0:
352
+ return logits
353
+ else:
354
+ values, _ = torch.topk(logits, k)
355
+ min_values = values[..., -1, None]
356
+ return torch.where(logits < min_values, torch.full_like(logits, float('-inf')), logits)
357
+
358
+
359
+ def top_p_logits(logits, p):
360
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
361
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
362
+ sorted_mask = cumulative_probs > p
363
+ sorted_mask[..., 1:] = sorted_mask[..., :-1].clone()
364
+ sorted_mask[..., 0] = False
365
+ mask_indices = torch.scatter(torch.full_like(logits, False, dtype=torch.bool),
366
+ -1, sorted_indices, sorted_mask)
367
+ logits = logits.masked_fill(mask_indices, float('-inf'))
368
+ return logits
369
+
370
+
371
+ def sample_with_temperature_topk_topp(logits, temperature=1.0, top_k=0, top_p=1.0):
372
+ orig_shape = logits.shape[:-1] # [batch, block]
373
+ vocab_size = logits.shape[-1]
374
+
375
+ logits = logits.reshape(-1, vocab_size) # [batch*block, vocab]
376
+
377
+ if temperature != 1.0:
378
+ logits = logits / temperature
379
+ if top_k > 0:
380
+ logits = top_k_logits(logits, top_k)
381
+ if top_p < 1.0:
382
+ logits = top_p_logits(logits, top_p)
383
+ probs = F.softmax(logits, dim=-1) # shape: [batch*block, vocab]
384
+ assert probs.dim() == 2
385
+ token = torch.multinomial(probs, num_samples=1) # [batch*block, 1]
386
+ token_prob = torch.gather(probs, -1, token) # [batch*block, 1]
387
+
388
+ return token.view(*orig_shape), token_prob.view(*orig_shape)
389
+
390
+
391
+ def get_num_transfer_tokens(block_length, steps):
392
+ base = block_length // steps
393
+ remainder = block_length % steps
394
+ num_transfer_tokens = torch.zeros(steps, dtype=torch.int64) + base
395
+ num_transfer_tokens[:remainder] += 1
396
+ return num_transfer_tokens
397
+
398
+
399
+ @torch.compile(fullgraph=True, mode="max-autotune-no-cudagraphs")
400
+ def fused_flex_attention(query, key, value, attention_mask, **kwargs):
401
+ return flex_attention(query, key, value, block_mask=attention_mask, **kwargs)
402
+
403
+
404
+ @use_kernel_forward_from_hub("RMSNorm")
405
+ class SDARRMSNorm(nn.Module):
406
+ def __init__(self, hidden_size, eps=1e-6):
407
+ """
408
+ SDARRMSNorm is equivalent to T5LayerNorm
409
+ """
410
+ super().__init__()
411
+ self.weight = nn.Parameter(torch.ones(hidden_size))
412
+ self.variance_epsilon = eps
413
+
414
+ def forward(self, hidden_states):
415
+ return flash_rms_norm(
416
+ hidden_states, weight=self.weight, bias=None, eps=self.variance_epsilon)
417
+ '''
418
+ input_dtype = hidden_states.dtype
419
+ hidden_states = hidden_states.to(torch.float32)
420
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
421
+ hidden_states = hidden_states * \
422
+ torch.rsqrt(variance + self.variance_epsilon)
423
+ return self.weight * hidden_states.to(input_dtype)
424
+ '''
425
+
426
+ def extra_repr(self):
427
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
428
+
429
+
430
+ class SDARMLP(nn.Module):
431
+ def __init__(self, config):
432
+ super().__init__()
433
+ self.config = config
434
+ self.hidden_size = config.hidden_size
435
+ self.intermediate_size = config.intermediate_size
436
+ self.gate_proj = nn.Linear(
437
+ self.hidden_size, self.intermediate_size, bias=False)
438
+ self.up_proj = nn.Linear(
439
+ self.hidden_size, self.intermediate_size, bias=False)
440
+ self.down_proj = nn.Linear(
441
+ self.intermediate_size, self.hidden_size, bias=False)
442
+ self.act_fn = ACT2FN[config.hidden_act]
443
+
444
+ def forward(self, x):
445
+ if liger_kernel_is_available:
446
+ return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x)))
447
+ else:
448
+ down_proj = self.down_proj(self.act_fn(
449
+ self.gate_proj(x)) * self.up_proj(x))
450
+ return down_proj
451
+
452
+
453
+ def rotate_half(x):
454
+ """Rotates half the hidden dims of the input."""
455
+ x1 = x[..., : x.shape[-1] // 2]
456
+ x2 = x[..., x.shape[-1] // 2:]
457
+ return torch.cat((-x2, x1), dim=-1)
458
+
459
+
460
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
461
+ """Applies Rotary Position Embedding to the query and key tensors.
462
+
463
+ Args:
464
+ q (`torch.Tensor`): The query tensor.
465
+ k (`torch.Tensor`): The key tensor.
466
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
467
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
468
+ position_ids (`torch.Tensor`, *optional*):
469
+ Deprecated and unused.
470
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
471
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
472
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
473
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
474
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
475
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
476
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
477
+ Returns:
478
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
479
+ """
480
+ cos = cos.unsqueeze(unsqueeze_dim)
481
+ sin = sin.unsqueeze(unsqueeze_dim)
482
+ q_embed = (q * cos) + (rotate_half(q) * sin)
483
+ k_embed = (k * cos) + (rotate_half(k) * sin)
484
+ return q_embed, k_embed
485
+
486
+
487
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
488
+ """
489
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
490
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
491
+ """
492
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
493
+ if n_rep == 1:
494
+ return hidden_states
495
+ hidden_states = hidden_states[:, :, None, :, :].expand(
496
+ batch, num_key_value_heads, n_rep, slen, head_dim)
497
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
498
+
499
+
500
+ def eager_attention_forward(
501
+ module: nn.Module,
502
+ query: torch.Tensor,
503
+ key: torch.Tensor,
504
+ value: torch.Tensor,
505
+ attention_mask: Optional[torch.Tensor],
506
+ scaling: float,
507
+ dropout: float = 0.0,
508
+ **kwargs,
509
+ ):
510
+ key_states = repeat_kv(key, module.num_key_value_groups)
511
+ value_states = repeat_kv(value, module.num_key_value_groups)
512
+
513
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
514
+ if attention_mask is not None:
515
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
516
+ attn_weights = attn_weights + causal_mask
517
+
518
+ attn_weights = nn.functional.softmax(
519
+ attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
520
+ attn_weights = nn.functional.dropout(
521
+ attn_weights, p=dropout, training=module.training)
522
+ attn_output = torch.matmul(attn_weights, value_states)
523
+ attn_output = attn_output.transpose(1, 2).contiguous()
524
+
525
+ return attn_output, attn_weights
526
+
527
+
528
+ class SDARAttention(nn.Module):
529
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
530
+
531
+ def __init__(self, config: SDARMTPConfig, layer_idx: int):
532
+ super().__init__()
533
+ self.config = config
534
+ self.layer_idx = layer_idx
535
+ self.head_dim = getattr(
536
+ config, "head_dim", config.hidden_size // config.num_attention_heads)
537
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
538
+ self.scaling = self.head_dim**-0.5
539
+ self.attention_dropout = config.attention_dropout
540
+ self.is_causal = True
541
+
542
+ self.hidden_size = config.hidden_size
543
+ self.num_attention_heads = config.num_attention_heads
544
+ self.num_key_value_heads = config.num_key_value_heads
545
+
546
+ self.q_proj = nn.Linear(
547
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
548
+ )
549
+ self.k_proj = nn.Linear(
550
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
551
+ )
552
+ self.v_proj = nn.Linear(
553
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
554
+ )
555
+ self.o_proj = nn.Linear(
556
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
557
+ )
558
+ # unlike olmo, only on the head dim!
559
+ self.q_norm = SDARRMSNorm(self.head_dim, eps=config.rms_norm_eps)
560
+ # thus post q_norm does not need reshape
561
+ self.k_norm = SDARRMSNorm(self.head_dim, eps=config.rms_norm_eps)
562
+ self.sliding_window = config.sliding_window
563
+ if not (
564
+ self.config.use_sliding_window
565
+ and getattr(self.config, "sliding_window", None) is not None
566
+ and self.layer_idx >= self.config.max_window_layers
567
+ ):
568
+ self.sliding_window = None
569
+
570
+ def forward(
571
+ self,
572
+ hidden_states: torch.Tensor,
573
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
574
+ attention_mask: Optional[torch.Tensor],
575
+ past_key_value: Optional[Cache] = None,
576
+ cache_position: Optional[torch.LongTensor] = None,
577
+ **kwargs: Unpack[FlashAttentionKwargs],
578
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
579
+ input_shape = hidden_states.shape[:-1]
580
+ bsz, q_len = input_shape
581
+ hidden_shape = (*input_shape, -1, self.head_dim)
582
+
583
+ query_states = self.q_norm(self.q_proj(
584
+ hidden_states).view(hidden_shape)).transpose(1, 2)
585
+ key_states = self.k_norm(self.k_proj(
586
+ hidden_states).view(hidden_shape)).transpose(1, 2)
587
+ value_states = self.v_proj(hidden_states).view(
588
+ hidden_shape).transpose(1, 2)
589
+
590
+ cos, sin = position_embeddings
591
+ query_states, key_states = apply_rotary_pos_emb(
592
+ query_states, key_states, cos, sin)
593
+
594
+ if past_key_value is not None and kwargs.get("store_kv", False):
595
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
596
+ key_states, value_states = past_key_value.update(
597
+ key_states, value_states, self.layer_idx)
598
+ elif past_key_value is not None and not kwargs.get("store_kv", False) and len(past_key_value) > self.layer_idx:
599
+ # only retrive, do not store kv
600
+ past_key_states, past_value_states = past_key_value[self.layer_idx]
601
+ key_states = torch.cat(
602
+ [past_key_states, key_states], dim=-2)
603
+ value_states = torch.cat(
604
+ [past_value_states, value_states], dim=-2)
605
+
606
+ if self.training:
607
+ attn_output, attn_weights = fused_flex_attention(
608
+ query=query_states,
609
+ key=key_states,
610
+ value=value_states,
611
+ attention_mask=attention_mask,
612
+ enable_gqa=True,
613
+ scale=self.scaling,
614
+ return_lse=True
615
+ )
616
+ attn_weights = attn_weights.to(
617
+ value_states.dtype) if attn_weights is not None else None
618
+ attn_output = rearrange(attn_output, 'b h l d -> b l (h d)')
619
+ else:
620
+ attention_mask = attention_mask.bool() if attention_mask is not None else None
621
+ attn_weights = None
622
+ if torch.all(attention_mask): # decoding
623
+ query_states = query_states.transpose(1, 2)
624
+ key_states = key_states.transpose(1, 2)
625
+ value_states = value_states.transpose(1, 2)
626
+ attn_output = flash_attn_func(
627
+ query_states,
628
+ key_states,
629
+ value_states,
630
+ causal=False,
631
+ softmax_scale=self.scaling
632
+ )
633
+ attn_output = rearrange(attn_output, 'b l h d -> b l (h d)')
634
+ else: # prefilling
635
+ attn_output = F.scaled_dot_product_attention(
636
+ query=query_states,
637
+ key=key_states,
638
+ value=value_states,
639
+ attn_mask=attention_mask,
640
+ is_causal=False,
641
+ scale=self.scaling,
642
+ enable_gqa=True
643
+ )
644
+ attn_output = rearrange(attn_output, 'b h l d -> b l (h d)')
645
+ attn_output = self.o_proj(attn_output)
646
+ return attn_output, attn_weights # , attn_weights
647
+
648
+
649
+ class SDARDecoderLayer(GradientCheckpointingLayer):
650
+ def __init__(self, config: SDARMTPConfig, layer_idx: int):
651
+ super().__init__()
652
+ self.hidden_size = config.hidden_size
653
+ self.self_attn = SDARAttention(config=config, layer_idx=layer_idx)
654
+ self.mlp = SDARMLP(config)
655
+ self.input_layernorm = SDARRMSNorm(
656
+ config.hidden_size, eps=config.rms_norm_eps)
657
+ self.post_attention_layernorm = SDARRMSNorm(
658
+ config.hidden_size, eps=config.rms_norm_eps)
659
+ if (
660
+ config.sliding_window and config._attn_implementation != "flash_attention_2"
661
+ ): # diff with Llama is this warning
662
+ logger.warning_once(
663
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
664
+ "unexpected results may be encountered."
665
+ )
666
+
667
+ def forward(
668
+ self,
669
+ hidden_states: torch.Tensor,
670
+ attention_mask: Optional[torch.Tensor] = None,
671
+ position_ids: Optional[torch.LongTensor] = None,
672
+ past_key_value: Optional[Cache] = None,
673
+ output_attentions: Optional[bool] = False,
674
+ use_cache: Optional[bool] = False,
675
+ store_kv: Optional[bool] = False,
676
+ cache_position: Optional[torch.LongTensor] = None,
677
+ # necessary, but kept here for BC
678
+ position_embeddings: Optional[Tuple[torch.Tensor,
679
+ torch.Tensor]] = None,
680
+ **kwargs: Unpack[FlashAttentionKwargs],
681
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
682
+ residual = hidden_states
683
+ hidden_states = self.input_layernorm(hidden_states)
684
+
685
+ # Self Attention
686
+ hidden_states, self_attn_weights = self.self_attn(
687
+ hidden_states=hidden_states,
688
+ attention_mask=attention_mask,
689
+ position_ids=position_ids,
690
+ past_key_value=past_key_value,
691
+ output_attentions=output_attentions,
692
+ use_cache=use_cache,
693
+ store_kv=store_kv,
694
+ cache_position=cache_position,
695
+ position_embeddings=position_embeddings,
696
+ **kwargs,
697
+ )
698
+ hidden_states = residual + hidden_states
699
+
700
+ # Fully Connected
701
+ residual = hidden_states
702
+ hidden_states = self.post_attention_layernorm(hidden_states)
703
+ hidden_states = self.mlp(hidden_states)
704
+ hidden_states = residual + hidden_states
705
+
706
+ outputs = (hidden_states,)
707
+ if output_attentions:
708
+ outputs += (self_attn_weights,)
709
+
710
+ return outputs
711
+
712
+
713
+ @auto_docstring
714
+ class SDARMTPPreTrainedModel(PreTrainedModel):
715
+ config_class = SDARMTPConfig
716
+ base_model_prefix = "model"
717
+ supports_gradient_checkpointing = True
718
+ _no_split_modules = ["SDARDecoderLayer"]
719
+ _skip_keys_device_placement = ["past_key_values"]
720
+ _supports_flash_attn_2 = True
721
+ _supports_sdpa = True
722
+ _supports_flex_attn = True
723
+ _supports_cache_class = True
724
+ _supports_quantized_cache = True
725
+ _supports_static_cache = True
726
+ _supports_attention_backend = True
727
+
728
+ def _init_weights(self, module):
729
+ std = self.config.initializer_range
730
+ if isinstance(module, nn.Linear):
731
+ module.weight.data.normal_(mean=0.0, std=std)
732
+ if module.bias is not None:
733
+ module.bias.data.zero_()
734
+ elif isinstance(module, nn.Embedding):
735
+ module.weight.data.normal_(mean=0.0, std=std)
736
+ if module.padding_idx is not None:
737
+ module.weight.data[module.padding_idx].zero_()
738
+ elif isinstance(module, SDARRMSNorm):
739
+ module.weight.data.fill_(1.0)
740
+
741
+
742
+ class SDARRotaryEmbedding(nn.Module):
743
+ def __init__(self, config: SDARMTPConfig, device=None):
744
+ super().__init__()
745
+ # BC: "rope_type" was originally "type"
746
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
747
+ self.rope_type = config.rope_scaling.get(
748
+ "rope_type", config.rope_scaling.get("type"))
749
+ else:
750
+ self.rope_type = "default"
751
+ self.max_seq_len_cached = config.max_position_embeddings
752
+ self.original_max_seq_len = config.max_position_embeddings
753
+
754
+ self.config = config
755
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
756
+
757
+ inv_freq, self.attention_scaling = self.rope_init_fn(
758
+ self.config, device)
759
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
760
+ self.original_inv_freq = self.inv_freq
761
+
762
+ @torch.no_grad()
763
+ # power user: used with advanced RoPE types (e.g. dynamic rope)
764
+ @dynamic_rope_update
765
+ def forward(self, x, position_ids):
766
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(
767
+ position_ids.shape[0], -1, 1).to(x.device)
768
+ position_ids_expanded = position_ids[:, None, :].float()
769
+
770
+ device_type = x.device.type if isinstance(
771
+ x.device.type, str) and x.device.type != "mps" else "cpu"
772
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
773
+ freqs = (inv_freq_expanded.float() @
774
+ position_ids_expanded.float()).transpose(1, 2)
775
+ emb = torch.cat((freqs, freqs), dim=-1)
776
+ cos = emb.cos() * self.attention_scaling
777
+ sin = emb.sin() * self.attention_scaling
778
+
779
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
780
+
781
+ @auto_docstring
782
+ class SDARModel(SDARMTPPreTrainedModel):
783
+ def __init__(self, config: SDARMTPConfig):
784
+ super().__init__(config)
785
+ self.padding_idx = config.pad_token_id
786
+ self.vocab_size = config.vocab_size
787
+
788
+ self.embed_tokens = nn.Embedding(
789
+ config.vocab_size, config.hidden_size, self.padding_idx)
790
+ self.layers = nn.ModuleList(
791
+ [SDARDecoderLayer(config, layer_idx)
792
+ for layer_idx in range(config.num_hidden_layers)]
793
+ )
794
+ self.norm = SDARRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
795
+ self.rotary_emb = SDARRotaryEmbedding(config=config)
796
+ self.gradient_checkpointing = False
797
+
798
+ # Initialize weights and apply final processing
799
+ self.post_init()
800
+
801
+ def get_input_embeddings(self):
802
+ return self.embed_tokens
803
+
804
+ def set_input_embeddings(self, value):
805
+ self.embed_tokens = value
806
+
807
+ @can_return_tuple
808
+ @auto_docstring
809
+ def forward(
810
+ self,
811
+ input_ids: Optional[torch.LongTensor] = None,
812
+ attention_mask: Optional[torch.Tensor] = None,
813
+ position_ids: Optional[torch.LongTensor] = None,
814
+ past_key_values: Optional[Cache] = None,
815
+ inputs_embeds: Optional[torch.FloatTensor] = None,
816
+ use_cache: Optional[bool] = None,
817
+ store_kv: Optional[bool] = None,
818
+ output_attentions: Optional[bool] = None,
819
+ output_hidden_states: Optional[bool] = None,
820
+ cache_position: Optional[torch.LongTensor] = None,
821
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
822
+ ) -> BaseModelOutputWithPast:
823
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
824
+ output_hidden_states = (
825
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
826
+ )
827
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
828
+
829
+ if (input_ids is None) ^ (inputs_embeds is not None):
830
+ raise ValueError(
831
+ "You must specify exactly one of input_ids or inputs_embeds")
832
+
833
+ if self.gradient_checkpointing and self.training and use_cache:
834
+ logger.warning_once(
835
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
836
+ )
837
+ use_cache = False
838
+
839
+ # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
840
+ if not isinstance(past_key_values, (type(None), Cache)):
841
+ raise ValueError(
842
+ "The `past_key_values` should be either a `Cache` object or `None`.")
843
+
844
+ if inputs_embeds is None:
845
+ inputs_embeds = self.embed_tokens(input_ids)
846
+
847
+ if use_cache and past_key_values is None:
848
+ past_key_values = DynamicCache()
849
+
850
+ if cache_position is None:
851
+ past_seen_tokens = past_key_values.get_seq_length(
852
+ ) if past_key_values is not None else 0
853
+ cache_position = torch.arange(
854
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
855
+ )
856
+
857
+ if position_ids is None:
858
+ position_ids = cache_position.unsqueeze(0)
859
+
860
+ # causal_mask = self._update_causal_mask(
861
+ # attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
862
+ # )
863
+
864
+ hidden_states = inputs_embeds
865
+
866
+ # create position embeddings to be shared across the decoder layers
867
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
868
+
869
+ # decoder layers
870
+ all_hidden_states = () if output_hidden_states else None
871
+ all_self_attns = () if output_attentions else None
872
+
873
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
874
+ if output_hidden_states:
875
+ all_hidden_states += (hidden_states,)
876
+
877
+ layer_outputs = decoder_layer(
878
+ hidden_states,
879
+ attention_mask=attention_mask,
880
+ position_ids=position_ids,
881
+ past_key_value=past_key_values,
882
+ output_attentions=output_attentions,
883
+ use_cache=use_cache,
884
+ store_kv=store_kv,
885
+ cache_position=cache_position,
886
+ position_embeddings=position_embeddings,
887
+ **flash_attn_kwargs,
888
+ )
889
+
890
+ hidden_states = layer_outputs[0]
891
+
892
+ if output_attentions:
893
+ all_self_attns += (layer_outputs[1],)
894
+
895
+ hidden_states = self.norm(hidden_states)
896
+
897
+ # add hidden states from the last decoder layer
898
+ if output_hidden_states:
899
+ all_hidden_states += (hidden_states,)
900
+
901
+ return BaseModelOutputWithPast(
902
+ last_hidden_state=hidden_states,
903
+ past_key_values=past_key_values if use_cache else None,
904
+ hidden_states=all_hidden_states,
905
+ attentions=all_self_attns,
906
+ )
907
+
908
+ def _update_causal_mask(
909
+ self,
910
+ attention_mask: Union[torch.Tensor, "BlockMask"],
911
+ input_tensor: torch.Tensor,
912
+ cache_position: torch.Tensor,
913
+ past_key_values: Cache,
914
+ output_attentions: bool = False,
915
+ ):
916
+ if self.config._attn_implementation == "flash_attention_2":
917
+ if attention_mask is not None and past_key_values is not None:
918
+ is_padding_right = attention_mask[:, -
919
+ 1].sum().item() != input_tensor.size()[0]
920
+ if is_padding_right:
921
+ raise ValueError(
922
+ "You are attempting to perform batched generation with padding_side='right'"
923
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen3. Make sure to "
924
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
925
+ )
926
+ if attention_mask is not None and 0.0 in attention_mask:
927
+ return attention_mask
928
+ return None
929
+ if self.config._attn_implementation == "flex_attention":
930
+ if isinstance(attention_mask, torch.Tensor):
931
+ seq_len_q, seq_len_kv = attention_mask.shape
932
+ assert seq_len_q == seq_len_kv, f"got {attention_mask.shape=}"
933
+ attention_mask = create_block_mask(
934
+ # 2d bool tensor, shape: [2*seqlen, 2*seqlen]
935
+ lambda b, h, q_idx, kv_idx: attention_mask[q_idx, kv_idx],
936
+ B=None, H=None, Q_LEN=seq_len_q, KV_LEN=seq_len_kv,
937
+ )
938
+ else:
939
+ # Here we pass in flex mask computed externally
940
+ assert isinstance(attention_mask, BlockMask)
941
+ return attention_mask
942
+
943
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
944
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
945
+ # to infer the attention mask.
946
+ past_seen_tokens = past_key_values.get_seq_length(
947
+ ) if past_key_values is not None else 0
948
+ using_static_cache = isinstance(past_key_values, StaticCache)
949
+ using_sliding_window_cache = isinstance(
950
+ past_key_values, SlidingWindowCache)
951
+
952
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
953
+ if (
954
+ self.config._attn_implementation == "sdpa"
955
+ and not (using_static_cache or using_sliding_window_cache)
956
+ and not output_attentions
957
+ ):
958
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
959
+ attention_mask,
960
+ inputs_embeds=input_tensor,
961
+ past_key_values_length=past_seen_tokens,
962
+ sliding_window=self.config.sliding_window,
963
+ is_training=self.training,
964
+ ):
965
+ return None
966
+
967
+ dtype = input_tensor.dtype
968
+ min_dtype = torch.finfo(dtype).min
969
+ sequence_length = input_tensor.shape[1]
970
+ # SlidingWindowCache or StaticCache
971
+ if using_sliding_window_cache or using_static_cache:
972
+ target_length = past_key_values.get_max_cache_shape()
973
+ # DynamicCache or no cache
974
+ else:
975
+ target_length = (
976
+ attention_mask.shape[-1]
977
+ if isinstance(attention_mask, torch.Tensor)
978
+ else past_seen_tokens + sequence_length + 1
979
+ )
980
+
981
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
982
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
983
+ attention_mask,
984
+ sequence_length=sequence_length,
985
+ target_length=target_length,
986
+ dtype=dtype,
987
+ cache_position=cache_position,
988
+ batch_size=input_tensor.shape[0],
989
+ config=self.config,
990
+ past_key_values=past_key_values,
991
+ )
992
+
993
+ if (
994
+ self.config._attn_implementation == "sdpa"
995
+ and attention_mask is not None
996
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
997
+ and not output_attentions
998
+ ):
999
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1000
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1001
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1002
+ causal_mask = AttentionMaskConverter._unmask_unattended(
1003
+ causal_mask, min_dtype)
1004
+
1005
+ return causal_mask
1006
+
1007
+ @staticmethod
1008
+ def _prepare_4d_causal_attention_mask_with_cache_position(
1009
+ attention_mask: torch.Tensor,
1010
+ sequence_length: int,
1011
+ target_length: int,
1012
+ dtype: torch.dtype,
1013
+ cache_position: torch.Tensor,
1014
+ batch_size: int,
1015
+ config: SDARMTPConfig,
1016
+ past_key_values: Cache,
1017
+ ):
1018
+ """
1019
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1020
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1021
+
1022
+ Args:
1023
+ attention_mask (`torch.Tensor`):
1024
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1025
+ sequence_length (`int`):
1026
+ The sequence length being processed.
1027
+ target_length (`int`):
1028
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1029
+ dtype (`torch.dtype`):
1030
+ The dtype to use for the 4D attention mask.
1031
+ cache_position (`torch.Tensor`):
1032
+ Indices depicting the position of the input sequence tokens in the sequence.
1033
+ batch_size (`torch.Tensor`):
1034
+ Batch size.
1035
+ config (`SDARMTPConfig`):
1036
+ The model's configuration class
1037
+ past_key_values (`Cache`):
1038
+ The cache class that is being used currently to generate
1039
+ """
1040
+ if attention_mask is not None and attention_mask.dim() == 4:
1041
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1042
+ causal_mask = attention_mask
1043
+ else:
1044
+ min_dtype = torch.finfo(dtype).min
1045
+ causal_mask = torch.full(
1046
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
1047
+ )
1048
+ diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
1049
+ -1, 1
1050
+ )
1051
+ text_config = config.get_text_config()
1052
+ if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
1053
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1054
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
1055
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1056
+ sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
1057
+ cache_position.reshape(-1, 1) -
1058
+ text_config.sliding_window
1059
+ )
1060
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1061
+ causal_mask *= diagonal_attend_mask
1062
+ causal_mask = causal_mask[None, None,
1063
+ :, :].expand(batch_size, 1, -1, -1)
1064
+ if attention_mask is not None:
1065
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1066
+ if attention_mask.shape[-1] > target_length:
1067
+ attention_mask = attention_mask[:, :target_length]
1068
+ mask_length = attention_mask.shape[-1]
1069
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1070
+ causal_mask.device
1071
+ )
1072
+ padding_mask = padding_mask == 0
1073
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1074
+ padding_mask, min_dtype
1075
+ )
1076
+ return causal_mask
1077
+
1078
+ @auto_docstring
1079
+ class SDARMTPModel(SDARMTPPreTrainedModel):
1080
+ def __init__(self, config: SDARMTPConfig):
1081
+ super().__init__(config)
1082
+ self.padding_idx = config.pad_token_id
1083
+ self.vocab_size = config.vocab_size
1084
+
1085
+ # self.embed_tokens = nn.Embedding(
1086
+ # config.vocab_size, config.hidden_size, self.padding_idx)
1087
+ self.embed_tokens = None
1088
+ self.enorm = SDARRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1089
+ self.hnorm = SDARRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1090
+ self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
1091
+ self.layers = nn.ModuleList(
1092
+ [SDARDecoderLayer(config, layer_idx)
1093
+ for layer_idx in range(config.num_nextn_predict_layers)]
1094
+ )
1095
+ self.lm_head = None
1096
+
1097
+ self.rotary_emb = SDARRotaryEmbedding(config=config)
1098
+
1099
+ self.gradient_checkpointing = False
1100
+
1101
+ # Initialize weights and apply final processing
1102
+ self.post_init()
1103
+
1104
+ def get_input_embeddings(self):
1105
+ return self.embed_tokens
1106
+
1107
+ def set_input_embeddings(self, value):
1108
+ self.embed_tokens = value
1109
+
1110
+ def get_output_embeddings(self):
1111
+ return self.lm_head
1112
+
1113
+ def set_output_embeddings(self, value):
1114
+ self.lm_head = value
1115
+
1116
+ @can_return_tuple
1117
+ @auto_docstring
1118
+ def forward(
1119
+ self,
1120
+ input_ids: Optional[torch.LongTensor] = None,
1121
+ attention_mask: Optional[torch.Tensor] = None,
1122
+ position_ids: Optional[torch.LongTensor] = None,
1123
+ past_key_values: Optional[Cache] = None,
1124
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1125
+ use_cache: Optional[bool] = None,
1126
+ store_kv: Optional[bool] = None,
1127
+ output_attentions: Optional[bool] = None,
1128
+ output_hidden_states: Optional[bool] = None,
1129
+ cache_position: Optional[torch.LongTensor] = None,
1130
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
1131
+ ) -> BaseModelOutputWithPast:
1132
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1133
+ output_hidden_states = (
1134
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1135
+ )
1136
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1137
+
1138
+ if self.embed_tokens is None:
1139
+ raise ValueError(
1140
+ "You must call the `set_input_embeddings` method to set the input embeddings "
1141
+ "before calling the forward method."
1142
+ )
1143
+
1144
+ if (input_ids is None) or (inputs_embeds is None):
1145
+ raise ValueError(
1146
+ "You must specify both input_ids and inputs_embeds")
1147
+
1148
+ if self.gradient_checkpointing and self.training and use_cache:
1149
+ logger.warning_once(
1150
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
1151
+ )
1152
+ use_cache = False
1153
+
1154
+ # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
1155
+ if not isinstance(past_key_values, (type(None), Cache)):
1156
+ raise ValueError(
1157
+ "The `past_key_values` should be either a `Cache` object or `None`.")
1158
+
1159
+ if use_cache and past_key_values is None:
1160
+ past_key_values = DynamicCache()
1161
+
1162
+ if cache_position is None:
1163
+ past_seen_tokens = past_key_values.get_seq_length(
1164
+ ) if past_key_values is not None else 0
1165
+ cache_position = torch.arange(
1166
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
1167
+ )
1168
+
1169
+ if position_ids is None:
1170
+ position_ids = cache_position.unsqueeze(0)
1171
+
1172
+ # causal_mask = self._update_causal_mask(
1173
+ # attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
1174
+ # )
1175
+
1176
+ hidden_states = self.eh_proj(
1177
+ torch.cat((
1178
+ self.enorm(self.embed_tokens(input_ids)),
1179
+ self.hnorm(inputs_embeds),
1180
+ ), dim=-1)
1181
+ )
1182
+
1183
+ # create position embeddings to be shared across the decoder layers
1184
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
1185
+
1186
+ # decoder layers
1187
+ all_hidden_states = () if output_hidden_states else None
1188
+ all_self_attns = () if output_attentions else None
1189
+
1190
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
1191
+ if output_hidden_states:
1192
+ all_hidden_states += (hidden_states,)
1193
+
1194
+ layer_outputs = decoder_layer(
1195
+ hidden_states,
1196
+ attention_mask=attention_mask,
1197
+ position_ids=position_ids,
1198
+ past_key_value=past_key_values,
1199
+ output_attentions=output_attentions,
1200
+ use_cache=use_cache,
1201
+ store_kv=store_kv,
1202
+ cache_position=cache_position,
1203
+ position_embeddings=position_embeddings,
1204
+ **flash_attn_kwargs,
1205
+ )
1206
+
1207
+ hidden_states = layer_outputs[0]
1208
+
1209
+ if output_attentions:
1210
+ all_self_attns += (layer_outputs[1],)
1211
+
1212
+ # hidden_states = self.norm(hidden_states)
1213
+
1214
+ # add hidden states from the last decoder layer
1215
+ if output_hidden_states:
1216
+ all_hidden_states += (hidden_states,)
1217
+
1218
+ return BaseModelOutputWithPast(
1219
+ last_hidden_state=hidden_states,
1220
+ past_key_values=past_key_values if use_cache else None,
1221
+ hidden_states=all_hidden_states,
1222
+ attentions=all_self_attns,
1223
+ )
1224
+
1225
+ def _update_causal_mask(
1226
+ self,
1227
+ attention_mask: Union[torch.Tensor, "BlockMask"],
1228
+ input_tensor: torch.Tensor,
1229
+ cache_position: torch.Tensor,
1230
+ past_key_values: Cache,
1231
+ output_attentions: bool = False,
1232
+ ):
1233
+ if self.config._attn_implementation == "flash_attention_2":
1234
+ if attention_mask is not None and past_key_values is not None:
1235
+ is_padding_right = attention_mask[:, -
1236
+ 1].sum().item() != input_tensor.size()[0]
1237
+ if is_padding_right:
1238
+ raise ValueError(
1239
+ "You are attempting to perform batched generation with padding_side='right'"
1240
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen3. Make sure to "
1241
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
1242
+ )
1243
+ if attention_mask is not None and 0.0 in attention_mask:
1244
+ return attention_mask
1245
+ return None
1246
+ if self.config._attn_implementation == "flex_attention":
1247
+ if isinstance(attention_mask, torch.Tensor):
1248
+ seq_len_q, seq_len_kv = attention_mask.shape
1249
+ assert seq_len_q == seq_len_kv, f"got {attention_mask.shape=}"
1250
+ attention_mask = create_block_mask(
1251
+ # 2d bool tensor, shape: [2*seqlen, 2*seqlen]
1252
+ lambda b, h, q_idx, kv_idx: attention_mask[q_idx, kv_idx],
1253
+ B=None, H=None, Q_LEN=seq_len_q, KV_LEN=seq_len_kv,
1254
+ )
1255
+ else:
1256
+ # Here we pass in flex mask computed externally
1257
+ assert isinstance(attention_mask, BlockMask)
1258
+ return attention_mask
1259
+
1260
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1261
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
1262
+ # to infer the attention mask.
1263
+ past_seen_tokens = past_key_values.get_seq_length(
1264
+ ) if past_key_values is not None else 0
1265
+ using_static_cache = isinstance(past_key_values, StaticCache)
1266
+ using_sliding_window_cache = isinstance(
1267
+ past_key_values, SlidingWindowCache)
1268
+
1269
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
1270
+ if (
1271
+ self.config._attn_implementation == "sdpa"
1272
+ and not (using_static_cache or using_sliding_window_cache)
1273
+ and not output_attentions
1274
+ ):
1275
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1276
+ attention_mask,
1277
+ inputs_embeds=input_tensor,
1278
+ past_key_values_length=past_seen_tokens,
1279
+ sliding_window=self.config.sliding_window,
1280
+ is_training=self.training,
1281
+ ):
1282
+ return None
1283
+
1284
+ dtype = input_tensor.dtype
1285
+ min_dtype = torch.finfo(dtype).min
1286
+ sequence_length = input_tensor.shape[1]
1287
+ # SlidingWindowCache or StaticCache
1288
+ if using_sliding_window_cache or using_static_cache:
1289
+ target_length = past_key_values.get_max_cache_shape()
1290
+ # DynamicCache or no cache
1291
+ else:
1292
+ target_length = (
1293
+ attention_mask.shape[-1]
1294
+ if isinstance(attention_mask, torch.Tensor)
1295
+ else past_seen_tokens + sequence_length + 1
1296
+ )
1297
+
1298
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
1299
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
1300
+ attention_mask,
1301
+ sequence_length=sequence_length,
1302
+ target_length=target_length,
1303
+ dtype=dtype,
1304
+ cache_position=cache_position,
1305
+ batch_size=input_tensor.shape[0],
1306
+ config=self.config,
1307
+ past_key_values=past_key_values,
1308
+ )
1309
+
1310
+ if (
1311
+ self.config._attn_implementation == "sdpa"
1312
+ and attention_mask is not None
1313
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
1314
+ and not output_attentions
1315
+ ):
1316
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1317
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1318
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1319
+ causal_mask = AttentionMaskConverter._unmask_unattended(
1320
+ causal_mask, min_dtype)
1321
+
1322
+ return causal_mask
1323
+
1324
+ @staticmethod
1325
+ def _prepare_4d_causal_attention_mask_with_cache_position(
1326
+ attention_mask: torch.Tensor,
1327
+ sequence_length: int,
1328
+ target_length: int,
1329
+ dtype: torch.dtype,
1330
+ cache_position: torch.Tensor,
1331
+ batch_size: int,
1332
+ config: SDARMTPConfig,
1333
+ past_key_values: Cache,
1334
+ ):
1335
+ """
1336
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1337
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1338
+
1339
+ Args:
1340
+ attention_mask (`torch.Tensor`):
1341
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
1342
+ sequence_length (`int`):
1343
+ The sequence length being processed.
1344
+ target_length (`int`):
1345
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
1346
+ dtype (`torch.dtype`):
1347
+ The dtype to use for the 4D attention mask.
1348
+ cache_position (`torch.Tensor`):
1349
+ Indices depicting the position of the input sequence tokens in the sequence.
1350
+ batch_size (`torch.Tensor`):
1351
+ Batch size.
1352
+ config (`SDARMTPConfig`):
1353
+ The model's configuration class
1354
+ past_key_values (`Cache`):
1355
+ The cache class that is being used currently to generate
1356
+ """
1357
+ if attention_mask is not None and attention_mask.dim() == 4:
1358
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
1359
+ causal_mask = attention_mask
1360
+ else:
1361
+ min_dtype = torch.finfo(dtype).min
1362
+ causal_mask = torch.full(
1363
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
1364
+ )
1365
+ diagonal_attend_mask = torch.arange(target_length, device=cache_position.device) > cache_position.reshape(
1366
+ -1, 1
1367
+ )
1368
+ text_config = config.get_text_config()
1369
+ if getattr(text_config, "use_sliding_window", True) and text_config.sliding_window is not None:
1370
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
1371
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
1372
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
1373
+ sliding_attend_mask = torch.arange(target_length, device=cache_position.device) <= (
1374
+ cache_position.reshape(-1, 1) -
1375
+ text_config.sliding_window
1376
+ )
1377
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
1378
+ causal_mask *= diagonal_attend_mask
1379
+ causal_mask = causal_mask[None, None,
1380
+ :, :].expand(batch_size, 1, -1, -1)
1381
+ if attention_mask is not None:
1382
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1383
+ if attention_mask.shape[-1] > target_length:
1384
+ attention_mask = attention_mask[:, :target_length]
1385
+ mask_length = attention_mask.shape[-1]
1386
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
1387
+ causal_mask.device
1388
+ )
1389
+ padding_mask = padding_mask == 0
1390
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1391
+ padding_mask, min_dtype
1392
+ )
1393
+ return causal_mask
1394
+
1395
+
1396
+ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
1397
+ ...
1398
+
1399
+ def freeze_module(m):
1400
+ if m is None:
1401
+ return
1402
+ m.eval()
1403
+ for p in m.parameters(recurse=True):
1404
+ p.requires_grad_(False)
1405
+
1406
+ def show_require_grad(module):
1407
+ for name, param in module.named_parameters():
1408
+ print(f"{name} | {param.requires_grad}")
1409
+
1410
+ @auto_docstring
1411
+ class SDARMTPForCausalLM(SDARMTPPreTrainedModel, GenerationMixin):
1412
+ _tied_weights_keys = [
1413
+ "lm_head.weight",
1414
+ "mtp_module.lm_head.weight",
1415
+ "mtp_module.embed_tokens.weight",
1416
+ ]
1417
+ _tp_plan = {"lm_head": "colwise_rep"}
1418
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
1419
+
1420
+ def __init__(self, config):
1421
+ super().__init__(config)
1422
+ self.model = SDARModel(config)
1423
+ self.mtp_module = SDARMTPModel(config)
1424
+ self.vocab_size = config.vocab_size
1425
+ self.lm_head = nn.Linear(
1426
+ config.hidden_size, config.vocab_size, bias=False)
1427
+
1428
+ # Initialize weights and apply final processing
1429
+ self.post_init()
1430
+ freeze_module(self.model)
1431
+ freeze_module(self.lm_head)
1432
+ self.mtp_module.set_input_embeddings(self.model.embed_tokens)
1433
+ self.mtp_module.set_output_embeddings(self.lm_head)
1434
+
1435
+ def get_input_embeddings(self):
1436
+ return self.model.embed_tokens
1437
+
1438
+ def set_input_embeddings(self, value):
1439
+ self.model.embed_tokens = value
1440
+
1441
+ def get_output_embeddings(self):
1442
+ return self.lm_head
1443
+
1444
+ def set_output_embeddings(self, new_embeddings):
1445
+ self.lm_head = new_embeddings
1446
+
1447
+ def set_decoder(self, decoder):
1448
+ self.model = decoder
1449
+
1450
+ def get_decoder(self):
1451
+ return self.model
1452
+
1453
+ def prepare_for_bd_training(self, inputs_ids, position_ids, prompt_mask):
1454
+ bsz, seq_len = inputs_ids.shape
1455
+ num_tokens = calculate_token_nums(position_ids) # List[torch.Tensor]
1456
+ noisy_inputs_ids, logits_to_keep_half, p_mask = forward_add_noise_packed(
1457
+ inputs_ids=inputs_ids,
1458
+ num_tokens_list=num_tokens,
1459
+ prompt_mask=prompt_mask,
1460
+ mask_id=self.config.mask_token_id,
1461
+ )
1462
+ router_noisy_part_list = []
1463
+ for i in range(bsz):
1464
+ cur_router_noisy_part = (torch.arange(num_tokens[i].shape[0] *2) % 2 == 0).to(inputs_ids.device)
1465
+ cur_router_noisy_part = cur_router_noisy_part.repeat_interleave(num_tokens[i].repeat_interleave(2))
1466
+ router_noisy_part_list.append(cur_router_noisy_part)
1467
+ router_noisy_part = torch.stack(router_noisy_part_list, dim=0)
1468
+
1469
+ # concated inputs_ids: (bzs, seq_len x 2)
1470
+ concat_inputs_ids = inputs_ids.repeat(1, 2)
1471
+ # concated logits_to_keep: (bsz, seq_len x 2)
1472
+ logits_to_keep = torch.zeros(
1473
+ bsz, 2 * seq_len, dtype=torch.bool, device=inputs_ids.device)
1474
+ # concated position_ids: (bsz, seq_len x 2)
1475
+ concat_position_ids = torch.zeros(
1476
+ bsz, 2 * seq_len, dtype=position_ids.dtype, device=position_ids.device)
1477
+ for i in range(bsz):
1478
+ concat_inputs_ids[i][router_noisy_part[i]] = noisy_inputs_ids[i]
1479
+ concat_inputs_ids[i][~router_noisy_part[i]] = inputs_ids[i]
1480
+
1481
+ logits_to_keep[i][router_noisy_part[i]] = logits_to_keep_half[i]
1482
+
1483
+ concat_position_ids[i][router_noisy_part[i]] = position_ids[i]
1484
+ concat_position_ids[i][~router_noisy_part[i]] = position_ids[i]
1485
+
1486
+ # create flex_attention mask
1487
+ attention_mask = block_attn_mask(num_tokens, self.config.block_size, inputs_ids.device)
1488
+ flex_attention_mask_3d = create_block_mask(
1489
+ lambda b, h, q_idx, kv_idx: attention_mask[b, q_idx, kv_idx],
1490
+ B=attention_mask.size(0), H=None,
1491
+ Q_LEN=attention_mask.size(1), KV_LEN=attention_mask.size(2),
1492
+ )
1493
+
1494
+ return concat_inputs_ids, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask
1495
+
1496
+ @can_return_tuple
1497
+ @auto_docstring
1498
+ def forward(
1499
+ self,
1500
+ input_ids: Optional[torch.LongTensor] = None,
1501
+ attention_mask: Optional[torch.Tensor] = None,
1502
+ position_ids: Optional[torch.LongTensor] = None,
1503
+ past_key_values: Optional[Cache] = None,
1504
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1505
+ labels: Optional[torch.LongTensor] = None,
1506
+ use_cache: Optional[bool] = None,
1507
+ output_attentions: Optional[bool] = None,
1508
+ output_hidden_states: Optional[bool] = None,
1509
+ cache_position: Optional[torch.LongTensor] = None,
1510
+ logits_to_keep: Union[int, torch.Tensor] = 0,
1511
+ **kwargs: Unpack[KwargsForCausalLM],
1512
+ ) -> CausalLMOutputWithPast:
1513
+ r"""
1514
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1515
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1516
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1517
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1518
+
1519
+ Example:
1520
+
1521
+ ```python
1522
+ >>> from transformers import AutoTokenizer, SDARForCausalLM
1523
+
1524
+ >>> model = SDARForCausalLM.from_pretrained("DiffuOpen/SDAR-1.7B-Chat")
1525
+ >>> tokenizer = AutoTokenizer.from_pretrained("DiffuOpen/SDAR-1.7B-Chat")
1526
+
1527
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1528
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1529
+
1530
+ >>> # Generate
1531
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1532
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1533
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1534
+ ```"""
1535
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1536
+ output_hidden_states = (
1537
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1538
+ )
1539
+ if self.training:
1540
+ assert inputs_embeds is None, "only support input_ids during training"
1541
+ prompt_mask = (labels == -100) if labels is not None else None
1542
+ position_ids = modify_padded_position_ids_2d(position_ids)
1543
+ concat_inputs_ids, concat_position_ids, flex_attention_mask_3d, logits_to_keep_half, logits_to_keep, p_mask = self.prepare_for_bd_training(input_ids, position_ids, prompt_mask)
1544
+ target_outputs: BaseModelOutputWithPast = self.model(
1545
+ input_ids=concat_inputs_ids,
1546
+ attention_mask=flex_attention_mask_3d,
1547
+ position_ids=concat_position_ids,
1548
+ past_key_values=past_key_values,
1549
+ inputs_embeds=inputs_embeds,
1550
+ output_attentions=output_attentions,
1551
+ output_hidden_states=True,
1552
+ cache_position=cache_position,
1553
+ **kwargs,
1554
+ )
1555
+ concat_inputs_embeds = target_outputs.last_hidden_state
1556
+ outputs = self.mtp_module(
1557
+ input_ids=concat_inputs_ids,
1558
+ inputs_embeds=concat_inputs_embeds,
1559
+ attention_mask=flex_attention_mask_3d,
1560
+ position_ids=concat_position_ids,
1561
+ output_attentions=output_attentions,
1562
+ output_hidden_states=output_hidden_states,
1563
+ return_dict=True,
1564
+ cache_position=cache_position,
1565
+ **kwargs,
1566
+ )
1567
+ hidden_states = outputs.last_hidden_state
1568
+ hidden_states = hidden_states[logits_to_keep].contiguous()
1569
+ assert labels is not None, "Labels must be provided for training."
1570
+ answer_len = (labels != -100).sum()
1571
+ loss_fct = FusedLinearDiffusionCrossEntropyLoss(reduction='sum')
1572
+ loss = loss_fct( # it will return (sum_loss, unreduced_loss)
1573
+ # conduct `view(-1, V)` inside the function
1574
+ x=hidden_states,
1575
+ target=labels[logits_to_keep_half].contiguous(),
1576
+ weight=self.lm_head.weight,
1577
+ bias=self.lm_head.bias,
1578
+ p_mask=p_mask,
1579
+ )
1580
+ loss = loss / answer_len
1581
+ logits = None
1582
+ else:
1583
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1584
+ target_outputs: BaseModelOutputWithPast = self.model(
1585
+ input_ids=input_ids,
1586
+ attention_mask=attention_mask,
1587
+ position_ids=position_ids,
1588
+ past_key_values=past_key_values,
1589
+ inputs_embeds=inputs_embeds,
1590
+ output_attentions=output_attentions,
1591
+ output_hidden_states=True,
1592
+ cache_position=cache_position,
1593
+ **kwargs,
1594
+ )
1595
+ inputs_embeds = target_outputs.last_hidden_state
1596
+ outputs: BaseModelOutputWithPast = self.mtp_module(
1597
+ input_ids=input_ids,
1598
+ attention_mask=attention_mask,
1599
+ position_ids=position_ids,
1600
+ past_key_values=past_key_values,
1601
+ inputs_embeds=inputs_embeds,
1602
+ use_cache=use_cache,
1603
+ output_attentions=output_attentions,
1604
+ output_hidden_states=output_hidden_states,
1605
+ cache_position=cache_position,
1606
+ **kwargs,
1607
+ )
1608
+
1609
+ hidden_states = outputs.last_hidden_state
1610
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1611
+ slice_indices = slice(-logits_to_keep,
1612
+ None) if isinstance(logits_to_keep, int) else logits_to_keep
1613
+ hidden_states = hidden_states[:, slice_indices, :].contiguous()
1614
+ fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training
1615
+ if fuse_linear_and_cross_entropy:
1616
+ # When using fused_linear_ce_loss, we do not compute the whole logits on HBM
1617
+ logits = None
1618
+ else:
1619
+ logits = self.lm_head(hidden_states)
1620
+
1621
+ loss = None
1622
+ if labels is not None:
1623
+ # FusedLinearCrossEntropyLoss will be implemented by monkey patch when training
1624
+ # We don't use it when inferencing
1625
+ loss_fct = nn.CrossEntropyLoss() # nn.CE
1626
+ loss = loss_fct(
1627
+ logits.view(-1, self.config.vocab_size), labels.view(-1))
1628
+
1629
+ return CausalLMOutputWithPast(
1630
+ loss=loss,
1631
+ logits=logits,
1632
+ past_key_values=outputs.past_key_values,
1633
+ hidden_states=outputs.hidden_states,
1634
+ attentions=outputs.attentions,
1635
+ )
1636
+
1637
+ @torch.no_grad()
1638
+ def block_diffusion_generate(
1639
+ self,
1640
+ prompt,
1641
+ mask_id,
1642
+ gen_length=128,
1643
+ block_length=8,
1644
+ denoising_steps=8,
1645
+ temperature=1.0,
1646
+ top_k=0,
1647
+ top_p=1.0,
1648
+ remasking_strategy='low_confidence_dynamic',
1649
+ confidence_threshold=0.85,
1650
+ eb_threshold=None,
1651
+ stopping_criteria_idx=None
1652
+ ):
1653
+
1654
+ self.eval()
1655
+ input_ids = prompt['input_ids']
1656
+ prompt_length = input_ids.shape[1]
1657
+ past_key_values = DynamicCache()
1658
+
1659
+ num_blocks = (prompt_length + gen_length +
1660
+ block_length - 1) // block_length
1661
+ total_length = num_blocks * block_length
1662
+
1663
+ block_mask = torch.tril(torch.ones(
1664
+ num_blocks, num_blocks, device=self.device))
1665
+ block_diffusion_attention_mask = block_mask.repeat_interleave(block_length, dim=0)\
1666
+ .repeat_interleave(block_length, dim=1).unsqueeze(0)
1667
+ position_ids = torch.arange(total_length, device=self.device).unsqueeze(0)
1668
+
1669
+ x = torch.full((1, total_length), mask_id,
1670
+ dtype=torch.long, device=self.device)
1671
+ x[:, :prompt_length] = input_ids
1672
+ prefill_blocks = prompt_length // block_length
1673
+ prefill_length = prefill_blocks * block_length
1674
+
1675
+ # Prefill stage
1676
+ if prefill_length > 0:
1677
+ cur_x = x[:, :prefill_length]
1678
+ cur_attn_mask = block_diffusion_attention_mask[:,
1679
+ :prefill_length, :prefill_length]
1680
+ cur_position_ids = position_ids[:, :prefill_length]
1681
+ self(cur_x,
1682
+ attention_mask=cur_attn_mask,
1683
+ position_ids=cur_position_ids,
1684
+ past_key_values=past_key_values,
1685
+ use_cache=True,
1686
+ store_kv=True)
1687
+
1688
+ num_transfer_tokens = get_num_transfer_tokens(
1689
+ block_length, denoising_steps)
1690
+
1691
+ # Decode stage
1692
+ for num_block in range(prefill_blocks, num_blocks):
1693
+ cur_x = x[:, num_block*block_length:(num_block+1)*block_length].clone()
1694
+ cur_attn_mask = block_diffusion_attention_mask[
1695
+ :, num_block*block_length:(num_block+1)*block_length, :(num_block+1)*block_length
1696
+ ]
1697
+ cur_position_ids = position_ids[:, num_block *
1698
+ block_length:(num_block+1)*block_length]
1699
+ for step in range(denoising_steps + 1):
1700
+ mask_index = (cur_x == mask_id)
1701
+ if mask_index.sum() == 0:
1702
+ # Store kv cache
1703
+ self(cur_x,
1704
+ attention_mask=cur_attn_mask,
1705
+ position_ids=cur_position_ids,
1706
+ past_key_values=past_key_values,
1707
+ use_cache=True,
1708
+ store_kv=True)
1709
+ break
1710
+
1711
+ # Denosing
1712
+ logits = self(cur_x,
1713
+ attention_mask=cur_attn_mask,
1714
+ position_ids=cur_position_ids,
1715
+ past_key_values=past_key_values,
1716
+ use_cache=True,
1717
+ store_kv=False).logits
1718
+
1719
+ # Sampling
1720
+ x0, x0_p = sample_with_temperature_topk_topp(
1721
+ logits,
1722
+ temperature=temperature,
1723
+ top_k=top_k,
1724
+ top_p=top_p
1725
+ )
1726
+
1727
+ # Sampling strategy
1728
+ if remasking_strategy == 'sequential':
1729
+ transfer_index = torch.zeros_like(x0, dtype=torch.bool)
1730
+ for j in range(cur_x.shape[0]):
1731
+ if mask_index[j].any():
1732
+ first_mask_index = mask_index[j].nonzero(as_tuple=True)[
1733
+ 0].min().item()
1734
+ transfer_index[j, first_mask_index:first_mask_index +
1735
+ num_transfer_tokens[step]] = True
1736
+ else:
1737
+ raise ValueError(
1738
+ "No mask tokens found in the current block.")
1739
+
1740
+ elif remasking_strategy == 'low_confidence_static':
1741
+ confidence = torch.where(mask_index, x0_p, -torch.inf)
1742
+ transfer_index = torch.zeros_like(x0, dtype=torch.bool)
1743
+ for j in range(confidence.shape[0]):
1744
+ _, idx = torch.topk(
1745
+ confidence[j], num_transfer_tokens[step])
1746
+ transfer_index[j, idx] = True
1747
+
1748
+ elif remasking_strategy == 'low_confidence_dynamic':
1749
+ confidence = torch.where(mask_index, x0_p, -torch.inf)
1750
+ transfer_index = torch.zeros_like(x0, dtype=torch.bool)
1751
+ for j in range(confidence.shape[0]):
1752
+ high_conf_mask = confidence[j] > confidence_threshold
1753
+ num_high_confidence = high_conf_mask.sum()
1754
+ if num_high_confidence >= num_transfer_tokens[step]:
1755
+ transfer_index[j] = high_conf_mask
1756
+ else:
1757
+ _, idx = torch.topk(
1758
+ confidence[j], num_transfer_tokens[step])
1759
+ transfer_index[j, idx] = True
1760
+ elif remasking_strategy == "entropy_bounded":
1761
+ eps = 1e-12
1762
+ entropies = -(x0_p.clamp_min(eps) * (x0_p.clamp_min(eps)).log()).sum(dim=-1)
1763
+ entropies = torch.where(mask_index, entropies, torch.inf)
1764
+ ent_sorted, order = torch.sort(entropies, dim=1, descending=False)
1765
+ cumsum = torch.cumsum(ent_sorted, dim=1)
1766
+ for j in range(x0_p.shape[0]):
1767
+ k = torch.searchsorted(cumsum[j], torch.tensor(eb_threshold, device=x0_p.device), right=False).item()
1768
+ k = max(1, min(k, int(mask_index[j].sum().item())))
1769
+ selected_token_indices = order[j, :k]
1770
+ transfer_index[j, selected_token_indices] = True
1771
+
1772
+ else:
1773
+ raise ValueError(
1774
+ f"Unknown remasking strategy: {remasking_strategy}")
1775
+
1776
+ cur_x[transfer_index] = x0[transfer_index]
1777
+
1778
+ x[:, num_block*block_length:(num_block+1)*block_length] = cur_x
1779
+ if stopping_criteria_idx is not None and any(stop_idx in x[:, prompt_length:] for stop_idx in stopping_criteria_idx):
1780
+ break
1781
+
1782
+ return x
1783
+
1784
+ __all__ = [
1785
+ "SDARMTPForCausalLM",
1786
+ "SDARMTPModel",
1787
+ "SDARMTPPreTrainedModel",
1788
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<MASK>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
tokenization_qwen2.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Qwen2."""
16
+
17
+ import json
18
+ import os
19
+ import unicodedata
20
+ from functools import lru_cache
21
+ from typing import Optional, Tuple
22
+
23
+ import regex as re
24
+
25
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
26
+ from transformers.utils import logging
27
+
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+ VOCAB_FILES_NAMES = {
32
+ "vocab_file": "vocab.json",
33
+ "merges_file": "merges.txt",
34
+ }
35
+
36
+
37
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
38
+
39
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
40
+
41
+
42
+ @lru_cache()
43
+ # Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
44
+ def bytes_to_unicode():
45
+ """
46
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
47
+ characters the bpe code barfs on.
48
+
49
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
50
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
51
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
52
+ tables between utf-8 bytes and unicode strings.
53
+ """
54
+ bs = (
55
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
56
+ )
57
+ cs = bs[:]
58
+ n = 0
59
+ for b in range(2**8):
60
+ if b not in bs:
61
+ bs.append(b)
62
+ cs.append(2**8 + n)
63
+ n += 1
64
+ cs = [chr(n) for n in cs]
65
+ return dict(zip(bs, cs))
66
+
67
+
68
+ # Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
69
+ def get_pairs(word):
70
+ """
71
+ Return set of symbol pairs in a word.
72
+
73
+ Word is represented as tuple of symbols (symbols being variable-length strings).
74
+ """
75
+ pairs = set()
76
+ prev_char = word[0]
77
+ for char in word[1:]:
78
+ pairs.add((prev_char, char))
79
+ prev_char = char
80
+ return pairs
81
+
82
+
83
+ class Qwen2Tokenizer(PreTrainedTokenizer):
84
+ """
85
+ Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
86
+
87
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
88
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
89
+
90
+ ```python
91
+ >>> from transformers import Qwen2Tokenizer
92
+
93
+ >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
94
+ >>> tokenizer("Hello world")["input_ids"]
95
+ [9707, 1879]
96
+
97
+ >>> tokenizer(" Hello world")["input_ids"]
98
+ [21927, 1879]
99
+ ```
100
+ This is expected.
101
+
102
+ You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
103
+
104
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
105
+ this superclass for more information regarding those methods.
106
+
107
+ Args:
108
+ vocab_file (`str`):
109
+ Path to the vocabulary file.
110
+ merges_file (`str`):
111
+ Path to the merges file.
112
+ errors (`str`, *optional*, defaults to `"replace"`):
113
+ Paradigm to follow when decoding bytes to UTF-8. See
114
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
115
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
116
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
117
+ token instead.
118
+ bos_token (`str`, *optional*):
119
+ The beginning of sequence token. Not applicable for this tokenizer.
120
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
121
+ The end of sequence token.
122
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
123
+ The token used for padding, for example when batching sequences of different lengths.
124
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
125
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
126
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
127
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
128
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
129
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
130
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
131
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
132
+ """
133
+
134
+ vocab_files_names = VOCAB_FILES_NAMES
135
+ model_input_names = ["input_ids", "attention_mask"]
136
+
137
+ def __init__(
138
+ self,
139
+ vocab_file,
140
+ merges_file,
141
+ errors="replace",
142
+ unk_token="<|endoftext|>",
143
+ bos_token=None,
144
+ eos_token="<|endoftext|>",
145
+ pad_token="<|endoftext|>",
146
+ clean_up_tokenization_spaces=False,
147
+ split_special_tokens=False,
148
+ **kwargs,
149
+ ):
150
+ # Qwen vocab does not contain control tokens; added tokens need to be special
151
+ bos_token = (
152
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
153
+ if isinstance(bos_token, str)
154
+ else bos_token
155
+ )
156
+ eos_token = (
157
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
158
+ if isinstance(eos_token, str)
159
+ else eos_token
160
+ )
161
+ unk_token = (
162
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
163
+ if isinstance(unk_token, str)
164
+ else unk_token
165
+ )
166
+ pad_token = (
167
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
168
+ if isinstance(pad_token, str)
169
+ else pad_token
170
+ )
171
+
172
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
173
+ self.encoder = json.load(vocab_handle)
174
+ self.decoder = {v: k for k, v in self.encoder.items()}
175
+ self.errors = errors # how to handle errors in decoding
176
+ self.byte_encoder = bytes_to_unicode()
177
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
178
+ bpe_merges = []
179
+ with open(merges_file, encoding="utf-8") as merges_handle:
180
+ for i, line in enumerate(merges_handle):
181
+ line = line.strip()
182
+ if (i == 0 and line.startswith("#version:")) or not line:
183
+ continue
184
+ bpe_merges.append(tuple(line.split()))
185
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
186
+ # NOTE: the cache can grow without bound and will get really large for long running processes
187
+ # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
188
+ # not a memory leak but appears as one.
189
+ # GPT2Tokenizer has the same problem, so let's be consistent.
190
+ self.cache = {}
191
+
192
+ self.pat = re.compile(PRETOKENIZE_REGEX)
193
+
194
+ if kwargs.get("add_prefix_space", False):
195
+ logger.warning_once(
196
+ f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
197
+ )
198
+
199
+ super().__init__(
200
+ errors=errors,
201
+ bos_token=bos_token,
202
+ eos_token=eos_token,
203
+ pad_token=pad_token,
204
+ unk_token=unk_token,
205
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
206
+ split_special_tokens=split_special_tokens,
207
+ **kwargs,
208
+ )
209
+
210
+ @property
211
+ def vocab_size(self) -> int:
212
+ return len(self.encoder)
213
+
214
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
215
+ def get_vocab(self):
216
+ return dict(self.encoder, **self.added_tokens_encoder)
217
+
218
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
219
+ def bpe(self, token):
220
+ if token in self.cache:
221
+ return self.cache[token]
222
+ word = tuple(token)
223
+ pairs = get_pairs(word)
224
+
225
+ if not pairs:
226
+ return token
227
+
228
+ while True:
229
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
230
+ if bigram not in self.bpe_ranks:
231
+ break
232
+ first, second = bigram
233
+ new_word = []
234
+ i = 0
235
+ while i < len(word):
236
+ try:
237
+ j = word.index(first, i)
238
+ except ValueError:
239
+ new_word.extend(word[i:])
240
+ break
241
+ else:
242
+ new_word.extend(word[i:j])
243
+ i = j
244
+
245
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
246
+ new_word.append(first + second)
247
+ i += 2
248
+ else:
249
+ new_word.append(word[i])
250
+ i += 1
251
+ new_word = tuple(new_word)
252
+ word = new_word
253
+ if len(word) == 1:
254
+ break
255
+ else:
256
+ pairs = get_pairs(word)
257
+ word = " ".join(word)
258
+ self.cache[token] = word
259
+ return word
260
+
261
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
262
+ def _tokenize(self, text):
263
+ """Tokenize a string."""
264
+ bpe_tokens = []
265
+ for token in re.findall(self.pat, text):
266
+ token = "".join(
267
+ self.byte_encoder[b] for b in token.encode("utf-8")
268
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
269
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
270
+ return bpe_tokens
271
+
272
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
273
+ def _convert_token_to_id(self, token):
274
+ """Converts a token (str) in an id using the vocab."""
275
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
276
+
277
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
278
+ def _convert_id_to_token(self, index):
279
+ """Converts an index (integer) in a token (str) using the vocab."""
280
+ return self.decoder.get(index)
281
+
282
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
283
+ def convert_tokens_to_string(self, tokens):
284
+ """Converts a sequence of tokens (string) in a single string."""
285
+ text = "".join(tokens)
286
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
287
+ return text
288
+
289
+ def decode(
290
+ self,
291
+ token_ids,
292
+ skip_special_tokens: bool = False,
293
+ clean_up_tokenization_spaces: Optional[bool] = False,
294
+ spaces_between_special_tokens: bool = False,
295
+ **kwargs,
296
+ ) -> str:
297
+ # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
298
+ # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
299
+ return super().decode(
300
+ token_ids,
301
+ skip_special_tokens=skip_special_tokens,
302
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
303
+ spaces_between_special_tokens=spaces_between_special_tokens,
304
+ **kwargs,
305
+ )
306
+
307
+ # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
308
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
309
+ if not os.path.isdir(save_directory):
310
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
311
+ return
312
+ vocab_file = os.path.join(
313
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
314
+ )
315
+ merge_file = os.path.join(
316
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
317
+ )
318
+
319
+ with open(vocab_file, "w", encoding="utf-8") as f:
320
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
321
+
322
+ index = 0
323
+ with open(merge_file, "w", encoding="utf-8") as writer:
324
+ writer.write("#version: 0.2\n")
325
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
326
+ if index != token_index:
327
+ logger.warning(
328
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
329
+ " Please check that the tokenizer is not corrupted!"
330
+ )
331
+ index = token_index
332
+ writer.write(" ".join(bpe_tokens) + "\n")
333
+ index += 1
334
+
335
+ return vocab_file, merge_file
336
+
337
+ def prepare_for_tokenization(self, text, **kwargs):
338
+ text = unicodedata.normalize("NFC", text)
339
+ return (text, kwargs)
340
+
341
+
342
+ __all__ = ["Qwen2Tokenizer"]
tokenization_qwen2_fast.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for Qwen2."""
16
+
17
+ from typing import Optional, Tuple
18
+
19
+ from transformers.tokenization_utils import AddedToken
20
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
21
+ from transformers.utils import logging
22
+ from .tokenization_qwen2 import Qwen2Tokenizer
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+ VOCAB_FILES_NAMES = {
28
+ "vocab_file": "vocab.json",
29
+ "merges_file": "merges.txt",
30
+ "tokenizer_file": "tokenizer.json",
31
+ }
32
+
33
+
34
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
35
+
36
+
37
+ class Qwen2TokenizerFast(PreTrainedTokenizerFast):
38
+ """
39
+ Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
40
+ Byte-Pair-Encoding.
41
+
42
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
43
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
44
+
45
+ ```python
46
+ >>> from transformers import Qwen2TokenizerFast
47
+
48
+ >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
49
+ >>> tokenizer("Hello world")["input_ids"]
50
+ [9707, 1879]
51
+
52
+ >>> tokenizer(" Hello world")["input_ids"]
53
+ [21927, 1879]
54
+ ```
55
+ This is expected.
56
+
57
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
58
+ refer to this superclass for more information regarding those methods.
59
+
60
+ Args:
61
+ vocab_file (`str`, *optional*):
62
+ Path to the vocabulary file.
63
+ merges_file (`str`, *optional*):
64
+ Path to the merges file.
65
+ tokenizer_file (`str`, *optional*):
66
+ Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
67
+ contains everything needed to load the tokenizer.
68
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
69
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
70
+ token instead. Not applicable to this tokenizer.
71
+ bos_token (`str`, *optional*):
72
+ The beginning of sequence token. Not applicable for this tokenizer.
73
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
74
+ The end of sequence token.
75
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
76
+ The token used for padding, for example when batching sequences of different lengths.
77
+ """
78
+
79
+ vocab_files_names = VOCAB_FILES_NAMES
80
+ model_input_names = ["input_ids", "attention_mask"]
81
+ slow_tokenizer_class = Qwen2Tokenizer
82
+
83
+ def __init__(
84
+ self,
85
+ vocab_file=None,
86
+ merges_file=None,
87
+ tokenizer_file=None,
88
+ unk_token="<|endoftext|>",
89
+ bos_token=None,
90
+ eos_token="<|endoftext|>",
91
+ pad_token="<|endoftext|>",
92
+ **kwargs,
93
+ ):
94
+ # We need to at least pass vocab_file and merges_file to base class
95
+ # in case a slow tokenizer needs to be initialized; other can be
96
+ # configured through files.
97
+ # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
98
+
99
+ bos_token = (
100
+ AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
101
+ if isinstance(bos_token, str)
102
+ else bos_token
103
+ )
104
+ eos_token = (
105
+ AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
106
+ if isinstance(eos_token, str)
107
+ else eos_token
108
+ )
109
+ unk_token = (
110
+ AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
111
+ if isinstance(unk_token, str)
112
+ else unk_token
113
+ )
114
+ pad_token = (
115
+ AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
116
+ if isinstance(pad_token, str)
117
+ else pad_token
118
+ )
119
+
120
+ super().__init__(
121
+ vocab_file=vocab_file,
122
+ merges_file=merges_file,
123
+ tokenizer_file=tokenizer_file,
124
+ unk_token=unk_token,
125
+ bos_token=bos_token,
126
+ eos_token=eos_token,
127
+ pad_token=pad_token,
128
+ **kwargs,
129
+ )
130
+
131
+ # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
132
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
133
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
134
+ return tuple(files)
135
+
136
+
137
+ __all__ = ["Qwen2TokenizerFast"]
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<|MASK|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": false
221
+ }
222
+ },
223
+ "additional_special_tokens": [
224
+ "<|im_start|>",
225
+ "<|im_end|>",
226
+ "<|object_ref_start|>",
227
+ "<|object_ref_end|>",
228
+ "<|box_start|>",
229
+ "<|box_end|>",
230
+ "<|quad_start|>",
231
+ "<|quad_end|>",
232
+ "<|vision_start|>",
233
+ "<|vision_end|>",
234
+ "<|vision_pad|>",
235
+ "<|image_pad|>",
236
+ "<|video_pad|>",
237
+ "<|MASK|>"
238
+ ],
239
+ "auto_map": {
240
+ "AutoTokenizer": [
241
+ "tokenization_qwen2.Qwen2Tokenizer",
242
+ null
243
+ ]
244
+ },
245
+ "bos_token": null,
246
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
247
+ "clean_up_tokenization_spaces": false,
248
+ "eos_token": "<|endoftext|>",
249
+ "mask_token": "<|MASK|>",
250
+ "errors": "replace",
251
+ "model_max_length": 131072,
252
+ "pad_token": "<|endoftext|>",
253
+ "split_special_tokens": false,
254
+ "tokenizer_class": "Qwen2Tokenizer",
255
+ "unk_token": null
256
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff