joerowell commited on
Commit
5e8a44b
·
verified ·
1 Parent(s): 0b85bdd

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. chat_template.jinja +132 -0
  2. config.json +202 -0
  3. configuration_laguna.py +245 -0
  4. generation_config.json +13 -0
  5. model-00001-of-00089.safetensors +3 -0
  6. model-00002-of-00089.safetensors +3 -0
  7. model-00003-of-00089.safetensors +3 -0
  8. model-00004-of-00089.safetensors +3 -0
  9. model-00005-of-00089.safetensors +3 -0
  10. model-00006-of-00089.safetensors +3 -0
  11. model-00007-of-00089.safetensors +3 -0
  12. model-00008-of-00089.safetensors +3 -0
  13. model-00009-of-00089.safetensors +3 -0
  14. model-00010-of-00089.safetensors +3 -0
  15. model-00011-of-00089.safetensors +3 -0
  16. model-00012-of-00089.safetensors +3 -0
  17. model-00013-of-00089.safetensors +3 -0
  18. model-00014-of-00089.safetensors +3 -0
  19. model-00015-of-00089.safetensors +3 -0
  20. model-00016-of-00089.safetensors +3 -0
  21. model-00017-of-00089.safetensors +3 -0
  22. model-00018-of-00089.safetensors +3 -0
  23. model-00019-of-00089.safetensors +3 -0
  24. model-00020-of-00089.safetensors +3 -0
  25. model-00021-of-00089.safetensors +3 -0
  26. model-00022-of-00089.safetensors +3 -0
  27. model-00023-of-00089.safetensors +3 -0
  28. model-00024-of-00089.safetensors +3 -0
  29. model-00025-of-00089.safetensors +3 -0
  30. model-00026-of-00089.safetensors +3 -0
  31. model-00027-of-00089.safetensors +3 -0
  32. model-00028-of-00089.safetensors +3 -0
  33. model-00029-of-00089.safetensors +3 -0
  34. model-00030-of-00089.safetensors +3 -0
  35. model-00031-of-00089.safetensors +3 -0
  36. model-00032-of-00089.safetensors +3 -0
  37. model-00033-of-00089.safetensors +3 -0
  38. model-00034-of-00089.safetensors +3 -0
  39. model-00035-of-00089.safetensors +3 -0
  40. model-00036-of-00089.safetensors +3 -0
  41. model-00037-of-00089.safetensors +3 -0
  42. model-00038-of-00089.safetensors +3 -0
  43. model-00039-of-00089.safetensors +3 -0
  44. model-00040-of-00089.safetensors +3 -0
  45. model-00041-of-00089.safetensors +3 -0
  46. model.safetensors.index.json +0 -0
  47. modeling_laguna.py +879 -0
  48. special_tokens_map.json +9 -0
  49. tokenizer.json +0 -0
  50. tokenizer_config.json +576 -0
chat_template.jinja ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {#- Copied from laguna_glm_thinking_v4/chat_template.jinja -#}
2
+ {#- Removes prefix that references <think> token, and replaces message.reasoning_content reference with message.reasoning -#}
3
+ {{- "〈|EOS|〉" -}}
4
+ {%- set enable_thinking = enable_thinking | default(false) -%}
5
+ {%- set render_assistant_messages_raw = render_assistant_messages_raw | default(false) -%}
6
+ {%- set add_generation_prompt = add_generation_prompt | default(false) -%}
7
+
8
+ {#- ───── header (system message) ───── -#}
9
+ {%- set system_message = "" -%}
10
+ {%- if messages and messages[0].role == "system" -%}
11
+ {%- set system_message = messages[0].content -%}
12
+ {%- endif -%}
13
+
14
+ {%- if (system_message and system_message.strip()) or tools -%}
15
+ {{- "<system>\n" -}}
16
+
17
+ {%- if system_message and system_message.strip() -%}
18
+ {{- "\n" -}}
19
+ {{- system_message.rstrip() -}}
20
+ {%- endif -%}
21
+
22
+ {%- if tools -%}
23
+ {{- "\n\n### Tools\n\n" -}}
24
+ {%- set ns = namespace(tool_string="You may call functions to assist with the user query.\n"
25
+ ~ "All available function signatures are listed below:\n"
26
+ ~ "<available_tools>\n") -%}
27
+ {%- for tool in tools -%}
28
+ {%- set ns.tool_string = ns.tool_string ~ (tool | tojson) ~ "\n" -%}
29
+ {%- endfor -%}
30
+ {%- if enable_thinking -%}
31
+ {%- set tool_string = ns.tool_string + "</available_tools>\n\n" ~
32
+ "Wrap your thinking in '<think>', '</think>' tags, followed by a function call. For each function call, return an unescaped XML-like object with function name and arguments within '<tool_call>' and '</tool_call>' tags, like here:\n" ~
33
+ "<think> your thoughts here </think>\n" ~
34
+ "<tool_call>function-name\n<arg_key>argument-key</arg_key>\n<arg_value>value-of-argument-key</arg_value>\n" ~
35
+ "</tool_call>" -%}
36
+ {%- else -%}
37
+ {%- set tool_string = ns.tool_string + "</available_tools>\n\n" ~
38
+ "For each function call, return an unescaped XML-like object " ~
39
+ "with function name and arguments within '<tool_call>' and '</tool_call>' tags, like here:\n" ~
40
+ "<tool_call>function-name\n<arg_key>argument-key</arg_key>\n<arg_value>value-of-argument-key</arg_value>\n" ~
41
+ "</tool_call>" -%}
42
+ {%- endif -%}
43
+ {{- tool_string -}}
44
+ {%- endif -%}
45
+
46
+ {{- "\n</system>\n" -}}
47
+ {%- endif -%}
48
+
49
+ {#- ───── main loop ───── -#}
50
+ {%- for message in messages -%}
51
+ {%- set content = message.content if message.content is string else "" -%}
52
+ {%- if message.role == "user" -%}
53
+ {{- "<user>\n" + content + "\n</user>\n" -}}
54
+ {%- elif message.role == "assistant" -%}
55
+ {%- generation -%}
56
+ {{- "<assistant>\n" -}}
57
+ {%- if render_assistant_messages_raw -%}
58
+ {#- Raw mode: prepend the generation prompt token, then dump content verbatim. -#}
59
+ {#- The generation prompt is <think> when enable_thinking, </think> otherwise. -#}
60
+ {#- Only prepend if content doesn't already start with it. -#}
61
+ {%- if enable_thinking -%}
62
+ {%- if not content.startswith('<think>') -%}
63
+ {{- '<think>' -}}
64
+ {%- endif -%}
65
+ {%- else -%}
66
+ {%- if not content.startswith('</think>') -%}
67
+ {{- '</think>' -}}
68
+ {%- endif -%}
69
+ {%- endif -%}
70
+ {{- content -}}
71
+ {#- Append closing tag if content doesn't already end with it. -#}
72
+ {%- if not content.endswith('</assistant>\n') and not content.endswith('</assistant>') -%}
73
+ {{- '\n</assistant>' -}}
74
+ {%- endif -%}
75
+ {{- "\n" -}}
76
+ {%- else -%}
77
+ {#- Extract reasoning content from message.reasoning (vLLM field name) or message.reasoning_content, or from <think> tags -#}
78
+ {%- set reasoning_content = '' %}
79
+ {%- if message.reasoning is string %}
80
+ {%- set reasoning_content = message.reasoning %}
81
+ {%- elif message.reasoning_content is string %}
82
+ {%- set reasoning_content = message.reasoning_content %}
83
+ {%- endif %}
84
+ {#- Always strip <think> tags from content if present to avoid duplication -#}
85
+ {%- if '</think>' in content %}
86
+ {%- if not reasoning_content %}
87
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
88
+ {%- endif %}
89
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
90
+ {%- endif %}
91
+ {#- Display reasoning content for all messages -#}
92
+ {%- if reasoning_content -%}
93
+ {{- '<think>\n' + reasoning_content.strip() + '\n</think>\n' -}}
94
+ {%- else -%}
95
+ {{- '</think>\n' -}}
96
+ {%- endif -%}
97
+ {#- Display main content -#}
98
+ {%- if content.strip() -%}
99
+ {{- content.strip() ~ "\n" -}}
100
+ {%- endif -%}
101
+ {%- if message.tool_calls -%}
102
+ {%- for tool_call in message.tool_calls -%}
103
+ {%- set function_data = tool_call.function -%}
104
+ {{- '<tool_call>' + function_data.name }}
105
+ {% set _args = function_data.arguments %}
106
+ {%- for k, v in _args.items() -%}
107
+ {{- "<arg_key>" ~ k ~ "</arg_key>\n" -}}
108
+ {{- "<arg_value>"}}{{ v | tojson(ensure_ascii=False) if v is not string else v }}{{ "</arg_value>\n" -}}
109
+ {%- endfor -%}
110
+ {{- "</tool_call>\n" -}}
111
+ {%- endfor -%}
112
+ {%- endif -%}
113
+ {{- "</assistant>\n" -}}
114
+ {%- endif -%}
115
+ {%- endgeneration -%}
116
+ {%- elif message.role == "tool" -%}
117
+ {{- "<tool_response>\n" + content + "\n</tool_response>\n" -}}
118
+ {%- elif message.role == "system" and loop.index0 != 0 -%}
119
+ {#- Render additional system messages (skip the first one which is handled separately in the header) -#}
120
+ {{- "<system>\n" + content + "\n</system>\n" -}}
121
+ {%- endif -%}
122
+ {%- endfor -%}
123
+ {#- ───── generation prompt ───── -#}
124
+ {%- if add_generation_prompt -%}
125
+ {{- "<assistant>\n" -}}
126
+ {#- ───── Include reasoning mode directive ───── -#}
127
+ {%- if not enable_thinking %}
128
+ {{- '</think>' -}}
129
+ {%- else %}
130
+ {{- '<think>' -}}
131
+ {%- endif %}
132
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LagunaForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_laguna.LagunaConfig",
7
+ "AutoModelForCausalLM": "modeling_laguna.LagunaForCausalLM"
8
+ },
9
+ "model_type": "laguna",
10
+ "vocab_size": 100352,
11
+ "hidden_size": 4096,
12
+ "intermediate_size": 16384,
13
+ "num_hidden_layers": 70,
14
+ "num_attention_heads": 64,
15
+ "num_key_value_heads": 8,
16
+ "head_dim": 128,
17
+ "max_position_embeddings": 131072,
18
+ "attention_bias": false,
19
+ "attention_dropout": 0.0,
20
+ "rms_norm_eps": 1e-06,
21
+ "num_experts": 256,
22
+ "num_experts_per_tok": 16,
23
+ "moe_intermediate_size": 1024,
24
+ "shared_expert_intermediate_size": 1024,
25
+ "norm_topk_prob": true,
26
+ "router_aux_loss_coef": 0.0,
27
+ "decoder_sparse_step": 1,
28
+ "mlp_only_layers": [
29
+ 0,
30
+ 1,
31
+ 2
32
+ ],
33
+ "bos_token_id": 2,
34
+ "eos_token_id": [
35
+ 2,
36
+ 24
37
+ ],
38
+ "pad_token_id": 9,
39
+ "tie_word_embeddings": false,
40
+ "use_cache": true,
41
+ "torch_dtype": "bfloat16",
42
+ "gating": true,
43
+ "sliding_window": 0,
44
+ "rope_parameters": {
45
+ "full_attention": {
46
+ "rope_theta": 500000.0,
47
+ "rope_type": "yarn",
48
+ "factor": 32.0,
49
+ "original_max_position_embeddings": 4096,
50
+ "beta_slow": 1.0,
51
+ "beta_fast": 64.0,
52
+ "attention_factor": 1.0,
53
+ "partial_rotary_factor": 1.0
54
+ }
55
+ },
56
+ "moe_apply_router_weight_on_input": false,
57
+ "mlp_layer_types": [
58
+ "dense",
59
+ "dense",
60
+ "dense",
61
+ "sparse",
62
+ "sparse",
63
+ "sparse",
64
+ "sparse",
65
+ "sparse",
66
+ "sparse",
67
+ "sparse",
68
+ "sparse",
69
+ "sparse",
70
+ "sparse",
71
+ "sparse",
72
+ "sparse",
73
+ "sparse",
74
+ "sparse",
75
+ "sparse",
76
+ "sparse",
77
+ "sparse",
78
+ "sparse",
79
+ "sparse",
80
+ "sparse",
81
+ "sparse",
82
+ "sparse",
83
+ "sparse",
84
+ "sparse",
85
+ "sparse",
86
+ "sparse",
87
+ "sparse",
88
+ "sparse",
89
+ "sparse",
90
+ "sparse",
91
+ "sparse",
92
+ "sparse",
93
+ "sparse",
94
+ "sparse",
95
+ "sparse",
96
+ "sparse",
97
+ "sparse",
98
+ "sparse",
99
+ "sparse",
100
+ "sparse",
101
+ "sparse",
102
+ "sparse",
103
+ "sparse",
104
+ "sparse",
105
+ "sparse",
106
+ "sparse",
107
+ "sparse",
108
+ "sparse",
109
+ "sparse",
110
+ "sparse",
111
+ "sparse",
112
+ "sparse",
113
+ "sparse",
114
+ "sparse",
115
+ "sparse",
116
+ "sparse",
117
+ "sparse",
118
+ "sparse",
119
+ "sparse",
120
+ "sparse",
121
+ "sparse",
122
+ "sparse",
123
+ "sparse",
124
+ "sparse",
125
+ "sparse",
126
+ "sparse",
127
+ "sparse"
128
+ ],
129
+ "gating_types": [
130
+ "per_element",
131
+ "per_element",
132
+ "per_element",
133
+ "per_element",
134
+ "per_element",
135
+ "per_element",
136
+ "per_element",
137
+ "per_element",
138
+ "per_element",
139
+ "per_element",
140
+ "per_element",
141
+ "per_element",
142
+ "per_element",
143
+ "per_element",
144
+ "per_element",
145
+ "per_element",
146
+ "per_element",
147
+ "per_element",
148
+ "per_element",
149
+ "per_element",
150
+ "per_element",
151
+ "per_element",
152
+ "per_element",
153
+ "per_element",
154
+ "per_element",
155
+ "per_element",
156
+ "per_element",
157
+ "per_element",
158
+ "per_element",
159
+ "per_element",
160
+ "per_element",
161
+ "per_element",
162
+ "per_element",
163
+ "per_element",
164
+ "per_element",
165
+ "per_element",
166
+ "per_element",
167
+ "per_element",
168
+ "per_element",
169
+ "per_element",
170
+ "per_element",
171
+ "per_element",
172
+ "per_element",
173
+ "per_element",
174
+ "per_element",
175
+ "per_element",
176
+ "per_element",
177
+ "per_element",
178
+ "per_element",
179
+ "per_element",
180
+ "per_element",
181
+ "per_element",
182
+ "per_element",
183
+ "per_element",
184
+ "per_element",
185
+ "per_element",
186
+ "per_element",
187
+ "per_element",
188
+ "per_element",
189
+ "per_element",
190
+ "per_element",
191
+ "per_element",
192
+ "per_element",
193
+ "per_element",
194
+ "per_element",
195
+ "per_element",
196
+ "per_element",
197
+ "per_element",
198
+ "per_element",
199
+ "per_element"
200
+ ],
201
+ "moe_routed_scaling_factor": 1.0
202
+ }
configuration_laguna.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from transformers.configuration_utils import PreTrainedConfig
15
+ from transformers.modeling_rope_utils import RopeParameters
16
+ from transformers.utils.import_utils import is_causal_conv1d_available, is_flash_linear_attention_available
17
+
18
+
19
+ class LagunaConfig(PreTrainedConfig):
20
+ r"""
21
+ Configuration class for Laguna model.
22
+
23
+ Laguna is Poolside's MoE architecture with:
24
+ - Attention output gating (softplus gate)
25
+ - Sigmoid routing instead of softmax
26
+ - No QKV bias
27
+ - Explicit head_dim parameter
28
+
29
+ Args:
30
+ head_dim (`int`, *optional*, defaults to 128):
31
+ Dimension of attention heads. Laguna uses explicit head_dim rather than
32
+ computing it from hidden_size // num_attention_heads.
33
+ qkv_bias (`bool`, *optional*, defaults to `False`):
34
+ Whether to add bias to QKV projections. Laguna uses no QKV bias.
35
+ attention_bias (`bool`, *optional*, defaults to `False`):
36
+ Whether to add bias to attention output projection. Laguna uses no attention bias.
37
+ gating (`bool` or `str`, *optional*, defaults to `True`):
38
+ Attention output gating mode. When ``True`` or ``"per-element"`` a g_proj
39
+ linear layer with output size ``num_attention_heads * head_dim`` is added
40
+ and ``attn_output = attn_output * softplus(g_proj(x))``. When ``"per-head"``
41
+ g_proj has output size ``num_attention_heads`` and the gate broadcasts across
42
+ ``head_dim``. When ``False`` no gating is applied.
43
+ partial_rotary_factor (`float`, *optional*):
44
+ Fraction of head_dim to apply rotary embeddings to. When set, this value is
45
+ injected into ``rope_parameters`` (and ``swa_rope_parameters``) if not already
46
+ specified there. When ``None`` the default behaviour of the rope implementation
47
+ is used (typically full rotary).
48
+ num_attention_heads_per_layer (`list[int]`, *optional*):
49
+ Optional per-layer override for ``num_attention_heads``. When provided the list
50
+ length must equal ``num_hidden_layers`` and each entry is the head count used by
51
+ that layer. When ``None`` every layer uses ``num_attention_heads``.
52
+ vocab_size (`int`, *optional*, defaults to 100352):
53
+ Vocabulary size of the Laguna model.
54
+ hidden_size (`int`, *optional*, defaults to 2048):
55
+ Dimension of the hidden representations.
56
+ intermediate_size (`int`, *optional*, defaults to 8192):
57
+ Dimension of the MLP representations for dense layers.
58
+ num_hidden_layers (`int`, *optional*, defaults to 48):
59
+ Number of hidden layers in the Transformer.
60
+ num_attention_heads (`int`, *optional*, defaults to 32):
61
+ Number of attention heads.
62
+ num_key_value_heads (`int`, *optional*, defaults to 8):
63
+ Number of key-value heads for GQA.
64
+ max_position_embeddings (`int`, *optional*, defaults to 4096):
65
+ Maximum sequence length.
66
+ rms_norm_eps (`float`, *optional*, defaults to 1e-6):
67
+ Epsilon for RMSNorm layers.
68
+ sliding_window (`int`, *optional*):
69
+ Sliding window attention size. Used by layers whose type in ``layer_types``
70
+ is ``"sliding_attention"``. When ``None``, all layers use full attention.
71
+ layer_types (`list[str]`, *optional*):
72
+ Per-layer attention type. Each element should be ``"sliding_attention"`` or
73
+ ``"full_attention"``. Length must equal ``num_hidden_layers``. When ``None``,
74
+ all layers default to global attention.
75
+ swa_attention_sink_enabled (`bool`, *optional*, defaults to `False`):
76
+ Whether to enable learnable attention sinks on sliding-window attention layers.
77
+ When enabled, a per-head bias parameter is added that allows the model to attend
78
+ to position 0 even when it falls outside the sliding window.
79
+ swa_rope_parameters (`RopeParameters`, *optional*):
80
+ Separate RoPE configuration for sliding-window attention layers. When ``None``,
81
+ SWA layers use the same RoPE as global attention layers.
82
+ num_experts (`int`, *optional*, defaults to 256):
83
+ Number of routed experts.
84
+ num_experts_per_tok (`int`, *optional*, defaults to 16):
85
+ Number of experts selected per token (top-k).
86
+ moe_intermediate_size (`int`, *optional*, defaults to 1024):
87
+ Intermediate size of routed experts.
88
+ shared_expert_intermediate_size (`int`, *optional*, defaults to 1024):
89
+ Intermediate size of the shared expert.
90
+ norm_topk_prob (`bool`, *optional*, defaults to `True`):
91
+ Whether to normalize top-k routing probabilities.
92
+ decoder_sparse_step (`int`, *optional*, defaults to 1):
93
+ Frequency of MoE layers (1 = every layer is MoE after mlp_only_layers).
94
+ mlp_only_layers (`list[int]`, *optional*, defaults to `[0]`):
95
+ Layer indices that use dense MLP instead of MoE.
96
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
97
+ Auxiliary loss coefficient for load balancing.
98
+ moe_routed_scaling_factor (`float`, *optional*, defaults to 1.0):
99
+ Scalar multiplier applied to the routed-expert output before combining with the
100
+ shared-expert output.
101
+ moe_apply_router_weight_on_input (`bool`, *optional*, defaults to `False`):
102
+ When ``True`` the top-k routing weights are multiplied into each expert's input
103
+ rather than its output. Matches the numerical form used by the trained checkpoint.
104
+ moe_router_logit_softcapping (`float`, *optional*, defaults to 0.0):
105
+ Optional soft-capping value ``c`` applied to router logits as
106
+ ``x = tanh(x / c) * c`` before sigmoid + top-k. Disabled when ``0``.
107
+ rope_parameters (`RopeParameters`, *optional*):
108
+ RoPE configuration. Defaults to rope_theta=500000.0.
109
+ """
110
+
111
+ model_type = "laguna"
112
+ keys_to_ignore_at_inference = ["past_key_values"]
113
+ # PreTrainedConfig in transformers v5 no longer auto-declares these; subclasses
114
+ # opt in by providing class-level annotations with defaults.
115
+ pad_token_id: int | None = None
116
+ bos_token_id: int | None = None
117
+ eos_token_id: int | list[int] | None = None
118
+ base_model_tp_plan = {
119
+ "layers.*.self_attn.q_proj": "colwise",
120
+ "layers.*.self_attn.k_proj": "colwise",
121
+ "layers.*.self_attn.v_proj": "colwise",
122
+ "layers.*.self_attn.g_proj": "colwise", # Laguna-specific gating projection
123
+ "layers.*.self_attn.o_proj": "rowwise",
124
+ "layers.*.mlp.gate_proj": "colwise",
125
+ "layers.*.mlp.up_proj": "colwise",
126
+ "layers.*.mlp.down_proj": "rowwise",
127
+ }
128
+ base_model_pp_plan = {
129
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
130
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
131
+ "norm": (["hidden_states"], ["hidden_states"]),
132
+ }
133
+
134
+ def __init__(
135
+ self,
136
+ vocab_size: int = 100352,
137
+ hidden_size: int = 2048,
138
+ intermediate_size: int = 8192,
139
+ num_hidden_layers: int = 48,
140
+ num_attention_heads: int = 32,
141
+ num_key_value_heads: int = 8,
142
+ head_dim: int = 128,
143
+ qkv_bias: bool = False,
144
+ attention_bias: bool = False,
145
+ gating: bool | str = True,
146
+ hidden_act: str = "silu",
147
+ max_position_embeddings: int = 4096,
148
+ initializer_range: float = 0.02,
149
+ rms_norm_eps: float = 1e-6,
150
+ use_cache: bool = True,
151
+ tie_word_embeddings: bool = False,
152
+ rope_parameters: RopeParameters | dict[str, RopeParameters] | None = None,
153
+ partial_rotary_factor: float | None = None,
154
+ attention_dropout: float = 0.0,
155
+ sliding_window: int | None = None,
156
+ layer_types: list[str] | None = None,
157
+ num_attention_heads_per_layer: list[int] | None = None,
158
+ swa_attention_sink_enabled: bool = False,
159
+ swa_rope_parameters: RopeParameters | None = None,
160
+ num_experts: int = 256,
161
+ num_experts_per_tok: int = 16,
162
+ moe_intermediate_size: int = 1024,
163
+ shared_expert_intermediate_size: int = 1024,
164
+ norm_topk_prob: bool = True,
165
+ decoder_sparse_step: int = 1,
166
+ mlp_only_layers: list[int] | None = None,
167
+ router_aux_loss_coef: float = 0.001,
168
+ moe_routed_scaling_factor: float = 1.0,
169
+ moe_apply_router_weight_on_input: bool = False,
170
+ moe_router_logit_softcapping: float = 0.0,
171
+ output_router_logits: bool = False,
172
+ **kwargs,
173
+ ):
174
+ # Default mlp_only_layers: first layer is dense (moe_first_k_dense_replace=1)
175
+ if mlp_only_layers is None:
176
+ mlp_only_layers = [0]
177
+
178
+ # Default layer_types: all layers use full attention (Laguna-M). Laguna-XS
179
+ # ships an explicit list with a mix of "full_attention" and "sliding_attention".
180
+ # Downstream mask builders (``create_masks_for_generate``) iterate
181
+ # ``layer_types``, so it must be a list — not left as ``None``.
182
+ if layer_types is None:
183
+ layer_types = ["full_attention"] * num_hidden_layers
184
+
185
+ # Default rope_parameters with Laguna's theta
186
+ if rope_parameters is None:
187
+ rope_parameters = {"rope_type": "default", "rope_theta": 500000.0}
188
+
189
+ # If ``partial_rotary_factor`` is set at the top level, inject it into any
190
+ # rope dict that does not already carry one so the rotary embedding picks
191
+ # it up consistently for both full-attention and SWA layers.
192
+ if partial_rotary_factor is not None:
193
+ if isinstance(rope_parameters, dict) and "partial_rotary_factor" not in rope_parameters:
194
+ rope_parameters = {**rope_parameters, "partial_rotary_factor": partial_rotary_factor}
195
+ if (
196
+ isinstance(swa_rope_parameters, dict)
197
+ and "partial_rotary_factor" not in swa_rope_parameters
198
+ ):
199
+ swa_rope_parameters = {
200
+ **swa_rope_parameters,
201
+ "partial_rotary_factor": partial_rotary_factor,
202
+ }
203
+
204
+ self.vocab_size = vocab_size
205
+ self.hidden_size = hidden_size
206
+ self.intermediate_size = intermediate_size
207
+ self.num_hidden_layers = num_hidden_layers
208
+ self.num_attention_heads = num_attention_heads
209
+ self.num_key_value_heads = num_key_value_heads
210
+ self.head_dim = head_dim
211
+ self.qkv_bias = qkv_bias
212
+ self.attention_bias = attention_bias
213
+ self.gating = gating
214
+ self.hidden_act = hidden_act
215
+ self.max_position_embeddings = max_position_embeddings
216
+ self.initializer_range = initializer_range
217
+ self.rms_norm_eps = rms_norm_eps
218
+ self.use_cache = use_cache
219
+ self.rope_parameters = rope_parameters
220
+ self.partial_rotary_factor = partial_rotary_factor
221
+ self.attention_dropout = attention_dropout
222
+ # Sliding window attention arguments
223
+ self.sliding_window = sliding_window
224
+ self.layer_types = layer_types
225
+ self.num_attention_heads_per_layer = num_attention_heads_per_layer
226
+ self.swa_attention_sink_enabled = swa_attention_sink_enabled
227
+ self.swa_rope_parameters = swa_rope_parameters
228
+ # MoE arguments
229
+ self.num_experts = num_experts
230
+ self.num_experts_per_tok = num_experts_per_tok
231
+ self.moe_intermediate_size = moe_intermediate_size
232
+ self.shared_expert_intermediate_size = shared_expert_intermediate_size
233
+ self.norm_topk_prob = norm_topk_prob
234
+ self.decoder_sparse_step = decoder_sparse_step
235
+ self.mlp_only_layers = mlp_only_layers
236
+ self.router_aux_loss_coef = router_aux_loss_coef
237
+ self.moe_routed_scaling_factor = moe_routed_scaling_factor
238
+ self.moe_apply_router_weight_on_input = moe_apply_router_weight_on_input
239
+ self.moe_router_logit_softcapping = moe_router_logit_softcapping
240
+ self.output_router_logits = output_router_logits
241
+
242
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
243
+
244
+
245
+ __all__ = ["LagunaConfig"]
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 2,
6
+ 24
7
+ ],
8
+ "max_new_tokens": 4096,
9
+ "pad_token_id": 9,
10
+ "temperature": 1.0,
11
+ "top_p": 1.0,
12
+ "min_p": 0.0
13
+ }
model-00001-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:230bfc34d0a67f0b92d8beb3dcc94f628ebe82d82c1327f203dab9e0e99b121c
3
+ size 5119451664
model-00002-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bbeb3ecbfa97602d8bd9fab4f078ae33df3867d94df529cf12ab875058c5e55
3
+ size 5085993864
model-00003-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b34bd3857c3b6872790482145fe3b2bb81476e81a05cf5b4ebdc6b2d081950b
3
+ size 5117432672
model-00004-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:230f8d942b97162d0633373db2e5d33b73172fa1dda3ce65ec477ba95aa5fd2a
3
+ size 5119410552
model-00005-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a052df48579439f9c2fdd0101d62d6fc6514ac21f7e70d6dfd86b4e06f4c33d
3
+ size 5117126864
model-00006-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:763bb266a50c6303ded30cd823b3adf43f1bcd5ee1909add6df9a0bbe5f75c2a
3
+ size 5117126776
model-00007-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7104c3c1f8046c56d19737b709479a690f22f62d5edc64b90dc13a769c8b5de1
3
+ size 5117126776
model-00008-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a800e72938eb992cd81086b76dcf112bea1243cb4095e4bf8785242d1ace1a81
3
+ size 5117126880
model-00009-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c69db1555b4499353d52b45d324ad419a5fd400fe2c81bb5398286f26f4bdb0
3
+ size 5117126936
model-00010-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:848011466cc779a33ea374925d7d1866842eaa2743cc3e3eaa34795b63c7a3e7
3
+ size 5117126840
model-00011-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60252cde42da7708216f8f8b9e85031c6d5a2e54f3f76baf06833a5a62346b9c
3
+ size 5117126776
model-00012-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bd9e973e6825554a5364330b62f0cc99aecbeab4da8cbaba33174eb09d258a3
3
+ size 5117126776
model-00013-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef3b1a12eb31a3ad70b3bd369bce9e27a483c3e7796d7a3379cf6435f4be5ab6
3
+ size 5117127072
model-00014-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94900c9bacf4bf50cd598d1e3145fd28d0ed610cf7e986895bd786c68a9debec
3
+ size 5117127560
model-00015-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52f0f996d77d7e5fe1a84da4c21df6cb682d7a77f36d22be08dbb0eb8b412fb3
3
+ size 5117127416
model-00016-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:399fd29b77e8b22c2c9a74ab3b6f2475a1535a38af39fb16cc83bde2610a35c1
3
+ size 5117127384
model-00017-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbf889d680b1598107470ba966be9e47593b5cf97c93b7eafe1848bae18f4f81
3
+ size 5117127384
model-00018-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2aab9519c42b1fc59bc072376d26c5a6f1df59ed957c0b9729fdf7b9a56caf20
3
+ size 5117127536
model-00019-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ae508d7070a9506a6765361b69b51198a1a73194dd23de2558ba8167a3aaa52
3
+ size 5117127568
model-00020-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a9eec6339221f19de1e17287edc9965d9a94b73ccd0ba380743871cc570ee7
3
+ size 5117127384
model-00021-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8bdf3c1be54a60fefb13f9b0d1dec708ca71a13633a8c55bce21886b52a0319
3
+ size 5117127384
model-00022-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b4b6db66886d150c907216b2fdf152a4bd220f24b485e4e0e1599281dde24cb
3
+ size 5117127400
model-00023-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd05b9a592c81a32761a93a075d69c3a49aff1a538c27325044a634d89da906a
3
+ size 5117127544
model-00024-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7df12b9b155563d10e32ab9989171380a16e4859be334c55da4a705053c80a3
3
+ size 5117127544
model-00025-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c0134632cc7f7db2e77f6ec7ae4b953c2ec1af5de4b1040ff2eb31bcb0ab8dd
3
+ size 5117127384
model-00026-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccaef3e08e56d331aa627231e2e0e7bc90b861d5388a6529b6a4604e7afe58a7
3
+ size 5117127384
model-00027-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90d16c2444c867f041049be6e5946d5e786588bcba1bf50394c3d7fa4461163e
3
+ size 5117127424
model-00028-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72083a61686e7f2d52d3579bd60a43788fdfbddf1e6d41e403fee850200d19cb
3
+ size 5117127544
model-00029-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfe01af6c42e2366d0eb8629c40ec2674df839a16f5b81be37ec2cbfc2906a12
3
+ size 5117127520
model-00030-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aea6291d6c5ccfd143d38f4b8dc2b3f1f3cc76e735ed8a352e1d32f956ddcc2
3
+ size 5117127384
model-00031-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98243efb22ac856c50215c4cea159da7210d24f54be0a31d182f370e542dc15a
3
+ size 5117127384
model-00032-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e861f320f8aef0c692c28c602fe92e0fd9d57b965b9623cacf3e06cc651f54
3
+ size 5117127440
model-00033-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e612640547e1691451884b6bbbe64a30c0835d7c9846f14b4042b7280c32ae
3
+ size 5117127544
model-00034-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae308c049b026e630cff7ce1fedd73e522c79c234814b0d4ef0b3cf7d00bbdc4
3
+ size 5117127496
model-00035-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba5bff98b7ea4fe119d8ebb252a18459098bd13853febca12d6efbccbd97213e
3
+ size 5117127384
model-00036-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5142bc53a51a456bf12e38cc4e994b50a03ff28335068f5adfb9430d24e500e
3
+ size 5117127384
model-00037-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9148cd551bb4611c168ed1cb46877e48c8c7f4157f71889e2872b29c1283d5b5
3
+ size 5117127464
model-00038-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb9aa08e087fef94805bcc2f91f8277fe5b19f3a7b2b01a680d90c7dd0b691e
3
+ size 5117127544
model-00039-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efe972d5de11a0b0f218a52400fc8a325e835b6a4746c7c6a91d4ae4887c422e
3
+ size 5117127480
model-00040-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b6b73c68a6d678cbf424e8a25d50d1f118fa1ead62351e48de16978e657c47
3
+ size 5117127384
model-00041-of-00089.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1708a52d726d5b3f21d9edb7b2e017adda578e915a2862b46e5aef26bc714b27
3
+ size 5117127384
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
modeling_laguna.py ADDED
@@ -0,0 +1,879 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 Poolside and the HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from collections.abc import Callable
16
+
17
+ import torch
18
+ import torch.nn.functional as F
19
+ from torch import nn
20
+
21
+ from transformers.activations import ACT2FN
22
+ from transformers.cache_utils import Cache
23
+ from transformers.integrations import use_experts_implementation, use_kernelized_func
24
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
25
+ from transformers.modeling_layers import GradientCheckpointingLayer
26
+ from transformers.modeling_outputs import MoeModelOutputWithPast
27
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
28
+ from transformers.processing_utils import Unpack
29
+ from transformers.utils import auto_docstring, can_return_tuple, is_grouped_mm_available
30
+ from transformers.utils.generic import TransformersKwargs, merge_with_config_defaults
31
+ from transformers.utils.output_capturing import OutputRecorder, capture_outputs
32
+ from transformers.cache_utils import DynamicCache
33
+ from transformers.generation import GenerationMixin
34
+ from transformers.integrations import use_kernel_forward_from_hub
35
+ from transformers.masking_utils import create_causal_mask
36
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
37
+ from transformers.modeling_utils import PreTrainedModel
38
+ from transformers.utils.generic import maybe_autocast
39
+ from .configuration_laguna import LagunaConfig
40
+
41
+ from transformers import initialization as init
42
+ from transformers.masking_utils import create_sliding_window_causal_mask
43
+ from transformers.modeling_outputs import MoeCausalLMOutputWithPast
44
+ from transformers.utils.import_utils import is_causal_conv1d_available, is_flash_linear_attention_available
45
+
46
+
47
+ @use_kernel_forward_from_hub("RMSNorm")
48
+ class LagunaRMSNorm(nn.Module):
49
+ def __init__(self, hidden_size, eps: float = 1e-6) -> None:
50
+ """
51
+ LagunaRMSNorm is equivalent to T5LayerNorm
52
+ """
53
+ super().__init__()
54
+ self.weight = nn.Parameter(torch.ones(hidden_size))
55
+ self.variance_epsilon = eps
56
+
57
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
58
+ input_dtype = hidden_states.dtype
59
+ hidden_states = hidden_states.to(torch.float32)
60
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
61
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
62
+ return self.weight * hidden_states.to(input_dtype)
63
+
64
+ def extra_repr(self):
65
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
66
+
67
+
68
+ class LagunaRotaryEmbedding(nn.Module):
69
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
70
+
71
+ def __init__(self, config: LagunaConfig, device=None):
72
+ super().__init__()
73
+ self.max_seq_len_cached = config.max_position_embeddings
74
+ self.original_max_seq_len = config.max_position_embeddings
75
+
76
+ self.config = config
77
+
78
+ self.rope_type = self.config.rope_parameters["rope_type"]
79
+ rope_init_fn: Callable = self.compute_default_rope_parameters
80
+ if self.rope_type != "default":
81
+ rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
82
+ inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
83
+
84
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
85
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
86
+
87
+ @staticmethod
88
+ def compute_default_rope_parameters(
89
+ config, device=None, seq_len=None) -> tuple["torch.Tensor", float]:
90
+ """
91
+ Computes the inverse frequencies according to the original RoPE implementation
92
+ Args:
93
+ config ([`~transformers.PreTrainedConfig`]):
94
+ The model configuration.
95
+ device (`torch.device`):
96
+ The device to use for initialization of the inverse frequencies.
97
+ seq_len (`int`, *optional*):
98
+ The current sequence length. Unused for this type of RoPE.
99
+ Returns:
100
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
101
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
102
+ """
103
+ base = config.rope_parameters["rope_theta"]
104
+ head_dim = (
105
+ getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
106
+ )
107
+ partial = config.rope_parameters.get("partial_rotary_factor", 1.0)
108
+ dim = int(head_dim * partial)
109
+ inv_freq = 1.0 / (
110
+ base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
111
+ )
112
+ return inv_freq, 1.0
113
+
114
+ @torch.no_grad()
115
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
116
+ def forward(self, x, position_ids):
117
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
118
+ position_ids_expanded = position_ids[:, None, :].float()
119
+
120
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
121
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
122
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
123
+ emb = torch.cat((freqs, freqs), dim=-1)
124
+ cos = emb.cos() * self.attention_scaling
125
+ sin = emb.sin() * self.attention_scaling
126
+
127
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
128
+
129
+
130
+ class LagunaMLP(nn.Module):
131
+ def __init__(self, config, intermediate_size=None):
132
+ super().__init__()
133
+ self.config = config
134
+ self.hidden_size = config.hidden_size
135
+ self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
136
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
137
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
138
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
139
+ self.act_fn = ACT2FN[config.hidden_act]
140
+
141
+ def forward(self, x):
142
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
143
+ return down_proj
144
+
145
+
146
+ class LagunaTopKRouter(nn.Module):
147
+ """Laguna MoE router using sigmoid scoring (not softmax).
148
+
149
+ Supports optional router-logit soft-capping and auxiliary-loss-free load
150
+ balancing (arXiv:2408.15664): the per-expert bias ``e_score_correction_bias``
151
+ is added to selection scores but the returned routing weights remain unbiased.
152
+ The bias lives on the router so accelerate's per-module hooks can co-locate it
153
+ with the gate — moving it to the experts module would cross a hook boundary
154
+ and leave the bias on meta under ``device_map="auto"`` / CPU-offload.
155
+ """
156
+ def __init__(self, config):
157
+ super().__init__()
158
+ self.top_k = config.num_experts_per_tok
159
+ self.num_experts = config.num_experts
160
+ self.norm_topk_prob = config.norm_topk_prob
161
+ self.hidden_dim = config.hidden_size
162
+ self.weight = nn.Parameter(torch.zeros(self.num_experts, self.hidden_dim))
163
+ # Zero-initialised so inference on checkpoints that don't ship the bias
164
+ # is a no-op. ``_checkpoint_conversion_mapping`` below remaps the
165
+ # ``mlp.experts.e_score_correction_bias`` key from vLLM-trained
166
+ # checkpoints onto this attribute.
167
+ self.e_score_correction_bias = nn.Parameter(
168
+ torch.zeros(config.num_experts), requires_grad=False
169
+ )
170
+ self.router_logit_softcapping = float(
171
+ getattr(config, "moe_router_logit_softcapping", 0.0) or 0.0
172
+ )
173
+
174
+ def forward(self,
175
+ hidden_states: torch.Tensor,
176
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
177
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
178
+ router_logits = F.linear(hidden_states, self.weight).float()
179
+ if self.router_logit_softcapping > 0.0:
180
+ router_logits = (
181
+ torch.tanh(router_logits / self.router_logit_softcapping) * self.router_logit_softcapping
182
+ )
183
+ routing_scores = torch.sigmoid(router_logits)
184
+ scores_for_selection = routing_scores + self.e_score_correction_bias.to(routing_scores.dtype)
185
+ _, selected_experts = torch.topk(scores_for_selection, self.top_k, dim=-1)
186
+ routing_weights = routing_scores.gather(-1, selected_experts)
187
+ if self.norm_topk_prob:
188
+ routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
189
+ routing_weights = routing_weights.to(hidden_states.dtype)
190
+ return router_logits, routing_weights, selected_experts
191
+
192
+
193
+ @use_experts_implementation
194
+ class LagunaExperts(nn.Module):
195
+ """Fused expert weights as 3D tensors for batched execution."""
196
+
197
+ def __init__(self, config):
198
+ super().__init__()
199
+ self.num_experts = config.num_experts
200
+ self.hidden_dim = config.hidden_size
201
+ self.intermediate_dim = config.moe_intermediate_size
202
+ self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
203
+ self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
204
+ self.act_fn = ACT2FN[config.hidden_act]
205
+
206
+ def forward(
207
+ self,
208
+ hidden_states: torch.Tensor,
209
+ top_k_index: torch.Tensor,
210
+ top_k_weights: torch.Tensor,
211
+ ) -> torch.Tensor:
212
+ final_hidden_states = torch.zeros_like(hidden_states)
213
+ with torch.no_grad():
214
+ expert_mask = F.one_hot(top_k_index, num_classes=self.num_experts)
215
+ expert_mask = expert_mask.permute(2, 1, 0)
216
+ expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
217
+
218
+ for expert_idx in expert_hit:
219
+ expert_idx = expert_idx[0]
220
+ if expert_idx == self.num_experts:
221
+ continue
222
+ top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
223
+ current_state = hidden_states[token_idx]
224
+ gate, up = F.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
225
+ current_hidden_states = self.act_fn(gate) * up
226
+ current_hidden_states = F.linear(current_hidden_states, self.down_proj[expert_idx])
227
+ current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
228
+ final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
229
+
230
+ return final_hidden_states
231
+
232
+
233
+ class LagunaSparseMoeBlock(nn.Module):
234
+ """Laguna MoE block using sigmoid router, fused expert tensors, and a shared expert."""
235
+
236
+ def __init__(self, config):
237
+ super().__init__()
238
+ self.num_experts = config.num_experts
239
+ self.routed_scaling_factor = float(getattr(config, "moe_routed_scaling_factor", 1.0))
240
+ # ``moe_apply_router_weight_on_input=True`` would require scaling each expert's
241
+ # input (rather than its output) by the routing weight. Supporting it cleanly
242
+ # alongside the fused experts kernels (``grouped_mm`` / ``batched_mm``) is future
243
+ # work; for now we fail loudly so a checkpoint that needs it can't silently
244
+ # diverge from its numerical form.
245
+ if getattr(config, "moe_apply_router_weight_on_input", False):
246
+ raise NotImplementedError(
247
+ "moe_apply_router_weight_on_input=True is not yet supported in the "
248
+ "transformers implementation of Laguna."
249
+ )
250
+ self.gate = LagunaTopKRouter(config)
251
+ self.experts = LagunaExperts(config)
252
+ self.shared_expert = LagunaMLP(config, intermediate_size=config.shared_expert_intermediate_size)
253
+
254
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
255
+ batch_size, sequence_length, hidden_dim = hidden_states.shape
256
+ hidden_states = hidden_states.view(-1, hidden_dim)
257
+
258
+ shared_expert_output = self.shared_expert(hidden_states)
259
+ _, routing_weights, selected_experts = self.gate(hidden_states)
260
+ expert_output = self.experts(hidden_states, selected_experts, routing_weights)
261
+ if self.routed_scaling_factor != 1.0:
262
+ expert_output = expert_output * self.routed_scaling_factor
263
+
264
+ expert_output = expert_output + shared_expert_output
265
+ expert_output = expert_output.reshape(batch_size, sequence_length, hidden_dim)
266
+ return expert_output
267
+
268
+
269
+ def rotate_half(x):
270
+ """Rotates half the hidden dims of the input."""
271
+ x1 = x[..., : x.shape[-1] // 2]
272
+ x2 = x[..., x.shape[-1] // 2 :]
273
+ return torch.cat((-x2, x1), dim=-1)
274
+
275
+
276
+ # Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb
277
+ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
278
+ """Applies Rotary Position Embedding to the query and key tensors.
279
+
280
+ Removes the interleaving of cos and sin from GLM
281
+
282
+ Args:
283
+ q (`torch.Tensor`): The query tensor.
284
+ k (`torch.Tensor`): The key tensor.
285
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
286
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
287
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
288
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
289
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
290
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
291
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
292
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
293
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
294
+ Returns:
295
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
296
+ """
297
+ cos = cos.unsqueeze(unsqueeze_dim)
298
+ sin = sin.unsqueeze(unsqueeze_dim)
299
+
300
+ # Keep half or full tensor for later concatenation
301
+ rotary_dim = cos.shape[-1]
302
+ q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
303
+ k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
304
+
305
+ # Apply rotary embeddings on the first half or full tensor
306
+ q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
307
+ k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
308
+
309
+ # Concatenate back to full shape
310
+ q_embed = torch.cat([q_embed, q_pass], dim=-1)
311
+ k_embed = torch.cat([k_embed, k_pass], dim=-1)
312
+ return q_embed, k_embed
313
+
314
+
315
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
316
+ """
317
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
318
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
319
+ """
320
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
321
+ if n_rep == 1:
322
+ return hidden_states
323
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
324
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
325
+
326
+
327
+ def eager_attention_forward(
328
+ module: nn.Module,
329
+ query: torch.Tensor,
330
+ key: torch.Tensor,
331
+ value: torch.Tensor,
332
+ attention_mask: torch.Tensor | None,
333
+ scaling: float,
334
+ dropout: float = 0.0,
335
+ **kwargs: Unpack[TransformersKwargs],
336
+ ):
337
+ key_states = repeat_kv(key, module.num_key_value_groups)
338
+ value_states = repeat_kv(value, module.num_key_value_groups)
339
+
340
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
341
+ if attention_mask is not None:
342
+ attn_weights = attn_weights + attention_mask
343
+
344
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
345
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
346
+ attn_output = torch.matmul(attn_weights, value_states)
347
+ attn_output = attn_output.transpose(1, 2).contiguous()
348
+
349
+ return attn_output, attn_weights
350
+
351
+
352
+ # Laguna attention is identical to Qwen2MoE attention except:
353
+ # - No QKV bias
354
+ # - Explicit head_dim from config
355
+ # - Output gating: attn_output = attn_output * softplus(g_proj(hidden_states)) (optional)
356
+ # - Per-layer sliding window attention with optional attention sinks
357
+ @use_kernelized_func(apply_rotary_pos_emb)
358
+ class LagunaAttention(nn.Module):
359
+ def __init__(self, config: LagunaConfig, layer_idx: int, num_heads: int | None = None):
360
+ super().__init__()
361
+ self.config = config
362
+ self.layer_idx = layer_idx
363
+ self.head_dim = config.head_dim
364
+ # Allow the caller (decoder layer) to supply a per-layer head count; fall back
365
+ # to config.num_attention_heads when not provided.
366
+ self.num_heads = num_heads if num_heads is not None else config.num_attention_heads
367
+ self.num_key_value_groups = self.num_heads // config.num_key_value_heads
368
+ self.scaling = self.head_dim**-0.5
369
+ self.attention_dropout = config.attention_dropout
370
+ self.is_causal = True
371
+
372
+ # Per-layer sliding window (follows Gemma2/Cohere2 convention)
373
+ layer_types = getattr(config, "layer_types", None)
374
+ if layer_types is not None:
375
+ self.is_sliding = layer_types[layer_idx] == "sliding_attention"
376
+ self.sliding_window = config.sliding_window if self.is_sliding else None
377
+ else:
378
+ self.is_sliding = False
379
+ self.sliding_window = None
380
+
381
+ # Laguna: no QKV bias, explicit head_dim
382
+ self.q_proj = nn.Linear(config.hidden_size, self.num_heads * config.head_dim, bias=False)
383
+ self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=False)
384
+ self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * config.head_dim, bias=False)
385
+ self.o_proj = nn.Linear(self.num_heads * config.head_dim, config.hidden_size, bias=False)
386
+
387
+ # Laguna-specific: optional gating projection.
388
+ # ``gating`` may be:
389
+ # - True / "per-element": one gate per (head, head_dim) channel
390
+ # - "per-head": one gate per head, broadcast across head_dim
391
+ # - False: no gating
392
+ gating = getattr(config, "gating", True)
393
+ self.gating = bool(gating)
394
+ self.gate_per_head = gating == "per-head"
395
+ if self.gating:
396
+ g_out = self.num_heads if self.gate_per_head else self.num_heads * config.head_dim
397
+ self.g_proj = nn.Linear(config.hidden_size, g_out, bias=False)
398
+
399
+ # Attention sinks (learnable per-head bias for SWA layers)
400
+ if self.is_sliding and getattr(config, "swa_attention_sink_enabled", False):
401
+ self.sink = nn.Parameter(torch.zeros(self.num_heads))
402
+
403
+ # QK normalization (RMSNorm applied per-head after reshape, before RoPE)
404
+ self.q_norm = LagunaRMSNorm(config.head_dim, eps=config.rms_norm_eps)
405
+ self.k_norm = LagunaRMSNorm(config.head_dim, eps=config.rms_norm_eps)
406
+
407
+ def forward(
408
+ self,
409
+ hidden_states: torch.Tensor,
410
+ position_embeddings: tuple[torch.Tensor, torch.Tensor],
411
+ attention_mask: torch.Tensor | None,
412
+ past_key_values: Cache | None = None,
413
+ **kwargs: Unpack[FlashAttentionKwargs],
414
+ ) -> tuple[torch.Tensor, torch.Tensor | None]:
415
+ input_shape = hidden_states.shape[:-1]
416
+ hidden_shape = (*input_shape, -1, self.head_dim)
417
+
418
+ query_states = self.q_proj(hidden_states)
419
+ key_states = self.k_proj(hidden_states)
420
+ value_states = self.v_proj(hidden_states)
421
+
422
+ query_states = query_states.view(hidden_shape).transpose(1, 2)
423
+ key_states = key_states.view(hidden_shape).transpose(1, 2)
424
+ value_states = value_states.view(hidden_shape).transpose(1, 2)
425
+
426
+ # QK normalization (applied per-head before RoPE)
427
+ query_states = self.q_norm(query_states)
428
+ key_states = self.k_norm(key_states)
429
+
430
+ cos, sin = position_embeddings
431
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
432
+
433
+ if past_key_values is not None:
434
+ key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx)
435
+
436
+ # ``attention_mask`` here is already the correct mask for this layer type —
437
+ # ``LagunaModel.forward`` builds separate full-attention and sliding-attention
438
+ # masks (using ``create_causal_mask`` / ``create_sliding_window_causal_mask``)
439
+ # and the decoder layer passes the right one in.
440
+ attention_interface: Callable = eager_attention_forward
441
+ if self.config._attn_implementation != "eager":
442
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
443
+
444
+ attn_output, attn_weights = attention_interface(
445
+ self,
446
+ query_states,
447
+ key_states,
448
+ value_states,
449
+ attention_mask,
450
+ dropout=0.0 if not self.training else self.attention_dropout,
451
+ scaling=self.scaling,
452
+ **kwargs,
453
+ )
454
+
455
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
456
+
457
+ # Laguna-specific: apply gating BEFORE o_proj (optional)
458
+ if self.gating:
459
+ gate = F.softplus(self.g_proj(hidden_states).float()).to(attn_output.dtype)
460
+ if self.gate_per_head:
461
+ # gate: [..., num_heads]; broadcast across head_dim
462
+ attn_shape = attn_output.shape
463
+ attn_output = (
464
+ attn_output.view(*attn_shape[:-1], self.num_heads, self.head_dim)
465
+ * gate.unsqueeze(-1)
466
+ ).view(attn_shape)
467
+ else:
468
+ attn_output = attn_output * gate
469
+
470
+ attn_output = self.o_proj(attn_output)
471
+
472
+ return attn_output, attn_weights
473
+
474
+ class LagunaDecoderLayer(GradientCheckpointingLayer):
475
+ """Laguna decoder layer with gated attention and sigmoid-routed MoE."""
476
+
477
+ def __init__(self, config: LagunaConfig, layer_idx: int):
478
+ super().__init__()
479
+ per_layer_heads = getattr(config, "num_attention_heads_per_layer", None)
480
+ layer_num_heads = (
481
+ per_layer_heads[layer_idx] if per_layer_heads is not None else config.num_attention_heads
482
+ )
483
+ # Layer type drives mask and position-embedding dispatch in ``LagunaModel.forward``.
484
+ layer_types = getattr(config, "layer_types", None)
485
+ self.attention_type = layer_types[layer_idx] if layer_types is not None else "full_attention"
486
+ self.self_attn = LagunaAttention(config, layer_idx, num_heads=layer_num_heads)
487
+ # Use MoE or dense MLP based on layer configuration
488
+ if (layer_idx not in config.mlp_only_layers) and (
489
+ config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
490
+ ):
491
+ self.mlp = LagunaSparseMoeBlock(config)
492
+ else:
493
+ self.mlp = LagunaMLP(config, intermediate_size=config.intermediate_size)
494
+ self.input_layernorm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
495
+ self.post_attention_layernorm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
496
+ self.hidden_size = config.hidden_size
497
+
498
+ def forward(
499
+ self,
500
+ hidden_states: torch.Tensor,
501
+ attention_mask: torch.Tensor | None = None,
502
+ position_ids: torch.LongTensor | None = None,
503
+ past_key_values: Cache | None = None,
504
+ use_cache: bool | None = False,
505
+ position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
506
+ **kwargs: Unpack[TransformersKwargs],
507
+ ) -> torch.Tensor:
508
+ residual = hidden_states
509
+ hidden_states = self.input_layernorm(hidden_states)
510
+ # Self Attention
511
+ hidden_states, _ = self.self_attn(
512
+ hidden_states=hidden_states,
513
+ attention_mask=attention_mask,
514
+ position_ids=position_ids,
515
+ past_key_values=past_key_values,
516
+ use_cache=use_cache,
517
+ position_embeddings=position_embeddings,
518
+ **kwargs,
519
+ )
520
+ hidden_states = residual + hidden_states
521
+
522
+ # Fully Connected
523
+ residual = hidden_states
524
+ hidden_states = self.post_attention_layernorm(hidden_states)
525
+ hidden_states = self.mlp(hidden_states)
526
+ hidden_states = residual + hidden_states
527
+ return hidden_states
528
+
529
+
530
+ @auto_docstring
531
+ class LagunaPreTrainedModel(PreTrainedModel):
532
+ config: LagunaConfig
533
+ base_model_prefix = "model"
534
+ supports_gradient_checkpointing = True
535
+ _no_split_modules = ["LagunaDecoderLayer"]
536
+ _skip_keys_device_placement = ["past_key_values"]
537
+ _supports_flash_attn = True
538
+ _supports_sdpa = True
539
+ _supports_flex_attn = True
540
+ _can_compile_fullgraph = (
541
+ is_grouped_mm_available()
542
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
543
+ _supports_attention_backend = True
544
+ _can_record_outputs = {
545
+ "router_logits": OutputRecorder(LagunaTopKRouter, index=0),
546
+ "hidden_states": LagunaDecoderLayer,
547
+ "attentions": LagunaAttention,
548
+ }
549
+ # vLLM-trained Laguna checkpoints store the aux-loss-free routing bias on the
550
+ # experts module (``mlp.experts.e_score_correction_bias``). In this impl the
551
+ # bias lives on the router to stay co-located with its consumer across
552
+ # accelerate's per-module hooks, so remap the legacy key on load.
553
+ _checkpoint_conversion_mapping = {
554
+ r"^(.*)\.mlp\.experts\.e_score_correction_bias$": r"\1.mlp.gate.e_score_correction_bias",
555
+ }
556
+
557
+ @torch.no_grad()
558
+ def _init_weights(self, module):
559
+ super()._init_weights(module)
560
+ std = self.config.initializer_range
561
+ if isinstance(module, LagunaExperts):
562
+ init.normal_(module.gate_up_proj, mean=0.0, std=std)
563
+ init.normal_(module.down_proj, mean=0.0, std=std)
564
+ elif isinstance(module, LagunaTopKRouter):
565
+ init.normal_(module.weight, mean=0.0, std=std)
566
+ # Bare ``nn.Parameter``s that are not covered by the parent's generic
567
+ # Linear/Embedding/norm handling need their own rules so that the
568
+ # __init__ and from_pretrained(state_dict={}) paths produce identical
569
+ # weights under a fixed seed.
570
+ if isinstance(module, LagunaTopKRouter):
571
+ torch.nn.init.zeros_(module.e_score_correction_bias)
572
+ if isinstance(module, LagunaAttention) and hasattr(module, "sink"):
573
+ torch.nn.init.zeros_(module.sink)
574
+
575
+
576
+ class LagunaModel(LagunaPreTrainedModel):
577
+ def __init__(self, config: LagunaConfig):
578
+ super().__init__(config)
579
+ self.padding_idx = config.pad_token_id
580
+ self.vocab_size = config.vocab_size
581
+
582
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
583
+ self.layers = nn.ModuleList(
584
+ [LagunaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
585
+ )
586
+ self.norm = LagunaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
587
+
588
+ # ``LagunaRotaryEmbedding`` inherits ``Qwen2MoeRotaryEmbedding``'s flat-shape
589
+ # contract — it reads ``config.rope_parameters["rope_type"]`` at the outer
590
+ # level. Laguna stores rope nested by layer type (``{"full_attention": {...},
591
+ # ...}``), so pass a config clone with the full-attention sub-dict flattened.
592
+ rp = getattr(config, "rope_parameters", None)
593
+ if isinstance(rp, dict) and isinstance(rp.get("full_attention"), dict):
594
+ import copy
595
+ full_config = copy.deepcopy(config)
596
+ full_config.rope_parameters = dict(rp["full_attention"])
597
+ self.rotary_emb = LagunaRotaryEmbedding(config=full_config)
598
+ else:
599
+ self.rotary_emb = LagunaRotaryEmbedding(config=config)
600
+
601
+ # Separate RoPE for sliding-window attention layers (when configured).
602
+ # Be careful with ``partial_rotary_factor`` — ``PreTrainedConfig.standardize_rope_params``
603
+ # unconditionally overwrites ``rope_parameters["partial_rotary_factor"]`` with
604
+ # ``self.partial_rotary_factor``, so we must align the top-level field on the
605
+ # cloned config to the SWA value, otherwise the global partial factor silently
606
+ # clobbers the SWA one.
607
+ if getattr(config, "swa_rope_parameters", None) is not None:
608
+ import copy
609
+
610
+ swa_config = copy.deepcopy(config)
611
+ swa_config.rope_parameters = dict(config.swa_rope_parameters)
612
+ swa_partial = swa_config.rope_parameters.get("partial_rotary_factor")
613
+ swa_config.partial_rotary_factor = swa_partial
614
+ self.swa_rotary_emb = LagunaRotaryEmbedding(config=swa_config)
615
+ else:
616
+ self.swa_rotary_emb = None
617
+
618
+ self.gradient_checkpointing = False
619
+
620
+ # Initialize weights and apply final processing
621
+ self.post_init()
622
+
623
+ @merge_with_config_defaults
624
+ @capture_outputs
625
+ @auto_docstring
626
+ def forward(
627
+ self,
628
+ input_ids: torch.LongTensor | None = None,
629
+ attention_mask: torch.Tensor | None = None,
630
+ position_ids: torch.LongTensor | None = None,
631
+ past_key_values: Cache | None = None,
632
+ inputs_embeds: torch.FloatTensor | None = None,
633
+ use_cache: bool | None = None,
634
+ **kwargs: Unpack[TransformersKwargs],
635
+ ) -> MoeModelOutputWithPast:
636
+ from ...cache_utils import DynamicCache
637
+ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
638
+
639
+ if (input_ids is None) ^ (inputs_embeds is not None):
640
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
641
+
642
+ if inputs_embeds is None:
643
+ inputs_embeds = self.embed_tokens(input_ids)
644
+
645
+ if use_cache and past_key_values is None:
646
+ past_key_values = DynamicCache(config=self.config)
647
+
648
+ if position_ids is None:
649
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
650
+ position_ids = (
651
+ torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
652
+ ).unsqueeze(0)
653
+
654
+ # Build one mask per layer-type so each layer can be dispatched with the right
655
+ # attention pattern (follows the afmoe / cohere2 v5 convention).
656
+ layer_types = getattr(self.config, "layer_types", None)
657
+ has_swa = layer_types is not None and "sliding_attention" in layer_types
658
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
659
+ mask_kwargs = {
660
+ "config": self.config,
661
+ "inputs_embeds": inputs_embeds,
662
+ "attention_mask": attention_mask,
663
+ "past_key_values": past_key_values,
664
+ "position_ids": position_ids,
665
+ }
666
+ causal_mask_mapping = {"full_attention": create_causal_mask(**mask_kwargs)}
667
+ if has_swa:
668
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
669
+
670
+ hidden_states = inputs_embeds
671
+ global_pe = self.rotary_emb(hidden_states, position_ids)
672
+ # Per-layer-type position embeddings: Laguna optionally uses a different rope for
673
+ # sliding layers (``swa_rope_parameters``). When absent, SWA layers share the
674
+ # global rope.
675
+ if has_swa:
676
+ swa_pe = (
677
+ self.swa_rotary_emb(hidden_states, position_ids)
678
+ if self.swa_rotary_emb is not None
679
+ else global_pe
680
+ )
681
+ position_embeddings_mapping = {"full_attention": global_pe, "sliding_attention": swa_pe}
682
+ else:
683
+ position_embeddings_mapping = None
684
+
685
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
686
+ layer_attn_mask = causal_mask_mapping[decoder_layer.attention_type]
687
+ layer_pos_emb = (
688
+ position_embeddings_mapping[decoder_layer.attention_type]
689
+ if position_embeddings_mapping is not None
690
+ else global_pe
691
+ )
692
+ hidden_states = decoder_layer(
693
+ hidden_states,
694
+ attention_mask=layer_attn_mask,
695
+ position_ids=position_ids,
696
+ past_key_values=past_key_values,
697
+ use_cache=use_cache,
698
+ position_embeddings=layer_pos_emb,
699
+ **kwargs,
700
+ )
701
+
702
+ hidden_states = self.norm(hidden_states)
703
+
704
+ return MoeModelOutputWithPast(
705
+ last_hidden_state=hidden_states,
706
+ past_key_values=past_key_values,
707
+ )
708
+
709
+
710
+ def load_balancing_loss_func(
711
+ gate_logits: torch.Tensor | tuple[torch.Tensor] | None,
712
+ num_experts: int | None = None,
713
+ top_k=2,
714
+ attention_mask: torch.Tensor | None = None,
715
+ ) -> torch.Tensor | int:
716
+ r"""
717
+ Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
718
+
719
+ See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
720
+ function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
721
+ experts is too unbalanced.
722
+
723
+ Args:
724
+ gate_logits:
725
+ Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
726
+ shape [batch_size X sequence_length, num_experts].
727
+ num_experts:
728
+ Number of experts
729
+ top_k:
730
+ The number of experts to route per-token, can be also interpreted as the `top-k` routing
731
+ parameter.
732
+ attention_mask (`torch.Tensor`, *optional*):
733
+ The attention_mask used in forward function
734
+ shape [batch_size X sequence_length] if not None.
735
+
736
+ Returns:
737
+ The auxiliary loss.
738
+ """
739
+ if gate_logits is None or not isinstance(gate_logits, tuple):
740
+ return 0
741
+
742
+ if isinstance(gate_logits, tuple):
743
+ compute_device = gate_logits[0].device
744
+ concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
745
+
746
+ routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
747
+
748
+ _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
749
+
750
+ expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
751
+
752
+ if attention_mask is None:
753
+ # Compute the percentage of tokens routed to each experts
754
+ tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
755
+
756
+ # Compute the average probability of routing to these experts
757
+ router_prob_per_expert = torch.mean(routing_weights, dim=0)
758
+ else:
759
+ batch_size, sequence_length = attention_mask.shape
760
+ num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
761
+
762
+ # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
763
+ expert_attention_mask = (
764
+ attention_mask[None, :, :, None, None]
765
+ .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
766
+ .reshape(-1, top_k, num_experts)
767
+ .to(compute_device)
768
+ )
769
+
770
+ # Compute the percentage of tokens routed to each experts
771
+ tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
772
+ expert_attention_mask, dim=0
773
+ )
774
+
775
+ # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
776
+ router_per_expert_attention_mask = (
777
+ attention_mask[None, :, :, None]
778
+ .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
779
+ .reshape(-1, num_experts)
780
+ .to(compute_device)
781
+ )
782
+
783
+ # Compute the average probability of routing to these experts
784
+ router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
785
+ router_per_expert_attention_mask, dim=0
786
+ )
787
+
788
+ overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
789
+ return overall_loss * num_experts
790
+
791
+
792
+ @auto_docstring
793
+ class LagunaForCausalLM(LagunaPreTrainedModel, GenerationMixin):
794
+ _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
795
+ _tp_plan = {"lm_head": "colwise_gather_output"}
796
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
797
+
798
+ def __init__(self, config):
799
+ super().__init__(config)
800
+ self.model = LagunaModel(config)
801
+ self.vocab_size = config.vocab_size
802
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
803
+ self.router_aux_loss_coef = config.router_aux_loss_coef
804
+ self.num_experts = config.num_experts
805
+ self.num_experts_per_tok = config.num_experts_per_tok
806
+
807
+ # Initialize weights and apply final processing
808
+ self.post_init()
809
+
810
+ @can_return_tuple
811
+ @auto_docstring
812
+ def forward(
813
+ self,
814
+ input_ids: torch.LongTensor | None = None,
815
+ attention_mask: torch.Tensor | None = None,
816
+ position_ids: torch.LongTensor | None = None,
817
+ past_key_values: Cache | None = None,
818
+ inputs_embeds: torch.FloatTensor | None = None,
819
+ labels: torch.LongTensor | None = None,
820
+ use_cache: bool | None = None,
821
+ output_router_logits: bool | None = None,
822
+ logits_to_keep: int | torch.Tensor = 0,
823
+ **kwargs: Unpack[TransformersKwargs],
824
+ ) -> MoeCausalLMOutputWithPast:
825
+ r"""
826
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
827
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
828
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
829
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
830
+ """
831
+
832
+ output_router_logits = (
833
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
834
+ )
835
+
836
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
837
+ outputs: MoeModelOutputWithPast = self.model(
838
+ input_ids=input_ids,
839
+ attention_mask=attention_mask,
840
+ position_ids=position_ids,
841
+ past_key_values=past_key_values,
842
+ inputs_embeds=inputs_embeds,
843
+ use_cache=use_cache,
844
+ output_router_logits=output_router_logits,
845
+ **kwargs,
846
+ )
847
+
848
+ hidden_states = outputs.last_hidden_state
849
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
850
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
851
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
852
+
853
+ loss = None
854
+ if labels is not None:
855
+ loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
856
+
857
+ aux_loss = None
858
+ if output_router_logits:
859
+ aux_loss = load_balancing_loss_func(
860
+ outputs.router_logits,
861
+ self.num_experts,
862
+ self.num_experts_per_tok,
863
+ attention_mask,
864
+ )
865
+ if labels is not None:
866
+ loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
867
+
868
+ return MoeCausalLMOutputWithPast(
869
+ loss=loss,
870
+ aux_loss=aux_loss,
871
+ logits=logits,
872
+ past_key_values=outputs.past_key_values,
873
+ hidden_states=outputs.hidden_states,
874
+ attentions=outputs.attentions,
875
+ router_logits=outputs.router_logits,
876
+ )
877
+
878
+
879
+ __all__ = ["LagunaForCausalLM", "LagunaModel", "LagunaPreTrainedModel"]
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "〈|EOS|〉",
3
+ "cls_token": "〈|CLS|〉",
4
+ "eos_token": "〈|EOS|〉",
5
+ "mask_token": "〈|MASK|〉",
6
+ "pad_token": "〈|PAD|〉",
7
+ "sep_token": "〈|SEP|〉",
8
+ "unk_token": "〈|UNK|〉"
9
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "〈|UNK|〉",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "〈|CODE_START|〉",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "〈|EOS|〉",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "〈|CODE_END|〉",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "〈|META_START|〉",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "〈|META_END|〉",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "〈|FIM_MIDDLE|〉",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "〈|FIM_SUFFIX|〉",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "〈|SEP|〉",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "〈|PAD|〉",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "〈|CLS|〉",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "〈|FIM_START|〉",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "〈|MASK|〉",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "|◊|",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "14": {
116
+ "content": "〈|",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "15": {
124
+ "content": "|〉",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "16": {
132
+ "content": "〈|/",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "17": {
140
+ "content": "/|〉",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "20": {
148
+ "content": "〈|SPECIAL_1|〉",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "21": {
156
+ "content": "〈|SPECIAL_2|〉",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "22": {
164
+ "content": "〈|SPECIAL_3|〉",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "27": {
172
+ "content": "〈|SPECIAL_8|〉",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "28": {
180
+ "content": "〈|SPECIAL_9|〉",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "29": {
188
+ "content": "〈|SPECIAL_10|〉",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "30": {
196
+ "content": "〈|SPECIAL_11|〉",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "31": {
204
+ "content": "〈|SPECIAL_12|〉",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "32": {
212
+ "content": "〈|SPECIAL_13|〉",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "33": {
220
+ "content": "〈|SPECIAL_14|〉",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "34": {
228
+ "content": "〈|SPECIAL_15|〉",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "35": {
236
+ "content": "〈|SPECIAL_16|〉",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "36": {
244
+ "content": "〈|SPECIAL_17|〉",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "37": {
252
+ "content": "〈|SPECIAL_18|〉",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "38": {
260
+ "content": "〈|SPECIAL_19|〉",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "39": {
268
+ "content": "〈|SPECIAL_20|〉",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "40": {
276
+ "content": "〈|SPECIAL_21|〉",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "41": {
284
+ "content": "〈|SPECIAL_22|〉",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "42": {
292
+ "content": "〈|SPECIAL_23|〉",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "43": {
300
+ "content": "〈|SPECIAL_24|〉",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "44": {
308
+ "content": "〈|SPECIAL_25|〉",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "45": {
316
+ "content": "〈|SPECIAL_26|〉",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "46": {
324
+ "content": "〈|SPECIAL_27|〉",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "47": {
332
+ "content": "〈|SPECIAL_28|〉",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "48": {
340
+ "content": "〈|SPECIAL_29|〉",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "49": {
348
+ "content": "〈|SPECIAL_30|〉",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "50": {
356
+ "content": "〈|SPECIAL_31|〉",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "51": {
364
+ "content": "〈|SPECIAL_32|〉",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "52": {
372
+ "content": "〈|SPECIAL_33|〉",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "53": {
380
+ "content": "〈|SPECIAL_34|〉",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "54": {
388
+ "content": "〈|SPECIAL_35|〉",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "55": {
396
+ "content": "〈|SPECIAL_36|〉",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "56": {
404
+ "content": "〈|SPECIAL_37|〉",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "57": {
412
+ "content": "〈|SPECIAL_38|〉",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "58": {
420
+ "content": "〈|SPECIAL_39|〉",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "59": {
428
+ "content": "〈|SPECIAL_40|〉",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "60": {
436
+ "content": "〈|SPECIAL_41|〉",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "61": {
444
+ "content": "〈|SPECIAL_42|〉",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "62": {
452
+ "content": "〈|SPECIAL_43|〉",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "63": {
460
+ "content": "〈|SPECIAL_44|〉",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "64": {
468
+ "content": "〈|SPECIAL_45|〉",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "65": {
476
+ "content": "〈|SPECIAL_46|〉",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "66": {
484
+ "content": "〈|SPECIAL_47|〉",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "67": {
492
+ "content": "〈|SPECIAL_48|〉",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "68": {
500
+ "content": "〈|SPECIAL_49|〉",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "69": {
508
+ "content": "〈|SPECIAL_50|〉",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "18": {
516
+ "content": "<think>",
517
+ "single_word": false,
518
+ "lstrip": false,
519
+ "rstrip": false,
520
+ "normalized": false,
521
+ "special": false
522
+ },
523
+ "19": {
524
+ "content": "</think>",
525
+ "single_word": false,
526
+ "lstrip": false,
527
+ "rstrip": false,
528
+ "normalized": false,
529
+ "special": false
530
+ },
531
+ "23": {
532
+ "content": "<assistant>",
533
+ "single_word": false,
534
+ "lstrip": false,
535
+ "rstrip": false,
536
+ "normalized": false,
537
+ "special": false
538
+ },
539
+ "24": {
540
+ "content": "</assistant>",
541
+ "single_word": false,
542
+ "lstrip": false,
543
+ "rstrip": false,
544
+ "normalized": false,
545
+ "special": false
546
+ },
547
+ "25": {
548
+ "content": "<tool_call>",
549
+ "single_word": false,
550
+ "lstrip": false,
551
+ "rstrip": false,
552
+ "normalized": false,
553
+ "special": false
554
+ },
555
+ "26": {
556
+ "content": "</tool_call>",
557
+ "single_word": false,
558
+ "lstrip": false,
559
+ "rstrip": false,
560
+ "normalized": false,
561
+ "special": false
562
+ }
563
+ },
564
+ "bos_token": "〈|EOS|〉",
565
+ "clean_up_tokenization_spaces": false,
566
+ "cls_token": "〈|CLS|〉",
567
+ "eos_token": "〈|EOS|〉",
568
+ "extra_special_tokens": {},
569
+ "mask_token": "〈|MASK|〉",
570
+ "model_max_length": 1000000000000000019884624838656,
571
+ "pad_token": "〈|PAD|〉",
572
+ "sep_token": "〈|SEP|〉",
573
+ "tokenizer_class": "PreTrainedTokenizerFast",
574
+ "unk_token": "〈|UNK|〉",
575
+ "chat_template": "{% include 'chat_template.jinja' %}"
576
+ }