ConicCat commited on
Commit
0776dca
·
verified ·
1 Parent(s): bbf0c64

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
5
+ tags:
6
+ - axolotl
7
+ - base_model:adapter:nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
8
+ - lora
9
+ - transformers
10
+ datasets:
11
+ - ConicCat/GLiMA_Thinking
12
+ - ConicCat/Gutenberg-SFT
13
+ - ConicCat/Condor-SFT-Filtered
14
+ - ConicCat/Ao3_Soft_Refusal
15
+ - ConicCat/VSF
16
+ pipeline_tag: text-generation
17
+ model-index:
18
+ - name: Writer-Stage-1
19
+ results: []
20
+ ---
21
+
22
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
23
+ should probably proofread and complete it, then remove this comment. -->
24
+
25
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
26
+ <details><summary>See axolotl config</summary>
27
+
28
+ axolotl version: `0.16.0.dev0`
29
+ ```yaml
30
+ base_model: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
31
+
32
+
33
+ load_in_8bit: true
34
+ load_in_4bit: false
35
+
36
+ sequence_len: 5120
37
+ max_sample_length: 5120
38
+
39
+ sample_packing: true
40
+ gradient_checkpointing: true
41
+
42
+ bf16: true
43
+ tf32: true
44
+
45
+ flash_attention: true
46
+ lora_mlp_kernel: false
47
+ lora_qkv_kernel: false
48
+ lora_o_kernel: false
49
+
50
+
51
+ datasets:
52
+ - path: ConicCat/GLiMA_Thinking
53
+ type: chat_template
54
+ roles_to_train: []
55
+ train_on_eos: turn
56
+ message_field_training: train
57
+
58
+ - path: ConicCat/Gutenberg-SFT
59
+ type: chat_template
60
+
61
+ - path: ConicCat/Condor-SFT-Filtered
62
+ split: train[:250]
63
+ type: chat_template
64
+
65
+ - path: ConicCat/Ao3_Soft_Refusal
66
+ type: chat_template
67
+
68
+ - path: ConicCat/VSF
69
+ type: chat_template
70
+
71
+ chat_template_jinja: "{% set bos = \"<|begin_of_text|>\" %}{%- set enable_thinking = false -%}{% set system_start_header = \"<|start_header_id|>\" %}{% set system_end_header = \"<|end_header_id|>\n\n\" %}{% set start_header = \"<|start_header_id|>\" %}{% set end_header = \"<|end_header_id|>\n\n\" %}{% set eot = \"<|eot_id|>\" %}{% set system_token = \"system\" %}{% set user_token = \"user\" %}{% set assistant_token = \"assistant\" %}{% set tool_token = \"tool\" %}{{- bos ~ system_start_header ~ system_token ~ system_end_header -}}{%- if messages[0].role == 'system' and messages[0].content != '' -%}{%- set system_content = messages[0].content -%}{%- if '/no_think' in system_content -%}{%- set system_content = system_content.replace('/no_think', '')|trim -%}{%- set enable_thinking = false -%}{%- elif '/think' in system_content -%}{%- set system_content = system_content.replace('/think', '')|trim -%}{%- set enable_thinking = true -%}{%- endif -%}{{- system_content + '\n\n' -}}{%- endif -%}{%- if tools -%}{{- 'You can use the following tools to assist the user if required:\n<AVAILABLE_TOOLS>[' -}}{%- for tool in tools -%}{{- (tool.function if tool.function is defined else tool) | tojson -}}{{- ', ' if not loop.last else '' -}}{%- endfor -%}{{- ']</AVAILABLE_TOOLS>\n\nIf you decide to call any tool(s), use the following format:\n<TOOLCALL>[{{\"name\": \"tool_name1\", \"arguments\": \"tool_args1\"}}, {{\"name\": \"tool_name2\", \"arguments\": \"tool_args2\"}}]</TOOLCALL>\n\nResponse from tool(s) will be returned in this format:\n<TOOL_RESPONSE>[{{\"response\": \"tool_response1\"}}, {{\"response\": \"tool_response2\"}}]</TOOL_RESPONSE>\n\nBased on the results returned by the tool(s), you can call additional tools if needed, correct tool calls if any errors are found, or just respond with the answer to the user.' -}}{%- endif -%}{{- eot -}}{%- for message in messages -%}{%- if message.role == user_token -%}{{- start_header ~ user_token ~ end_header -}}{{ message.content -}}{{ eot -}}{%- elif message.role == assistant_token -%}{%- if '</think>' in message.content -%}{%- set content = message.content.split('</think>')[-1].lstrip() -%}{%- else -%}{%- set content = message.content -%}{%- endif -%}{{- start_header ~ assistant_token ~ end_header -}}{{ content -}}{%- if message.tool_calls -%}{{- '<TOOLCALL>[' -}}{%- for call in message.tool_calls -%}{%- set fn = call.function if call.function is defined else call -%}{{- '{\"name\": \"' + fn.name + '\", \"arguments\": ' -}}{%- if fn.arguments is string -%}{{- fn.arguments -}}{%- else -%}{{- fn.arguments | tojson -}}{%- endif -%}{{- '}' + (', ' if not loop.last else '') -}}{%- endfor -%}{{- ']</TOOLCALL>' -}}{%- endif -%}{{- eot -}}{%- elif message.role == tool_token -%}{%- if loop.first or (messages[loop.index0 - 1].role != tool_token) -%}{{- start_header ~ tool_token ~ end_header -}}{{ '<TOOL_RESPONSE>[' -}}{%- endif -%}{{- message.content -}}{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == tool_token) else '' -}}{%- if loop.last or (messages[loop.index0 + 1].role != tool_token) -%}{{- ']</TOOL_RESPONSE>' -}}{{ eot -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- start_header ~ assistant_token ~ end_header -}}{%- if not enable_thinking -%}{{- '<think>\n\n</think>\n\n' -}}{%- endif -%}{%- endif -%}"
72
+ trust_remote_code: true
73
+
74
+ adapter: lora
75
+ lora_r: 32
76
+ lora_alpha: 64
77
+ lora_dropout: 0.0
78
+ lora_bias: None
79
+ lora_target_linear: true
80
+ use_tensorboard: true
81
+
82
+ optimizer: paged_adamw_8bit
83
+ learning_rate: 1.25e-5 # 1e-4 / 4
84
+ loraplus_lr_ratio: 16
85
+
86
+ # Training arguments
87
+ output_dir: ./Writer-Stage-1
88
+ num_epochs: 3
89
+ micro_batch_size: 1
90
+ gradient_accumulation_steps: 16
91
+ save_strategy: 'no'
92
+ warmup_ratio: 0.05
93
+ lr_scheduler: 'constant_with_warmup'
94
+ max_grad_norm: 1
95
+ logging_steps: 1
96
+ seed: 42
97
+ ```
98
+
99
+ </details><br>
100
+
101
+ # Writer-Stage-1
102
+
103
+ This model is a fine-tuned version of [nvidia/Llama-3_3-Nemotron-Super-49B-v1_5](https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1_5) on the ConicCat/GLiMA_Thinking, the ConicCat/Gutenberg-SFT, the ConicCat/Condor-SFT-Filtered, the ConicCat/Ao3_Soft_Refusal and the ConicCat/VSF datasets.
104
+
105
+ ## Model description
106
+
107
+ More information needed
108
+
109
+ ## Intended uses & limitations
110
+
111
+ More information needed
112
+
113
+ ## Training and evaluation data
114
+
115
+ More information needed
116
+
117
+ ## Training procedure
118
+
119
+ ### Training hyperparameters
120
+
121
+ The following hyperparameters were used during training:
122
+ - learning_rate: 1.25e-05
123
+ - train_batch_size: 1
124
+ - eval_batch_size: 1
125
+ - seed: 42
126
+ - gradient_accumulation_steps: 16
127
+ - total_train_batch_size: 16
128
+ - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
129
+ - lr_scheduler_type: constant_with_warmup
130
+ - lr_scheduler_warmup_steps: 2
131
+ - training_steps: 54
132
+
133
+ ### Training results
134
+
135
+
136
+
137
+ ### Framework versions
138
+
139
+ - PEFT 0.18.1
140
+ - Transformers 5.3.0
141
+ - Pytorch 2.9.1+cu128
142
+ - Datasets 4.5.0
143
+ - Tokenizers 0.22.2
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "o_proj",
33
+ "q_proj",
34
+ "gate_proj",
35
+ "down_proj",
36
+ "k_proj",
37
+ "v_proj",
38
+ "up_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35aec8e7edeb4728f563221db3318570ae5a184c1fd972dc577cca44c6ddab69
3
+ size 1203621016
block_config.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import json
3
+ import warnings
4
+ from dataclasses import dataclass, MISSING
5
+ from functools import partial
6
+ from typing import Optional, Any
7
+
8
+
9
+ @partial(dataclass, frozen=True, kw_only=True)
10
+ class JsonComparable:
11
+ def to_json(self) -> str:
12
+ return json.dumps(dataclasses.asdict(self))
13
+
14
+ def __eq__(self, other: "JsonComparable") -> bool:
15
+ return self.to_json() == other.to_json()
16
+
17
+ def __hash__(self) -> int:
18
+ return hash(self.to_json())
19
+
20
+ def __lt__(self, other: "JsonComparable") -> bool:
21
+ return self.to_json() < other.to_json()
22
+
23
+
24
+ @partial(dataclass, frozen=True, kw_only=True)
25
+ class SubblockConfig(JsonComparable):
26
+ no_op: bool = False
27
+ replace_with_linear: bool = False
28
+ sparsify: Optional[list[str]] = None
29
+
30
+ def __post_init__(self):
31
+ assert not (self.no_op and self.replace_with_linear)
32
+
33
+ def _force_setattr(self, name: str, value: Any) -> None:
34
+ """
35
+ Set an attribute even in frozen dataclasses.
36
+ Use only inside __post_init__!
37
+ """
38
+ object.__setattr__(self, name, value)
39
+
40
+
41
+ @partial(dataclass, frozen=True, kw_only=True)
42
+ class AttentionConfig(SubblockConfig):
43
+ n_heads_in_group: Optional[int] = None
44
+ window_length: Optional[int] = None
45
+ num_sink_tokens: Optional[int] = None
46
+ use_prefill_window_in_sink_attention: bool = False
47
+ unshifted_sink: bool = False
48
+
49
+ def __post_init__(self):
50
+ super().__post_init__()
51
+ assert not (self.no_op and self.replace_with_linear)
52
+
53
+ if self.no_op or self.replace_with_linear:
54
+ for irrelevant_att in ["n_heads_in_group", "window_length", "num_sink_tokens"]:
55
+ self._force_setattr(irrelevant_att, None)
56
+ else:
57
+ assert self.n_heads_in_group is not None
58
+
59
+ if self.is_sink:
60
+ assert not (self.unshifted_sink and self.use_prefill_window_in_sink_attention), \
61
+ ("Unshifted sink uses its own kind of explicit masking, not standard window. "
62
+ "Set use_prefill_window_in_sink_attention to False.")
63
+ assert not (self.num_sink_tokens == 0 and not self.unshifted_sink), \
64
+ "Fake sink attention with 0 sink tokens is only supported with unshifted_sink=True"
65
+
66
+ @property
67
+ def prefill_sliding_window(self) -> Optional[int]:
68
+ if self.window_length is not None:
69
+ if not self.is_sink or self.use_prefill_window_in_sink_attention:
70
+ return self.window_length
71
+ return None
72
+
73
+ @property
74
+ def is_sliding(self) -> bool:
75
+ return self.prefill_sliding_window is not None
76
+
77
+ @property
78
+ def is_sink(self) -> bool:
79
+ return (
80
+ (self.window_length is not None)
81
+ and
82
+ (self.num_sink_tokens is not None)
83
+ )
84
+
85
+
86
+ @partial(dataclass, frozen=True, kw_only=True)
87
+ class FFNConfig(SubblockConfig):
88
+ ffn_mult: Optional[float] = None
89
+
90
+ def __post_init__(self):
91
+ super().__post_init__()
92
+ if self.no_op or self.replace_with_linear:
93
+ self._force_setattr("ffn_mult", None)
94
+ else:
95
+ assert self.ffn_mult is not None
96
+ self._force_setattr("ffn_mult", round(self.ffn_mult, 6))
97
+
98
+
99
+ @partial(dataclass, frozen=True, kw_only=True)
100
+ class BlockConfig(JsonComparable):
101
+ attention: AttentionConfig = MISSING
102
+ ffn: FFNConfig = MISSING
103
+
104
+ def __post_init__(self):
105
+ """
106
+ Init subblock dataclasses from dicts
107
+ """
108
+ for subblock_name in dataclasses.fields(self):
109
+ subblock_config = getattr(self, subblock_name.name)
110
+ if isinstance(subblock_config, dict):
111
+ subblock_fields = [field.name for field in dataclasses.fields(subblock_name.type)]
112
+ unsupported_fields = [field_name for field_name in subblock_config.keys()
113
+ if field_name not in subblock_fields]
114
+ if len(unsupported_fields) > 0:
115
+ warnings.warn(f"Removed unsupported fields {unsupported_fields} from {subblock_name.type.__name__}")
116
+ subblock_config = {k: v for k, v in subblock_config.items() if k not in unsupported_fields}
117
+ object.__setattr__(self, subblock_name.name,
118
+ subblock_name.type(**subblock_config)) # __setattr__ to overcome frozen=True
chat_template.jinja ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% set bos = "<|begin_of_text|>" %}{%- set enable_thinking = false -%}{% set system_start_header = "<|start_header_id|>" %}{% set system_end_header = "<|end_header_id|>
2
+
3
+ " %}{% set start_header = "<|start_header_id|>" %}{% set end_header = "<|end_header_id|>
4
+
5
+ " %}{% set eot = "<|eot_id|>" %}{% set system_token = "system" %}{% set user_token = "user" %}{% set assistant_token = "assistant" %}{% set tool_token = "tool" %}{{- bos ~ system_start_header ~ system_token ~ system_end_header -}}{%- if messages[0].role == 'system' and messages[0].content != '' -%}{%- set system_content = messages[0].content -%}{%- if '/no_think' in system_content -%}{%- set system_content = system_content.replace('/no_think', '')|trim -%}{%- set enable_thinking = false -%}{%- elif '/think' in system_content -%}{%- set system_content = system_content.replace('/think', '')|trim -%}{%- set enable_thinking = true -%}{%- endif -%}{{- system_content + '
6
+
7
+ ' -}}{%- endif -%}{%- if tools -%}{{- 'You can use the following tools to assist the user if required:
8
+ <AVAILABLE_TOOLS>[' -}}{%- for tool in tools -%}{{- (tool.function if tool.function is defined else tool) | tojson -}}{{- ', ' if not loop.last else '' -}}{%- endfor -%}{{- ']</AVAILABLE_TOOLS>
9
+
10
+ If you decide to call any tool(s), use the following format:
11
+ <TOOLCALL>[{{"name": "tool_name1", "arguments": "tool_args1"}}, {{"name": "tool_name2", "arguments": "tool_args2"}}]</TOOLCALL>
12
+
13
+ Response from tool(s) will be returned in this format:
14
+ <TOOL_RESPONSE>[{{"response": "tool_response1"}}, {{"response": "tool_response2"}}]</TOOL_RESPONSE>
15
+
16
+ Based on the results returned by the tool(s), you can call additional tools if needed, correct tool calls if any errors are found, or just respond with the answer to the user.' -}}{%- endif -%}{{- eot -}}{%- for message in messages -%}{%- if message.role == user_token -%}{{- start_header ~ user_token ~ end_header -}}{{ message.content -}}{{ eot -}}{%- elif message.role == assistant_token -%}{%- if '</think>' in message.content -%}{%- set content = message.content.split('</think>')[-1].lstrip() -%}{%- else -%}{%- set content = message.content -%}{%- endif -%}{{- start_header ~ assistant_token ~ end_header -}}{{ content -}}{%- if message.tool_calls -%}{{- '<TOOLCALL>[' -}}{%- for call in message.tool_calls -%}{%- set fn = call.function if call.function is defined else call -%}{{- '{"name": "' + fn.name + '", "arguments": ' -}}{%- if fn.arguments is string -%}{{- fn.arguments -}}{%- else -%}{{- fn.arguments | tojson -}}{%- endif -%}{{- '}' + (', ' if not loop.last else '') -}}{%- endfor -%}{{- ']</TOOLCALL>' -}}{%- endif -%}{{- eot -}}{%- elif message.role == tool_token -%}{%- if loop.first or (messages[loop.index0 - 1].role != tool_token) -%}{{- start_header ~ tool_token ~ end_header -}}{{ '<TOOL_RESPONSE>[' -}}{%- endif -%}{{- message.content -}}{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == tool_token) else '' -}}{%- if loop.last or (messages[loop.index0 + 1].role != tool_token) -%}{{- ']</TOOL_RESPONSE>' -}}{{ eot -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- start_header ~ assistant_token ~ end_header -}}{%- if not enable_thinking -%}{{- '<think>
17
+
18
+ </think>
19
+
20
+ ' -}}{%- endif -%}{%- endif -%}
config.json ADDED
@@ -0,0 +1,1497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeciLMForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_decilm.DeciLMConfig",
9
+ "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM"
10
+ },
11
+ "block_configs": [
12
+ {
13
+ "attention": {
14
+ "n_heads_in_group": 8,
15
+ "no_op": false,
16
+ "num_sink_tokens": null,
17
+ "replace_with_linear": false,
18
+ "sparsify": null,
19
+ "unshifted_sink": false,
20
+ "use_prefill_window_in_sink_attention": false,
21
+ "window_length": null
22
+ },
23
+ "ffn": {
24
+ "ffn_mult": 2.625,
25
+ "no_op": false,
26
+ "replace_with_linear": false,
27
+ "sparsify": null
28
+ }
29
+ },
30
+ {
31
+ "attention": {
32
+ "n_heads_in_group": 8,
33
+ "no_op": false,
34
+ "num_sink_tokens": null,
35
+ "replace_with_linear": false,
36
+ "sparsify": null,
37
+ "unshifted_sink": false,
38
+ "use_prefill_window_in_sink_attention": false,
39
+ "window_length": null
40
+ },
41
+ "ffn": {
42
+ "ffn_mult": 5.25,
43
+ "no_op": false,
44
+ "replace_with_linear": false,
45
+ "sparsify": null
46
+ }
47
+ },
48
+ {
49
+ "attention": {
50
+ "n_heads_in_group": 8,
51
+ "no_op": false,
52
+ "num_sink_tokens": null,
53
+ "replace_with_linear": false,
54
+ "sparsify": null,
55
+ "unshifted_sink": false,
56
+ "use_prefill_window_in_sink_attention": false,
57
+ "window_length": null
58
+ },
59
+ "ffn": {
60
+ "ffn_mult": 5.25,
61
+ "no_op": false,
62
+ "replace_with_linear": false,
63
+ "sparsify": null
64
+ }
65
+ },
66
+ {
67
+ "attention": {
68
+ "n_heads_in_group": 8,
69
+ "no_op": false,
70
+ "num_sink_tokens": null,
71
+ "replace_with_linear": false,
72
+ "sparsify": null,
73
+ "unshifted_sink": false,
74
+ "use_prefill_window_in_sink_attention": false,
75
+ "window_length": null
76
+ },
77
+ "ffn": {
78
+ "ffn_mult": 5.25,
79
+ "no_op": false,
80
+ "replace_with_linear": false,
81
+ "sparsify": null
82
+ }
83
+ },
84
+ {
85
+ "attention": {
86
+ "n_heads_in_group": 8,
87
+ "no_op": false,
88
+ "num_sink_tokens": null,
89
+ "replace_with_linear": false,
90
+ "sparsify": null,
91
+ "unshifted_sink": false,
92
+ "use_prefill_window_in_sink_attention": false,
93
+ "window_length": null
94
+ },
95
+ "ffn": {
96
+ "ffn_mult": 5.25,
97
+ "no_op": false,
98
+ "replace_with_linear": false,
99
+ "sparsify": null
100
+ }
101
+ },
102
+ {
103
+ "attention": {
104
+ "n_heads_in_group": 8,
105
+ "no_op": false,
106
+ "num_sink_tokens": null,
107
+ "replace_with_linear": false,
108
+ "sparsify": null,
109
+ "unshifted_sink": false,
110
+ "use_prefill_window_in_sink_attention": false,
111
+ "window_length": null
112
+ },
113
+ "ffn": {
114
+ "ffn_mult": 5.25,
115
+ "no_op": false,
116
+ "replace_with_linear": false,
117
+ "sparsify": null
118
+ }
119
+ },
120
+ {
121
+ "attention": {
122
+ "n_heads_in_group": null,
123
+ "no_op": true,
124
+ "num_sink_tokens": null,
125
+ "replace_with_linear": false,
126
+ "sparsify": null,
127
+ "unshifted_sink": false,
128
+ "use_prefill_window_in_sink_attention": false,
129
+ "window_length": null
130
+ },
131
+ "ffn": {
132
+ "ffn_mult": 2.625,
133
+ "no_op": false,
134
+ "replace_with_linear": false,
135
+ "sparsify": null
136
+ }
137
+ },
138
+ {
139
+ "attention": {
140
+ "n_heads_in_group": null,
141
+ "no_op": true,
142
+ "num_sink_tokens": null,
143
+ "replace_with_linear": false,
144
+ "sparsify": null,
145
+ "unshifted_sink": false,
146
+ "use_prefill_window_in_sink_attention": false,
147
+ "window_length": null
148
+ },
149
+ "ffn": {
150
+ "ffn_mult": 2.625,
151
+ "no_op": false,
152
+ "replace_with_linear": false,
153
+ "sparsify": null
154
+ }
155
+ },
156
+ {
157
+ "attention": {
158
+ "n_heads_in_group": 8,
159
+ "no_op": false,
160
+ "num_sink_tokens": null,
161
+ "replace_with_linear": false,
162
+ "sparsify": null,
163
+ "unshifted_sink": false,
164
+ "use_prefill_window_in_sink_attention": false,
165
+ "window_length": null
166
+ },
167
+ "ffn": {
168
+ "ffn_mult": 5.25,
169
+ "no_op": false,
170
+ "replace_with_linear": false,
171
+ "sparsify": null
172
+ }
173
+ },
174
+ {
175
+ "attention": {
176
+ "n_heads_in_group": 8,
177
+ "no_op": false,
178
+ "num_sink_tokens": null,
179
+ "replace_with_linear": false,
180
+ "sparsify": null,
181
+ "unshifted_sink": false,
182
+ "use_prefill_window_in_sink_attention": false,
183
+ "window_length": null
184
+ },
185
+ "ffn": {
186
+ "ffn_mult": 5.25,
187
+ "no_op": false,
188
+ "replace_with_linear": false,
189
+ "sparsify": null
190
+ }
191
+ },
192
+ {
193
+ "attention": {
194
+ "n_heads_in_group": 8,
195
+ "no_op": false,
196
+ "num_sink_tokens": null,
197
+ "replace_with_linear": false,
198
+ "sparsify": null,
199
+ "unshifted_sink": false,
200
+ "use_prefill_window_in_sink_attention": false,
201
+ "window_length": null
202
+ },
203
+ "ffn": {
204
+ "ffn_mult": 5.25,
205
+ "no_op": false,
206
+ "replace_with_linear": false,
207
+ "sparsify": null
208
+ }
209
+ },
210
+ {
211
+ "attention": {
212
+ "n_heads_in_group": null,
213
+ "no_op": true,
214
+ "num_sink_tokens": null,
215
+ "replace_with_linear": false,
216
+ "sparsify": null,
217
+ "unshifted_sink": false,
218
+ "use_prefill_window_in_sink_attention": false,
219
+ "window_length": null
220
+ },
221
+ "ffn": {
222
+ "ffn_mult": 3.28125,
223
+ "no_op": false,
224
+ "replace_with_linear": false,
225
+ "sparsify": null
226
+ }
227
+ },
228
+ {
229
+ "attention": {
230
+ "n_heads_in_group": 8,
231
+ "no_op": false,
232
+ "num_sink_tokens": null,
233
+ "replace_with_linear": false,
234
+ "sparsify": null,
235
+ "unshifted_sink": false,
236
+ "use_prefill_window_in_sink_attention": false,
237
+ "window_length": null
238
+ },
239
+ "ffn": {
240
+ "ffn_mult": 5.25,
241
+ "no_op": false,
242
+ "replace_with_linear": false,
243
+ "sparsify": null
244
+ }
245
+ },
246
+ {
247
+ "attention": {
248
+ "n_heads_in_group": 8,
249
+ "no_op": false,
250
+ "num_sink_tokens": null,
251
+ "replace_with_linear": false,
252
+ "sparsify": null,
253
+ "unshifted_sink": false,
254
+ "use_prefill_window_in_sink_attention": false,
255
+ "window_length": null
256
+ },
257
+ "ffn": {
258
+ "ffn_mult": 5.25,
259
+ "no_op": false,
260
+ "replace_with_linear": false,
261
+ "sparsify": null
262
+ }
263
+ },
264
+ {
265
+ "attention": {
266
+ "n_heads_in_group": 8,
267
+ "no_op": false,
268
+ "num_sink_tokens": null,
269
+ "replace_with_linear": false,
270
+ "sparsify": null,
271
+ "unshifted_sink": false,
272
+ "use_prefill_window_in_sink_attention": false,
273
+ "window_length": null
274
+ },
275
+ "ffn": {
276
+ "ffn_mult": 5.25,
277
+ "no_op": false,
278
+ "replace_with_linear": false,
279
+ "sparsify": null
280
+ }
281
+ },
282
+ {
283
+ "attention": {
284
+ "n_heads_in_group": 8,
285
+ "no_op": false,
286
+ "num_sink_tokens": null,
287
+ "replace_with_linear": false,
288
+ "sparsify": null,
289
+ "unshifted_sink": false,
290
+ "use_prefill_window_in_sink_attention": false,
291
+ "window_length": null
292
+ },
293
+ "ffn": {
294
+ "ffn_mult": 5.25,
295
+ "no_op": false,
296
+ "replace_with_linear": false,
297
+ "sparsify": null
298
+ }
299
+ },
300
+ {
301
+ "attention": {
302
+ "n_heads_in_group": 8,
303
+ "no_op": false,
304
+ "num_sink_tokens": null,
305
+ "replace_with_linear": false,
306
+ "sparsify": null,
307
+ "unshifted_sink": false,
308
+ "use_prefill_window_in_sink_attention": false,
309
+ "window_length": null
310
+ },
311
+ "ffn": {
312
+ "ffn_mult": 5.25,
313
+ "no_op": false,
314
+ "replace_with_linear": false,
315
+ "sparsify": null
316
+ }
317
+ },
318
+ {
319
+ "attention": {
320
+ "n_heads_in_group": 8,
321
+ "no_op": false,
322
+ "num_sink_tokens": null,
323
+ "replace_with_linear": false,
324
+ "sparsify": null,
325
+ "unshifted_sink": false,
326
+ "use_prefill_window_in_sink_attention": false,
327
+ "window_length": null
328
+ },
329
+ "ffn": {
330
+ "ffn_mult": 5.25,
331
+ "no_op": false,
332
+ "replace_with_linear": false,
333
+ "sparsify": null
334
+ }
335
+ },
336
+ {
337
+ "attention": {
338
+ "n_heads_in_group": 8,
339
+ "no_op": false,
340
+ "num_sink_tokens": null,
341
+ "replace_with_linear": false,
342
+ "sparsify": null,
343
+ "unshifted_sink": false,
344
+ "use_prefill_window_in_sink_attention": false,
345
+ "window_length": null
346
+ },
347
+ "ffn": {
348
+ "ffn_mult": 5.25,
349
+ "no_op": false,
350
+ "replace_with_linear": false,
351
+ "sparsify": null
352
+ }
353
+ },
354
+ {
355
+ "attention": {
356
+ "n_heads_in_group": 8,
357
+ "no_op": false,
358
+ "num_sink_tokens": null,
359
+ "replace_with_linear": false,
360
+ "sparsify": null,
361
+ "unshifted_sink": false,
362
+ "use_prefill_window_in_sink_attention": false,
363
+ "window_length": null
364
+ },
365
+ "ffn": {
366
+ "ffn_mult": 5.25,
367
+ "no_op": false,
368
+ "replace_with_linear": false,
369
+ "sparsify": null
370
+ }
371
+ },
372
+ {
373
+ "attention": {
374
+ "n_heads_in_group": 8,
375
+ "no_op": false,
376
+ "num_sink_tokens": null,
377
+ "replace_with_linear": false,
378
+ "sparsify": null,
379
+ "unshifted_sink": false,
380
+ "use_prefill_window_in_sink_attention": false,
381
+ "window_length": null
382
+ },
383
+ "ffn": {
384
+ "ffn_mult": 5.25,
385
+ "no_op": false,
386
+ "replace_with_linear": false,
387
+ "sparsify": null
388
+ }
389
+ },
390
+ {
391
+ "attention": {
392
+ "n_heads_in_group": 8,
393
+ "no_op": false,
394
+ "num_sink_tokens": null,
395
+ "replace_with_linear": false,
396
+ "sparsify": null,
397
+ "unshifted_sink": false,
398
+ "use_prefill_window_in_sink_attention": false,
399
+ "window_length": null
400
+ },
401
+ "ffn": {
402
+ "ffn_mult": 5.25,
403
+ "no_op": false,
404
+ "replace_with_linear": false,
405
+ "sparsify": null
406
+ }
407
+ },
408
+ {
409
+ "attention": {
410
+ "n_heads_in_group": 8,
411
+ "no_op": false,
412
+ "num_sink_tokens": null,
413
+ "replace_with_linear": false,
414
+ "sparsify": null,
415
+ "unshifted_sink": false,
416
+ "use_prefill_window_in_sink_attention": false,
417
+ "window_length": null
418
+ },
419
+ "ffn": {
420
+ "ffn_mult": 5.25,
421
+ "no_op": false,
422
+ "replace_with_linear": false,
423
+ "sparsify": null
424
+ }
425
+ },
426
+ {
427
+ "attention": {
428
+ "n_heads_in_group": 8,
429
+ "no_op": false,
430
+ "num_sink_tokens": null,
431
+ "replace_with_linear": false,
432
+ "sparsify": null,
433
+ "unshifted_sink": false,
434
+ "use_prefill_window_in_sink_attention": false,
435
+ "window_length": null
436
+ },
437
+ "ffn": {
438
+ "ffn_mult": 5.25,
439
+ "no_op": false,
440
+ "replace_with_linear": false,
441
+ "sparsify": null
442
+ }
443
+ },
444
+ {
445
+ "attention": {
446
+ "n_heads_in_group": 8,
447
+ "no_op": false,
448
+ "num_sink_tokens": null,
449
+ "replace_with_linear": false,
450
+ "sparsify": null,
451
+ "unshifted_sink": false,
452
+ "use_prefill_window_in_sink_attention": false,
453
+ "window_length": null
454
+ },
455
+ "ffn": {
456
+ "ffn_mult": 5.25,
457
+ "no_op": false,
458
+ "replace_with_linear": false,
459
+ "sparsify": null
460
+ }
461
+ },
462
+ {
463
+ "attention": {
464
+ "n_heads_in_group": 8,
465
+ "no_op": false,
466
+ "num_sink_tokens": null,
467
+ "replace_with_linear": false,
468
+ "sparsify": null,
469
+ "unshifted_sink": false,
470
+ "use_prefill_window_in_sink_attention": false,
471
+ "window_length": null
472
+ },
473
+ "ffn": {
474
+ "ffn_mult": 5.25,
475
+ "no_op": false,
476
+ "replace_with_linear": false,
477
+ "sparsify": null
478
+ }
479
+ },
480
+ {
481
+ "attention": {
482
+ "n_heads_in_group": 8,
483
+ "no_op": false,
484
+ "num_sink_tokens": null,
485
+ "replace_with_linear": false,
486
+ "sparsify": null,
487
+ "unshifted_sink": false,
488
+ "use_prefill_window_in_sink_attention": false,
489
+ "window_length": null
490
+ },
491
+ "ffn": {
492
+ "ffn_mult": 5.25,
493
+ "no_op": false,
494
+ "replace_with_linear": false,
495
+ "sparsify": null
496
+ }
497
+ },
498
+ {
499
+ "attention": {
500
+ "n_heads_in_group": 8,
501
+ "no_op": false,
502
+ "num_sink_tokens": null,
503
+ "replace_with_linear": false,
504
+ "sparsify": null,
505
+ "unshifted_sink": false,
506
+ "use_prefill_window_in_sink_attention": false,
507
+ "window_length": null
508
+ },
509
+ "ffn": {
510
+ "ffn_mult": 5.25,
511
+ "no_op": false,
512
+ "replace_with_linear": false,
513
+ "sparsify": null
514
+ }
515
+ },
516
+ {
517
+ "attention": {
518
+ "n_heads_in_group": 8,
519
+ "no_op": false,
520
+ "num_sink_tokens": null,
521
+ "replace_with_linear": false,
522
+ "sparsify": null,
523
+ "unshifted_sink": false,
524
+ "use_prefill_window_in_sink_attention": false,
525
+ "window_length": null
526
+ },
527
+ "ffn": {
528
+ "ffn_mult": 5.25,
529
+ "no_op": false,
530
+ "replace_with_linear": false,
531
+ "sparsify": null
532
+ }
533
+ },
534
+ {
535
+ "attention": {
536
+ "n_heads_in_group": 8,
537
+ "no_op": false,
538
+ "num_sink_tokens": null,
539
+ "replace_with_linear": false,
540
+ "sparsify": null,
541
+ "unshifted_sink": false,
542
+ "use_prefill_window_in_sink_attention": false,
543
+ "window_length": null
544
+ },
545
+ "ffn": {
546
+ "ffn_mult": 5.25,
547
+ "no_op": false,
548
+ "replace_with_linear": false,
549
+ "sparsify": null
550
+ }
551
+ },
552
+ {
553
+ "attention": {
554
+ "n_heads_in_group": 8,
555
+ "no_op": false,
556
+ "num_sink_tokens": null,
557
+ "replace_with_linear": false,
558
+ "sparsify": null,
559
+ "unshifted_sink": false,
560
+ "use_prefill_window_in_sink_attention": false,
561
+ "window_length": null
562
+ },
563
+ "ffn": {
564
+ "ffn_mult": 5.25,
565
+ "no_op": false,
566
+ "replace_with_linear": false,
567
+ "sparsify": null
568
+ }
569
+ },
570
+ {
571
+ "attention": {
572
+ "n_heads_in_group": 8,
573
+ "no_op": false,
574
+ "num_sink_tokens": null,
575
+ "replace_with_linear": false,
576
+ "sparsify": null,
577
+ "unshifted_sink": false,
578
+ "use_prefill_window_in_sink_attention": false,
579
+ "window_length": null
580
+ },
581
+ "ffn": {
582
+ "ffn_mult": 5.25,
583
+ "no_op": false,
584
+ "replace_with_linear": false,
585
+ "sparsify": null
586
+ }
587
+ },
588
+ {
589
+ "attention": {
590
+ "n_heads_in_group": 8,
591
+ "no_op": false,
592
+ "num_sink_tokens": null,
593
+ "replace_with_linear": false,
594
+ "sparsify": null,
595
+ "unshifted_sink": false,
596
+ "use_prefill_window_in_sink_attention": false,
597
+ "window_length": null
598
+ },
599
+ "ffn": {
600
+ "ffn_mult": 5.25,
601
+ "no_op": false,
602
+ "replace_with_linear": false,
603
+ "sparsify": null
604
+ }
605
+ },
606
+ {
607
+ "attention": {
608
+ "n_heads_in_group": 8,
609
+ "no_op": false,
610
+ "num_sink_tokens": null,
611
+ "replace_with_linear": false,
612
+ "sparsify": null,
613
+ "unshifted_sink": false,
614
+ "use_prefill_window_in_sink_attention": false,
615
+ "window_length": null
616
+ },
617
+ "ffn": {
618
+ "ffn_mult": 5.25,
619
+ "no_op": false,
620
+ "replace_with_linear": false,
621
+ "sparsify": null
622
+ }
623
+ },
624
+ {
625
+ "attention": {
626
+ "n_heads_in_group": 8,
627
+ "no_op": false,
628
+ "num_sink_tokens": null,
629
+ "replace_with_linear": false,
630
+ "sparsify": null,
631
+ "unshifted_sink": false,
632
+ "use_prefill_window_in_sink_attention": false,
633
+ "window_length": null
634
+ },
635
+ "ffn": {
636
+ "ffn_mult": 5.25,
637
+ "no_op": false,
638
+ "replace_with_linear": false,
639
+ "sparsify": null
640
+ }
641
+ },
642
+ {
643
+ "attention": {
644
+ "n_heads_in_group": 8,
645
+ "no_op": false,
646
+ "num_sink_tokens": null,
647
+ "replace_with_linear": false,
648
+ "sparsify": null,
649
+ "unshifted_sink": false,
650
+ "use_prefill_window_in_sink_attention": false,
651
+ "window_length": null
652
+ },
653
+ "ffn": {
654
+ "ffn_mult": 5.25,
655
+ "no_op": false,
656
+ "replace_with_linear": false,
657
+ "sparsify": null
658
+ }
659
+ },
660
+ {
661
+ "attention": {
662
+ "n_heads_in_group": 8,
663
+ "no_op": false,
664
+ "num_sink_tokens": null,
665
+ "replace_with_linear": false,
666
+ "sparsify": null,
667
+ "unshifted_sink": false,
668
+ "use_prefill_window_in_sink_attention": false,
669
+ "window_length": null
670
+ },
671
+ "ffn": {
672
+ "ffn_mult": 5.25,
673
+ "no_op": false,
674
+ "replace_with_linear": false,
675
+ "sparsify": null
676
+ }
677
+ },
678
+ {
679
+ "attention": {
680
+ "n_heads_in_group": 8,
681
+ "no_op": false,
682
+ "num_sink_tokens": null,
683
+ "replace_with_linear": false,
684
+ "sparsify": null,
685
+ "unshifted_sink": false,
686
+ "use_prefill_window_in_sink_attention": false,
687
+ "window_length": null
688
+ },
689
+ "ffn": {
690
+ "ffn_mult": 5.25,
691
+ "no_op": false,
692
+ "replace_with_linear": false,
693
+ "sparsify": null
694
+ }
695
+ },
696
+ {
697
+ "attention": {
698
+ "n_heads_in_group": 8,
699
+ "no_op": false,
700
+ "num_sink_tokens": null,
701
+ "replace_with_linear": false,
702
+ "sparsify": null,
703
+ "unshifted_sink": false,
704
+ "use_prefill_window_in_sink_attention": false,
705
+ "window_length": null
706
+ },
707
+ "ffn": {
708
+ "ffn_mult": 5.25,
709
+ "no_op": false,
710
+ "replace_with_linear": false,
711
+ "sparsify": null
712
+ }
713
+ },
714
+ {
715
+ "attention": {
716
+ "n_heads_in_group": 8,
717
+ "no_op": false,
718
+ "num_sink_tokens": null,
719
+ "replace_with_linear": false,
720
+ "sparsify": null,
721
+ "unshifted_sink": false,
722
+ "use_prefill_window_in_sink_attention": false,
723
+ "window_length": null
724
+ },
725
+ "ffn": {
726
+ "ffn_mult": 5.25,
727
+ "no_op": false,
728
+ "replace_with_linear": false,
729
+ "sparsify": null
730
+ }
731
+ },
732
+ {
733
+ "attention": {
734
+ "n_heads_in_group": 8,
735
+ "no_op": false,
736
+ "num_sink_tokens": null,
737
+ "replace_with_linear": false,
738
+ "sparsify": null,
739
+ "unshifted_sink": false,
740
+ "use_prefill_window_in_sink_attention": false,
741
+ "window_length": null
742
+ },
743
+ "ffn": {
744
+ "ffn_mult": 5.25,
745
+ "no_op": false,
746
+ "replace_with_linear": false,
747
+ "sparsify": null
748
+ }
749
+ },
750
+ {
751
+ "attention": {
752
+ "n_heads_in_group": 8,
753
+ "no_op": false,
754
+ "num_sink_tokens": null,
755
+ "replace_with_linear": false,
756
+ "sparsify": null,
757
+ "unshifted_sink": false,
758
+ "use_prefill_window_in_sink_attention": false,
759
+ "window_length": null
760
+ },
761
+ "ffn": {
762
+ "ffn_mult": 5.25,
763
+ "no_op": false,
764
+ "replace_with_linear": false,
765
+ "sparsify": null
766
+ }
767
+ },
768
+ {
769
+ "attention": {
770
+ "n_heads_in_group": null,
771
+ "no_op": true,
772
+ "num_sink_tokens": null,
773
+ "replace_with_linear": false,
774
+ "sparsify": null,
775
+ "unshifted_sink": false,
776
+ "use_prefill_window_in_sink_attention": false,
777
+ "window_length": null
778
+ },
779
+ "ffn": {
780
+ "ffn_mult": 1.3125,
781
+ "no_op": false,
782
+ "replace_with_linear": false,
783
+ "sparsify": null
784
+ }
785
+ },
786
+ {
787
+ "attention": {
788
+ "n_heads_in_group": null,
789
+ "no_op": true,
790
+ "num_sink_tokens": null,
791
+ "replace_with_linear": false,
792
+ "sparsify": null,
793
+ "unshifted_sink": false,
794
+ "use_prefill_window_in_sink_attention": false,
795
+ "window_length": null
796
+ },
797
+ "ffn": {
798
+ "ffn_mult": 2.625,
799
+ "no_op": false,
800
+ "replace_with_linear": false,
801
+ "sparsify": null
802
+ }
803
+ },
804
+ {
805
+ "attention": {
806
+ "n_heads_in_group": null,
807
+ "no_op": true,
808
+ "num_sink_tokens": null,
809
+ "replace_with_linear": false,
810
+ "sparsify": null,
811
+ "unshifted_sink": false,
812
+ "use_prefill_window_in_sink_attention": false,
813
+ "window_length": null
814
+ },
815
+ "ffn": {
816
+ "ffn_mult": 2.625,
817
+ "no_op": false,
818
+ "replace_with_linear": false,
819
+ "sparsify": null
820
+ }
821
+ },
822
+ {
823
+ "attention": {
824
+ "n_heads_in_group": null,
825
+ "no_op": true,
826
+ "num_sink_tokens": null,
827
+ "replace_with_linear": false,
828
+ "sparsify": null,
829
+ "unshifted_sink": false,
830
+ "use_prefill_window_in_sink_attention": false,
831
+ "window_length": null
832
+ },
833
+ "ffn": {
834
+ "ffn_mult": 1.3125,
835
+ "no_op": false,
836
+ "replace_with_linear": false,
837
+ "sparsify": null
838
+ }
839
+ },
840
+ {
841
+ "attention": {
842
+ "n_heads_in_group": null,
843
+ "no_op": true,
844
+ "num_sink_tokens": null,
845
+ "replace_with_linear": false,
846
+ "sparsify": null,
847
+ "unshifted_sink": false,
848
+ "use_prefill_window_in_sink_attention": false,
849
+ "window_length": null
850
+ },
851
+ "ffn": {
852
+ "ffn_mult": 5.25,
853
+ "no_op": false,
854
+ "replace_with_linear": false,
855
+ "sparsify": null
856
+ }
857
+ },
858
+ {
859
+ "attention": {
860
+ "n_heads_in_group": null,
861
+ "no_op": true,
862
+ "num_sink_tokens": null,
863
+ "replace_with_linear": false,
864
+ "sparsify": null,
865
+ "unshifted_sink": false,
866
+ "use_prefill_window_in_sink_attention": false,
867
+ "window_length": null
868
+ },
869
+ "ffn": {
870
+ "ffn_mult": 1.3125,
871
+ "no_op": false,
872
+ "replace_with_linear": false,
873
+ "sparsify": null
874
+ }
875
+ },
876
+ {
877
+ "attention": {
878
+ "n_heads_in_group": null,
879
+ "no_op": true,
880
+ "num_sink_tokens": null,
881
+ "replace_with_linear": false,
882
+ "sparsify": null,
883
+ "unshifted_sink": false,
884
+ "use_prefill_window_in_sink_attention": false,
885
+ "window_length": null
886
+ },
887
+ "ffn": {
888
+ "ffn_mult": 2.625,
889
+ "no_op": false,
890
+ "replace_with_linear": false,
891
+ "sparsify": null
892
+ }
893
+ },
894
+ {
895
+ "attention": {
896
+ "n_heads_in_group": null,
897
+ "no_op": true,
898
+ "num_sink_tokens": null,
899
+ "replace_with_linear": false,
900
+ "sparsify": null,
901
+ "unshifted_sink": false,
902
+ "use_prefill_window_in_sink_attention": false,
903
+ "window_length": null
904
+ },
905
+ "ffn": {
906
+ "ffn_mult": 1.3125,
907
+ "no_op": false,
908
+ "replace_with_linear": false,
909
+ "sparsify": null
910
+ }
911
+ },
912
+ {
913
+ "attention": {
914
+ "n_heads_in_group": null,
915
+ "no_op": true,
916
+ "num_sink_tokens": null,
917
+ "replace_with_linear": false,
918
+ "sparsify": null,
919
+ "unshifted_sink": false,
920
+ "use_prefill_window_in_sink_attention": false,
921
+ "window_length": null
922
+ },
923
+ "ffn": {
924
+ "ffn_mult": 1.3125,
925
+ "no_op": false,
926
+ "replace_with_linear": false,
927
+ "sparsify": null
928
+ }
929
+ },
930
+ {
931
+ "attention": {
932
+ "n_heads_in_group": null,
933
+ "no_op": true,
934
+ "num_sink_tokens": null,
935
+ "replace_with_linear": false,
936
+ "sparsify": null,
937
+ "unshifted_sink": false,
938
+ "use_prefill_window_in_sink_attention": false,
939
+ "window_length": null
940
+ },
941
+ "ffn": {
942
+ "ffn_mult": 1.3125,
943
+ "no_op": false,
944
+ "replace_with_linear": false,
945
+ "sparsify": null
946
+ }
947
+ },
948
+ {
949
+ "attention": {
950
+ "n_heads_in_group": 8,
951
+ "no_op": false,
952
+ "num_sink_tokens": null,
953
+ "replace_with_linear": false,
954
+ "sparsify": null,
955
+ "unshifted_sink": false,
956
+ "use_prefill_window_in_sink_attention": false,
957
+ "window_length": null
958
+ },
959
+ "ffn": {
960
+ "ffn_mult": 5.25,
961
+ "no_op": false,
962
+ "replace_with_linear": false,
963
+ "sparsify": null
964
+ }
965
+ },
966
+ {
967
+ "attention": {
968
+ "n_heads_in_group": null,
969
+ "no_op": true,
970
+ "num_sink_tokens": null,
971
+ "replace_with_linear": false,
972
+ "sparsify": null,
973
+ "unshifted_sink": false,
974
+ "use_prefill_window_in_sink_attention": false,
975
+ "window_length": null
976
+ },
977
+ "ffn": {
978
+ "ffn_mult": 1.3125,
979
+ "no_op": false,
980
+ "replace_with_linear": false,
981
+ "sparsify": null
982
+ }
983
+ },
984
+ {
985
+ "attention": {
986
+ "n_heads_in_group": null,
987
+ "no_op": true,
988
+ "num_sink_tokens": null,
989
+ "replace_with_linear": false,
990
+ "sparsify": null,
991
+ "unshifted_sink": false,
992
+ "use_prefill_window_in_sink_attention": false,
993
+ "window_length": null
994
+ },
995
+ "ffn": {
996
+ "ffn_mult": 1.0,
997
+ "no_op": false,
998
+ "replace_with_linear": false,
999
+ "sparsify": null
1000
+ }
1001
+ },
1002
+ {
1003
+ "attention": {
1004
+ "n_heads_in_group": null,
1005
+ "no_op": true,
1006
+ "num_sink_tokens": null,
1007
+ "replace_with_linear": false,
1008
+ "sparsify": null,
1009
+ "unshifted_sink": false,
1010
+ "use_prefill_window_in_sink_attention": false,
1011
+ "window_length": null
1012
+ },
1013
+ "ffn": {
1014
+ "ffn_mult": 1.0,
1015
+ "no_op": false,
1016
+ "replace_with_linear": false,
1017
+ "sparsify": null
1018
+ }
1019
+ },
1020
+ {
1021
+ "attention": {
1022
+ "n_heads_in_group": null,
1023
+ "no_op": true,
1024
+ "num_sink_tokens": null,
1025
+ "replace_with_linear": false,
1026
+ "sparsify": null,
1027
+ "unshifted_sink": false,
1028
+ "use_prefill_window_in_sink_attention": false,
1029
+ "window_length": null
1030
+ },
1031
+ "ffn": {
1032
+ "ffn_mult": 1.3125,
1033
+ "no_op": false,
1034
+ "replace_with_linear": false,
1035
+ "sparsify": null
1036
+ }
1037
+ },
1038
+ {
1039
+ "attention": {
1040
+ "n_heads_in_group": null,
1041
+ "no_op": true,
1042
+ "num_sink_tokens": null,
1043
+ "replace_with_linear": false,
1044
+ "sparsify": null,
1045
+ "unshifted_sink": false,
1046
+ "use_prefill_window_in_sink_attention": false,
1047
+ "window_length": null
1048
+ },
1049
+ "ffn": {
1050
+ "ffn_mult": 1.0,
1051
+ "no_op": false,
1052
+ "replace_with_linear": false,
1053
+ "sparsify": null
1054
+ }
1055
+ },
1056
+ {
1057
+ "attention": {
1058
+ "n_heads_in_group": null,
1059
+ "no_op": true,
1060
+ "num_sink_tokens": null,
1061
+ "replace_with_linear": false,
1062
+ "sparsify": null,
1063
+ "unshifted_sink": false,
1064
+ "use_prefill_window_in_sink_attention": false,
1065
+ "window_length": null
1066
+ },
1067
+ "ffn": {
1068
+ "ffn_mult": 1.0,
1069
+ "no_op": false,
1070
+ "replace_with_linear": false,
1071
+ "sparsify": null
1072
+ }
1073
+ },
1074
+ {
1075
+ "attention": {
1076
+ "n_heads_in_group": null,
1077
+ "no_op": true,
1078
+ "num_sink_tokens": null,
1079
+ "replace_with_linear": false,
1080
+ "sparsify": null,
1081
+ "unshifted_sink": false,
1082
+ "use_prefill_window_in_sink_attention": false,
1083
+ "window_length": null
1084
+ },
1085
+ "ffn": {
1086
+ "ffn_mult": 1.0,
1087
+ "no_op": false,
1088
+ "replace_with_linear": false,
1089
+ "sparsify": null
1090
+ }
1091
+ },
1092
+ {
1093
+ "attention": {
1094
+ "n_heads_in_group": null,
1095
+ "no_op": true,
1096
+ "num_sink_tokens": null,
1097
+ "replace_with_linear": false,
1098
+ "sparsify": null,
1099
+ "unshifted_sink": false,
1100
+ "use_prefill_window_in_sink_attention": false,
1101
+ "window_length": null
1102
+ },
1103
+ "ffn": {
1104
+ "ffn_mult": 1.3125,
1105
+ "no_op": false,
1106
+ "replace_with_linear": false,
1107
+ "sparsify": null
1108
+ }
1109
+ },
1110
+ {
1111
+ "attention": {
1112
+ "n_heads_in_group": null,
1113
+ "no_op": true,
1114
+ "num_sink_tokens": null,
1115
+ "replace_with_linear": false,
1116
+ "sparsify": null,
1117
+ "unshifted_sink": false,
1118
+ "use_prefill_window_in_sink_attention": false,
1119
+ "window_length": null
1120
+ },
1121
+ "ffn": {
1122
+ "ffn_mult": 1.3125,
1123
+ "no_op": false,
1124
+ "replace_with_linear": false,
1125
+ "sparsify": null
1126
+ }
1127
+ },
1128
+ {
1129
+ "attention": {
1130
+ "n_heads_in_group": null,
1131
+ "no_op": true,
1132
+ "num_sink_tokens": null,
1133
+ "replace_with_linear": false,
1134
+ "sparsify": null,
1135
+ "unshifted_sink": false,
1136
+ "use_prefill_window_in_sink_attention": false,
1137
+ "window_length": null
1138
+ },
1139
+ "ffn": {
1140
+ "ffn_mult": 0.5,
1141
+ "no_op": false,
1142
+ "replace_with_linear": false,
1143
+ "sparsify": null
1144
+ }
1145
+ },
1146
+ {
1147
+ "attention": {
1148
+ "n_heads_in_group": null,
1149
+ "no_op": true,
1150
+ "num_sink_tokens": null,
1151
+ "replace_with_linear": false,
1152
+ "sparsify": null,
1153
+ "unshifted_sink": false,
1154
+ "use_prefill_window_in_sink_attention": false,
1155
+ "window_length": null
1156
+ },
1157
+ "ffn": {
1158
+ "ffn_mult": 0.5,
1159
+ "no_op": false,
1160
+ "replace_with_linear": false,
1161
+ "sparsify": null
1162
+ }
1163
+ },
1164
+ {
1165
+ "attention": {
1166
+ "n_heads_in_group": null,
1167
+ "no_op": true,
1168
+ "num_sink_tokens": null,
1169
+ "replace_with_linear": false,
1170
+ "sparsify": null,
1171
+ "unshifted_sink": false,
1172
+ "use_prefill_window_in_sink_attention": false,
1173
+ "window_length": null
1174
+ },
1175
+ "ffn": {
1176
+ "ffn_mult": 1.0,
1177
+ "no_op": false,
1178
+ "replace_with_linear": false,
1179
+ "sparsify": null
1180
+ }
1181
+ },
1182
+ {
1183
+ "attention": {
1184
+ "n_heads_in_group": null,
1185
+ "no_op": true,
1186
+ "num_sink_tokens": null,
1187
+ "replace_with_linear": false,
1188
+ "sparsify": null,
1189
+ "unshifted_sink": false,
1190
+ "use_prefill_window_in_sink_attention": false,
1191
+ "window_length": null
1192
+ },
1193
+ "ffn": {
1194
+ "ffn_mult": 1.0,
1195
+ "no_op": false,
1196
+ "replace_with_linear": false,
1197
+ "sparsify": null
1198
+ }
1199
+ },
1200
+ {
1201
+ "attention": {
1202
+ "n_heads_in_group": null,
1203
+ "no_op": true,
1204
+ "num_sink_tokens": null,
1205
+ "replace_with_linear": false,
1206
+ "sparsify": null,
1207
+ "unshifted_sink": false,
1208
+ "use_prefill_window_in_sink_attention": false,
1209
+ "window_length": null
1210
+ },
1211
+ "ffn": {
1212
+ "ffn_mult": 0.5,
1213
+ "no_op": false,
1214
+ "replace_with_linear": false,
1215
+ "sparsify": null
1216
+ }
1217
+ },
1218
+ {
1219
+ "attention": {
1220
+ "n_heads_in_group": null,
1221
+ "no_op": true,
1222
+ "num_sink_tokens": null,
1223
+ "replace_with_linear": false,
1224
+ "sparsify": null,
1225
+ "unshifted_sink": false,
1226
+ "use_prefill_window_in_sink_attention": false,
1227
+ "window_length": null
1228
+ },
1229
+ "ffn": {
1230
+ "ffn_mult": 0.5,
1231
+ "no_op": false,
1232
+ "replace_with_linear": false,
1233
+ "sparsify": null
1234
+ }
1235
+ },
1236
+ {
1237
+ "attention": {
1238
+ "n_heads_in_group": null,
1239
+ "no_op": true,
1240
+ "num_sink_tokens": null,
1241
+ "replace_with_linear": false,
1242
+ "sparsify": null,
1243
+ "unshifted_sink": false,
1244
+ "use_prefill_window_in_sink_attention": false,
1245
+ "window_length": null
1246
+ },
1247
+ "ffn": {
1248
+ "ffn_mult": 1.0,
1249
+ "no_op": false,
1250
+ "replace_with_linear": false,
1251
+ "sparsify": null
1252
+ }
1253
+ },
1254
+ {
1255
+ "attention": {
1256
+ "n_heads_in_group": null,
1257
+ "no_op": true,
1258
+ "num_sink_tokens": null,
1259
+ "replace_with_linear": false,
1260
+ "sparsify": null,
1261
+ "unshifted_sink": false,
1262
+ "use_prefill_window_in_sink_attention": false,
1263
+ "window_length": null
1264
+ },
1265
+ "ffn": {
1266
+ "ffn_mult": 0.5,
1267
+ "no_op": false,
1268
+ "replace_with_linear": false,
1269
+ "sparsify": null
1270
+ }
1271
+ },
1272
+ {
1273
+ "attention": {
1274
+ "n_heads_in_group": null,
1275
+ "no_op": true,
1276
+ "num_sink_tokens": null,
1277
+ "replace_with_linear": false,
1278
+ "sparsify": null,
1279
+ "unshifted_sink": false,
1280
+ "use_prefill_window_in_sink_attention": false,
1281
+ "window_length": null
1282
+ },
1283
+ "ffn": {
1284
+ "ffn_mult": 0.5,
1285
+ "no_op": false,
1286
+ "replace_with_linear": false,
1287
+ "sparsify": null
1288
+ }
1289
+ },
1290
+ {
1291
+ "attention": {
1292
+ "n_heads_in_group": 8,
1293
+ "no_op": false,
1294
+ "num_sink_tokens": null,
1295
+ "replace_with_linear": false,
1296
+ "sparsify": null,
1297
+ "unshifted_sink": false,
1298
+ "use_prefill_window_in_sink_attention": false,
1299
+ "window_length": null
1300
+ },
1301
+ "ffn": {
1302
+ "ffn_mult": 5.25,
1303
+ "no_op": false,
1304
+ "replace_with_linear": false,
1305
+ "sparsify": null
1306
+ }
1307
+ },
1308
+ {
1309
+ "attention": {
1310
+ "n_heads_in_group": 8,
1311
+ "no_op": false,
1312
+ "num_sink_tokens": null,
1313
+ "replace_with_linear": false,
1314
+ "sparsify": null,
1315
+ "unshifted_sink": false,
1316
+ "use_prefill_window_in_sink_attention": false,
1317
+ "window_length": null
1318
+ },
1319
+ "ffn": {
1320
+ "ffn_mult": 5.25,
1321
+ "no_op": false,
1322
+ "replace_with_linear": false,
1323
+ "sparsify": null
1324
+ }
1325
+ },
1326
+ {
1327
+ "attention": {
1328
+ "n_heads_in_group": 8,
1329
+ "no_op": false,
1330
+ "num_sink_tokens": null,
1331
+ "replace_with_linear": false,
1332
+ "sparsify": null,
1333
+ "unshifted_sink": false,
1334
+ "use_prefill_window_in_sink_attention": false,
1335
+ "window_length": null
1336
+ },
1337
+ "ffn": {
1338
+ "ffn_mult": 5.25,
1339
+ "no_op": false,
1340
+ "replace_with_linear": false,
1341
+ "sparsify": null
1342
+ }
1343
+ },
1344
+ {
1345
+ "attention": {
1346
+ "n_heads_in_group": 8,
1347
+ "no_op": false,
1348
+ "num_sink_tokens": null,
1349
+ "replace_with_linear": false,
1350
+ "sparsify": null,
1351
+ "unshifted_sink": false,
1352
+ "use_prefill_window_in_sink_attention": false,
1353
+ "window_length": null
1354
+ },
1355
+ "ffn": {
1356
+ "ffn_mult": 5.25,
1357
+ "no_op": false,
1358
+ "replace_with_linear": false,
1359
+ "sparsify": null
1360
+ }
1361
+ },
1362
+ {
1363
+ "attention": {
1364
+ "n_heads_in_group": 8,
1365
+ "no_op": false,
1366
+ "num_sink_tokens": null,
1367
+ "replace_with_linear": false,
1368
+ "sparsify": null,
1369
+ "unshifted_sink": false,
1370
+ "use_prefill_window_in_sink_attention": false,
1371
+ "window_length": null
1372
+ },
1373
+ "ffn": {
1374
+ "ffn_mult": 5.25,
1375
+ "no_op": false,
1376
+ "replace_with_linear": false,
1377
+ "sparsify": null
1378
+ }
1379
+ },
1380
+ {
1381
+ "attention": {
1382
+ "n_heads_in_group": 8,
1383
+ "no_op": false,
1384
+ "num_sink_tokens": null,
1385
+ "replace_with_linear": false,
1386
+ "sparsify": null,
1387
+ "unshifted_sink": false,
1388
+ "use_prefill_window_in_sink_attention": false,
1389
+ "window_length": null
1390
+ },
1391
+ "ffn": {
1392
+ "ffn_mult": 5.25,
1393
+ "no_op": false,
1394
+ "replace_with_linear": false,
1395
+ "sparsify": null
1396
+ }
1397
+ },
1398
+ {
1399
+ "attention": {
1400
+ "n_heads_in_group": 8,
1401
+ "no_op": false,
1402
+ "num_sink_tokens": null,
1403
+ "replace_with_linear": false,
1404
+ "sparsify": null,
1405
+ "unshifted_sink": false,
1406
+ "use_prefill_window_in_sink_attention": false,
1407
+ "window_length": null
1408
+ },
1409
+ "ffn": {
1410
+ "ffn_mult": 5.25,
1411
+ "no_op": false,
1412
+ "replace_with_linear": false,
1413
+ "sparsify": null
1414
+ }
1415
+ },
1416
+ {
1417
+ "attention": {
1418
+ "n_heads_in_group": 8,
1419
+ "no_op": false,
1420
+ "num_sink_tokens": null,
1421
+ "replace_with_linear": false,
1422
+ "sparsify": null,
1423
+ "unshifted_sink": false,
1424
+ "use_prefill_window_in_sink_attention": false,
1425
+ "window_length": null
1426
+ },
1427
+ "ffn": {
1428
+ "ffn_mult": 5.25,
1429
+ "no_op": false,
1430
+ "replace_with_linear": false,
1431
+ "sparsify": null
1432
+ }
1433
+ },
1434
+ {
1435
+ "attention": {
1436
+ "n_heads_in_group": 8,
1437
+ "no_op": false,
1438
+ "num_sink_tokens": null,
1439
+ "replace_with_linear": false,
1440
+ "sparsify": null,
1441
+ "unshifted_sink": false,
1442
+ "use_prefill_window_in_sink_attention": false,
1443
+ "window_length": null
1444
+ },
1445
+ "ffn": {
1446
+ "ffn_mult": 5.25,
1447
+ "no_op": false,
1448
+ "replace_with_linear": false,
1449
+ "sparsify": null
1450
+ }
1451
+ }
1452
+ ],
1453
+ "bos_token_id": 128000,
1454
+ "dtype": "bfloat16",
1455
+ "eos_token_id": 128009,
1456
+ "hidden_act": "silu",
1457
+ "hidden_size": 8192,
1458
+ "initializer_range": 0.02,
1459
+ "intermediate_size": null,
1460
+ "max_position_embeddings": 131072,
1461
+ "mlp_bias": false,
1462
+ "model_type": "nemotron-nas",
1463
+ "num_attention_heads": 64,
1464
+ "num_hidden_layers": 80,
1465
+ "num_key_value_heads": null,
1466
+ "pad_token_id": null,
1467
+ "pretraining_tp": 1,
1468
+ "quantization_config": {
1469
+ "_load_in_4bit": false,
1470
+ "_load_in_8bit": true,
1471
+ "bnb_4bit_compute_dtype": "float32",
1472
+ "bnb_4bit_quant_storage": "uint8",
1473
+ "bnb_4bit_quant_type": "fp4",
1474
+ "bnb_4bit_use_double_quant": false,
1475
+ "llm_int8_enable_fp32_cpu_offload": false,
1476
+ "llm_int8_has_fp16_weight": false,
1477
+ "llm_int8_skip_modules": null,
1478
+ "llm_int8_threshold": 6.0,
1479
+ "load_in_4bit": false,
1480
+ "load_in_8bit": true,
1481
+ "quant_method": "bitsandbytes"
1482
+ },
1483
+ "rms_norm_eps": 1e-05,
1484
+ "rope_parameters": {
1485
+ "factor": 16.0,
1486
+ "high_freq_factor": 4.0,
1487
+ "low_freq_factor": 1.0,
1488
+ "original_max_position_embeddings": 8192,
1489
+ "rope_theta": 500000.0,
1490
+ "rope_type": "llama3"
1491
+ },
1492
+ "rope_theta": 500000.0,
1493
+ "tie_word_embeddings": false,
1494
+ "transformers_version": "5.3.0",
1495
+ "use_cache": false,
1496
+ "vocab_size": 128256
1497
+ }
configuration_decilm.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Nvidia Corporation. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import dataclasses
17
+ import warnings
18
+ from typing import Dict, Any
19
+
20
+ from transformers.utils import is_flash_attn_2_available
21
+
22
+ from .block_config import BlockConfig
23
+ from .transformers_4_44_2__configuration_llama import LlamaConfig
24
+ from .transformers_4_44_2__modeling_rope_utils import \
25
+ rope_config_validation # fake import to make AutoConfig infer the dependency
26
+
27
+ rope_config_validation # this line is here to make sure that auto-formatting doesn't remove the import
28
+
29
+
30
+ class DeciLMConfig(LlamaConfig):
31
+ model_type = "nemotron-nas"
32
+
33
+ def __init__(
34
+ self,
35
+ block_configs: list[dict] | list[BlockConfig] = None,
36
+ **kwargs,
37
+ ):
38
+ attn_implementation = kwargs.pop("attn_implementation", None)
39
+ if attn_implementation is None and is_flash_attn_2_available():
40
+ attn_implementation = "flash_attention_2"
41
+
42
+ if block_configs is not None:
43
+ if isinstance(block_configs[0], dict):
44
+ block_configs = [BlockConfig(**conf) for conf in block_configs]
45
+
46
+ using_unshifted_sink = any([block_config.attention.unshifted_sink for block_config in block_configs])
47
+ if using_unshifted_sink and attn_implementation != "eager":
48
+ warnings.warn("Forcing attn_implementation='eager' since some attention layers use unshifted sink")
49
+ attn_implementation = "eager"
50
+
51
+ super().__init__(attn_implementation=attn_implementation, **kwargs)
52
+
53
+ self.intermediate_size = None
54
+ self.num_key_value_heads = None
55
+
56
+ if block_configs is not None:
57
+ assert len(block_configs) == self.num_hidden_layers
58
+
59
+ self.block_configs: list[BlockConfig] = block_configs
60
+
61
+ def to_dict(self) -> Dict[str, Any]:
62
+ self_dict = super().to_dict()
63
+ if self.block_configs is not None:
64
+ self_dict["block_configs"] = [dataclasses.asdict(conf) for conf in self.block_configs]
65
+ return self_dict
debug.log ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-03-31 02:46:14,052] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:10906] baseline 0.000GB ()
2
+ [2026-03-31 02:46:14,053] [INFO] [axolotl.cli.config.load_cfg:341] [PID:10906] config:
3
+ {
4
+ "activation_offloading": false,
5
+ "adapter": "lora",
6
+ "axolotl_config_path": "writer.yaml",
7
+ "base_model": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
8
+ "base_model_config": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
9
+ "batch_size": 16,
10
+ "bf16": true,
11
+ "capabilities": {
12
+ "bf16": true,
13
+ "compute_capability": "sm_90",
14
+ "fp8": true,
15
+ "n_gpu": 1,
16
+ "n_node": 1,
17
+ "tf32": true
18
+ },
19
+ "chat_template": "jinja",
20
+ "chat_template_jinja": "{% set bos = \"<|begin_of_text|>\" %}{%- set enable_thinking = false -%}{% set system_start_header = \"<|start_header_id|>\" %}{% set system_end_header = \"<|end_header_id|>\n\n\" %}{% set start_header = \"<|start_header_id|>\" %}{% set end_header = \"<|end_header_id|>\n\n\" %}{% set eot = \"<|eot_id|>\" %}{% set system_token = \"system\" %}{% set user_token = \"user\" %}{% set assistant_token = \"assistant\" %}{% set tool_token = \"tool\" %}{{- bos ~ system_start_header ~ system_token ~ system_end_header -}}{%- if messages[0].role == 'system' and messages[0].content != '' -%}{%- set system_content = messages[0].content -%}{%- if '/no_think' in system_content -%}{%- set system_content = system_content.replace('/no_think', '')|trim -%}{%- set enable_thinking = false -%}{%- elif '/think' in system_content -%}{%- set system_content = system_content.replace('/think', '')|trim -%}{%- set enable_thinking = true -%}{%- endif -%}{{- system_content + '\n\n' -}}{%- endif -%}{%- if tools -%}{{- 'You can use the following tools to assist the user if required:\n<AVAILABLE_TOOLS>[' -}}{%- for tool in tools -%}{{- (tool.function if tool.function is defined else tool) | tojson -}}{{- ', ' if not loop.last else '' -}}{%- endfor -%}{{- ']</AVAILABLE_TOOLS>\n\nIf you decide to call any tool(s), use the following format:\n<TOOLCALL>[{{\"name\": \"tool_name1\", \"arguments\": \"tool_args1\"}}, {{\"name\": \"tool_name2\", \"arguments\": \"tool_args2\"}}]</TOOLCALL>\n\nResponse from tool(s) will be returned in this format:\n<TOOL_RESPONSE>[{{\"response\": \"tool_response1\"}}, {{\"response\": \"tool_response2\"}}]</TOOL_RESPONSE>\n\nBased on the results returned by the tool(s), you can call additional tools if needed, correct tool calls if any errors are found, or just respond with the answer to the user.' -}}{%- endif -%}{{- eot -}}{%- for message in messages -%}{%- if message.role == user_token -%}{{- start_header ~ user_token ~ end_header -}}{{ message.content -}}{{ eot -}}{%- elif message.role == assistant_token -%}{%- if '</think>' in message.content -%}{%- set content = message.content.split('</think>')[-1].lstrip() -%}{%- else -%}{%- set content = message.content -%}{%- endif -%}{{- start_header ~ assistant_token ~ end_header -}}{{ content -}}{%- if message.tool_calls -%}{{- '<TOOLCALL>[' -}}{%- for call in message.tool_calls -%}{%- set fn = call.function if call.function is defined else call -%}{{- '{\"name\": \"' + fn.name + '\", \"arguments\": ' -}}{%- if fn.arguments is string -%}{{- fn.arguments -}}{%- else -%}{{- fn.arguments | tojson -}}{%- endif -%}{{- '}' + (', ' if not loop.last else '') -}}{%- endfor -%}{{- ']</TOOLCALL>' -}}{%- endif -%}{{- eot -}}{%- elif message.role == tool_token -%}{%- if loop.first or (messages[loop.index0 - 1].role != tool_token) -%}{{- start_header ~ tool_token ~ end_header -}}{{ '<TOOL_RESPONSE>[' -}}{%- endif -%}{{- message.content -}}{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == tool_token) else '' -}}{%- if loop.last or (messages[loop.index0 + 1].role != tool_token) -%}{{- ']</TOOL_RESPONSE>' -}}{{ eot -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- start_header ~ assistant_token ~ end_header -}}{%- if not enable_thinking -%}{{- '<think>\n\n</think>\n\n' -}}{%- endif -%}{%- endif -%}",
21
+ "context_parallel_size": 1,
22
+ "dataloader_num_workers": 1,
23
+ "dataloader_pin_memory": true,
24
+ "dataloader_prefetch_factor": 256,
25
+ "dataset_num_proc": 8,
26
+ "datasets": [
27
+ {
28
+ "chat_template": "tokenizer_default",
29
+ "message_field_training": "train",
30
+ "message_property_mappings": {
31
+ "content": "content",
32
+ "role": "role"
33
+ },
34
+ "path": "ConicCat/GLiMA_Thinking",
35
+ "roles_to_train": [],
36
+ "train_on_eos": "turn",
37
+ "trust_remote_code": false,
38
+ "type": "chat_template"
39
+ },
40
+ {
41
+ "chat_template": "tokenizer_default",
42
+ "message_property_mappings": {
43
+ "content": "content",
44
+ "role": "role"
45
+ },
46
+ "path": "ConicCat/Gutenberg-SFT",
47
+ "trust_remote_code": false,
48
+ "type": "chat_template"
49
+ },
50
+ {
51
+ "chat_template": "tokenizer_default",
52
+ "message_property_mappings": {
53
+ "content": "content",
54
+ "role": "role"
55
+ },
56
+ "path": "ConicCat/Condor-SFT-Filtered",
57
+ "split": "train[:250]",
58
+ "trust_remote_code": false,
59
+ "type": "chat_template"
60
+ },
61
+ {
62
+ "chat_template": "tokenizer_default",
63
+ "message_property_mappings": {
64
+ "content": "content",
65
+ "role": "role"
66
+ },
67
+ "path": "ConicCat/Ao3_Soft_Refusal",
68
+ "trust_remote_code": false,
69
+ "type": "chat_template"
70
+ },
71
+ {
72
+ "chat_template": "tokenizer_default",
73
+ "message_property_mappings": {
74
+ "content": "content",
75
+ "role": "role"
76
+ },
77
+ "path": "ConicCat/VSF",
78
+ "trust_remote_code": false,
79
+ "type": "chat_template"
80
+ }
81
+ ],
82
+ "ddp": false,
83
+ "device": "cuda:0",
84
+ "device_map": "auto",
85
+ "dion_rank_fraction": 1.0,
86
+ "dion_rank_multiple_of": 1,
87
+ "eaft_alpha": 1.0,
88
+ "eaft_k": 20,
89
+ "env_capabilities": {
90
+ "torch_version": "2.9.1"
91
+ },
92
+ "eval_batch_size": 1,
93
+ "eval_causal_lm_metrics": [
94
+ "sacrebleu",
95
+ "comet",
96
+ "ter",
97
+ "chrf"
98
+ ],
99
+ "eval_max_new_tokens": 128,
100
+ "eval_sample_packing": true,
101
+ "eval_table_size": 0,
102
+ "experimental_skip_move_to_device": true,
103
+ "flash_attention": true,
104
+ "fp16": false,
105
+ "generate_samples": false,
106
+ "generation_do_sample": true,
107
+ "generation_max_new_tokens": 50,
108
+ "generation_prompt_ratio": 0.5,
109
+ "generation_temperature": 0.7,
110
+ "gradient_accumulation_steps": 16,
111
+ "gradient_checkpointing": true,
112
+ "gradient_checkpointing_kwargs": {
113
+ "use_reentrant": true
114
+ },
115
+ "include_tkps": true,
116
+ "is_llama_derived_model": true,
117
+ "layer_offloading": false,
118
+ "learning_rate": 1.25e-05,
119
+ "lisa_layers_attribute": "model.layers",
120
+ "load_best_model_at_end": false,
121
+ "load_in_4bit": false,
122
+ "load_in_8bit": false,
123
+ "local_rank": 0,
124
+ "logging_steps": 1,
125
+ "lora_alpha": 64,
126
+ "lora_dropout": 0.0,
127
+ "lora_mlp_kernel": false,
128
+ "lora_o_kernel": false,
129
+ "lora_qkv_kernel": false,
130
+ "lora_r": 32,
131
+ "lora_target_linear": true,
132
+ "loraplus_lr_embedding": 1e-06,
133
+ "loraplus_lr_ratio": 16.0,
134
+ "lr_scheduler": "constant_with_warmup",
135
+ "max_grad_norm": 1.0,
136
+ "mean_resizing_embeddings": false,
137
+ "merge_method": "memory_efficient",
138
+ "micro_batch_size": 1,
139
+ "model_config_type": "nemotron-nas",
140
+ "num_epochs": 3.0,
141
+ "num_generation_samples": 3,
142
+ "optimizer": "paged_adamw_8bit",
143
+ "otel_metrics_host": "localhost",
144
+ "otel_metrics_port": 8000,
145
+ "output_dir": "./Writer-Stage-1",
146
+ "pad_to_sequence_len": true,
147
+ "pretrain_multipack_attn": true,
148
+ "profiler_steps_start": 0,
149
+ "qlora_sharded_model_loading": false,
150
+ "quantize_moe_experts": false,
151
+ "ray_num_workers": 1,
152
+ "resources_per_worker": {
153
+ "GPU": 1
154
+ },
155
+ "sample_packing": true,
156
+ "sample_packing_bin_size": 200,
157
+ "sample_packing_group_size": 100000,
158
+ "save_only_model": false,
159
+ "save_safetensors": true,
160
+ "save_strategy": "no",
161
+ "seed": 42,
162
+ "sequence_len": 5120,
163
+ "shuffle_before_merging_datasets": false,
164
+ "shuffle_merged_datasets": true,
165
+ "skip_prepare_dataset": false,
166
+ "streaming_multipack_buffer_size": 10000,
167
+ "strict": false,
168
+ "tensor_parallel_size": 1,
169
+ "tf32": true,
170
+ "tiled_mlp_use_original_mlp": true,
171
+ "tokenizer_config": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
172
+ "tokenizer_save_jinja_files": true,
173
+ "torch_dtype": "torch.bfloat16",
174
+ "train_on_inputs": false,
175
+ "trl": {
176
+ "async_prefetch": false,
177
+ "log_completions": false,
178
+ "mask_truncated_completions": false,
179
+ "ref_model_mixup_alpha": 0.9,
180
+ "ref_model_sync_steps": 64,
181
+ "replay_buffer_size": 0,
182
+ "replay_recompute_logps": true,
183
+ "reroll_max_groups": 1,
184
+ "reroll_start_fraction": 1.0,
185
+ "reward_num_workers": 1,
186
+ "scale_rewards": true,
187
+ "skip_zero_advantage_batches": true,
188
+ "sync_ref_model": false,
189
+ "use_data_producer": false,
190
+ "use_vllm": false,
191
+ "vllm_lora_sync": false,
192
+ "vllm_server_host": "0.0.0.0",
193
+ "vllm_server_port": 8000
194
+ },
195
+ "trust_remote_code": true,
196
+ "use_otel_metrics": false,
197
+ "use_ray": false,
198
+ "use_tensorboard": true,
199
+ "val_set_size": 0.0,
200
+ "vllm": {
201
+ "device": "auto",
202
+ "dtype": "auto",
203
+ "gpu_memory_utilization": 0.9,
204
+ "host": "0.0.0.0",
205
+ "port": 8000
206
+ },
207
+ "warmup_ratio": 0.05,
208
+ "weight_decay": 0.0,
209
+ "world_size": 1
210
+ }
211
+ [2026-03-31 02:46:14,057] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:129] [PID:10906] explicitly setting `eval_sample_packing` to match `sample_packing`
212
+ [2026-03-31 02:46:14,057] [WARNING] [axolotl.utils.schemas.validation.check_sample_packing_without_attention:190] [PID:10906] sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination.
213
+ [2026-03-31 02:46:14,057] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:239] [PID:10906] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
214
+ [2026-03-31 02:46:14,057] [WARNING] [axolotl.utils.schemas.model.hint_trust_remote_code:103] [PID:10906] `trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model.
215
+ [2026-03-31 02:46:14,759] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:10906] baseline 0.000GB ()
216
+ [2026-03-31 02:46:14,760] [INFO] [axolotl.cli.config.load_cfg:341] [PID:10906] config:
217
+ {
218
+ "activation_offloading": false,
219
+ "adapter": "lora",
220
+ "axolotl_config_path": "writer.yaml",
221
+ "base_model": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
222
+ "base_model_config": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
223
+ "batch_size": 16,
224
+ "bf16": true,
225
+ "capabilities": {
226
+ "bf16": true,
227
+ "compute_capability": "sm_90",
228
+ "fp8": true,
229
+ "n_gpu": 1,
230
+ "n_node": 1,
231
+ "tf32": true
232
+ },
233
+ "chat_template": "jinja",
234
+ "chat_template_jinja": "{% set bos = \"<|begin_of_text|>\" %}{%- set enable_thinking = false -%}{% set system_start_header = \"<|start_header_id|>\" %}{% set system_end_header = \"<|end_header_id|>\n\n\" %}{% set start_header = \"<|start_header_id|>\" %}{% set end_header = \"<|end_header_id|>\n\n\" %}{% set eot = \"<|eot_id|>\" %}{% set system_token = \"system\" %}{% set user_token = \"user\" %}{% set assistant_token = \"assistant\" %}{% set tool_token = \"tool\" %}{{- bos ~ system_start_header ~ system_token ~ system_end_header -}}{%- if messages[0].role == 'system' and messages[0].content != '' -%}{%- set system_content = messages[0].content -%}{%- if '/no_think' in system_content -%}{%- set system_content = system_content.replace('/no_think', '')|trim -%}{%- set enable_thinking = false -%}{%- elif '/think' in system_content -%}{%- set system_content = system_content.replace('/think', '')|trim -%}{%- set enable_thinking = true -%}{%- endif -%}{{- system_content + '\n\n' -}}{%- endif -%}{%- if tools -%}{{- 'You can use the following tools to assist the user if required:\n<AVAILABLE_TOOLS>[' -}}{%- for tool in tools -%}{{- (tool.function if tool.function is defined else tool) | tojson -}}{{- ', ' if not loop.last else '' -}}{%- endfor -%}{{- ']</AVAILABLE_TOOLS>\n\nIf you decide to call any tool(s), use the following format:\n<TOOLCALL>[{{\"name\": \"tool_name1\", \"arguments\": \"tool_args1\"}}, {{\"name\": \"tool_name2\", \"arguments\": \"tool_args2\"}}]</TOOLCALL>\n\nResponse from tool(s) will be returned in this format:\n<TOOL_RESPONSE>[{{\"response\": \"tool_response1\"}}, {{\"response\": \"tool_response2\"}}]</TOOL_RESPONSE>\n\nBased on the results returned by the tool(s), you can call additional tools if needed, correct tool calls if any errors are found, or just respond with the answer to the user.' -}}{%- endif -%}{{- eot -}}{%- for message in messages -%}{%- if message.role == user_token -%}{{- start_header ~ user_token ~ end_header -}}{{ message.content -}}{{ eot -}}{%- elif message.role == assistant_token -%}{%- if '</think>' in message.content -%}{%- set content = message.content.split('</think>')[-1].lstrip() -%}{%- else -%}{%- set content = message.content -%}{%- endif -%}{{- start_header ~ assistant_token ~ end_header -}}{{ content -}}{%- if message.tool_calls -%}{{- '<TOOLCALL>[' -}}{%- for call in message.tool_calls -%}{%- set fn = call.function if call.function is defined else call -%}{{- '{\"name\": \"' + fn.name + '\", \"arguments\": ' -}}{%- if fn.arguments is string -%}{{- fn.arguments -}}{%- else -%}{{- fn.arguments | tojson -}}{%- endif -%}{{- '}' + (', ' if not loop.last else '') -}}{%- endfor -%}{{- ']</TOOLCALL>' -}}{%- endif -%}{{- eot -}}{%- elif message.role == tool_token -%}{%- if loop.first or (messages[loop.index0 - 1].role != tool_token) -%}{{- start_header ~ tool_token ~ end_header -}}{{ '<TOOL_RESPONSE>[' -}}{%- endif -%}{{- message.content -}}{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == tool_token) else '' -}}{%- if loop.last or (messages[loop.index0 + 1].role != tool_token) -%}{{- ']</TOOL_RESPONSE>' -}}{{ eot -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- start_header ~ assistant_token ~ end_header -}}{%- if not enable_thinking -%}{{- '<think>\n\n</think>\n\n' -}}{%- endif -%}{%- endif -%}",
235
+ "context_parallel_size": 1,
236
+ "dataloader_num_workers": 1,
237
+ "dataloader_pin_memory": true,
238
+ "dataloader_prefetch_factor": 256,
239
+ "dataset_num_proc": 8,
240
+ "datasets": [
241
+ {
242
+ "chat_template": "tokenizer_default",
243
+ "message_field_training": "train",
244
+ "message_property_mappings": {
245
+ "content": "content",
246
+ "role": "role"
247
+ },
248
+ "path": "ConicCat/GLiMA_Thinking",
249
+ "roles_to_train": [],
250
+ "train_on_eos": "turn",
251
+ "trust_remote_code": false,
252
+ "type": "chat_template"
253
+ },
254
+ {
255
+ "chat_template": "tokenizer_default",
256
+ "message_property_mappings": {
257
+ "content": "content",
258
+ "role": "role"
259
+ },
260
+ "path": "ConicCat/Gutenberg-SFT",
261
+ "trust_remote_code": false,
262
+ "type": "chat_template"
263
+ },
264
+ {
265
+ "chat_template": "tokenizer_default",
266
+ "message_property_mappings": {
267
+ "content": "content",
268
+ "role": "role"
269
+ },
270
+ "path": "ConicCat/Condor-SFT-Filtered",
271
+ "split": "train[:250]",
272
+ "trust_remote_code": false,
273
+ "type": "chat_template"
274
+ },
275
+ {
276
+ "chat_template": "tokenizer_default",
277
+ "message_property_mappings": {
278
+ "content": "content",
279
+ "role": "role"
280
+ },
281
+ "path": "ConicCat/Ao3_Soft_Refusal",
282
+ "trust_remote_code": false,
283
+ "type": "chat_template"
284
+ },
285
+ {
286
+ "chat_template": "tokenizer_default",
287
+ "message_property_mappings": {
288
+ "content": "content",
289
+ "role": "role"
290
+ },
291
+ "path": "ConicCat/VSF",
292
+ "trust_remote_code": false,
293
+ "type": "chat_template"
294
+ }
295
+ ],
296
+ "ddp": false,
297
+ "device": "cuda:0",
298
+ "device_map": "auto",
299
+ "dion_rank_fraction": 1.0,
300
+ "dion_rank_multiple_of": 1,
301
+ "eaft_alpha": 1.0,
302
+ "eaft_k": 20,
303
+ "env_capabilities": {
304
+ "torch_version": "2.9.1"
305
+ },
306
+ "eval_batch_size": 1,
307
+ "eval_causal_lm_metrics": [
308
+ "sacrebleu",
309
+ "comet",
310
+ "ter",
311
+ "chrf"
312
+ ],
313
+ "eval_max_new_tokens": 128,
314
+ "eval_sample_packing": true,
315
+ "eval_table_size": 0,
316
+ "experimental_skip_move_to_device": true,
317
+ "flash_attention": false,
318
+ "fp16": false,
319
+ "generate_samples": false,
320
+ "generation_do_sample": true,
321
+ "generation_max_new_tokens": 50,
322
+ "generation_prompt_ratio": 0.5,
323
+ "generation_temperature": 0.7,
324
+ "gradient_accumulation_steps": 16,
325
+ "gradient_checkpointing": true,
326
+ "gradient_checkpointing_kwargs": {
327
+ "use_reentrant": true
328
+ },
329
+ "include_tkps": true,
330
+ "is_llama_derived_model": true,
331
+ "layer_offloading": false,
332
+ "learning_rate": 1.25e-05,
333
+ "lisa_layers_attribute": "model.layers",
334
+ "load_best_model_at_end": false,
335
+ "load_in_4bit": false,
336
+ "load_in_8bit": false,
337
+ "local_rank": 0,
338
+ "logging_steps": 1,
339
+ "lora_alpha": 64,
340
+ "lora_dropout": 0.0,
341
+ "lora_mlp_kernel": false,
342
+ "lora_o_kernel": false,
343
+ "lora_qkv_kernel": false,
344
+ "lora_r": 32,
345
+ "lora_target_linear": true,
346
+ "loraplus_lr_embedding": 1e-06,
347
+ "loraplus_lr_ratio": 16.0,
348
+ "lr_scheduler": "constant_with_warmup",
349
+ "max_grad_norm": 1.0,
350
+ "mean_resizing_embeddings": false,
351
+ "merge_lora": true,
352
+ "merge_method": "memory_efficient",
353
+ "micro_batch_size": 1,
354
+ "model_config_type": "nemotron-nas",
355
+ "num_epochs": 3.0,
356
+ "num_generation_samples": 3,
357
+ "optimizer": "paged_adamw_8bit",
358
+ "otel_metrics_host": "localhost",
359
+ "otel_metrics_port": 8000,
360
+ "output_dir": "./Writer-Stage-1",
361
+ "pad_to_sequence_len": true,
362
+ "pretrain_multipack_attn": true,
363
+ "profiler_steps_start": 0,
364
+ "qlora_sharded_model_loading": false,
365
+ "quantize_moe_experts": false,
366
+ "ray_num_workers": 1,
367
+ "resources_per_worker": {
368
+ "GPU": 1
369
+ },
370
+ "sample_packing": true,
371
+ "sample_packing_bin_size": 200,
372
+ "sample_packing_group_size": 100000,
373
+ "save_only_model": false,
374
+ "save_safetensors": true,
375
+ "save_strategy": "no",
376
+ "seed": 42,
377
+ "sequence_len": 5120,
378
+ "shuffle_before_merging_datasets": false,
379
+ "shuffle_merged_datasets": true,
380
+ "skip_prepare_dataset": false,
381
+ "streaming_multipack_buffer_size": 10000,
382
+ "strict": false,
383
+ "tensor_parallel_size": 1,
384
+ "tf32": true,
385
+ "tiled_mlp_use_original_mlp": true,
386
+ "tokenizer_config": "nvidia/Llama-3_3-Nemotron-Super-49B-v1_5",
387
+ "tokenizer_save_jinja_files": true,
388
+ "torch_dtype": "torch.bfloat16",
389
+ "train_on_inputs": false,
390
+ "trl": {
391
+ "async_prefetch": false,
392
+ "log_completions": false,
393
+ "mask_truncated_completions": false,
394
+ "ref_model_mixup_alpha": 0.9,
395
+ "ref_model_sync_steps": 64,
396
+ "replay_buffer_size": 0,
397
+ "replay_recompute_logps": true,
398
+ "reroll_max_groups": 1,
399
+ "reroll_start_fraction": 1.0,
400
+ "reward_num_workers": 1,
401
+ "scale_rewards": true,
402
+ "skip_zero_advantage_batches": true,
403
+ "sync_ref_model": false,
404
+ "use_data_producer": false,
405
+ "use_vllm": false,
406
+ "vllm_lora_sync": false,
407
+ "vllm_server_host": "0.0.0.0",
408
+ "vllm_server_port": 8000
409
+ },
410
+ "trust_remote_code": true,
411
+ "use_otel_metrics": false,
412
+ "use_ray": false,
413
+ "use_tensorboard": true,
414
+ "val_set_size": 0.0,
415
+ "vllm": {
416
+ "device": "auto",
417
+ "dtype": "auto",
418
+ "gpu_memory_utilization": 0.9,
419
+ "host": "0.0.0.0",
420
+ "port": 8000
421
+ },
422
+ "warmup_ratio": 0.05,
423
+ "weight_decay": 0.0,
424
+ "world_size": 1
425
+ }
426
+ [2026-03-31 02:46:14,760] [DEBUG] [axolotl.cli.merge_lora.do_merge_lora:32] [PID:10906] Using memory-efficient LoRA merging method...
427
+ [2026-03-31 02:46:14,760] [DEBUG] [axolotl.cli.merge_lora._do_merge_lora_efficient:79] [PID:10906] Using memory-efficient LoRA merging method...
428
+
429
+
430
+
431
+
432
+
433
+ [2026-03-31 02:46:19,620] [DEBUG] [axolotl.cli.utils.lora_merge.merge_lora_sharded_efficient:854] [PID:10906] Loading LoRA weights from Writer-Stage-1/adapter_model.safetensors
434
+ [2026-03-31 02:46:19,633] [DEBUG] [axolotl.cli.utils.lora_merge.merge_lora_sharded_efficient:860] [PID:10906] Keeping LoRA weights on CPU; will move per-tensor during merge
435
+ [2026-03-31 02:46:19,633] [DEBUG] [axolotl.cli.utils.lora_merge.merge_lora_sharded_efficient:866] [PID:10906] Found 21 model shards in /workspace/data/huggingface-cache/hub/models--nvidia--Llama-3_3-Nemotron-Super-49B-v1_5/snapshots/420ba7d28211abf116b8b103ab700d92619daf98
436
+ [2026-03-31 02:46:19,633] [INFO] [axolotl.cli.utils.lora_merge.copy_non_model_files:303] [PID:10906] Copying non-model files to output directory...
437
+ [2026-03-31 02:46:19,633] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying config.json to output
438
+ [2026-03-31 02:46:19,633] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying configuration_decilm.py to output
439
+ [2026-03-31 02:46:19,633] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__configuration_llama.py to output
440
+ [2026-03-31 02:46:19,634] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__modeling_rope_utils.py to output
441
+ [2026-03-31 02:46:19,634] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying block_config.py to output
442
+ [2026-03-31 02:46:19,634] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying tokenizer_config.json to output
443
+ [2026-03-31 02:46:19,634] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying tokenizer.json to output
444
+ [2026-03-31 02:46:19,638] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying special_tokens_map.json to output
445
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying modeling_decilm.py to output
446
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__modeling_outputs.py to output
447
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__cache_utils.py to output
448
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__pytorch_utils.py to output
449
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__activations.py to output
450
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying variable_cache.py to output
451
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__modeling_flash_attention_utils_backward_compat.py to output
452
+ [2026-03-31 02:46:19,639] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying transformers_4_44_2__modeling_attn_mask_utils.py to output
453
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying generation_config.json to output
454
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying llama_nemotron_toolcall_parser_no_streaming.py to output
455
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying README.md to output
456
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying PRIVACY.md to output
457
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying BIAS.md to output
458
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying .gitattributes to output
459
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying accuracy_chart.png to output
460
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying SAFETY&SECURITY.md to output
461
+ [2026-03-31 02:46:19,640] [DEBUG] [axolotl.cli.utils.lora_merge.copy_non_model_files:324] [PID:10906] Copying EXPLAINABILITY.md to output
462
+
463
+
464
+
465
+ [2026-03-31 02:46:20,696] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.0.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([14336, 32])
466
+ [2026-03-31 02:46:21,426] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.0.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([14336, 32])
467
+ [2026-03-31 02:46:22,225] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.0.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
468
+ [2026-03-31 02:46:22,280] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.0.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
469
+ [2026-03-31 02:46:22,820] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.0.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
470
+ [2026-03-31 02:46:23,341] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.0.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
471
+ [2026-03-31 02:46:23,394] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
472
+ [2026-03-31 02:46:24,838] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
473
+ [2026-03-31 02:46:26,250] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
474
+ [2026-03-31 02:46:27,647] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
475
+ [2026-03-31 02:46:27,699] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
476
+ [2026-03-31 02:46:28,154] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
477
+ [2026-03-31 02:46:28,618] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.1.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
478
+ [2026-03-31 02:46:28,670] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.2.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
479
+ [2026-03-31 02:46:28,722] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.2.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
480
+ [2026-03-31 02:46:29,202] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.2.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
481
+
482
+
483
+ [2026-03-31 02:46:34,816] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.2.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
484
+ [2026-03-31 02:46:36,246] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.2.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
485
+ [2026-03-31 02:46:37,651] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.2.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
486
+ [2026-03-31 02:46:38,131] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
487
+ [2026-03-31 02:46:39,614] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
488
+ [2026-03-31 02:46:41,043] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
489
+ [2026-03-31 02:46:42,447] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
490
+ [2026-03-31 02:46:42,497] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
491
+ [2026-03-31 02:46:42,956] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
492
+ [2026-03-31 02:46:43,452] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.3.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
493
+ [2026-03-31 02:46:43,505] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
494
+ [2026-03-31 02:46:44,942] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
495
+ [2026-03-31 02:46:46,427] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
496
+ [2026-03-31 02:46:47,935] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
497
+ [2026-03-31 02:46:47,987] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
498
+ [2026-03-31 02:46:48,458] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
499
+ [2026-03-31 02:46:48,915] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.4.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
500
+
501
+
502
+ [2026-03-31 02:46:54,686] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.5.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
503
+ [2026-03-31 02:46:56,164] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.5.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
504
+ [2026-03-31 02:46:57,648] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.5.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
505
+ [2026-03-31 02:46:57,687] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.5.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
506
+ [2026-03-31 02:46:58,223] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.5.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
507
+ [2026-03-31 02:46:58,718] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.5.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
508
+ [2026-03-31 02:46:58,764] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.6.mlp.down_proj.weight: torch.Size([32, 14336]), torch.Size([8192, 32])
509
+ [2026-03-31 02:46:59,519] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.6.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([14336, 32])
510
+ [2026-03-31 02:47:00,309] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.6.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([14336, 32])
511
+ [2026-03-31 02:47:01,112] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.7.mlp.down_proj.weight: torch.Size([32, 14336]), torch.Size([8192, 32])
512
+ [2026-03-31 02:47:01,813] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.7.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([14336, 32])
513
+ [2026-03-31 02:47:02,515] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.7.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([14336, 32])
514
+ [2026-03-31 02:47:03,290] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
515
+ [2026-03-31 02:47:04,711] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
516
+ [2026-03-31 02:47:06,150] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
517
+ [2026-03-31 02:47:07,654] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
518
+ [2026-03-31 02:47:07,706] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
519
+ [2026-03-31 02:47:08,159] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
520
+ [2026-03-31 02:47:08,635] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.8.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
521
+ [2026-03-31 02:47:08,687] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
522
+ [2026-03-31 02:47:08,782] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
523
+ [2026-03-31 02:47:09,296] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
524
+
525
+
526
+ [2026-03-31 02:47:14,969] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.10.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
527
+ [2026-03-31 02:47:16,429] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.10.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
528
+ [2026-03-31 02:47:17,853] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.10.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
529
+ [2026-03-31 02:47:17,888] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.10.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
530
+ [2026-03-31 02:47:18,394] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.10.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
531
+ [2026-03-31 02:47:18,937] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.10.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
532
+ [2026-03-31 02:47:18,982] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.11.mlp.down_proj.weight: torch.Size([32, 17920]), torch.Size([8192, 32])
533
+ [2026-03-31 02:47:19,927] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.11.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([17920, 32])
534
+ [2026-03-31 02:47:20,862] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.11.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([17920, 32])
535
+ [2026-03-31 02:47:21,841] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.12.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
536
+ [2026-03-31 02:47:23,267] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.12.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
537
+ [2026-03-31 02:47:23,318] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.12.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
538
+ [2026-03-31 02:47:23,760] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.12.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
539
+ [2026-03-31 02:47:24,253] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.12.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
540
+ [2026-03-31 02:47:24,310] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
541
+ [2026-03-31 02:47:25,835] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
542
+ [2026-03-31 02:47:27,317] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
543
+ [2026-03-31 02:47:28,733] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.9.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
544
+
545
+
546
+ [2026-03-31 02:47:34,842] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.12.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
547
+ [2026-03-31 02:47:36,252] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
548
+ [2026-03-31 02:47:37,651] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
549
+ [2026-03-31 02:47:39,044] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
550
+ [2026-03-31 02:47:40,529] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
551
+ [2026-03-31 02:47:40,576] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
552
+ [2026-03-31 02:47:41,056] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
553
+ [2026-03-31 02:47:41,533] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.13.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
554
+ [2026-03-31 02:47:41,585] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
555
+ [2026-03-31 02:47:43,009] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
556
+ [2026-03-31 02:47:44,442] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
557
+ [2026-03-31 02:47:45,912] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
558
+ [2026-03-31 02:47:45,965] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
559
+ [2026-03-31 02:47:46,456] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
560
+ [2026-03-31 02:47:46,904] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.14.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
561
+ [2026-03-31 02:47:46,961] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.15.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
562
+ [2026-03-31 02:47:47,017] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.15.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
563
+ [2026-03-31 02:47:47,457] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.15.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
564
+ [2026-03-31 02:47:47,951] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.15.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
565
+
566
+
567
+ [2026-03-31 02:47:53,756] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.15.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
568
+ [2026-03-31 02:47:55,227] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.15.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
569
+ [2026-03-31 02:47:56,651] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
570
+ [2026-03-31 02:47:58,125] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
571
+ [2026-03-31 02:47:59,605] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
572
+ [2026-03-31 02:48:01,046] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
573
+ [2026-03-31 02:48:01,081] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
574
+ [2026-03-31 02:48:01,556] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
575
+ [2026-03-31 02:48:02,015] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.16.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
576
+ [2026-03-31 02:48:02,051] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
577
+ [2026-03-31 02:48:03,496] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
578
+ [2026-03-31 02:48:04,865] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
579
+ [2026-03-31 02:48:06,337] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
580
+ [2026-03-31 02:48:06,373] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
581
+ [2026-03-31 02:48:06,859] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
582
+ [2026-03-31 02:48:07,346] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.17.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
583
+ [2026-03-31 02:48:07,382] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.18.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
584
+ [2026-03-31 02:48:07,482] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.18.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
585
+ [2026-03-31 02:48:08,034] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.18.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
586
+
587
+
588
+ [2026-03-31 02:48:14,139] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.18.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
589
+ [2026-03-31 02:48:15,617] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.18.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
590
+ [2026-03-31 02:48:17,054] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.18.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
591
+ [2026-03-31 02:48:17,529] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
592
+ [2026-03-31 02:48:18,954] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
593
+ [2026-03-31 02:48:20,434] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
594
+ [2026-03-31 02:48:21,897] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
595
+ [2026-03-31 02:48:21,935] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
596
+ [2026-03-31 02:48:22,358] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
597
+ [2026-03-31 02:48:22,851] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.19.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
598
+ [2026-03-31 02:48:22,904] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
599
+ [2026-03-31 02:48:24,335] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
600
+ [2026-03-31 02:48:25,757] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
601
+ [2026-03-31 02:48:27,225] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
602
+ [2026-03-31 02:48:27,282] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
603
+ [2026-03-31 02:48:27,819] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
604
+ [2026-03-31 02:48:28,336] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.20.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
605
+
606
+
607
+ [2026-03-31 02:48:34,285] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.21.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
608
+ [2026-03-31 02:48:35,749] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.21.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
609
+ [2026-03-31 02:48:37,242] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.21.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
610
+ [2026-03-31 02:48:37,284] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.21.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
611
+ [2026-03-31 02:48:37,791] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.21.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
612
+ [2026-03-31 02:48:38,257] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.21.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
613
+ [2026-03-31 02:48:38,294] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
614
+ [2026-03-31 02:48:39,730] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
615
+ [2026-03-31 02:48:41,243] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
616
+ [2026-03-31 02:48:42,655] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
617
+ [2026-03-31 02:48:42,694] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
618
+ [2026-03-31 02:48:43,157] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
619
+ [2026-03-31 02:48:43,653] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.22.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
620
+ [2026-03-31 02:48:43,706] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.23.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
621
+ [2026-03-31 02:48:45,154] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.23.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
622
+ [2026-03-31 02:48:46,559] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.23.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
623
+ [2026-03-31 02:48:46,612] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.23.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
624
+ [2026-03-31 02:48:47,125] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.23.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
625
+ [2026-03-31 02:48:47,652] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.23.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
626
+
627
+
628
+ [2026-03-31 02:48:53,245] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
629
+ [2026-03-31 02:48:54,731] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
630
+ [2026-03-31 02:48:56,186] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
631
+ [2026-03-31 02:48:57,635] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
632
+ [2026-03-31 02:48:57,671] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
633
+ [2026-03-31 02:48:58,217] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
634
+ [2026-03-31 02:48:58,732] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.24.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
635
+ [2026-03-31 02:48:58,785] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
636
+ [2026-03-31 02:49:00,244] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
637
+ [2026-03-31 02:49:01,692] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
638
+ [2026-03-31 02:49:03,112] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
639
+ [2026-03-31 02:49:03,169] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
640
+ [2026-03-31 02:49:03,656] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
641
+ [2026-03-31 02:49:04,140] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.25.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
642
+ [2026-03-31 02:49:04,195] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.26.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
643
+ [2026-03-31 02:49:05,648] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.26.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
644
+ [2026-03-31 02:49:05,706] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.26.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
645
+ [2026-03-31 02:49:06,253] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.26.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
646
+ [2026-03-31 02:49:06,752] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.26.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
647
+
648
+
649
+ [2026-03-31 02:49:12,102] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.26.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
650
+ [2026-03-31 02:49:13,563] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
651
+ [2026-03-31 02:49:15,043] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
652
+ [2026-03-31 02:49:16,455] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
653
+ [2026-03-31 02:49:17,848] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
654
+ [2026-03-31 02:49:17,894] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
655
+ [2026-03-31 02:49:18,356] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
656
+ [2026-03-31 02:49:18,837] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.27.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
657
+ [2026-03-31 02:49:18,887] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
658
+ [2026-03-31 02:49:20,329] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
659
+ [2026-03-31 02:49:21,814] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
660
+ [2026-03-31 02:49:23,243] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
661
+ [2026-03-31 02:49:23,295] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
662
+ [2026-03-31 02:49:23,830] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
663
+ [2026-03-31 02:49:24,322] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.28.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
664
+ [2026-03-31 02:49:24,376] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.29.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
665
+ [2026-03-31 02:49:24,429] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.29.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
666
+ [2026-03-31 02:49:24,936] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.29.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
667
+ [2026-03-31 02:49:25,455] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.29.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
668
+
669
+
670
+ [2026-03-31 02:49:31,229] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.29.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
671
+ [2026-03-31 02:49:32,646] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.29.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
672
+ [2026-03-31 02:49:34,113] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
673
+ [2026-03-31 02:49:35,536] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
674
+ [2026-03-31 02:49:36,945] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
675
+ [2026-03-31 02:49:38,428] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
676
+ [2026-03-31 02:49:38,474] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
677
+ [2026-03-31 02:49:38,953] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
678
+ [2026-03-31 02:49:39,444] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.30.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
679
+ [2026-03-31 02:49:39,496] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.mlp.down_proj.weight: torch.Size([32, 28672]), torch.Size([8192, 32])
680
+ [2026-03-31 02:49:40,842] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.mlp.gate_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
681
+ [2026-03-31 02:49:42,246] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.mlp.up_proj.weight: torch.Size([32, 8192]), torch.Size([28672, 32])
682
+ [2026-03-31 02:49:43,648] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
683
+ [2026-03-31 02:49:43,706] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.self_attn.o_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
684
+ [2026-03-31 02:49:44,155] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
685
+ [2026-03-31 02:49:44,646] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.31.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
686
+ [2026-03-31 02:49:44,701] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.32.self_attn.k_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
687
+ [2026-03-31 02:49:44,781] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.32.self_attn.q_proj.weight: torch.Size([32, 8192]), torch.Size([8192, 32])
688
+ [2026-03-31 02:49:45,256] [DEBUG] [axolotl.cli.utils.lora_merge._merge_tensor_with_lora:411] [PID:10906] Merging LoRA for model.layers.32.self_attn.v_proj.weight: torch.Size([32, 8192]), torch.Size([1024, 32])
689
+
690
+ [2026-03-31 02:49:49,768] [ERROR] [axolotl.telemetry.errors.wrapper:158] [PID:10906] Error captured in telemetry. Run ID: 77193302-fa43-4dfd-ab04-45c91b8c4748
691
+ Traceback (most recent call last):
692
+ File "/root/miniconda3/envs/py3.11/bin/axolotl", line 6, in <module>
693
+ sys.exit(main())
694
+ ^^^^^^
695
+ File "/workspace/axolotl/src/axolotl/cli/main.py", line 347, in main
696
+ cli()
697
+ File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/click/core.py", line 1485, in __call__
698
+ return self.main(*args, **kwargs)
699
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^
700
+ File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/click/core.py", line 1406, in main
701
+ rv = self.invoke(ctx)
702
+ ^^^^^^^^^^^^^^^^
703
+ File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/click/core.py", line 1873, in invoke
704
+ return _process_result(sub_ctx.command.invoke(sub_ctx))
705
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
706
+ File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/click/core.py", line 1269, in invoke
707
+ return ctx.invoke(self.callback, **ctx.params)
708
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
709
+ File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/click/core.py", line 824, in invoke
710
+ return callback(*args, **kwargs)
711
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
712
+ File "/workspace/axolotl/src/axolotl/cli/utils/args.py", line 48, in wrapper
713
+ return func(*args, **filtered_kwargs)
714
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
715
+ File "/workspace/axolotl/src/axolotl/cli/main.py", line 293, in merge_lora
716
+ do_cli(config=config, **kwargs)
717
+ File "/workspace/axolotl/src/axolotl/cli/merge_lora.py", line 169, in do_cli
718
+ do_merge_lora(cfg=parsed_cfg)
719
+ File "/workspace/axolotl/src/axolotl/telemetry/errors.py", line 127, in wrapper
720
+ return func(*args, **kwargs)
721
+ ^^^^^^^^^^^^^^^^^^^^^
722
+ File "/workspace/axolotl/src/axolotl/cli/merge_lora.py", line 33, in do_merge_lora
723
+ _do_merge_lora_efficient(cfg=cfg)
724
+ File "/workspace/axolotl/src/axolotl/cli/merge_lora.py", line 108, in _do_merge_lora_efficient
725
+ merge_lora_sharded_efficient(
726
+ File "/workspace/axolotl/src/axolotl/cli/utils/lora_merge.py", line 940, in merge_lora_sharded_efficient
727
+ safetensors.torch.save_file(
728
+ File "/root/miniconda3/envs/py3.11/lib/python3.11/site-packages/safetensors/torch.py", line 307, in save_file
729
+ serialize_file(_flatten(tensors), filename, metadata=metadata)
730
+ safetensors_rust.SafetensorError: Error while serializing: I/O error: No space left on device (os error 28)
runs/Mar31_01-27-28_b8de28f8ab2a/events.out.tfevents.1774920448.b8de28f8ab2a.3556.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76b91afdc544c9574c1ccca57556de6d448d7ec49328dfe6e6f02bce5d22f2b7
3
+ size 46082
runs/Mar31_01-31-17_b8de28f8ab2a/events.out.tfevents.1774920677.b8de28f8ab2a.6000.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01aafbab852593ccec0eeaee70a3ad2a5a537356fd3d439706fa189c52846c1c
3
+ size 48037
runs/Mar31_01-41-00_b8de28f8ab2a/events.out.tfevents.1774921260.b8de28f8ab2a.9806.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f6e84333aff94e98274813987c957ec4450376a766c375677fceeb038fc0aa2
3
+ size 81795
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "is_local": false,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 131072,
12
+ "pad_token": "<|eot_id|>",
13
+ "tokenizer_class": "TokenizersBackend"
14
+ }
transformers_4_44_2__configuration_llama.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """LLaMA model configuration"""
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from .transformers_4_44_2__modeling_rope_utils import rope_config_validation
24
+
25
+
26
+ class LlamaConfig(PretrainedConfig):
27
+ r"""
28
+ This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
29
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
30
+ defaults will yield a similar configuration to that of the LLaMA-7B.
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 32000):
38
+ Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`LlamaModel`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 11008):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer decoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer decoder.
48
+ num_key_value_heads (`int`, *optional*):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details checkout [this
54
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
55
+ `num_attention_heads`.
56
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
57
+ The non-linear activation function (function or string) in the decoder.
58
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
59
+ The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
60
+ Llama 2 up to 4096, CodeLlama up to 16384.
61
+ initializer_range (`float`, *optional*, defaults to 0.02):
62
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
63
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
64
+ The epsilon used by the rms normalization layers.
65
+ use_cache (`bool`, *optional*, defaults to `True`):
66
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
67
+ relevant if `config.is_decoder=True`.
68
+ pad_token_id (`int`, *optional*):
69
+ Padding token id.
70
+ bos_token_id (`int`, *optional*, defaults to 1):
71
+ Beginning of stream token id.
72
+ eos_token_id (`int`, *optional*, defaults to 2):
73
+ End of stream token id.
74
+ pretraining_tp (`int`, *optional*, defaults to 1):
75
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
76
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
77
+ understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
78
+ results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
79
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
80
+ Whether to tie weight embeddings
81
+ rope_theta (`float`, *optional*, defaults to 10000.0):
82
+ The base period of the RoPE embeddings.
83
+ rope_scaling (`Dict`, *optional*):
84
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
85
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
86
+ accordingly.
87
+ Expected contents:
88
+ `rope_type` (`str`):
89
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
90
+ 'llama3'], with 'default' being the original RoPE implementation.
91
+ `factor` (`float`, *optional*):
92
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
93
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
94
+ original maximum pre-trained length.
95
+ `original_max_position_embeddings` (`int`, *optional*):
96
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
97
+ pretraining.
98
+ `attention_factor` (`float`, *optional*):
99
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
100
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
101
+ `factor` field to infer the suggested value.
102
+ `beta_fast` (`float`, *optional*):
103
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
104
+ ramp function. If unspecified, it defaults to 32.
105
+ `beta_slow` (`float`, *optional*):
106
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
107
+ ramp function. If unspecified, it defaults to 1.
108
+ `short_factor` (`List[float]`, *optional*):
109
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
110
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
111
+ size divided by the number of attention heads divided by 2
112
+ `long_factor` (`List[float]`, *optional*):
113
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
114
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
115
+ size divided by the number of attention heads divided by 2
116
+ `low_freq_factor` (`float`, *optional*):
117
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
118
+ `high_freq_factor` (`float`, *optional*):
119
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
120
+ attention_bias (`bool`, *optional*, defaults to `False`):
121
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
122
+ attention_dropout (`float`, *optional*, defaults to 0.0):
123
+ The dropout ratio for the attention probabilities.
124
+ mlp_bias (`bool`, *optional*, defaults to `False`):
125
+ Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
126
+
127
+ ```python
128
+ >>> from transformers import LlamaModel, LlamaConfig
129
+
130
+ >>> # Initializing a LLaMA llama-7b style configuration
131
+ >>> configuration = LlamaConfig()
132
+
133
+ >>> # Initializing a model from the llama-7b style configuration
134
+ >>> model = LlamaModel(configuration)
135
+
136
+ >>> # Accessing the model configuration
137
+ >>> configuration = model.config
138
+ ```"""
139
+
140
+ model_type = "llama"
141
+ keys_to_ignore_at_inference = ["past_key_values"]
142
+
143
+ def __init__(
144
+ self,
145
+ vocab_size=32000,
146
+ hidden_size=4096,
147
+ intermediate_size=11008,
148
+ num_hidden_layers=32,
149
+ num_attention_heads=32,
150
+ num_key_value_heads=None,
151
+ hidden_act="silu",
152
+ max_position_embeddings=2048,
153
+ initializer_range=0.02,
154
+ rms_norm_eps=1e-6,
155
+ use_cache=True,
156
+ pad_token_id=None,
157
+ bos_token_id=1,
158
+ eos_token_id=2,
159
+ pretraining_tp=1,
160
+ tie_word_embeddings=False,
161
+ rope_theta=10000.0,
162
+ rope_scaling=None,
163
+ attention_bias=False,
164
+ attention_dropout=0.0,
165
+ mlp_bias=False,
166
+ **kwargs,
167
+ ):
168
+ self.vocab_size = vocab_size
169
+ self.max_position_embeddings = max_position_embeddings
170
+ self.hidden_size = hidden_size
171
+ self.intermediate_size = intermediate_size
172
+ self.num_hidden_layers = num_hidden_layers
173
+ self.num_attention_heads = num_attention_heads
174
+
175
+ # for backward compatibility
176
+ if num_key_value_heads is None:
177
+ num_key_value_heads = num_attention_heads
178
+
179
+ self.num_key_value_heads = num_key_value_heads
180
+ self.hidden_act = hidden_act
181
+ self.initializer_range = initializer_range
182
+ self.rms_norm_eps = rms_norm_eps
183
+ self.pretraining_tp = pretraining_tp
184
+ self.use_cache = use_cache
185
+ self.rope_theta = rope_theta
186
+ self.rope_scaling = rope_scaling
187
+ self.attention_bias = attention_bias
188
+ self.attention_dropout = attention_dropout
189
+ self.mlp_bias = mlp_bias
190
+
191
+ # Validate the correctness of rotary position embeddings parameters
192
+ # BC: if there is a 'type' field, move it to 'rope_type'.
193
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
194
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
195
+ rope_config_validation(self)
196
+
197
+ super().__init__(
198
+ pad_token_id=pad_token_id,
199
+ bos_token_id=bos_token_id,
200
+ eos_token_id=eos_token_id,
201
+ tie_word_embeddings=tie_word_embeddings,
202
+ **kwargs,
203
+ )
transformers_4_44_2__modeling_rope_utils.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from typing import Optional, Tuple
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import is_torch_available, logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ if is_torch_available():
26
+ import torch
27
+
28
+
29
+ def _compute_default_rope_parameters(
30
+ config: Optional[PretrainedConfig] = None,
31
+ device: Optional["torch.device"] = None,
32
+ seq_len: Optional[int] = None,
33
+ **rope_kwargs,
34
+ ) -> Tuple["torch.Tensor", float]:
35
+ """
36
+ Computes the inverse frequencies according to the original RoPE implementation
37
+ Args:
38
+ config ([`~transformers.PretrainedConfig`]):
39
+ The model configuration.
40
+ device (`torch.device`):
41
+ The device to use for initialization of the inverse frequencies.
42
+ seq_len (`int`, *optional*):
43
+ The current sequence length. Unused for this type of RoPE.
44
+ rope_kwargs (`Dict`, *optional*):
45
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
46
+ Returns:
47
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
48
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
49
+ """
50
+ if config is not None and len(rope_kwargs) > 0:
51
+ raise ValueError(
52
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
53
+ f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
54
+ )
55
+ if len(rope_kwargs) > 0:
56
+ base = rope_kwargs["base"]
57
+ dim = rope_kwargs["dim"]
58
+ elif config is not None:
59
+ base = config.rope_theta
60
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
61
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
62
+ dim = int(head_dim * partial_rotary_factor)
63
+
64
+ attention_factor = 1.0 # Unused in this type of RoPE
65
+
66
+ # Compute the inverse frequencies
67
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
68
+ return inv_freq, attention_factor
69
+
70
+
71
+ def _compute_linear_scaling_rope_parameters(
72
+ config: Optional[PretrainedConfig] = None,
73
+ device: Optional["torch.device"] = None,
74
+ seq_len: Optional[int] = None,
75
+ **rope_kwargs,
76
+ ) -> Tuple["torch.Tensor", float]:
77
+ """
78
+ Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
79
+ Args:
80
+ config ([`~transformers.PretrainedConfig`]):
81
+ The model configuration.
82
+ device (`torch.device`):
83
+ The device to use for initialization of the inverse frequencies.
84
+ seq_len (`int`, *optional*):
85
+ The current sequence length. Unused for this type of RoPE.
86
+ rope_kwargs (`Dict`, *optional*):
87
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
88
+ Returns:
89
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
90
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
91
+ """
92
+ if config is not None and len(rope_kwargs) > 0:
93
+ raise ValueError(
94
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
95
+ f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
96
+ )
97
+ if len(rope_kwargs) > 0:
98
+ factor = rope_kwargs["factor"]
99
+ elif config is not None:
100
+ factor = config.rope_scaling["factor"]
101
+
102
+ # Gets the default RoPE parameters
103
+ inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
104
+
105
+ # Then applies linear scaling to the frequencies.
106
+ # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
107
+ # applying scaling to the inverse frequencies is equivalent.
108
+ inv_freq /= factor
109
+ return inv_freq, attention_factor
110
+
111
+
112
+ def _compute_dynamic_ntk_parameters(
113
+ config: Optional[PretrainedConfig] = None,
114
+ device: Optional["torch.device"] = None,
115
+ seq_len: Optional[int] = None,
116
+ **rope_kwargs,
117
+ ) -> Tuple["torch.Tensor", float]:
118
+ """
119
+ Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
120
+ Args:
121
+ config ([`~transformers.PretrainedConfig`]):
122
+ The model configuration.
123
+ device (`torch.device`):
124
+ The device to use for initialization of the inverse frequencies.
125
+ seq_len (`int`, *optional*):
126
+ The current sequence length, used to update the dynamic RoPE at inference time.
127
+ rope_kwargs (`Dict`, *optional*):
128
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
129
+ Returns:
130
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
131
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
132
+ """
133
+ # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
134
+ if config is not None and len(rope_kwargs) > 0:
135
+ raise ValueError(
136
+ "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
137
+ f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
138
+ )
139
+ if len(rope_kwargs) > 0:
140
+ base = rope_kwargs["base"]
141
+ dim = rope_kwargs["dim"]
142
+ max_position_embeddings = rope_kwargs["max_position_embeddings"]
143
+ factor = rope_kwargs["factor"]
144
+ elif config is not None:
145
+ base = config.rope_theta
146
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
147
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
148
+ dim = int(head_dim * partial_rotary_factor)
149
+ max_position_embeddings = config.max_position_embeddings
150
+ factor = config.rope_scaling["factor"]
151
+
152
+ attention_factor = 1.0 # Unused in this type of RoPE
153
+
154
+ # seq_len: default to max_position_embeddings, e.g. at init time
155
+ seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
156
+
157
+ # Compute the inverse frequencies
158
+ base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
159
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
160
+ return inv_freq, attention_factor
161
+
162
+
163
+ def _compute_yarn_parameters(
164
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
165
+ ) -> Tuple["torch.Tensor", float]:
166
+ """
167
+ Computes the inverse frequencies with NTK scaling. Please refer to the
168
+ [original paper](https://arxiv.org/abs/2309.00071)
169
+ Args:
170
+ config ([`~transformers.PretrainedConfig`]):
171
+ The model configuration.
172
+ device (`torch.device`):
173
+ The device to use for initialization of the inverse frequencies.
174
+ seq_len (`int`, *optional*):
175
+ The current sequence length. Unused for this type of RoPE.
176
+ rope_kwargs (`Dict`, *optional*):
177
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
178
+ Returns:
179
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
180
+ post-processing scaling factor applied to the computed cos/sin.
181
+ """
182
+ # No need to keep BC with yarn, unreleased when this new pattern was created.
183
+ if len(rope_kwargs) > 0:
184
+ raise ValueError(
185
+ f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
186
+ )
187
+
188
+ base = config.rope_theta
189
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
190
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
191
+ dim = int(head_dim * partial_rotary_factor)
192
+ max_position_embeddings = config.max_position_embeddings
193
+ factor = config.rope_scaling["factor"]
194
+
195
+ # Sets the attention factor as suggested in the paper
196
+ attention_factor = config.rope_scaling.get("attention_factor")
197
+ if attention_factor is None:
198
+ attention_factor = 0.1 * math.log(factor) + 1.0
199
+
200
+ # Optional config options
201
+ # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
202
+ beta_fast = config.rope_scaling.get("beta_fast") or 32
203
+ beta_slow = config.rope_scaling.get("beta_slow") or 1
204
+
205
+ # Compute the inverse frequencies
206
+ def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
207
+ """Inverse dimension formula to find the dimension based on the number of rotations"""
208
+ return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
209
+
210
+ def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
211
+ """Find dimension range bounds based on rotations"""
212
+ low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
213
+ high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
214
+ return max(low, 0), min(high, dim - 1)
215
+
216
+ def linear_ramp_factor(min, max, dim):
217
+ if min == max:
218
+ max += 0.001 # Prevent singularity
219
+
220
+ linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
221
+ ramp_func = torch.clamp(linear_func, 0, 1)
222
+ return ramp_func
223
+
224
+ # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
225
+ # to expand the possible context length. In other words, interpolation = apply scaling factor.
226
+ pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
227
+ inv_freq_extrapolation = 1.0 / pos_freqs
228
+ inv_freq_interpolation = 1.0 / (factor * pos_freqs)
229
+
230
+ low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
231
+
232
+ # Get n-dimensional rotational scaling corrected for extrapolation
233
+ inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
234
+ inv_freq = (
235
+ inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
236
+ + inv_freq_extrapolation * inv_freq_extrapolation_factor
237
+ )
238
+
239
+ return inv_freq, attention_factor
240
+
241
+
242
+ def _compute_longrope_parameters(
243
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
244
+ ) -> Tuple["torch.Tensor", float]:
245
+ """
246
+ Computes the inverse frequencies with LongRoPE scaling. Please refer to the
247
+ [original implementation](https://github.com/microsoft/LongRoPE)
248
+ Args:
249
+ config ([`~transformers.PretrainedConfig`]):
250
+ The model configuration.
251
+ device (`torch.device`):
252
+ The device to use for initialization of the inverse frequencies.
253
+ seq_len (`int`, *optional*):
254
+ The current sequence length. Unused for this type of RoPE.
255
+ rope_kwargs (`Dict`, *optional*):
256
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
257
+ Returns:
258
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
259
+ post-processing scaling factor applied to the computed cos/sin.
260
+ """
261
+ # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
262
+ # No need to keep BC with longrope, unreleased when this new pattern was created.
263
+ if len(rope_kwargs) > 0:
264
+ raise ValueError(
265
+ "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
266
+ f"{rope_kwargs}"
267
+ )
268
+
269
+ base = config.rope_theta
270
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
271
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
272
+ dim = int(head_dim * partial_rotary_factor)
273
+ long_factor = config.rope_scaling["long_factor"]
274
+ short_factor = config.rope_scaling["short_factor"]
275
+ factor = config.rope_scaling.get("factor")
276
+ attention_factor = config.rope_scaling.get("attention_factor")
277
+
278
+ # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
279
+ # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
280
+ # values to compute the default attention scaling factor, instead of using `factor`.
281
+ if hasattr(config, "original_max_position_embeddings"):
282
+ max_position_embeddings = config.original_max_position_embeddings
283
+ expanded_max_position_embeddings = config.max_position_embeddings
284
+ factor = expanded_max_position_embeddings / max_position_embeddings
285
+ else:
286
+ max_position_embeddings = config.max_position_embeddings
287
+ expanded_max_position_embeddings = max_position_embeddings * factor
288
+
289
+ # Sets the attention factor as suggested in the paper
290
+ if attention_factor is None:
291
+ if factor <= 1.0:
292
+ attention_factor = 1.0
293
+ else:
294
+ attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
295
+
296
+ # Compute the inverse frequencies -- scaled based on the target sequence length
297
+ if expanded_max_position_embeddings > max_position_embeddings:
298
+ ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
299
+ else:
300
+ ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
301
+ inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
302
+ inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
303
+
304
+ return inv_freq, attention_factor
305
+
306
+
307
+ def _compute_llama3_parameters(
308
+ config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
309
+ ) -> Tuple["torch.Tensor", float]:
310
+ """
311
+ Computes the inverse frequencies for llama 3.1.
312
+
313
+ Args:
314
+ config ([`~transformers.PretrainedConfig`]):
315
+ The model configuration.
316
+ device (`torch.device`):
317
+ The device to use for initialization of the inverse frequencies.
318
+ seq_len (`int`, *optional*):
319
+ The current sequence length. Unused for this type of RoPE.
320
+ rope_kwargs (`Dict`, *optional*):
321
+ BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
322
+ Returns:
323
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
324
+ post-processing scaling factor applied to the computed cos/sin.
325
+ """
326
+ # Gets the default RoPE parameters
327
+ inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
328
+
329
+ factor = config.rope_scaling["factor"] # `8` in the original implementation
330
+ low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation
331
+ high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation
332
+ old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation
333
+
334
+ low_freq_wavelen = old_context_len / low_freq_factor
335
+ high_freq_wavelen = old_context_len / high_freq_factor
336
+
337
+ wavelen = 2 * math.pi / inv_freq
338
+ # wavelen < high_freq_wavelen: do nothing
339
+ # wavelen > low_freq_wavelen: divide by factor
340
+ inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
341
+ # otherwise: interpolate between the two, using a smooth factor
342
+ smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
343
+ smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
344
+ is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
345
+ inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
346
+
347
+ return inv_freq_llama, attention_factor
348
+
349
+
350
+ # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
351
+ # from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
352
+ # parameterizations, as long as the callable has the same signature.
353
+ ROPE_INIT_FUNCTIONS = {
354
+ "default": _compute_default_rope_parameters,
355
+ "linear": _compute_linear_scaling_rope_parameters,
356
+ "dynamic": _compute_dynamic_ntk_parameters,
357
+ "yarn": _compute_yarn_parameters,
358
+ "longrope": _compute_longrope_parameters,
359
+ "llama3": _compute_llama3_parameters,
360
+ }
361
+
362
+
363
+ def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
364
+ """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
365
+ # BC: "rope_type" was originally "type" -- let's gracefully handle it
366
+ if "rope_type" not in received_keys and "type" in received_keys:
367
+ received_keys -= {"type"}
368
+ received_keys.add("rope_type")
369
+
370
+ missing_keys = required_keys - received_keys
371
+ if missing_keys:
372
+ raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
373
+
374
+ if optional_keys is not None:
375
+ unused_keys = received_keys - required_keys - optional_keys
376
+ else:
377
+ unused_keys = received_keys - required_keys
378
+ if unused_keys:
379
+ logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
380
+
381
+
382
+ def _validate_default_rope_parameters(config: PretrainedConfig):
383
+ rope_scaling = config.rope_scaling
384
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
385
+ required_keys = {"rope_type"}
386
+ received_keys = set(rope_scaling.keys())
387
+ _check_received_keys(rope_type, received_keys, required_keys)
388
+
389
+
390
+ def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
391
+ rope_scaling = config.rope_scaling
392
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
393
+ required_keys = {"rope_type", "factor"}
394
+ received_keys = set(rope_scaling.keys())
395
+ _check_received_keys(rope_type, received_keys, required_keys)
396
+
397
+ factor = rope_scaling["factor"]
398
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
399
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
400
+
401
+
402
+ def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
403
+ rope_scaling = config.rope_scaling
404
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
405
+ required_keys = {"rope_type", "factor"}
406
+ # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
407
+ optional_keys = {"original_max_position_embeddings"}
408
+ received_keys = set(rope_scaling.keys())
409
+ _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
410
+
411
+ factor = rope_scaling["factor"]
412
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
413
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
414
+
415
+
416
+ def _validate_yarn_parameters(config: PretrainedConfig):
417
+ rope_scaling = config.rope_scaling
418
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
419
+ required_keys = {"rope_type", "factor"}
420
+ optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
421
+ received_keys = set(rope_scaling.keys())
422
+ _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
423
+
424
+ factor = rope_scaling["factor"]
425
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
426
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
427
+
428
+ attention_factor = rope_scaling.get("attention_factor")
429
+ if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
430
+ logger.warning(
431
+ f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
432
+ )
433
+ beta_fast = rope_scaling.get("beta_fast")
434
+ if beta_fast is not None and not isinstance(beta_fast, float):
435
+ logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
436
+ beta_slow = rope_scaling.get("beta_slow")
437
+ if beta_slow is not None and not isinstance(beta_slow, float):
438
+ logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")
439
+
440
+ if (beta_fast or 32) < (beta_slow or 1):
441
+ logger.warning(
442
+ f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
443
+ f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
444
+ )
445
+
446
+
447
+ def _validate_longrope_parameters(config: PretrainedConfig):
448
+ rope_scaling = config.rope_scaling
449
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
450
+ required_keys = {"rope_type", "short_factor", "long_factor"}
451
+ # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
452
+ optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
453
+ received_keys = set(rope_scaling.keys())
454
+ _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
455
+
456
+ partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
457
+ head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
458
+ dim = int(head_dim * partial_rotary_factor)
459
+
460
+ short_factor = rope_scaling.get("short_factor")
461
+ if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
462
+ logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
463
+ if not len(short_factor) == dim // 2:
464
+ logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")
465
+
466
+ long_factor = rope_scaling.get("long_factor")
467
+ if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
468
+ logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
469
+ if not len(long_factor) == dim // 2:
470
+ logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")
471
+
472
+ # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
473
+ # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
474
+ # unique to longrope (= undesirable)
475
+ if hasattr(config, "original_max_position_embeddings"):
476
+ logger.warning_once(
477
+ "This model has set a `original_max_position_embeddings` field, to be used together with "
478
+ "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
479
+ "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
480
+ "as it is compatible with most model architectures."
481
+ )
482
+ else:
483
+ factor = rope_scaling.get("factor")
484
+ if factor is None:
485
+ logger.warning("Missing required keys in `rope_scaling`: 'factor'")
486
+ elif not isinstance(factor, float) or factor < 1.0:
487
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
488
+
489
+ attention_factor = rope_scaling.get("attention_factor")
490
+ if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0:
491
+ logger.warning(
492
+ f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
493
+ )
494
+
495
+
496
+ def _validate_llama3_parameters(config: PretrainedConfig):
497
+ rope_scaling = config.rope_scaling
498
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type"
499
+ required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
500
+ received_keys = set(rope_scaling.keys())
501
+ _check_received_keys(rope_type, received_keys, required_keys)
502
+
503
+ factor = rope_scaling["factor"]
504
+ if factor is None or not isinstance(factor, float) or factor < 1.0:
505
+ logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
506
+
507
+ low_freq_factor = rope_scaling["low_freq_factor"]
508
+ high_freq_factor = rope_scaling["high_freq_factor"]
509
+ if low_freq_factor is None or not isinstance(low_freq_factor, float):
510
+ logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
511
+ if high_freq_factor is None or not isinstance(high_freq_factor, float):
512
+ logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
513
+ if high_freq_factor <= low_freq_factor:
514
+ logger.warning(
515
+ "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
516
+ f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
517
+ )
518
+
519
+ original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
520
+ if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
521
+ logger.warning(
522
+ "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
523
+ f"{original_max_position_embeddings}"
524
+ )
525
+ if original_max_position_embeddings >= config.max_position_embeddings:
526
+ logger.warning(
527
+ "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
528
+ f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
529
+ )
530
+
531
+
532
+ # Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
533
+ ROPE_VALIDATION_FUNCTIONS = {
534
+ "default": _validate_default_rope_parameters,
535
+ "linear": _validate_linear_scaling_rope_parameters,
536
+ "dynamic": _validate_dynamic_scaling_rope_parameters,
537
+ "yarn": _validate_yarn_parameters,
538
+ "longrope": _validate_longrope_parameters,
539
+ "llama3": _validate_llama3_parameters,
540
+ }
541
+
542
+
543
+ def rope_config_validation(config: PretrainedConfig):
544
+ """
545
+ Validate the RoPE config arguments, given a `PretrainedConfig` object
546
+ """
547
+ rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig`
548
+ if rope_scaling is None:
549
+ return
550
+
551
+ # BC: "rope_type" was originally "type"
552
+ rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
553
+ validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
554
+ if validation_fn is not None:
555
+ validation_fn(config)
556
+ else:
557
+ logger.warning(
558
+ f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
559
+ )