ZetangForward commited on
Commit
393ba8c
·
verified ·
1 Parent(s): 90d4408

Synced from ModelScope: LCM_group/moba_qwen3-4b (Auto-fixed license)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ run-20251217_233019-b3ly702fm741dd93khyrv/backup.swanlab filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.mdl ADDED
Binary file (46 Bytes). View file
 
.msc ADDED
Binary file (1.93 kB). View file
 
.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1768899057
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ tags: []
4
+
5
+ #model-type:
6
+ ##如 gpt、phi、llama、chatglm、baichuan 等
7
+ #- gpt
8
+
9
+ #domain:
10
+ ##如 nlp、cv、audio、multi-modal
11
+ #- nlp
12
+
13
+ #language:
14
+ ##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
15
+ #- cn
16
+
17
+ #metrics:
18
+ ##如 CIDEr、Blue、ROUGE 等
19
+ #- CIDEr
20
+
21
+ #tags:
22
+ ##各种自定义,包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
23
+ #- pretrained
24
+
25
+ #tools:
26
+ ##如 vllm、fastchat、llamacpp、AdaSeq 等
27
+ #- vllm
28
+ ---
29
+ ### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重,可浏览“模型文件”页面获取。
30
+ #### 您可以通过如下git clone命令,或者ModelScope SDK来下载模型
31
+
32
+ SDK下载
33
+ ```bash
34
+ #安装ModelScope
35
+ pip install modelscope
36
+ ```
37
+ ```python
38
+ #SDK模型下载
39
+ from modelscope import snapshot_download
40
+ model_dir = snapshot_download('tang031223/moba')
41
+ ```
42
+ Git下载
43
+ ```
44
+ #Git模型下载
45
+ git clone https://www.modelscope.cn/tang031223/moba.git
46
+ ```
47
+
48
+ <p style="color: lightgrey;">如果您是本模型的贡献者,我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>,及时完善模型卡片内容。</p>
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.2249212775528565,
3
+ "num_input_tokens_seen": 262144000,
4
+ "train_loss": 1.4907063425183296,
5
+ "train_runtime": 24201.9893,
6
+ "train_samples_per_second": 0.331,
7
+ "train_steps_per_second": 0.041
8
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 9728,
14
+ "max_position_embeddings": 262144,
15
+ "max_window_layers": 36,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 36,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "factor": 8.0,
23
+ "original_max_position_embeddings": 40960,
24
+ "rope_type": "yarn",
25
+ "type": "yarn"
26
+ },
27
+ "rope_theta": 1000000,
28
+ "sliding_window": null,
29
+ "tie_word_embeddings": true,
30
+ "torch_dtype": "bfloat16",
31
+ "transformers_version": "4.51.1",
32
+ "use_cache": false,
33
+ "use_sliding_window": false,
34
+ "vocab_size": 151936
35
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.51.1"
13
+ }
log_20251217_232021.out ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Traceback (most recent call last):
2
+ File "<frozen runpy>", line 198, in _run_module_as_main
3
+ File "<frozen runpy>", line 88, in _run_code
4
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
5
+ main()
6
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
7
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
8
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
9
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
10
+ obj = dtype(**inputs)
11
+ ^^^^^^^^^^^^^^^
12
+ File "<string>", line 164, in __init__
13
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
14
+ self.device
15
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
16
+ return self._setup_devices
17
+ ^^^^^^^^^^^^^^^^^^^
18
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
19
+ cached = self.fget(obj)
20
+ ^^^^^^^^^^^^^^
21
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
22
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
23
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
24
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
25
+ raise ImportError(
26
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
27
+ Traceback (most recent call last):
28
+ File "<frozen runpy>", line 198, in _run_module_as_main
29
+ File "<frozen runpy>", line 88, in _run_code
30
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
31
+ main()
32
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
33
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
34
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
35
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
36
+ obj = dtype(**inputs)
37
+ ^^^^^^^^^^^^^^^
38
+ File "<string>", line 164, in __init__
39
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
40
+ self.device
41
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
42
+ return self._setup_devices
43
+ ^^^^^^^^^^^^^^^^^^^
44
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
45
+ cached = self.fget(obj)
46
+ ^^^^^^^^^^^^^^
47
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
48
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
49
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
51
+ raise ImportError(
52
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
53
+ Traceback (most recent call last):
54
+ File "<frozen runpy>", line 198, in _run_module_as_main
55
+ File "<frozen runpy>", line 88, in _run_code
56
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
57
+ main()
58
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
59
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
60
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
61
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
62
+ obj = dtype(**inputs)
63
+ ^^^^^^^^^^^^^^^
64
+ File "<string>", line 164, in __init__
65
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
66
+ self.device
67
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
68
+ return self._setup_devices
69
+ ^^^^^^^^^^^^^^^^^^^
70
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
71
+ cached = self.fget(obj)
72
+ ^^^^^^^^^^^^^^
73
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
74
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
75
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
76
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
77
+ raise ImportError(
78
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
79
+ Traceback (most recent call last):
80
+ File "<frozen runpy>", line 198, in _run_module_as_main
81
+ File "<frozen runpy>", line 88, in _run_code
82
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
83
+ main()
84
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
85
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
86
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
87
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
88
+ obj = dtype(**inputs)
89
+ ^^^^^^^^^^^^^^^
90
+ File "<string>", line 164, in __init__
91
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
92
+ self.device
93
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
94
+ return self._setup_devices
95
+ ^^^^^^^^^^^^^^^^^^^
96
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
97
+ cached = self.fget(obj)
98
+ ^^^^^^^^^^^^^^
99
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
100
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
101
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
102
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
103
+ raise ImportError(
104
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
105
+ Traceback (most recent call last):
106
+ File "<frozen runpy>", line 198, in _run_module_as_main
107
+ File "<frozen runpy>", line 88, in _run_code
108
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
109
+ main()
110
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
111
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
112
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
113
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
114
+ obj = dtype(**inputs)
115
+ ^^^^^^^^^^^^^^^
116
+ File "<string>", line 164, in __init__
117
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
118
+ self.device
119
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
120
+ return self._setup_devices
121
+ ^^^^^^^^^^^^^^^^^^^
122
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
123
+ cached = self.fget(obj)
124
+ ^^^^^^^^^^^^^^
125
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
126
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
127
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
128
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
129
+ raise ImportError(
130
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
131
+ Traceback (most recent call last):
132
+ File "<frozen runpy>", line 198, in _run_module_as_main
133
+ File "<frozen runpy>", line 88, in _run_code
134
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
135
+ main()
136
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
137
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
138
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
139
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
140
+ obj = dtype(**inputs)
141
+ ^^^^^^^^^^^^^^^
142
+ File "<string>", line 164, in __init__
143
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
144
+ self.device
145
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
146
+ return self._setup_devices
147
+ ^^^^^^^^^^^^^^^^^^^
148
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
149
+ cached = self.fget(obj)
150
+ ^^^^^^^^^^^^^^
151
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
152
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
153
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
154
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
155
+ raise ImportError(
156
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
157
+ Traceback (most recent call last):
158
+ File "<frozen runpy>", line 198, in _run_module_as_main
159
+ File "<frozen runpy>", line 88, in _run_code
160
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
161
+ main()
162
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
163
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
164
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
165
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
166
+ obj = dtype(**inputs)
167
+ ^^^^^^^^^^^^^^^
168
+ File "<string>", line 164, in __init__
169
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
170
+ self.device
171
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
172
+ return self._setup_devices
173
+ ^^^^^^^^^^^^^^^^^^^
174
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
175
+ cached = self.fget(obj)
176
+ ^^^^^^^^^^^^^^
177
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
178
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
179
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
180
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
181
+ raise ImportError(
182
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
183
+ Traceback (most recent call last):
184
+ File "<frozen runpy>", line 198, in _run_module_as_main
185
+ File "<frozen runpy>", line 88, in _run_code
186
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 214, in <module>
187
+ main()
188
+ File "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py", line 48, in main
189
+ script_args, training_args, data_args = parser.parse_args_into_dataclasses()
190
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
191
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/hf_argparser.py", line 358, in parse_args_into_dataclasses
192
+ obj = dtype(**inputs)
193
+ ^^^^^^^^^^^^^^^
194
+ File "<string>", line 164, in __init__
195
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 1761, in __post_init__
196
+ self.device
197
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2297, in device
198
+ return self._setup_devices
199
+ ^^^^^^^^^^^^^^^^^^^
200
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/utils/generic.py", line 67, in __get__
201
+ cached = self.fget(obj)
202
+ ^^^^^^^^^^^^^^
203
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/transformers/training_args.py", line 2224, in _setup_devices
204
+ self.distributed_state = PartialState(**accelerator_state_kwargs)
205
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
206
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/accelerate/state.py", line 207, in __init__
207
+ raise ImportError(
208
+ ImportError: DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source
209
+ W1217 23:20:28.243000 432111 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 432131 closing signal SIGTERM
210
+ W1217 23:20:28.244000 432111 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 432132 closing signal SIGTERM
211
+ W1217 23:20:28.244000 432111 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 432134 closing signal SIGTERM
212
+ W1217 23:20:28.244000 432111 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 432136 closing signal SIGTERM
213
+ E1217 23:20:28.509000 432111 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 2 (pid: 432133) of binary: /opt/conda/envs/qqt/bin/python3.11
214
+ Traceback (most recent call last):
215
+ File "/opt/conda/envs/qqt/bin/torchrun", line 7, in <module>
216
+ sys.exit(main())
217
+ ^^^^^^
218
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
219
+ return f(*args, **kwargs)
220
+ ^^^^^^^^^^^^^^^^^^
221
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/torch/distributed/run.py", line 918, in main
222
+ run(args)
223
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run
224
+ elastic_launch(
225
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
226
+ return launch_agent(self._config, self._entrypoint, list(args))
227
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
228
+ File "/opt/conda/envs/qqt/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
229
+ raise ChildFailedError(
230
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
231
+ ============================================================
232
+ training.moba_train FAILED
233
+ ------------------------------------------------------------
234
+ Failures:
235
+ [1]:
236
+ time : 2025-12-17_23:20:28
237
+ host : pod-1436390728976789504
238
+ rank : 4 (local_rank: 4)
239
+ exitcode : 1 (pid: 432135)
240
+ error_file: <N/A>
241
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
242
+ [2]:
243
+ time : 2025-12-17_23:20:28
244
+ host : pod-1436390728976789504
245
+ rank : 6 (local_rank: 6)
246
+ exitcode : 1 (pid: 432137)
247
+ error_file: <N/A>
248
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
249
+ [3]:
250
+ time : 2025-12-17_23:20:28
251
+ host : pod-1436390728976789504
252
+ rank : 7 (local_rank: 7)
253
+ exitcode : 1 (pid: 432138)
254
+ error_file: <N/A>
255
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
256
+ ------------------------------------------------------------
257
+ Root Cause (first observed failure):
258
+ [0]:
259
+ time : 2025-12-17_23:20:28
260
+ host : pod-1436390728976789504
261
+ rank : 2 (local_rank: 2)
262
+ exitcode : 1 (pid: 432133)
263
+ error_file: <N/A>
264
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
265
+ ============================================================
log_20251217_232101.out ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d61b9729f5448250e9d65b6d229c42a32874c2b23426e821527baf735f041ec8
3
+ size 4967215360
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60d07fa2fdf2a64b10e1bd519a01468ba775a2df4e41321de0c413da4717a465
3
+ size 3855679144
model.safetensors.index.json ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8822848512
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
+ "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
+ "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
+ "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
+ "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
+ "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
+ "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
+ "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
+ "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
+ "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
+ "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
+ "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
+ "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
+ "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
+ "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
+ "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
+ "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
+ "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
+ "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
+ "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
+ "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
+ "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
+ "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
+ "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
+ "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
+ "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
+ "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
+ "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
+ "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
+ "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
+ "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
+ "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
+ "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
+ "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
+ "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
+ "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
+ "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
+ "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
+ "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
+ "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
+ "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
+ "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
+ "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
+ "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
+ "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
+ "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
+ "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
+ "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
+ "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
+ "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
+ "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
+ "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
+ "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
+ "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
+ "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
+ "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
+ "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
+ "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
+ "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
+ "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
+ "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
+ "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
+ "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
+ "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
+ "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
+ "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
+ "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
+ "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
+ "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
+ "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
+ "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
+ "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
+ "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
+ "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
+ "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
+ "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
+ "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
+ "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
+ "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
+ "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
+ "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
+ "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
+ "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
+ "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
+ "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
+ "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
+ "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
+ "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
+ "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
+ "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
+ "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
+ "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
+ "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
+ "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
+ "model.norm.weight": "model-00002-of-00002.safetensors"
405
+ }
406
+ }
run-20251217_233019-b3ly702fm741dd93khyrv/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df0130e9b68d4ce9b61a88fa90bdc3731a76974ed66df4d793436e904dc15b52
3
+ size 6777820
run-20251217_233019-b3ly702fm741dd93khyrv/files/config.yaml ADDED
@@ -0,0 +1,976 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 0
4
+ value: 🤗transformers
5
+ _attn_implementation_autoset:
6
+ desc: ''
7
+ sort: 74
8
+ value: true
9
+ _name_or_path:
10
+ desc: ''
11
+ sort: 73
12
+ value: /workspace/mnt/hf_models/Qwen3-4B
13
+ accelerator_config:
14
+ desc: ''
15
+ sort: 156
16
+ value:
17
+ dispatch_batches: null
18
+ even_batches: true
19
+ gradient_accumulation_kwargs: null
20
+ non_blocking: false
21
+ split_batches: false
22
+ use_seedable_sampler: true
23
+ adafactor:
24
+ desc: ''
25
+ sort: 161
26
+ value: false
27
+ adam_beta1:
28
+ desc: ''
29
+ sort: 94
30
+ value: 0.9
31
+ adam_beta2:
32
+ desc: ''
33
+ sort: 95
34
+ value: 0.95
35
+ adam_epsilon:
36
+ desc: ''
37
+ sort: 96
38
+ value: 1.0e-08
39
+ add_cross_attention:
40
+ desc: ''
41
+ sort: 33
42
+ value: false
43
+ architectures:
44
+ desc: ''
45
+ sort: 60
46
+ value:
47
+ - Qwen3ForCausalLM
48
+ attention_bias:
49
+ desc: ''
50
+ sort: 18
51
+ value: false
52
+ attention_dropout:
53
+ desc: ''
54
+ sort: 19
55
+ value: 0.0
56
+ attention_type:
57
+ desc: ''
58
+ sort: 226
59
+ value: moba
60
+ auto_find_batch_size:
61
+ desc: ''
62
+ sort: 189
63
+ value: false
64
+ average_tokens_across_devices:
65
+ desc: ''
66
+ sort: 205
67
+ value: false
68
+ bad_words_ids:
69
+ desc: ''
70
+ sort: 50
71
+ value: null
72
+ batch_eval_metrics:
73
+ desc: ''
74
+ sort: 201
75
+ value: false
76
+ begin_suppress_tokens:
77
+ desc: ''
78
+ sort: 59
79
+ value: null
80
+ bf16:
81
+ desc: ''
82
+ sort: 126
83
+ value: true
84
+ bf16_full_eval:
85
+ desc: ''
86
+ sort: 130
87
+ value: false
88
+ bos_token_id:
89
+ desc: ''
90
+ sort: 66
91
+ value: 151643
92
+ chunk_size_feed_forward:
93
+ desc: ''
94
+ sort: 29
95
+ value: 0
96
+ context_window_if_toggled:
97
+ desc: ''
98
+ sort: 218
99
+ value: 4096
100
+ cross_attention_hidden_size:
101
+ desc: ''
102
+ sort: 32
103
+ value: null
104
+ cuda_empty_cache:
105
+ desc: ''
106
+ sort: 208
107
+ value: true
108
+ data_seed:
109
+ desc: ''
110
+ sort: 123
111
+ value: null
112
+ dataloader_drop_last:
113
+ desc: ''
114
+ sort: 138
115
+ value: false
116
+ dataloader_num_workers:
117
+ desc: ''
118
+ sort: 140
119
+ value: 1
120
+ dataloader_persistent_workers:
121
+ desc: ''
122
+ sort: 169
123
+ value: false
124
+ dataloader_pin_memory:
125
+ desc: ''
126
+ sort: 168
127
+ value: true
128
+ dataloader_prefetch_factor:
129
+ desc: ''
130
+ sort: 141
131
+ value: null
132
+ ddp_backend:
133
+ desc: ''
134
+ sort: 134
135
+ value: null
136
+ ddp_broadcast_buffers:
137
+ desc: ''
138
+ sort: 167
139
+ value: null
140
+ ddp_bucket_cap_mb:
141
+ desc: ''
142
+ sort: 166
143
+ value: null
144
+ ddp_find_unused_parameters:
145
+ desc: ''
146
+ sort: 165
147
+ value: false
148
+ ddp_timeout:
149
+ desc: ''
150
+ sort: 193
151
+ value: 1800
152
+ debug:
153
+ desc: ''
154
+ sort: 137
155
+ value: []
156
+ decoder_start_token_id:
157
+ desc: ''
158
+ sort: 70
159
+ value: null
160
+ deepspeed:
161
+ desc: ''
162
+ sort: 157
163
+ value: /workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/deepspeed_stage2.json
164
+ disable_linear_regularization_term:
165
+ desc: ''
166
+ sort: 217
167
+ value: false
168
+ disable_tqdm:
169
+ desc: ''
170
+ sort: 144
171
+ value: true
172
+ diversity_penalty:
173
+ desc: ''
174
+ sort: 41
175
+ value: 0.0
176
+ do_eval:
177
+ desc: ''
178
+ sort: 80
179
+ value: false
180
+ do_predict:
181
+ desc: ''
182
+ sort: 81
183
+ value: false
184
+ do_sample:
185
+ desc: ''
186
+ sort: 37
187
+ value: false
188
+ do_train:
189
+ desc: ''
190
+ sort: 79
191
+ value: true
192
+ early_stopping:
193
+ desc: ''
194
+ sort: 38
195
+ value: false
196
+ enable_ada_sparsity:
197
+ desc: ''
198
+ sort: 230
199
+ value: false
200
+ enable_layerwise_sparsity:
201
+ desc: ''
202
+ sort: 231
203
+ value: false
204
+ encoder_no_repeat_ngram_size:
205
+ desc: ''
206
+ sort: 49
207
+ value: 0
208
+ end_head_sparsity:
209
+ desc: ''
210
+ sort: 212
211
+ value: 0.95
212
+ eos_token_id:
213
+ desc: ''
214
+ sort: 68
215
+ value: 151645
216
+ erank_analysis_path:
217
+ desc: ''
218
+ sort: 237
219
+ value: null
220
+ eval_accumulation_steps:
221
+ desc: ''
222
+ sort: 89
223
+ value: null
224
+ eval_delay:
225
+ desc: ''
226
+ sort: 90
227
+ value: 0
228
+ eval_do_concat_batches:
229
+ desc: ''
230
+ sort: 183
231
+ value: true
232
+ eval_on_start:
233
+ desc: ''
234
+ sort: 202
235
+ value: false
236
+ eval_steps:
237
+ desc: ''
238
+ sort: 139
239
+ value: null
240
+ eval_strategy:
241
+ desc: ''
242
+ sort: 82
243
+ value: 'no'
244
+ eval_use_gather_object:
245
+ desc: ''
246
+ sort: 204
247
+ value: false
248
+ exponential_decay_length_penalty:
249
+ desc: ''
250
+ sort: 57
251
+ value: null
252
+ finetuning_task:
253
+ desc: ''
254
+ sort: 61
255
+ value: null
256
+ forced_bos_token_id:
257
+ desc: ''
258
+ sort: 54
259
+ value: null
260
+ forced_eos_token_id:
261
+ desc: ''
262
+ sort: 55
263
+ value: null
264
+ fp16:
265
+ desc: ''
266
+ sort: 127
267
+ value: false
268
+ fp16_backend:
269
+ desc: ''
270
+ sort: 184
271
+ value: auto
272
+ fp16_full_eval:
273
+ desc: ''
274
+ sort: 131
275
+ value: false
276
+ fp16_opt_level:
277
+ desc: ''
278
+ sort: 128
279
+ value: O1
280
+ freeze_mask_parameters:
281
+ desc: ''
282
+ sort: 220
283
+ value: false
284
+ freeze_non_mask_parameters:
285
+ desc: ''
286
+ sort: 219
287
+ value: false
288
+ fsdp:
289
+ desc: ''
290
+ sort: 151
291
+ value: []
292
+ fsdp_config:
293
+ desc: ''
294
+ sort: 153
295
+ value:
296
+ min_num_params: 0
297
+ xla: false
298
+ xla_fsdp_grad_ckpt: false
299
+ xla_fsdp_v2: false
300
+ fsdp_min_num_params:
301
+ desc: ''
302
+ sort: 152
303
+ value: 0
304
+ fsdp_transformer_layer_cls_to_wrap:
305
+ desc: ''
306
+ sort: 155
307
+ value: null
308
+ full_determinism:
309
+ desc: ''
310
+ sort: 190
311
+ value: false
312
+ gradient_accumulation_steps:
313
+ desc: ''
314
+ sort: 88
315
+ value: 1
316
+ gradient_checkpointing:
317
+ desc: ''
318
+ sort: 179
319
+ value: true
320
+ gradient_checkpointing_kwargs:
321
+ desc: ''
322
+ sort: 180
323
+ value: null
324
+ greater_is_better:
325
+ desc: ''
326
+ sort: 149
327
+ value: null
328
+ group_by_length:
329
+ desc: ''
330
+ sort: 162
331
+ value: false
332
+ half_precision_backend:
333
+ desc: ''
334
+ sort: 129
335
+ value: auto
336
+ head_dim:
337
+ desc: ''
338
+ sort: 11
339
+ value: 128
340
+ hidden_act:
341
+ desc: ''
342
+ sort: 12
343
+ value: silu
344
+ hidden_size:
345
+ desc: ''
346
+ sort: 3
347
+ value: 2560
348
+ hub_always_push:
349
+ desc: ''
350
+ sort: 178
351
+ value: false
352
+ hub_model_id:
353
+ desc: ''
354
+ sort: 174
355
+ value: null
356
+ hub_private_repo:
357
+ desc: ''
358
+ sort: 177
359
+ value: null
360
+ hub_strategy:
361
+ desc: ''
362
+ sort: 175
363
+ value: every_save
364
+ hub_token:
365
+ desc: ''
366
+ sort: 176
367
+ value: <HUB_TOKEN>
368
+ id2label:
369
+ desc: ''
370
+ sort: 62
371
+ value:
372
+ '0': LABEL_0
373
+ '1': LABEL_1
374
+ ignore_data_skip:
375
+ desc: ''
376
+ sort: 150
377
+ value: false
378
+ include_for_metrics:
379
+ desc: ''
380
+ sort: 182
381
+ value: []
382
+ include_inputs_for_metrics:
383
+ desc: ''
384
+ sort: 181
385
+ value: false
386
+ include_num_input_tokens_seen:
387
+ desc: ''
388
+ sort: 198
389
+ value: false
390
+ include_tokens_per_second:
391
+ desc: ''
392
+ sort: 197
393
+ value: false
394
+ initializer_range:
395
+ desc: ''
396
+ sort: 13
397
+ value: 0.02
398
+ intermediate_size:
399
+ desc: ''
400
+ sort: 4
401
+ value: 9728
402
+ is_decoder:
403
+ desc: ''
404
+ sort: 31
405
+ value: false
406
+ is_encoder_decoder:
407
+ desc: ''
408
+ sort: 30
409
+ value: false
410
+ jit_mode_eval:
411
+ desc: ''
412
+ sort: 124
413
+ value: false
414
+ label2id:
415
+ desc: ''
416
+ sort: 63
417
+ value:
418
+ LABEL_0: 0
419
+ LABEL_1: 1
420
+ label_names:
421
+ desc: ''
422
+ sort: 146
423
+ value: null
424
+ label_smoothing_factor:
425
+ desc: ''
426
+ sort: 158
427
+ value: 0.0
428
+ layerwise_sparsity_max_ratio:
429
+ desc: ''
430
+ sort: 234
431
+ value: 1.0
432
+ layerwise_sparsity_min_ratio:
433
+ desc: ''
434
+ sort: 233
435
+ value: 0.5
436
+ layerwise_sparsity_power:
437
+ desc: ''
438
+ sort: 235
439
+ value: 1.0
440
+ layerwise_sparsity_schedule:
441
+ desc: ''
442
+ sort: 232
443
+ value: high-low-high
444
+ layerwise_sparsity_weight:
445
+ desc: ''
446
+ sort: 236
447
+ value: 1.0
448
+ learning_rate:
449
+ desc: ''
450
+ sort: 92
451
+ value: 1.0e-05
452
+ length_column_name:
453
+ desc: ''
454
+ sort: 163
455
+ value: length
456
+ length_penalty:
457
+ desc: ''
458
+ sort: 47
459
+ value: 1.0
460
+ load_best_model_at_end:
461
+ desc: ''
462
+ sort: 147
463
+ value: false
464
+ load_masks_from:
465
+ desc: ''
466
+ sort: 224
467
+ value: null
468
+ load_masks_sparsity:
469
+ desc: ''
470
+ sort: 225
471
+ value: null
472
+ local_rank:
473
+ desc: ''
474
+ sort: 133
475
+ value: 0
476
+ log_level:
477
+ desc: ''
478
+ sort: 104
479
+ value: info
480
+ log_level_replica:
481
+ desc: ''
482
+ sort: 105
483
+ value: warning
484
+ log_on_each_node:
485
+ desc: ''
486
+ sort: 106
487
+ value: true
488
+ logging_dir:
489
+ desc: ''
490
+ sort: 107
491
+ value: checkpoints/moba_qwen3_Qwen3-4B_bsz8_steps1000_lr1e-5_warmup0.1_/runs/Dec17_23-21-08_pod-1436390728976789504
492
+ logging_first_step:
493
+ desc: ''
494
+ sort: 109
495
+ value: false
496
+ logging_nan_inf_filter:
497
+ desc: ''
498
+ sort: 111
499
+ value: true
500
+ logging_steps:
501
+ desc: ''
502
+ sort: 110
503
+ value: 1.0
504
+ logging_strategy:
505
+ desc: ''
506
+ sort: 108
507
+ value: steps
508
+ lr_scheduler_kwargs:
509
+ desc: ''
510
+ sort: 101
511
+ value: {}
512
+ lr_scheduler_type:
513
+ desc: ''
514
+ sort: 100
515
+ value: cosine
516
+ mask_learning_rate:
517
+ desc: ''
518
+ sort: 213
519
+ value: 0.001
520
+ max_grad_norm:
521
+ desc: ''
522
+ sort: 97
523
+ value: 1.0
524
+ max_length:
525
+ desc: ''
526
+ sort: 35
527
+ value: 20
528
+ max_position_embeddings:
529
+ desc: ''
530
+ sort: 2
531
+ value: 40960
532
+ max_steps:
533
+ desc: ''
534
+ sort: 99
535
+ value: 1000
536
+ max_window_layers:
537
+ desc: ''
538
+ sort: 9
539
+ value: 36
540
+ metric_for_best_model:
541
+ desc: ''
542
+ sort: 148
543
+ value: null
544
+ min_length:
545
+ desc: ''
546
+ sort: 36
547
+ value: 0
548
+ min_lr_ratio:
549
+ desc: ''
550
+ sort: 206
551
+ value: 0.01
552
+ model_num_parameters:
553
+ desc: ''
554
+ sort: 238
555
+ value: 4022468096
556
+ model_type:
557
+ desc: ''
558
+ sort: 76
559
+ value: qwen3
560
+ mp_parameters:
561
+ desc: ''
562
+ sort: 188
563
+ value: ''
564
+ neftune_noise_alpha:
565
+ desc: ''
566
+ sort: 199
567
+ value: null
568
+ no_cuda:
569
+ desc: ''
570
+ sort: 119
571
+ value: false
572
+ no_repeat_ngram_size:
573
+ desc: ''
574
+ sort: 48
575
+ value: 0
576
+ num_attention_heads:
577
+ desc: ''
578
+ sort: 6
579
+ value: 32
580
+ num_beam_groups:
581
+ desc: ''
582
+ sort: 40
583
+ value: 1
584
+ num_beams:
585
+ desc: ''
586
+ sort: 39
587
+ value: 1
588
+ num_hidden_layers:
589
+ desc: ''
590
+ sort: 5
591
+ value: 36
592
+ num_key_value_heads:
593
+ desc: ''
594
+ sort: 10
595
+ value: 8
596
+ num_return_sequences:
597
+ desc: ''
598
+ sort: 51
599
+ value: 1
600
+ num_train_epochs:
601
+ desc: ''
602
+ sort: 98
603
+ value: 3.0
604
+ optim:
605
+ desc: ''
606
+ sort: 159
607
+ value: adamw_torch
608
+ optim_args:
609
+ desc: ''
610
+ sort: 160
611
+ value: null
612
+ optim_target_modules:
613
+ desc: ''
614
+ sort: 200
615
+ value: null
616
+ ordered:
617
+ desc: ''
618
+ sort: 207
619
+ value: false
620
+ output_attentions:
621
+ desc: ''
622
+ sort: 22
623
+ value: false
624
+ output_dir:
625
+ desc: ''
626
+ sort: 77
627
+ value: checkpoints/moba_qwen3_Qwen3-4B_bsz8_steps1000_lr1e-5_warmup0.1_
628
+ output_hidden_states:
629
+ desc: ''
630
+ sort: 21
631
+ value: false
632
+ output_scores:
633
+ desc: ''
634
+ sort: 52
635
+ value: false
636
+ overwrite_output_dir:
637
+ desc: ''
638
+ sort: 78
639
+ value: false
640
+ pad_token_id:
641
+ desc: ''
642
+ sort: 67
643
+ value: null
644
+ past_index:
645
+ desc: ''
646
+ sort: 142
647
+ value: -1
648
+ per_device_eval_batch_size:
649
+ desc: ''
650
+ sort: 85
651
+ value: 1
652
+ per_device_train_batch_size:
653
+ desc: ''
654
+ sort: 84
655
+ value: 1
656
+ per_gpu_eval_batch_size:
657
+ desc: ''
658
+ sort: 87
659
+ value: null
660
+ per_gpu_train_batch_size:
661
+ desc: ''
662
+ sort: 86
663
+ value: null
664
+ prediction_loss_only:
665
+ desc: ''
666
+ sort: 83
667
+ value: false
668
+ prefix:
669
+ desc: ''
670
+ sort: 65
671
+ value: null
672
+ problem_type:
673
+ desc: ''
674
+ sort: 72
675
+ value: null
676
+ pruned_heads:
677
+ desc: ''
678
+ sort: 27
679
+ value: {}
680
+ push_to_hub:
681
+ desc: ''
682
+ sort: 172
683
+ value: false
684
+ push_to_hub_model_id:
685
+ desc: ''
686
+ sort: 185
687
+ value: null
688
+ push_to_hub_organization:
689
+ desc: ''
690
+ sort: 186
691
+ value: null
692
+ push_to_hub_token:
693
+ desc: ''
694
+ sort: 187
695
+ value: <PUSH_TO_HUB_TOKEN>
696
+ ray_scope:
697
+ desc: ''
698
+ sort: 192
699
+ value: last
700
+ reg_learning_rate:
701
+ desc: ''
702
+ sort: 214
703
+ value: 0.001
704
+ remove_invalid_values:
705
+ desc: ''
706
+ sort: 56
707
+ value: false
708
+ remove_unused_columns:
709
+ desc: ''
710
+ sort: 145
711
+ value: false
712
+ repetition_penalty:
713
+ desc: ''
714
+ sort: 46
715
+ value: 1.0
716
+ report_to:
717
+ desc: ''
718
+ sort: 164
719
+ value:
720
+ - swanlab
721
+ restore_callback_states_from_checkpoint:
722
+ desc: ''
723
+ sort: 118
724
+ value: false
725
+ resume_from_checkpoint:
726
+ desc: ''
727
+ sort: 173
728
+ value: null
729
+ return_dict:
730
+ desc: ''
731
+ sort: 20
732
+ value: true
733
+ return_dict_in_generate:
734
+ desc: ''
735
+ sort: 53
736
+ value: false
737
+ rms_norm_eps:
738
+ desc: ''
739
+ sort: 14
740
+ value: 1.0e-06
741
+ rope_scaling:
742
+ desc: ''
743
+ sort: 17
744
+ value:
745
+ factor: 4.0
746
+ original_max_position_embeddings: 40960
747
+ rope_type: yarn
748
+ type: yarn
749
+ rope_theta:
750
+ desc: ''
751
+ sort: 16
752
+ value: 1000000
753
+ run_name:
754
+ desc: ''
755
+ sort: 143
756
+ value: moba_qwen3_Qwen3-4B_bsz8_steps1000_lr1e-5_warmup0.1_
757
+ save_on_each_node:
758
+ desc: ''
759
+ sort: 116
760
+ value: false
761
+ save_only_model:
762
+ desc: ''
763
+ sort: 117
764
+ value: false
765
+ save_safetensors:
766
+ desc: ''
767
+ sort: 115
768
+ value: true
769
+ save_steps:
770
+ desc: ''
771
+ sort: 113
772
+ value: 500
773
+ save_strategy:
774
+ desc: ''
775
+ sort: 112
776
+ value: steps
777
+ save_total_limit:
778
+ desc: ''
779
+ sort: 114
780
+ value: null
781
+ seed:
782
+ desc: ''
783
+ sort: 122
784
+ value: 42
785
+ sep_token_id:
786
+ desc: ''
787
+ sort: 69
788
+ value: null
789
+ seq_parallel_size:
790
+ desc: ''
791
+ sort: 210
792
+ value: 1
793
+ sink_size:
794
+ desc: ''
795
+ sort: 228
796
+ value: 128
797
+ skip_memory_metrics:
798
+ desc: ''
799
+ sort: 170
800
+ value: true
801
+ sliding_window:
802
+ desc: ''
803
+ sort: 8
804
+ value: null
805
+ sparsity_warmup_ratio:
806
+ desc: ''
807
+ sort: 216
808
+ value: 0.05
809
+ start_head_sparsity:
810
+ desc: ''
811
+ sort: 211
812
+ value: 0.0
813
+ streaming_dataset:
814
+ desc: ''
815
+ sort: 209
816
+ value: true
817
+ stripe_init_start_with_keep:
818
+ desc: ''
819
+ sort: 223
820
+ value: false
821
+ stripe_init_width_1:
822
+ desc: ''
823
+ sort: 221
824
+ value: null
825
+ stripe_init_width_2:
826
+ desc: ''
827
+ sort: 222
828
+ value: null
829
+ suppress_tokens:
830
+ desc: ''
831
+ sort: 58
832
+ value: null
833
+ task_specific_params:
834
+ desc: ''
835
+ sort: 71
836
+ value: null
837
+ temperature:
838
+ desc: ''
839
+ sort: 42
840
+ value: 1.0
841
+ tf32:
842
+ desc: ''
843
+ sort: 132
844
+ value: null
845
+ tf_legacy_loss:
846
+ desc: ''
847
+ sort: 26
848
+ value: false
849
+ tie_encoder_decoder:
850
+ desc: ''
851
+ sort: 34
852
+ value: false
853
+ tie_word_embeddings:
854
+ desc: ''
855
+ sort: 28
856
+ value: true
857
+ toggle_type:
858
+ desc: ''
859
+ sort: 227
860
+ value: streaming
861
+ tokenizer_class:
862
+ desc: ''
863
+ sort: 64
864
+ value: null
865
+ top_k:
866
+ desc: ''
867
+ sort: 43
868
+ value: 50
869
+ top_p:
870
+ desc: ''
871
+ sort: 44
872
+ value: 1.0
873
+ topk_k:
874
+ desc: ''
875
+ sort: 229
876
+ value: 2048
877
+ torch_compile:
878
+ desc: ''
879
+ sort: 194
880
+ value: false
881
+ torch_compile_backend:
882
+ desc: ''
883
+ sort: 195
884
+ value: null
885
+ torch_compile_mode:
886
+ desc: ''
887
+ sort: 196
888
+ value: null
889
+ torch_dtype:
890
+ desc: ''
891
+ sort: 24
892
+ value: bfloat16
893
+ torch_empty_cache_steps:
894
+ desc: ''
895
+ sort: 91
896
+ value: null
897
+ torchdynamo:
898
+ desc: ''
899
+ sort: 191
900
+ value: null
901
+ torchscript:
902
+ desc: ''
903
+ sort: 23
904
+ value: false
905
+ tp_size:
906
+ desc: ''
907
+ sort: 154
908
+ value: 0
909
+ tpu_metrics_debug:
910
+ desc: ''
911
+ sort: 136
912
+ value: false
913
+ tpu_num_cores:
914
+ desc: ''
915
+ sort: 135
916
+ value: null
917
+ transformers_version:
918
+ desc: ''
919
+ sort: 75
920
+ value: 4.51.1
921
+ typical_p:
922
+ desc: ''
923
+ sort: 45
924
+ value: 1.0
925
+ use_bfloat16:
926
+ desc: ''
927
+ sort: 25
928
+ value: false
929
+ use_cache:
930
+ desc: ''
931
+ sort: 15
932
+ value: false
933
+ use_cpu:
934
+ desc: ''
935
+ sort: 120
936
+ value: false
937
+ use_ipex:
938
+ desc: ''
939
+ sort: 125
940
+ value: false
941
+ use_legacy_prediction_loop:
942
+ desc: ''
943
+ sort: 171
944
+ value: false
945
+ use_liger_kernel:
946
+ desc: ''
947
+ sort: 203
948
+ value: false
949
+ use_mps_device:
950
+ desc: ''
951
+ sort: 121
952
+ value: false
953
+ use_sliding_window:
954
+ desc: ''
955
+ sort: 7
956
+ value: false
957
+ vocab_size:
958
+ desc: ''
959
+ sort: 1
960
+ value: 151936
961
+ warmup_ratio:
962
+ desc: ''
963
+ sort: 102
964
+ value: 0.1
965
+ warmup_steps:
966
+ desc: ''
967
+ sort: 103
968
+ value: 0
969
+ warmup_type:
970
+ desc: ''
971
+ sort: 215
972
+ value: linear
973
+ weight_decay:
974
+ desc: ''
975
+ sort: 93
976
+ value: 0.1
run-20251217_233019-b3ly702fm741dd93khyrv/files/requirements.txt ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.1
2
+ accelerate==1.9.0
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.14
5
+ aiosignal==1.4.0
6
+ airportsdata==20250706
7
+ annotated-types==0.7.0
8
+ anthropic==0.61.0
9
+ anyio==4.9.0
10
+ argcomplete==3.6.2
11
+ argon2-cffi==25.1.0
12
+ argon2-cffi-bindings==25.1.0
13
+ arrow==1.3.0
14
+ astor==0.8.1
15
+ asttokens==3.0.0
16
+ async-lru==2.0.5
17
+ attrs==25.3.0
18
+ autocommand==2.2.2
19
+ azure-core==1.35.0
20
+ azure-identity==1.23.1
21
+ azure-storage-blob==12.26.0
22
+ azure-storage-file-datalake==12.21.0
23
+ babel==2.17.0
24
+ backcall==0.2.0
25
+ backoff==2.2.1
26
+ backports.tarfile==1.2.0
27
+ bcrypt==4.3.0
28
+ beautifulsoup4==4.13.4
29
+ blake3==1.0.5
30
+ bleach==6.2.0
31
+ blobfile==3.0.0
32
+ block_sparse_attention_triton==0.1.0
33
+ block_sparse_attn==0.0.1
34
+ boto3==1.39.7
35
+ botocore==1.39.7
36
+ Brotli==1.1.0
37
+ cachetools==5.5.2
38
+ certifi==2025.7.14
39
+ cffi==1.17.1
40
+ charset-normalizer==3.4.2
41
+ circuitbreaker==2.1.3
42
+ click==8.2.1
43
+ cloudpickle==3.1.1
44
+ comm==0.2.3
45
+ compressed-tensors==0.9.2
46
+ contourpy==1.3.2
47
+ cramjam==2.10.0
48
+ cryptography==44.0.3
49
+ cuda-bindings==12.9.0
50
+ cuda-python==12.9.0
51
+ cupy-cuda12x==13.5.1
52
+ cycler==0.12.1
53
+ datasets==2.20.0
54
+ datatools-py==0.1
55
+ debugpy==1.8.15
56
+ decorator==5.2.1
57
+ decord==0.6.0
58
+ deepspeed==0.18.3
59
+ defusedxml==0.7.1
60
+ depyf==0.18.0
61
+ dill==0.3.8
62
+ diskcache==5.6.3
63
+ distro==1.9.0
64
+ dnspython==2.7.0
65
+ docker-pycreds==0.4.0
66
+ docopt==0.6.2
67
+ docstring_parser==0.16
68
+ einops==0.8.1
69
+ email_validator==2.2.0
70
+ executing==2.2.0
71
+ fastapi==0.116.1
72
+ fastapi-cli==0.0.8
73
+ fastapi-cloud-cli==0.1.5
74
+ fastjsonschema==2.21.1
75
+ fastrlock==0.8.3
76
+ filelock==3.18.0
77
+ flash-attn==2.6.3
78
+ flashinfer-python==0.2.8
79
+ fonttools==4.59.0
80
+ fqdn==1.5.1
81
+ frozenlist==1.7.0
82
+ fsspec==2024.5.0
83
+ gguf==0.10.0
84
+ gitdb==4.0.12
85
+ GitPython==3.1.44
86
+ google-api-core==2.25.1
87
+ google-auth==2.40.3
88
+ google-cloud-core==2.4.3
89
+ google-cloud-storage==2.10.0
90
+ google-crc32c==1.7.1
91
+ google-resumable-media==2.7.2
92
+ googleapis-common-protos==1.70.0
93
+ gql==3.5.3
94
+ graphql-core==3.2.6
95
+ h11==0.16.0
96
+ hf_transfer==0.1.9
97
+ hf-xet==1.1.5
98
+ hjson==3.1.0
99
+ httpcore==1.0.9
100
+ httptools==0.6.4
101
+ httpx==0.28.1
102
+ huggingface-hub==0.34.3
103
+ idna==3.10
104
+ importlib_metadata==8.7.0
105
+ inflect==7.3.1
106
+ iniconfig==2.1.0
107
+ inquirerpy==0.3.4
108
+ interegular==0.3.3
109
+ ipykernel==6.30.1
110
+ ipython==8.12.3
111
+ ipython_pygments_lexers==1.1.1
112
+ ipywidgets==8.1.7
113
+ isodate==0.7.2
114
+ isoduration==20.11.0
115
+ jaraco.collections==5.1.0
116
+ jaraco.context==5.3.0
117
+ jaraco.functools==4.0.1
118
+ jaraco.text==3.12.1
119
+ jedi==0.19.2
120
+ Jinja2==3.1.6
121
+ jiter==0.10.0
122
+ jmespath==1.0.1
123
+ joblib==1.5.1
124
+ json5==0.12.0
125
+ jsonpointer==3.0.0
126
+ jsonschema==4.25.0
127
+ jsonschema-specifications==2025.4.1
128
+ jupyter==1.1.1
129
+ jupyter_client==8.6.3
130
+ jupyter-console==6.6.3
131
+ jupyter_core==5.8.1
132
+ jupyter-events==0.12.0
133
+ jupyter-lsp==2.2.6
134
+ jupyter_server==2.16.0
135
+ jupyter_server_terminals==0.5.3
136
+ jupyterlab==4.4.5
137
+ jupyterlab_pygments==0.3.0
138
+ jupyterlab_server==2.27.3
139
+ jupyterlab_widgets==3.0.15
140
+ kiwisolver==1.4.8
141
+ lark==1.2.2
142
+ litellm==1.75.0
143
+ llguidance==0.7.30
144
+ llvmlite==0.44.0
145
+ lm-format-enforcer==0.10.12
146
+ lxml==6.0.0
147
+ markdown-it-py==3.0.0
148
+ MarkupSafe==3.0.2
149
+ matplotlib==3.10.3
150
+ matplotlib-inline==0.1.7
151
+ mdurl==0.1.2
152
+ minference==0.1.6.0
153
+ mistral_common==1.8.3
154
+ mistune==3.1.3
155
+ modelscope==1.28.1
156
+ more-itertools==10.3.0
157
+ mosaicml-cli==0.5.34
158
+ mosaicml-streaming==0.8.1
159
+ mpmath==1.3.0
160
+ mptools==0.1.0
161
+ msal==1.32.3
162
+ msal-extensions==1.3.1
163
+ msgpack==1.1.1
164
+ msgspec==0.19.0
165
+ multidict==6.6.3
166
+ multiprocess==0.70.16
167
+ nanobind==2.8.0
168
+ nbclient==0.10.2
169
+ nbconvert==7.16.6
170
+ nbformat==5.10.4
171
+ nest-asyncio==1.6.0
172
+ networkx==3.5
173
+ ninja==1.11.1.1
174
+ nltk==3.9.1
175
+ notebook==7.4.5
176
+ notebook_shim==0.2.4
177
+ numba==0.61.0
178
+ numpy==1.26.4
179
+ nvidia-cublas-cu12==12.4.5.8
180
+ nvidia-cuda-cupti-cu12==12.4.127
181
+ nvidia-cuda-nvrtc-cu12==12.4.127
182
+ nvidia-cuda-runtime-cu12==12.4.127
183
+ nvidia-cudnn-cu12==9.1.0.70
184
+ nvidia-cufft-cu12==11.2.1.3
185
+ nvidia-cufile-cu12==1.13.1.3
186
+ nvidia-curand-cu12==10.3.5.147
187
+ nvidia-cusolver-cu12==11.6.1.9
188
+ nvidia-cusparse-cu12==12.3.1.170
189
+ nvidia-cusparselt-cu12==0.6.2
190
+ nvidia-ml-py==12.575.51
191
+ nvidia-nccl-cu12==2.21.5
192
+ nvidia-nvjitlink-cu12==12.4.127
193
+ nvidia-nvshmem-cu12==3.3.20
194
+ nvidia-nvtx-cu12==12.4.127
195
+ nvitop==1.5.2
196
+ oci==2.155.2
197
+ openai==1.99.1
198
+ opencv-python-headless==4.11.0.86
199
+ orjson==3.11.1
200
+ outlines==0.1.11
201
+ outlines_core==0.1.26
202
+ overrides==7.7.0
203
+ packaging==24.1
204
+ pandas==2.3.1
205
+ pandocfilters==1.5.1
206
+ paramiko==3.5.1
207
+ parso==0.8.4
208
+ partial-json-parser==0.2.1.1.post6
209
+ peewee==3.18.2
210
+ pexpect==4.9.0
211
+ pfzy==0.3.4
212
+ pickleshare==0.7.5
213
+ pillow==11.3.0
214
+ pip==25.3
215
+ pipreqs==0.5.0
216
+ platformdirs==4.3.8
217
+ pluggy==1.6.0
218
+ prettytable==3.16.0
219
+ prometheus_client==0.22.1
220
+ prometheus-fastapi-instrumentator==7.1.0
221
+ prompt_toolkit==3.0.51
222
+ propcache==0.3.2
223
+ proto-plus==1.26.1
224
+ protobuf==4.25.3
225
+ psutil==7.0.0
226
+ ptyprocess==0.7.0
227
+ pure_eval==0.2.3
228
+ py-cpuinfo==9.0.0
229
+ pyarrow==20.0.0
230
+ pyarrow-hotfix==0.7
231
+ pyasn1==0.6.1
232
+ pyasn1_modules==0.4.2
233
+ pycountry==24.6.1
234
+ pycparser==2.22
235
+ pycryptodomex==3.23.0
236
+ pydantic==2.11.7
237
+ pydantic_core==2.33.2
238
+ pydantic-extra-types==2.10.5
239
+ pyecharts==2.0.8
240
+ Pygments==2.19.2
241
+ PyJWT==2.10.1
242
+ PyNaCl==1.5.0
243
+ pynvml==12.0.0
244
+ pyOpenSSL==24.3.0
245
+ pyparsing==3.2.3
246
+ pytest==8.4.1
247
+ python-dateutil==2.9.0
248
+ python-docx==1.2.0
249
+ python-dotenv==1.1.1
250
+ python-json-logger==3.3.0
251
+ python-multipart==0.0.20
252
+ python-snappy==0.7.3
253
+ pytz==2025.2
254
+ PyYAML==6.0.2
255
+ pyzmq==27.0.1
256
+ questionary==2.1.0
257
+ ray==2.48.0
258
+ referencing==0.36.2
259
+ regex==2023.12.25
260
+ requests==2.32.4
261
+ rfc3339-validator==0.1.4
262
+ rfc3986-validator==0.1.1
263
+ rfc3987-syntax==1.1.0
264
+ rich==13.9.4
265
+ rich-toolkit==0.14.9
266
+ rignore==0.6.4
267
+ rouge_score==0.1.2
268
+ rpds-py==0.26.0
269
+ rsa==4.9.1
270
+ ruamel.yaml==0.18.14
271
+ ruamel.yaml.clib==0.2.12
272
+ s3transfer==0.13.0
273
+ safetensors==0.5.3
274
+ scipy==1.16.1
275
+ seaborn==0.13.2
276
+ Send2Trash==1.8.3
277
+ sentencepiece==0.2.0
278
+ sentry-sdk==2.33.0
279
+ setproctitle==1.3.6
280
+ setuptools==80.9.0
281
+ sgl-kernel==0.1.4
282
+ sglang==0.4.6.post5
283
+ shellingham==1.5.4
284
+ simple-parsing==0.1.7
285
+ simplejson==3.20.1
286
+ six==1.17.0
287
+ smmap==5.0.2
288
+ sniffio==1.3.1
289
+ soundfile==0.13.1
290
+ soupsieve==2.7
291
+ sparseattn==0.1.0
292
+ stack-data==0.6.3
293
+ starlette==0.47.2
294
+ swanboard==0.1.8b1
295
+ swankit==0.2.4
296
+ swanlab==0.6.8
297
+ sympy==1.13.1
298
+ tabulate==0.9.0
299
+ tensor-parallel==2.0.0
300
+ terminado==0.18.1
301
+ tiktoken==0.7.0
302
+ tinycss2==1.4.0
303
+ tokenizers==0.21.4
304
+ tomli==2.0.1
305
+ torch==2.6.0+cu124
306
+ torch_memory_saver==0.0.8
307
+ torchao==0.9.0
308
+ torchaudio==2.6.0+cu124
309
+ torchvision==0.21.0+cu124
310
+ tornado==6.5.1
311
+ tqdm==4.66.4
312
+ traitlets==5.14.3
313
+ transformers==4.51.1
314
+ triton==3.2.0
315
+ typeguard==4.3.0
316
+ typer==0.16.0
317
+ types-python-dateutil==2.9.0.20250708
318
+ typing_extensions==4.14.1
319
+ typing-inspection==0.4.1
320
+ tzdata==2025.2
321
+ ujson==5.10.0
322
+ uri-template==1.3.0
323
+ urllib3==2.5.0
324
+ uv==0.7.21
325
+ uvicorn==0.35.0
326
+ uvloop==0.21.0
327
+ validators==0.35.0
328
+ vllm==0.8.3
329
+ wandb==0.17.3
330
+ watchfiles==1.1.0
331
+ wcwidth==0.2.13
332
+ webcolors==24.11.1
333
+ webencodings==0.5.1
334
+ websocket-client==1.8.0
335
+ websockets==11.0.3
336
+ wheel==0.45.1
337
+ widgetsnbextension==4.0.14
338
+ wrapt==1.17.2
339
+ xformers==0.0.29.post2
340
+ xgrammar==0.1.17
341
+ xxhash==3.5.0
342
+ yarg==0.1.9
343
+ yarl==1.20.1
344
+ zipp==3.23.0
345
+ zstandard==0.23.0
346
+ zstd==1.5.5.1
run-20251217_233019-b3ly702fm741dd93khyrv/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "2016", "cpu": {"brand": "Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz", "cores": 112}, "gpu": {"nvidia": {"driver": "535.104.12", "cores": 8, "type": ["NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB", "NVIDIA A800-SXM4-80GB"], "memory": ["80", "80", "80", "80", "80", "80", "80", "80"], "cuda": "12.4", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere", "Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912, 6912, 6912, 6912, 6912]}}, "os": "Linux-4.19.90-2107.6.0.0192.8.oe1.bclinux.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.4 LTS", "hostname": "pod-1436390728976789504", "pid": 432541, "cwd": "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn", "python": "3.11.0", "python_verbose": "3.11.0 | packaged by conda-forge | (main, Jan 14 2023, 12:27:40) [GCC 11.3.0]", "executable": "/opt/conda/envs/qqt/bin/python3.11", "command": "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/training/moba_train.py --report_to swanlab --do_train --model_name_or_path /workspace/mnt/hf_models/Qwen3-4B --tokenizer_name /workspace/mnt/hf_models/Qwen3-4B --run_name moba_qwen3_Qwen3-4B_bsz8_steps1000_lr1e-5_warmup0.1_ --output_dir checkpoints/moba_qwen3_Qwen3-4B_bsz8_steps1000_lr1e-5_warmup0.1_ --config_overrides_json --gradient_accumulation_steps 1 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --per_device_max_tokens 32768 --bf16 --learning_rate 1e-5 --min_lr_ratio 0.01 --lr_scheduler_type cosine --max_grad_norm 1.0 --adam_beta1 0.9 --adam_beta2 0.95 --weight_decay 0.1 --warmup_ratio 0.1 --optim adamw_torch --logging_steps 1 --log_level info --max_steps 1000 --save_steps 500 --dataloader_num_workers 1 --disable_tqdm true --use_fast_tokenizer false --remove_unused_columns false --ddp_find_unused_parameters false --cuda_empty_cache --tokenized_mds_train /workspace/mnt/qqt/public_data/qwen_mix_sft_64K3 --attention_type moba --deepspeed /workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/deepspeed_stage2.json --gradient_checkpointing", "git_remote": "https://gitee.com/lcm_lab/SparseAttn", "git_info": ["", ""], "swanlab": {"version": "0.6.8", "_monitor": 5, "logdir": "/workspace/mnt/qqt/project/NSA/SparseAttn/sparseattn/checkpoints/moba_qwen3_Qwen3-4B_bsz8_steps1000_lr1e-5_warmup0.1_/run-20251217_233019-b3ly702fm741dd93khyrv"}}
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "extra_special_tokens": {},
235
+ "model_max_length": 131072,
236
+ "pad_token": "<|endoftext|>",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.2249212775528565,
3
+ "num_input_tokens_seen": 262144000,
4
+ "train_loss": 1.4907063425183296,
5
+ "train_runtime": 24201.9893,
6
+ "train_samples_per_second": 0.331,
7
+ "train_steps_per_second": 0.041
8
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17200aec4291ed92d4b4ca90c01cf19251e0275c0fb72c676135f06fee0303f4
3
+ size 8376
vocab.json ADDED
The diff for this file is too large to render. See raw diff