p1k0 commited on
Commit
39df6f2
·
verified ·
1 Parent(s): 379170c

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +12 -0
  2. swanlog/.gitignore +1 -0
  3. swanlog/run-20250628_232758-a3b1799d/backup.swanlab +3 -0
  4. swanlog/run-20250628_232758-a3b1799d/files/config.yaml +990 -0
  5. swanlog/run-20250628_232758-a3b1799d/files/requirements.txt +296 -0
  6. swanlog/run-20250628_232758-a3b1799d/files/swanlab-metadata.json +1 -0
  7. swanlog/run-20250628_234855-a3b1799d/backup.swanlab +3 -0
  8. swanlog/run-20250628_234855-a3b1799d/files/config.yaml +990 -0
  9. swanlog/run-20250628_234855-a3b1799d/files/requirements.txt +296 -0
  10. swanlog/run-20250628_234855-a3b1799d/files/swanlab-metadata.json +1 -0
  11. swanlog/run-20250629_082850-a3b1799d/backup.swanlab +3 -0
  12. swanlog/run-20250629_082850-a3b1799d/files/config.yaml +990 -0
  13. swanlog/run-20250629_082850-a3b1799d/files/requirements.txt +296 -0
  14. swanlog/run-20250629_082850-a3b1799d/files/swanlab-metadata.json +1 -0
  15. swanlog/run-20250629_084639-a3b1799d/backup.swanlab +3 -0
  16. swanlog/run-20250629_084639-a3b1799d/files/config.yaml +990 -0
  17. swanlog/run-20250629_084639-a3b1799d/files/requirements.txt +296 -0
  18. swanlog/run-20250629_084639-a3b1799d/files/swanlab-metadata.json +1 -0
  19. swanlog/run-20250629_090551-a3b1799d/backup.swanlab +3 -0
  20. swanlog/run-20250629_090551-a3b1799d/files/config.yaml +990 -0
  21. swanlog/run-20250629_090551-a3b1799d/files/requirements.txt +296 -0
  22. swanlog/run-20250629_090551-a3b1799d/files/swanlab-metadata.json +1 -0
  23. swanlog/run-20250629_092305-a3b1799d/backup.swanlab +3 -0
  24. swanlog/run-20250629_092305-a3b1799d/files/config.yaml +990 -0
  25. swanlog/run-20250629_092305-a3b1799d/files/requirements.txt +296 -0
  26. swanlog/run-20250629_092305-a3b1799d/files/swanlab-metadata.json +1 -0
  27. swanlog/run-20250629_094047-a3b1799d/backup.swanlab +0 -0
  28. swanlog/run-20250629_094047-a3b1799d/files/config.yaml +990 -0
  29. swanlog/run-20250629_094047-a3b1799d/files/requirements.txt +296 -0
  30. swanlog/run-20250629_094047-a3b1799d/files/swanlab-metadata.json +1 -0
  31. swanlog/run-20250629_094305-a3b1799d/backup.swanlab +3 -0
  32. swanlog/run-20250629_094305-a3b1799d/files/config.yaml +990 -0
  33. swanlog/run-20250629_094305-a3b1799d/files/requirements.txt +296 -0
  34. swanlog/run-20250629_094305-a3b1799d/files/swanlab-metadata.json +1 -0
  35. swanlog/run-20250629_101310-a3b1799d/backup.swanlab +3 -0
  36. swanlog/run-20250629_101310-a3b1799d/files/config.yaml +990 -0
  37. swanlog/run-20250629_101310-a3b1799d/files/requirements.txt +296 -0
  38. swanlog/run-20250629_101310-a3b1799d/files/swanlab-metadata.json +1 -0
  39. swanlog/run-20250629_111950-a3b1799d/backup.swanlab +3 -0
  40. swanlog/run-20250629_111950-a3b1799d/files/config.yaml +990 -0
  41. swanlog/run-20250629_111950-a3b1799d/files/requirements.txt +296 -0
  42. swanlog/run-20250629_111950-a3b1799d/files/swanlab-metadata.json +1 -0
  43. swanlog/run-20250629_120036-a3b1799d/backup.swanlab +3 -0
  44. swanlog/run-20250629_120036-a3b1799d/files/config.yaml +1004 -0
  45. swanlog/run-20250629_120036-a3b1799d/files/requirements.txt +296 -0
  46. swanlog/run-20250629_120036-a3b1799d/files/swanlab-metadata.json +1 -0
  47. swanlog/run-20250629_184555-a3b1799d/backup.swanlab +3 -0
  48. swanlog/run-20250629_184555-a3b1799d/files/config.yaml +986 -0
  49. swanlog/run-20250629_184555-a3b1799d/files/requirements.txt +296 -0
  50. swanlog/run-20250629_184555-a3b1799d/files/swanlab-metadata.json +1 -0
.gitattributes CHANGED
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ swanlog/run-20250628_232758-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
37
+ swanlog/run-20250628_234855-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
38
+ swanlog/run-20250629_082850-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
39
+ swanlog/run-20250629_084639-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
40
+ swanlog/run-20250629_090551-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
41
+ swanlog/run-20250629_092305-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
42
+ swanlog/run-20250629_094305-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
43
+ swanlog/run-20250629_101310-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
44
+ swanlog/run-20250629_111950-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
45
+ swanlog/run-20250629_120036-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
46
+ swanlog/run-20250629_184555-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
47
+ swanlog/run-20250629_190303-a3b1799d/backup.swanlab filter=lfs diff=lfs merge=lfs -text
swanlog/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
swanlog/run-20250628_232758-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48a8adba074b9f59af2ffb69a9df81e9ae8ea09cf3f91acf0aaf0e8ccdd5215a
3
+ size 871438
swanlog/run-20250628_232758-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 1.0e-05
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1/v1-20250628-232707/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1/v1-20250628-232707
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1/v1-20250628-232707
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250628_232758-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250628_232758-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 2878259, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250628_232758-a3b1799d"}}
swanlog/run-20250628_234855-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45fe04f62801602079a5ec980d332d5832b4cc04800cfde12aad30ccee2eb4c3
3
+ size 871421
swanlog/run-20250628_234855-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 1.0e-05
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250628-234806/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250628-234806
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250628-234806
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 2.0e-05
swanlog/run-20250628_234855-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250628_234855-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3395167, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 2e-5 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250628_234855-a3b1799d"}}
swanlog/run-20250629_082850-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca085f4bca3d7e7ee8b97c5bb9167d07e9aa67ebf6bf339ca4700e8022bd4939
3
+ size 804536
swanlog/run-20250629_082850-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 1.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250629-082750/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250629-082750
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v0-20250629-082750
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_082850-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_082850-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 763184, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --split_dataset_ratio 0.1 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_082850-a3b1799d"}}
swanlog/run-20250629_084639-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1de4f47ced940649ad542ffc1570817bcfc7f985b9b741022cacfdb83d928a45
3
+ size 877654
swanlog/run-20250629_084639-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 1.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v1-20250629-084541/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v1-20250629-084541
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_r1_v2/v1-20250629-084541
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_084639-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_084639-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 2000666, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_084639-a3b1799d"}}
swanlog/run-20250629_090551-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95d54e71f7495a22679cfd8ebffd788dba92519665b3f36af1291b916036f38d
3
+ size 825733
swanlog/run-20250629_090551-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 1.0e-05
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal/v2-20250629-090453/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal/v2-20250629-090453
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal/v2-20250629-090453
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_090551-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_090551-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3395381, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-5 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_090551-a3b1799d"}}
swanlog/run-20250629_092305-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9946f2b8edee4800761eff6a0659d712d57f79153999431b644f76de3feeb126
3
+ size 825740
swanlog/run-20250629_092305-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 5.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v2/v0-20250629-092206/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v2/v0-20250629-092206
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v2/v0-20250629-092206
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_092305-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_092305-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 344856, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 5e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_092305-a3b1799d"}}
swanlog/run-20250629_094047-a3b1799d/backup.swanlab ADDED
Binary file (34.9 kB). View file
 
swanlog/run-20250629_094047-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 2.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-093950/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-093950
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-093950
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_094047-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_094047-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 1533419, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 2e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v3 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_094047-a3b1799d"}}
swanlog/run-20250629_094305-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69ef5ae04024278da899aa09fd26a6abdb43a69a6c81ca1154dcebf9606e9f74
3
+ size 810013
swanlog/run-20250629_094305-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 2.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v1-20250629-094213/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v1-20250629-094213
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v1-20250629-094213
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_094305-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_094305-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 1699831, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 2e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v3 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_094305-a3b1799d"}}
swanlog/run-20250629_101310-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ec9cb760ff27e221828644ef3beabe41bbc9061fbd4255657361012062c2d1
3
+ size 825310
swanlog/run-20250629_101310-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 2.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-101213/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-101213
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v3/v0-20250629-101213
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_101310-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_101310-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3972595, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 2e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v3 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_101310-a3b1799d"}}
swanlog/run-20250629_111950-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e515379d1491373474da1d875b6e6f4440a9c74ec435826e691b32a2b80ac67
3
+ size 825904
swanlog/run-20250629_111950-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 74
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 73
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 223
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 213
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 156
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 161
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 94
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 95
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 96
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 33
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 216
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 60
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 16
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 17
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 189
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 205
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 50
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 201
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 59
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 126
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 130
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 66
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 220
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 212
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 29
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 32
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 123
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 138
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 140
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 169
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 168
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 141
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 134
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 167
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 166
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 165
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 193
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 137
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 70
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 157
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 144
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 41
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 80
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 81
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 37
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 79
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 38
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 49
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 68
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 89
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 225
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 227
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 90
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 183
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 228
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 226
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 202
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 139
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 82
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 224
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 204
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 57
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 61
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 54
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 55
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 127
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 184
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 131
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 128
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 151
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 153
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 152
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 222
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 155
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 190
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 231
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 210
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 208
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 209
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 88
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 179
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 180
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 149
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 162
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 129
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 19
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 9
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 4
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 178
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 174
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 177
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 175
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 176
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 62
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 150
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 182
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 181
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 198
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 197
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 10
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 5
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 31
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 30
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 124
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 63
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 146
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 158
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 92
492
+ value: 1.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 163
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 47
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 147
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 133
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 230
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 104
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 105
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 106
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 107
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v4/v0-20250629-111856/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 109
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 111
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 110
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 108
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 101
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 100
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 215
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 97
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 35
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 3
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 99
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 148
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 221
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 36
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 18
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 232
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 76
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 188
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 199
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 119
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 48
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 7
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 40
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 39
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 6
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 8
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 51
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 98
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 159
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 160
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 200
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 218
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 22
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 77
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v4/v0-20250629-111856
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 21
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 52
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 78
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 67
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 142
684
+ value: -1
685
+ per_device_eval_batch_size:
686
+ desc: ''
687
+ sort: 85
688
+ value: 2
689
+ per_device_train_batch_size:
690
+ desc: ''
691
+ sort: 84
692
+ value: 2
693
+ per_gpu_eval_batch_size:
694
+ desc: ''
695
+ sort: 87
696
+ value: null
697
+ per_gpu_train_batch_size:
698
+ desc: ''
699
+ sort: 86
700
+ value: null
701
+ predict_with_generate:
702
+ desc: ''
703
+ sort: 207
704
+ value: false
705
+ prediction_loss_only:
706
+ desc: ''
707
+ sort: 83
708
+ value: false
709
+ prefix:
710
+ desc: ''
711
+ sort: 65
712
+ value: null
713
+ pretraining_tp:
714
+ desc: ''
715
+ sort: 12
716
+ value: 1
717
+ problem_type:
718
+ desc: ''
719
+ sort: 72
720
+ value: null
721
+ pruned_heads:
722
+ desc: ''
723
+ sort: 27
724
+ value: {}
725
+ push_to_hub:
726
+ desc: ''
727
+ sort: 172
728
+ value: false
729
+ push_to_hub_model_id:
730
+ desc: ''
731
+ sort: 185
732
+ value: null
733
+ push_to_hub_organization:
734
+ desc: ''
735
+ sort: 186
736
+ value: null
737
+ push_to_hub_token:
738
+ desc: ''
739
+ sort: 187
740
+ value: <PUSH_TO_HUB_TOKEN>
741
+ ray_scope:
742
+ desc: ''
743
+ sort: 192
744
+ value: last
745
+ remove_invalid_values:
746
+ desc: ''
747
+ sort: 56
748
+ value: false
749
+ remove_unused_columns:
750
+ desc: ''
751
+ sort: 145
752
+ value: false
753
+ repetition_penalty:
754
+ desc: ''
755
+ sort: 46
756
+ value: 1.0
757
+ report_to:
758
+ desc: ''
759
+ sort: 164
760
+ value:
761
+ - swanlab
762
+ restore_callback_states_from_checkpoint:
763
+ desc: ''
764
+ sort: 118
765
+ value: false
766
+ resume_from_checkpoint:
767
+ desc: ''
768
+ sort: 173
769
+ value: null
770
+ return_dict:
771
+ desc: ''
772
+ sort: 20
773
+ value: true
774
+ return_dict_in_generate:
775
+ desc: ''
776
+ sort: 53
777
+ value: false
778
+ rms_norm_eps:
779
+ desc: ''
780
+ sort: 11
781
+ value: 1.0e-05
782
+ rope_scaling:
783
+ desc: ''
784
+ sort: 15
785
+ value:
786
+ factor: 8.0
787
+ high_freq_factor: 4.0
788
+ low_freq_factor: 1.0
789
+ original_max_position_embeddings: 8192
790
+ rope_type: llama3
791
+ rope_theta:
792
+ desc: ''
793
+ sort: 14
794
+ value: 500000.0
795
+ run_name:
796
+ desc: ''
797
+ sort: 143
798
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_v4/v0-20250629-111856
799
+ save_on_each_node:
800
+ desc: ''
801
+ sort: 116
802
+ value: false
803
+ save_only_model:
804
+ desc: ''
805
+ sort: 117
806
+ value: false
807
+ save_safetensors:
808
+ desc: ''
809
+ sort: 115
810
+ value: true
811
+ save_steps:
812
+ desc: ''
813
+ sort: 113
814
+ value: 500
815
+ save_strategy:
816
+ desc: ''
817
+ sort: 112
818
+ value: steps
819
+ save_total_limit:
820
+ desc: ''
821
+ sort: 114
822
+ value: 1
823
+ seed:
824
+ desc: ''
825
+ sort: 122
826
+ value: 42
827
+ sep_token_id:
828
+ desc: ''
829
+ sort: 69
830
+ value: null
831
+ skip_memory_metrics:
832
+ desc: ''
833
+ sort: 170
834
+ value: true
835
+ sortish_sampler:
836
+ desc: ''
837
+ sort: 206
838
+ value: false
839
+ suppress_tokens:
840
+ desc: ''
841
+ sort: 58
842
+ value: null
843
+ task_specific_params:
844
+ desc: ''
845
+ sort: 71
846
+ value: null
847
+ temperature:
848
+ desc: ''
849
+ sort: 42
850
+ value: 1.0
851
+ tf32:
852
+ desc: ''
853
+ sort: 132
854
+ value: null
855
+ tf_legacy_loss:
856
+ desc: ''
857
+ sort: 26
858
+ value: false
859
+ tie_encoder_decoder:
860
+ desc: ''
861
+ sort: 34
862
+ value: false
863
+ tie_word_embeddings:
864
+ desc: ''
865
+ sort: 28
866
+ value: false
867
+ tokenizer_class:
868
+ desc: ''
869
+ sort: 64
870
+ value: null
871
+ top_k:
872
+ desc: ''
873
+ sort: 43
874
+ value: 50
875
+ top_p:
876
+ desc: ''
877
+ sort: 44
878
+ value: 1.0
879
+ torch_compile:
880
+ desc: ''
881
+ sort: 194
882
+ value: false
883
+ torch_compile_backend:
884
+ desc: ''
885
+ sort: 195
886
+ value: null
887
+ torch_compile_mode:
888
+ desc: ''
889
+ sort: 196
890
+ value: null
891
+ torch_dtype:
892
+ desc: ''
893
+ sort: 24
894
+ value: bfloat16
895
+ torch_empty_cache_steps:
896
+ desc: ''
897
+ sort: 91
898
+ value: null
899
+ torchdynamo:
900
+ desc: ''
901
+ sort: 191
902
+ value: null
903
+ torchscript:
904
+ desc: ''
905
+ sort: 23
906
+ value: false
907
+ tp_size:
908
+ desc: ''
909
+ sort: 154
910
+ value: 0
911
+ tpu_metrics_debug:
912
+ desc: ''
913
+ sort: 136
914
+ value: false
915
+ tpu_num_cores:
916
+ desc: ''
917
+ sort: 135
918
+ value: null
919
+ train_dataloader_shuffle:
920
+ desc: ''
921
+ sort: 214
922
+ value: true
923
+ train_type:
924
+ desc: ''
925
+ sort: 229
926
+ value: full
927
+ transformers_version:
928
+ desc: ''
929
+ sort: 75
930
+ value: 4.51.3
931
+ typical_p:
932
+ desc: ''
933
+ sort: 45
934
+ value: 1.0
935
+ use_bfloat16:
936
+ desc: ''
937
+ sort: 25
938
+ value: false
939
+ use_cache:
940
+ desc: ''
941
+ sort: 13
942
+ value: false
943
+ use_cpu:
944
+ desc: ''
945
+ sort: 120
946
+ value: false
947
+ use_ipex:
948
+ desc: ''
949
+ sort: 125
950
+ value: false
951
+ use_legacy_prediction_loop:
952
+ desc: ''
953
+ sort: 171
954
+ value: false
955
+ use_liger_kernel:
956
+ desc: ''
957
+ sort: 203
958
+ value: false
959
+ use_logits_to_keep:
960
+ desc: ''
961
+ sort: 219
962
+ value: null
963
+ use_mps_device:
964
+ desc: ''
965
+ sort: 121
966
+ value: false
967
+ vit_gradient_checkpointing:
968
+ desc: ''
969
+ sort: 211
970
+ value: true
971
+ vit_lr:
972
+ desc: ''
973
+ sort: 217
974
+ value: null
975
+ vocab_size:
976
+ desc: ''
977
+ sort: 2
978
+ value: 128256
979
+ warmup_ratio:
980
+ desc: ''
981
+ sort: 102
982
+ value: 0.05
983
+ warmup_steps:
984
+ desc: ''
985
+ sort: 103
986
+ value: 0
987
+ weight_decay:
988
+ desc: ''
989
+ sort: 93
990
+ value: 0.0001
swanlog/run-20250629_111950-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_111950-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 3726930, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type full --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_v4 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_111950-a3b1799d"}}
swanlog/run-20250629_120036-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1960e714980575f5e339106625733af3169058f29830a29253dd005ceb3ad64
3
+ size 895306
swanlog/run-20250629_120036-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,1004 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 75
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 74
16
+ value: /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 224
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 214
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 157
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 162
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 95
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 96
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 97
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 34
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 217
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 61
62
+ value:
63
+ - LlamaForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 17
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 18
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 190
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 206
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 51
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 202
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 60
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 127
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 131
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 67
103
+ value: 128000
104
+ channels:
105
+ desc: ''
106
+ sort: 221
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 213
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 30
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 33
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 124
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 139
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 141
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 170
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 169
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 142
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 135
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 168
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 167
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 166
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 194
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 138
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 71
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 158
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 145
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 42
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 81
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 82
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 38
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 80
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 39
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 50
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 69
245
+ value:
246
+ - 128001
247
+ - 128008
248
+ - 128009
249
+ eval_accumulation_steps:
250
+ desc: ''
251
+ sort: 90
252
+ value: null
253
+ eval_datasets:
254
+ desc: ''
255
+ sort: 226
256
+ value: []
257
+ eval_datasets_args:
258
+ desc: ''
259
+ sort: 228
260
+ value: null
261
+ eval_delay:
262
+ desc: ''
263
+ sort: 91
264
+ value: 0
265
+ eval_do_concat_batches:
266
+ desc: ''
267
+ sort: 184
268
+ value: true
269
+ eval_generation_config:
270
+ desc: ''
271
+ sort: 229
272
+ value: null
273
+ eval_limit:
274
+ desc: ''
275
+ sort: 227
276
+ value: null
277
+ eval_on_start:
278
+ desc: ''
279
+ sort: 203
280
+ value: false
281
+ eval_steps:
282
+ desc: ''
283
+ sort: 140
284
+ value: null
285
+ eval_strategy:
286
+ desc: ''
287
+ sort: 83
288
+ value: epoch
289
+ eval_use_evalscope:
290
+ desc: ''
291
+ sort: 225
292
+ value: false
293
+ eval_use_gather_object:
294
+ desc: ''
295
+ sort: 205
296
+ value: false
297
+ exponential_decay_length_penalty:
298
+ desc: ''
299
+ sort: 58
300
+ value: null
301
+ finetuning_task:
302
+ desc: ''
303
+ sort: 62
304
+ value: null
305
+ forced_bos_token_id:
306
+ desc: ''
307
+ sort: 55
308
+ value: null
309
+ forced_eos_token_id:
310
+ desc: ''
311
+ sort: 56
312
+ value: null
313
+ fp16:
314
+ desc: ''
315
+ sort: 128
316
+ value: false
317
+ fp16_backend:
318
+ desc: ''
319
+ sort: 185
320
+ value: auto
321
+ fp16_full_eval:
322
+ desc: ''
323
+ sort: 132
324
+ value: false
325
+ fp16_opt_level:
326
+ desc: ''
327
+ sort: 129
328
+ value: O1
329
+ fsdp:
330
+ desc: ''
331
+ sort: 152
332
+ value: []
333
+ fsdp_config:
334
+ desc: ''
335
+ sort: 154
336
+ value:
337
+ min_num_params: 0
338
+ xla: false
339
+ xla_fsdp_grad_ckpt: false
340
+ xla_fsdp_v2: false
341
+ fsdp_min_num_params:
342
+ desc: ''
343
+ sort: 153
344
+ value: 0
345
+ fsdp_num:
346
+ desc: ''
347
+ sort: 223
348
+ value: 1
349
+ fsdp_transformer_layer_cls_to_wrap:
350
+ desc: ''
351
+ sort: 156
352
+ value: null
353
+ full_determinism:
354
+ desc: ''
355
+ sort: 191
356
+ value: false
357
+ galore_config:
358
+ desc: ''
359
+ sort: 232
360
+ value: null
361
+ generation_config:
362
+ desc: ''
363
+ sort: 211
364
+ value: null
365
+ generation_max_length:
366
+ desc: ''
367
+ sort: 209
368
+ value: null
369
+ generation_num_beams:
370
+ desc: ''
371
+ sort: 210
372
+ value: null
373
+ gradient_accumulation_steps:
374
+ desc: ''
375
+ sort: 89
376
+ value: 2
377
+ gradient_checkpointing:
378
+ desc: ''
379
+ sort: 180
380
+ value: false
381
+ gradient_checkpointing_kwargs:
382
+ desc: ''
383
+ sort: 181
384
+ value: null
385
+ greater_is_better:
386
+ desc: ''
387
+ sort: 150
388
+ value: false
389
+ group_by_length:
390
+ desc: ''
391
+ sort: 163
392
+ value: false
393
+ half_precision_backend:
394
+ desc: ''
395
+ sort: 130
396
+ value: auto
397
+ head_dim:
398
+ desc: ''
399
+ sort: 20
400
+ value: 128
401
+ hidden_act:
402
+ desc: ''
403
+ sort: 10
404
+ value: silu
405
+ hidden_size:
406
+ desc: ''
407
+ sort: 5
408
+ value: 4096
409
+ hub_always_push:
410
+ desc: ''
411
+ sort: 179
412
+ value: false
413
+ hub_model_id:
414
+ desc: ''
415
+ sort: 175
416
+ value: null
417
+ hub_private_repo:
418
+ desc: ''
419
+ sort: 178
420
+ value: null
421
+ hub_strategy:
422
+ desc: ''
423
+ sort: 176
424
+ value: every_save
425
+ hub_token:
426
+ desc: ''
427
+ sort: 177
428
+ value: <HUB_TOKEN>
429
+ id2label:
430
+ desc: ''
431
+ sort: 63
432
+ value:
433
+ '0': LABEL_0
434
+ '1': LABEL_1
435
+ ignore_data_skip:
436
+ desc: ''
437
+ sort: 151
438
+ value: false
439
+ include_for_metrics:
440
+ desc: ''
441
+ sort: 183
442
+ value: []
443
+ include_inputs_for_metrics:
444
+ desc: ''
445
+ sort: 182
446
+ value: false
447
+ include_num_input_tokens_seen:
448
+ desc: ''
449
+ sort: 199
450
+ value: false
451
+ include_tokens_per_second:
452
+ desc: ''
453
+ sort: 198
454
+ value: false
455
+ initializer_range:
456
+ desc: ''
457
+ sort: 11
458
+ value: 0.02
459
+ intermediate_size:
460
+ desc: ''
461
+ sort: 6
462
+ value: 14336
463
+ is_decoder:
464
+ desc: ''
465
+ sort: 32
466
+ value: false
467
+ is_encoder_decoder:
468
+ desc: ''
469
+ sort: 31
470
+ value: false
471
+ jit_mode_eval:
472
+ desc: ''
473
+ sort: 125
474
+ value: false
475
+ label2id:
476
+ desc: ''
477
+ sort: 64
478
+ value:
479
+ LABEL_0: 0
480
+ LABEL_1: 1
481
+ label_names:
482
+ desc: ''
483
+ sort: 147
484
+ value: null
485
+ label_smoothing_factor:
486
+ desc: ''
487
+ sort: 159
488
+ value: 0.0
489
+ learning_rate:
490
+ desc: ''
491
+ sort: 93
492
+ value: 1.0e-06
493
+ length_column_name:
494
+ desc: ''
495
+ sort: 164
496
+ value: length
497
+ length_penalty:
498
+ desc: ''
499
+ sort: 48
500
+ value: 1.0
501
+ load_best_model_at_end:
502
+ desc: ''
503
+ sort: 148
504
+ value: false
505
+ local_rank:
506
+ desc: ''
507
+ sort: 134
508
+ value: 0
509
+ local_repo_path:
510
+ desc: ''
511
+ sort: 231
512
+ value: null
513
+ log_level:
514
+ desc: ''
515
+ sort: 105
516
+ value: passive
517
+ log_level_replica:
518
+ desc: ''
519
+ sort: 106
520
+ value: warning
521
+ log_on_each_node:
522
+ desc: ''
523
+ sort: 107
524
+ value: true
525
+ logging_dir:
526
+ desc: ''
527
+ sort: 108
528
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_lora_v4/v0-20250629-115943/runs
529
+ logging_first_step:
530
+ desc: ''
531
+ sort: 110
532
+ value: true
533
+ logging_nan_inf_filter:
534
+ desc: ''
535
+ sort: 112
536
+ value: true
537
+ logging_steps:
538
+ desc: ''
539
+ sort: 111
540
+ value: 1
541
+ logging_strategy:
542
+ desc: ''
543
+ sort: 109
544
+ value: steps
545
+ lr_scheduler_kwargs:
546
+ desc: ''
547
+ sort: 102
548
+ value: null
549
+ lr_scheduler_type:
550
+ desc: ''
551
+ sort: 101
552
+ value: cosine
553
+ max_epochs:
554
+ desc: ''
555
+ sort: 216
556
+ value: null
557
+ max_grad_norm:
558
+ desc: ''
559
+ sort: 98
560
+ value: 1.0
561
+ max_length:
562
+ desc: ''
563
+ sort: 36
564
+ value: 20
565
+ max_position_embeddings:
566
+ desc: ''
567
+ sort: 4
568
+ value: 131072
569
+ max_steps:
570
+ desc: ''
571
+ sort: 100
572
+ value: -1
573
+ metric_for_best_model:
574
+ desc: ''
575
+ sort: 149
576
+ value: loss
577
+ metric_warmup_step:
578
+ desc: ''
579
+ sort: 222
580
+ value: 0
581
+ min_length:
582
+ desc: ''
583
+ sort: 37
584
+ value: 0
585
+ mlp_bias:
586
+ desc: ''
587
+ sort: 19
588
+ value: false
589
+ model_num_parameters:
590
+ desc: ''
591
+ sort: 233
592
+ value: 0
593
+ model_type:
594
+ desc: ''
595
+ sort: 77
596
+ value: llama
597
+ mp_parameters:
598
+ desc: ''
599
+ sort: 189
600
+ value: ''
601
+ neftune_noise_alpha:
602
+ desc: ''
603
+ sort: 200
604
+ value: null
605
+ no_cuda:
606
+ desc: ''
607
+ sort: 120
608
+ value: false
609
+ no_repeat_ngram_size:
610
+ desc: ''
611
+ sort: 49
612
+ value: 0
613
+ num_attention_heads:
614
+ desc: ''
615
+ sort: 8
616
+ value: 32
617
+ num_beam_groups:
618
+ desc: ''
619
+ sort: 41
620
+ value: 1
621
+ num_beams:
622
+ desc: ''
623
+ sort: 40
624
+ value: 1
625
+ num_hidden_layers:
626
+ desc: ''
627
+ sort: 7
628
+ value: 32
629
+ num_key_value_heads:
630
+ desc: ''
631
+ sort: 9
632
+ value: 8
633
+ num_return_sequences:
634
+ desc: ''
635
+ sort: 52
636
+ value: 1
637
+ num_train_epochs:
638
+ desc: ''
639
+ sort: 99
640
+ value: 5.0
641
+ optim:
642
+ desc: ''
643
+ sort: 160
644
+ value: adamw_torch
645
+ optim_args:
646
+ desc: ''
647
+ sort: 161
648
+ value: null
649
+ optim_target_modules:
650
+ desc: ''
651
+ sort: 201
652
+ value: null
653
+ optimizer:
654
+ desc: ''
655
+ sort: 219
656
+ value: null
657
+ output_attentions:
658
+ desc: ''
659
+ sort: 23
660
+ value: false
661
+ output_dir:
662
+ desc: ''
663
+ sort: 78
664
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_lora_v4/v0-20250629-115943
665
+ output_hidden_states:
666
+ desc: ''
667
+ sort: 22
668
+ value: false
669
+ output_scores:
670
+ desc: ''
671
+ sort: 53
672
+ value: false
673
+ overwrite_output_dir:
674
+ desc: ''
675
+ sort: 79
676
+ value: false
677
+ pad_token_id:
678
+ desc: ''
679
+ sort: 68
680
+ value: 128009
681
+ past_index:
682
+ desc: ''
683
+ sort: 143
684
+ value: -1
685
+ peft_config:
686
+ desc: ''
687
+ sort: 2
688
+ value:
689
+ default: 'LoraConfig(task_type=''CAUSAL_LM'', peft_type=<PeftType.LORA: ''LORA''>,
690
+ auto_mapping=None, base_model_name_or_path=''/mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct'',
691
+ revision=None, inference_mode=False, r=8, target_modules={''up_proj'', ''o_proj'',
692
+ ''v_proj'', ''q_proj'', ''gate_proj'', ''k_proj'', ''down_proj''}, exclude_modules=None,
693
+ lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias=''none'', use_rslora=False,
694
+ modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None,
695
+ rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core=''megatron.core'',
696
+ trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None,
697
+ use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False),
698
+ lora_bias=False, lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)'
699
+ per_device_eval_batch_size:
700
+ desc: ''
701
+ sort: 86
702
+ value: 2
703
+ per_device_train_batch_size:
704
+ desc: ''
705
+ sort: 85
706
+ value: 2
707
+ per_gpu_eval_batch_size:
708
+ desc: ''
709
+ sort: 88
710
+ value: null
711
+ per_gpu_train_batch_size:
712
+ desc: ''
713
+ sort: 87
714
+ value: null
715
+ predict_with_generate:
716
+ desc: ''
717
+ sort: 208
718
+ value: false
719
+ prediction_loss_only:
720
+ desc: ''
721
+ sort: 84
722
+ value: false
723
+ prefix:
724
+ desc: ''
725
+ sort: 66
726
+ value: null
727
+ pretraining_tp:
728
+ desc: ''
729
+ sort: 13
730
+ value: 1
731
+ problem_type:
732
+ desc: ''
733
+ sort: 73
734
+ value: null
735
+ pruned_heads:
736
+ desc: ''
737
+ sort: 28
738
+ value: {}
739
+ push_to_hub:
740
+ desc: ''
741
+ sort: 173
742
+ value: false
743
+ push_to_hub_model_id:
744
+ desc: ''
745
+ sort: 186
746
+ value: null
747
+ push_to_hub_organization:
748
+ desc: ''
749
+ sort: 187
750
+ value: null
751
+ push_to_hub_token:
752
+ desc: ''
753
+ sort: 188
754
+ value: <PUSH_TO_HUB_TOKEN>
755
+ ray_scope:
756
+ desc: ''
757
+ sort: 193
758
+ value: last
759
+ remove_invalid_values:
760
+ desc: ''
761
+ sort: 57
762
+ value: false
763
+ remove_unused_columns:
764
+ desc: ''
765
+ sort: 146
766
+ value: false
767
+ repetition_penalty:
768
+ desc: ''
769
+ sort: 47
770
+ value: 1.0
771
+ report_to:
772
+ desc: ''
773
+ sort: 165
774
+ value:
775
+ - swanlab
776
+ restore_callback_states_from_checkpoint:
777
+ desc: ''
778
+ sort: 119
779
+ value: false
780
+ resume_from_checkpoint:
781
+ desc: ''
782
+ sort: 174
783
+ value: null
784
+ return_dict:
785
+ desc: ''
786
+ sort: 21
787
+ value: true
788
+ return_dict_in_generate:
789
+ desc: ''
790
+ sort: 54
791
+ value: false
792
+ rms_norm_eps:
793
+ desc: ''
794
+ sort: 12
795
+ value: 1.0e-05
796
+ rope_scaling:
797
+ desc: ''
798
+ sort: 16
799
+ value:
800
+ factor: 8.0
801
+ high_freq_factor: 4.0
802
+ low_freq_factor: 1.0
803
+ original_max_position_embeddings: 8192
804
+ rope_type: llama3
805
+ rope_theta:
806
+ desc: ''
807
+ sort: 15
808
+ value: 500000.0
809
+ run_name:
810
+ desc: ''
811
+ sort: 144
812
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/llama3_8b_normal_lora_v4/v0-20250629-115943
813
+ save_on_each_node:
814
+ desc: ''
815
+ sort: 117
816
+ value: false
817
+ save_only_model:
818
+ desc: ''
819
+ sort: 118
820
+ value: false
821
+ save_safetensors:
822
+ desc: ''
823
+ sort: 116
824
+ value: true
825
+ save_steps:
826
+ desc: ''
827
+ sort: 114
828
+ value: 500
829
+ save_strategy:
830
+ desc: ''
831
+ sort: 113
832
+ value: steps
833
+ save_total_limit:
834
+ desc: ''
835
+ sort: 115
836
+ value: 1
837
+ seed:
838
+ desc: ''
839
+ sort: 123
840
+ value: 42
841
+ sep_token_id:
842
+ desc: ''
843
+ sort: 70
844
+ value: null
845
+ skip_memory_metrics:
846
+ desc: ''
847
+ sort: 171
848
+ value: true
849
+ sortish_sampler:
850
+ desc: ''
851
+ sort: 207
852
+ value: false
853
+ suppress_tokens:
854
+ desc: ''
855
+ sort: 59
856
+ value: null
857
+ task_specific_params:
858
+ desc: ''
859
+ sort: 72
860
+ value: null
861
+ temperature:
862
+ desc: ''
863
+ sort: 43
864
+ value: 1.0
865
+ tf32:
866
+ desc: ''
867
+ sort: 133
868
+ value: null
869
+ tf_legacy_loss:
870
+ desc: ''
871
+ sort: 27
872
+ value: false
873
+ tie_encoder_decoder:
874
+ desc: ''
875
+ sort: 35
876
+ value: false
877
+ tie_word_embeddings:
878
+ desc: ''
879
+ sort: 29
880
+ value: false
881
+ tokenizer_class:
882
+ desc: ''
883
+ sort: 65
884
+ value: null
885
+ top_k:
886
+ desc: ''
887
+ sort: 44
888
+ value: 50
889
+ top_p:
890
+ desc: ''
891
+ sort: 45
892
+ value: 1.0
893
+ torch_compile:
894
+ desc: ''
895
+ sort: 195
896
+ value: false
897
+ torch_compile_backend:
898
+ desc: ''
899
+ sort: 196
900
+ value: null
901
+ torch_compile_mode:
902
+ desc: ''
903
+ sort: 197
904
+ value: null
905
+ torch_dtype:
906
+ desc: ''
907
+ sort: 25
908
+ value: bfloat16
909
+ torch_empty_cache_steps:
910
+ desc: ''
911
+ sort: 92
912
+ value: null
913
+ torchdynamo:
914
+ desc: ''
915
+ sort: 192
916
+ value: null
917
+ torchscript:
918
+ desc: ''
919
+ sort: 24
920
+ value: false
921
+ tp_size:
922
+ desc: ''
923
+ sort: 155
924
+ value: 0
925
+ tpu_metrics_debug:
926
+ desc: ''
927
+ sort: 137
928
+ value: false
929
+ tpu_num_cores:
930
+ desc: ''
931
+ sort: 136
932
+ value: null
933
+ train_dataloader_shuffle:
934
+ desc: ''
935
+ sort: 215
936
+ value: true
937
+ train_type:
938
+ desc: ''
939
+ sort: 230
940
+ value: lora
941
+ transformers_version:
942
+ desc: ''
943
+ sort: 76
944
+ value: 4.51.3
945
+ typical_p:
946
+ desc: ''
947
+ sort: 46
948
+ value: 1.0
949
+ use_bfloat16:
950
+ desc: ''
951
+ sort: 26
952
+ value: false
953
+ use_cache:
954
+ desc: ''
955
+ sort: 14
956
+ value: false
957
+ use_cpu:
958
+ desc: ''
959
+ sort: 121
960
+ value: false
961
+ use_ipex:
962
+ desc: ''
963
+ sort: 126
964
+ value: false
965
+ use_legacy_prediction_loop:
966
+ desc: ''
967
+ sort: 172
968
+ value: false
969
+ use_liger_kernel:
970
+ desc: ''
971
+ sort: 204
972
+ value: false
973
+ use_logits_to_keep:
974
+ desc: ''
975
+ sort: 220
976
+ value: null
977
+ use_mps_device:
978
+ desc: ''
979
+ sort: 122
980
+ value: false
981
+ vit_gradient_checkpointing:
982
+ desc: ''
983
+ sort: 212
984
+ value: true
985
+ vit_lr:
986
+ desc: ''
987
+ sort: 218
988
+ value: null
989
+ vocab_size:
990
+ desc: ''
991
+ sort: 3
992
+ value: 128256
993
+ warmup_ratio:
994
+ desc: ''
995
+ sort: 103
996
+ value: 0.05
997
+ warmup_steps:
998
+ desc: ''
999
+ sort: 104
1000
+ value: 0
1001
+ weight_decay:
1002
+ desc: ''
1003
+ sort: 94
1004
+ value: 0.0001
swanlog/run-20250629_120036-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_120036-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 430023, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Meta-Llama-3.1-8B-Instruct --model_type llama3_1 --train_type lora --dataset data/train_data/normal_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 1024 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/llama3_8b_normal_lora_v4 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_120036-a3b1799d"}}
swanlog/run-20250629_184555-a3b1799d/backup.swanlab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6366fd34242266b6124cca4db6ae0957dee924eb1088bbb28e409e71cacbf87d
3
+ size 877553
swanlog/run-20250629_184555-a3b1799d/files/config.yaml ADDED
@@ -0,0 +1,986 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FRAMEWORK:
2
+ desc: ''
3
+ sort: 1
4
+ value: 🤗transformers
5
+ UPPERFRAME:
6
+ desc: ''
7
+ sort: 0
8
+ value: 🐦‍⬛ms-swift
9
+ _attn_implementation_autoset:
10
+ desc: ''
11
+ sort: 75
12
+ value: true
13
+ _name_or_path:
14
+ desc: ''
15
+ sort: 74
16
+ value: /mnt/data/users/liamding/data/models/Qwen3-8B
17
+ acc_steps:
18
+ desc: ''
19
+ sort: 224
20
+ value: 1
21
+ acc_strategy:
22
+ desc: ''
23
+ sort: 214
24
+ value: token
25
+ accelerator_config:
26
+ desc: ''
27
+ sort: 157
28
+ value:
29
+ dispatch_batches: false
30
+ even_batches: true
31
+ gradient_accumulation_kwargs: null
32
+ non_blocking: false
33
+ split_batches: false
34
+ use_seedable_sampler: true
35
+ adafactor:
36
+ desc: ''
37
+ sort: 162
38
+ value: false
39
+ adam_beta1:
40
+ desc: ''
41
+ sort: 95
42
+ value: 0.9
43
+ adam_beta2:
44
+ desc: ''
45
+ sort: 96
46
+ value: 0.95
47
+ adam_epsilon:
48
+ desc: ''
49
+ sort: 97
50
+ value: 1.0e-08
51
+ add_cross_attention:
52
+ desc: ''
53
+ sort: 34
54
+ value: false
55
+ aligner_lr:
56
+ desc: ''
57
+ sort: 217
58
+ value: null
59
+ architectures:
60
+ desc: ''
61
+ sort: 61
62
+ value:
63
+ - Qwen3ForCausalLM
64
+ attention_bias:
65
+ desc: ''
66
+ sort: 19
67
+ value: false
68
+ attention_dropout:
69
+ desc: ''
70
+ sort: 20
71
+ value: 0.0
72
+ auto_find_batch_size:
73
+ desc: ''
74
+ sort: 190
75
+ value: false
76
+ average_tokens_across_devices:
77
+ desc: ''
78
+ sort: 206
79
+ value: false
80
+ bad_words_ids:
81
+ desc: ''
82
+ sort: 51
83
+ value: null
84
+ batch_eval_metrics:
85
+ desc: ''
86
+ sort: 202
87
+ value: false
88
+ begin_suppress_tokens:
89
+ desc: ''
90
+ sort: 60
91
+ value: null
92
+ bf16:
93
+ desc: ''
94
+ sort: 127
95
+ value: true
96
+ bf16_full_eval:
97
+ desc: ''
98
+ sort: 131
99
+ value: false
100
+ bos_token_id:
101
+ desc: ''
102
+ sort: 67
103
+ value: 151643
104
+ channels:
105
+ desc: ''
106
+ sort: 221
107
+ value: null
108
+ check_model:
109
+ desc: ''
110
+ sort: 213
111
+ value: true
112
+ chunk_size_feed_forward:
113
+ desc: ''
114
+ sort: 30
115
+ value: 0
116
+ cross_attention_hidden_size:
117
+ desc: ''
118
+ sort: 33
119
+ value: null
120
+ data_seed:
121
+ desc: ''
122
+ sort: 124
123
+ value: 42
124
+ dataloader_drop_last:
125
+ desc: ''
126
+ sort: 139
127
+ value: false
128
+ dataloader_num_workers:
129
+ desc: ''
130
+ sort: 141
131
+ value: 4
132
+ dataloader_persistent_workers:
133
+ desc: ''
134
+ sort: 170
135
+ value: false
136
+ dataloader_pin_memory:
137
+ desc: ''
138
+ sort: 169
139
+ value: true
140
+ dataloader_prefetch_factor:
141
+ desc: ''
142
+ sort: 142
143
+ value: 10
144
+ ddp_backend:
145
+ desc: ''
146
+ sort: 135
147
+ value: null
148
+ ddp_broadcast_buffers:
149
+ desc: ''
150
+ sort: 168
151
+ value: null
152
+ ddp_bucket_cap_mb:
153
+ desc: ''
154
+ sort: 167
155
+ value: null
156
+ ddp_find_unused_parameters:
157
+ desc: ''
158
+ sort: 166
159
+ value: null
160
+ ddp_timeout:
161
+ desc: ''
162
+ sort: 194
163
+ value: 18000000
164
+ debug:
165
+ desc: ''
166
+ sort: 138
167
+ value: []
168
+ decoder_start_token_id:
169
+ desc: ''
170
+ sort: 71
171
+ value: null
172
+ deepspeed:
173
+ desc: ''
174
+ sort: 158
175
+ value:
176
+ bf16:
177
+ enabled: auto
178
+ fp16:
179
+ enabled: auto
180
+ hysteresis: 2
181
+ initial_scale_power: 16
182
+ loss_scale: 0
183
+ loss_scale_window: 1000
184
+ min_loss_scale: 1
185
+ gradient_accumulation_steps: auto
186
+ gradient_clipping: auto
187
+ steps_per_print: 2000
188
+ train_batch_size: auto
189
+ train_micro_batch_size_per_gpu: auto
190
+ wall_clock_breakdown: false
191
+ zero_optimization:
192
+ contiguous_gradients: true
193
+ offload_optimizer:
194
+ device: none
195
+ pin_memory: true
196
+ offload_param:
197
+ device: none
198
+ pin_memory: true
199
+ overlap_comm: false
200
+ reduce_bucket_size: auto
201
+ stage: 3
202
+ stage3_gather_16bit_weights_on_model_save: true
203
+ stage3_max_live_parameters: 1000000000.0
204
+ stage3_max_reuse_distance: 1000000000.0
205
+ stage3_param_persistence_threshold: auto
206
+ stage3_prefetch_bucket_size: auto
207
+ sub_group_size: 1000000000.0
208
+ zero_quantized_gradients: false
209
+ zero_quantized_weights: false
210
+ disable_tqdm:
211
+ desc: ''
212
+ sort: 145
213
+ value: false
214
+ diversity_penalty:
215
+ desc: ''
216
+ sort: 42
217
+ value: 0.0
218
+ do_eval:
219
+ desc: ''
220
+ sort: 81
221
+ value: true
222
+ do_predict:
223
+ desc: ''
224
+ sort: 82
225
+ value: false
226
+ do_sample:
227
+ desc: ''
228
+ sort: 38
229
+ value: false
230
+ do_train:
231
+ desc: ''
232
+ sort: 80
233
+ value: false
234
+ early_stopping:
235
+ desc: ''
236
+ sort: 39
237
+ value: false
238
+ encoder_no_repeat_ngram_size:
239
+ desc: ''
240
+ sort: 50
241
+ value: 0
242
+ eos_token_id:
243
+ desc: ''
244
+ sort: 69
245
+ value: 151645
246
+ eval_accumulation_steps:
247
+ desc: ''
248
+ sort: 90
249
+ value: null
250
+ eval_datasets:
251
+ desc: ''
252
+ sort: 226
253
+ value: []
254
+ eval_datasets_args:
255
+ desc: ''
256
+ sort: 228
257
+ value: null
258
+ eval_delay:
259
+ desc: ''
260
+ sort: 91
261
+ value: 0
262
+ eval_do_concat_batches:
263
+ desc: ''
264
+ sort: 184
265
+ value: true
266
+ eval_generation_config:
267
+ desc: ''
268
+ sort: 229
269
+ value: null
270
+ eval_limit:
271
+ desc: ''
272
+ sort: 227
273
+ value: null
274
+ eval_on_start:
275
+ desc: ''
276
+ sort: 203
277
+ value: false
278
+ eval_steps:
279
+ desc: ''
280
+ sort: 140
281
+ value: null
282
+ eval_strategy:
283
+ desc: ''
284
+ sort: 83
285
+ value: epoch
286
+ eval_use_evalscope:
287
+ desc: ''
288
+ sort: 225
289
+ value: false
290
+ eval_use_gather_object:
291
+ desc: ''
292
+ sort: 205
293
+ value: false
294
+ exponential_decay_length_penalty:
295
+ desc: ''
296
+ sort: 58
297
+ value: null
298
+ finetuning_task:
299
+ desc: ''
300
+ sort: 62
301
+ value: null
302
+ forced_bos_token_id:
303
+ desc: ''
304
+ sort: 55
305
+ value: null
306
+ forced_eos_token_id:
307
+ desc: ''
308
+ sort: 56
309
+ value: null
310
+ fp16:
311
+ desc: ''
312
+ sort: 128
313
+ value: false
314
+ fp16_backend:
315
+ desc: ''
316
+ sort: 185
317
+ value: auto
318
+ fp16_full_eval:
319
+ desc: ''
320
+ sort: 132
321
+ value: false
322
+ fp16_opt_level:
323
+ desc: ''
324
+ sort: 129
325
+ value: O1
326
+ fsdp:
327
+ desc: ''
328
+ sort: 152
329
+ value: []
330
+ fsdp_config:
331
+ desc: ''
332
+ sort: 154
333
+ value:
334
+ min_num_params: 0
335
+ xla: false
336
+ xla_fsdp_grad_ckpt: false
337
+ xla_fsdp_v2: false
338
+ fsdp_min_num_params:
339
+ desc: ''
340
+ sort: 153
341
+ value: 0
342
+ fsdp_num:
343
+ desc: ''
344
+ sort: 223
345
+ value: 1
346
+ fsdp_transformer_layer_cls_to_wrap:
347
+ desc: ''
348
+ sort: 156
349
+ value: null
350
+ full_determinism:
351
+ desc: ''
352
+ sort: 191
353
+ value: false
354
+ galore_config:
355
+ desc: ''
356
+ sort: 232
357
+ value: null
358
+ generation_config:
359
+ desc: ''
360
+ sort: 211
361
+ value: null
362
+ generation_max_length:
363
+ desc: ''
364
+ sort: 209
365
+ value: null
366
+ generation_num_beams:
367
+ desc: ''
368
+ sort: 210
369
+ value: null
370
+ gradient_accumulation_steps:
371
+ desc: ''
372
+ sort: 89
373
+ value: 2
374
+ gradient_checkpointing:
375
+ desc: ''
376
+ sort: 180
377
+ value: false
378
+ gradient_checkpointing_kwargs:
379
+ desc: ''
380
+ sort: 181
381
+ value: null
382
+ greater_is_better:
383
+ desc: ''
384
+ sort: 150
385
+ value: false
386
+ group_by_length:
387
+ desc: ''
388
+ sort: 163
389
+ value: false
390
+ half_precision_backend:
391
+ desc: ''
392
+ sort: 130
393
+ value: auto
394
+ head_dim:
395
+ desc: ''
396
+ sort: 12
397
+ value: 128
398
+ hidden_act:
399
+ desc: ''
400
+ sort: 13
401
+ value: silu
402
+ hidden_size:
403
+ desc: ''
404
+ sort: 4
405
+ value: 4096
406
+ hub_always_push:
407
+ desc: ''
408
+ sort: 179
409
+ value: false
410
+ hub_model_id:
411
+ desc: ''
412
+ sort: 175
413
+ value: null
414
+ hub_private_repo:
415
+ desc: ''
416
+ sort: 178
417
+ value: null
418
+ hub_strategy:
419
+ desc: ''
420
+ sort: 176
421
+ value: every_save
422
+ hub_token:
423
+ desc: ''
424
+ sort: 177
425
+ value: <HUB_TOKEN>
426
+ id2label:
427
+ desc: ''
428
+ sort: 63
429
+ value:
430
+ '0': LABEL_0
431
+ '1': LABEL_1
432
+ ignore_data_skip:
433
+ desc: ''
434
+ sort: 151
435
+ value: false
436
+ include_for_metrics:
437
+ desc: ''
438
+ sort: 183
439
+ value: []
440
+ include_inputs_for_metrics:
441
+ desc: ''
442
+ sort: 182
443
+ value: false
444
+ include_num_input_tokens_seen:
445
+ desc: ''
446
+ sort: 199
447
+ value: false
448
+ include_tokens_per_second:
449
+ desc: ''
450
+ sort: 198
451
+ value: false
452
+ initializer_range:
453
+ desc: ''
454
+ sort: 14
455
+ value: 0.02
456
+ intermediate_size:
457
+ desc: ''
458
+ sort: 5
459
+ value: 12288
460
+ is_decoder:
461
+ desc: ''
462
+ sort: 32
463
+ value: false
464
+ is_encoder_decoder:
465
+ desc: ''
466
+ sort: 31
467
+ value: false
468
+ jit_mode_eval:
469
+ desc: ''
470
+ sort: 125
471
+ value: false
472
+ label2id:
473
+ desc: ''
474
+ sort: 64
475
+ value:
476
+ LABEL_0: 0
477
+ LABEL_1: 1
478
+ label_names:
479
+ desc: ''
480
+ sort: 147
481
+ value: null
482
+ label_smoothing_factor:
483
+ desc: ''
484
+ sort: 159
485
+ value: 0.0
486
+ learning_rate:
487
+ desc: ''
488
+ sort: 93
489
+ value: 1.0e-06
490
+ length_column_name:
491
+ desc: ''
492
+ sort: 164
493
+ value: length
494
+ length_penalty:
495
+ desc: ''
496
+ sort: 48
497
+ value: 1.0
498
+ load_best_model_at_end:
499
+ desc: ''
500
+ sort: 148
501
+ value: false
502
+ local_rank:
503
+ desc: ''
504
+ sort: 134
505
+ value: 0
506
+ local_repo_path:
507
+ desc: ''
508
+ sort: 231
509
+ value: null
510
+ log_level:
511
+ desc: ''
512
+ sort: 105
513
+ value: passive
514
+ log_level_replica:
515
+ desc: ''
516
+ sort: 106
517
+ value: warning
518
+ log_on_each_node:
519
+ desc: ''
520
+ sort: 107
521
+ value: true
522
+ logging_dir:
523
+ desc: ''
524
+ sort: 108
525
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_8b_r1_v2/v0-20250629-184450/runs
526
+ logging_first_step:
527
+ desc: ''
528
+ sort: 110
529
+ value: true
530
+ logging_nan_inf_filter:
531
+ desc: ''
532
+ sort: 112
533
+ value: true
534
+ logging_steps:
535
+ desc: ''
536
+ sort: 111
537
+ value: 1
538
+ logging_strategy:
539
+ desc: ''
540
+ sort: 109
541
+ value: steps
542
+ lr_scheduler_kwargs:
543
+ desc: ''
544
+ sort: 102
545
+ value: null
546
+ lr_scheduler_type:
547
+ desc: ''
548
+ sort: 101
549
+ value: cosine
550
+ max_epochs:
551
+ desc: ''
552
+ sort: 216
553
+ value: null
554
+ max_grad_norm:
555
+ desc: ''
556
+ sort: 98
557
+ value: 1.0
558
+ max_length:
559
+ desc: ''
560
+ sort: 36
561
+ value: 20
562
+ max_position_embeddings:
563
+ desc: ''
564
+ sort: 3
565
+ value: 40960
566
+ max_steps:
567
+ desc: ''
568
+ sort: 100
569
+ value: -1
570
+ max_window_layers:
571
+ desc: ''
572
+ sort: 10
573
+ value: 36
574
+ metric_for_best_model:
575
+ desc: ''
576
+ sort: 149
577
+ value: loss
578
+ metric_warmup_step:
579
+ desc: ''
580
+ sort: 222
581
+ value: 0
582
+ min_length:
583
+ desc: ''
584
+ sort: 37
585
+ value: 0
586
+ model_num_parameters:
587
+ desc: ''
588
+ sort: 233
589
+ value: 0
590
+ model_type:
591
+ desc: ''
592
+ sort: 77
593
+ value: qwen3
594
+ mp_parameters:
595
+ desc: ''
596
+ sort: 189
597
+ value: ''
598
+ neftune_noise_alpha:
599
+ desc: ''
600
+ sort: 200
601
+ value: null
602
+ no_cuda:
603
+ desc: ''
604
+ sort: 120
605
+ value: false
606
+ no_repeat_ngram_size:
607
+ desc: ''
608
+ sort: 49
609
+ value: 0
610
+ num_attention_heads:
611
+ desc: ''
612
+ sort: 7
613
+ value: 32
614
+ num_beam_groups:
615
+ desc: ''
616
+ sort: 41
617
+ value: 1
618
+ num_beams:
619
+ desc: ''
620
+ sort: 40
621
+ value: 1
622
+ num_hidden_layers:
623
+ desc: ''
624
+ sort: 6
625
+ value: 36
626
+ num_key_value_heads:
627
+ desc: ''
628
+ sort: 11
629
+ value: 8
630
+ num_return_sequences:
631
+ desc: ''
632
+ sort: 52
633
+ value: 1
634
+ num_train_epochs:
635
+ desc: ''
636
+ sort: 99
637
+ value: 5.0
638
+ optim:
639
+ desc: ''
640
+ sort: 160
641
+ value: adamw_torch
642
+ optim_args:
643
+ desc: ''
644
+ sort: 161
645
+ value: null
646
+ optim_target_modules:
647
+ desc: ''
648
+ sort: 201
649
+ value: null
650
+ optimizer:
651
+ desc: ''
652
+ sort: 219
653
+ value: null
654
+ output_attentions:
655
+ desc: ''
656
+ sort: 23
657
+ value: false
658
+ output_dir:
659
+ desc: ''
660
+ sort: 78
661
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_8b_r1_v2/v0-20250629-184450
662
+ output_hidden_states:
663
+ desc: ''
664
+ sort: 22
665
+ value: false
666
+ output_scores:
667
+ desc: ''
668
+ sort: 53
669
+ value: false
670
+ overwrite_output_dir:
671
+ desc: ''
672
+ sort: 79
673
+ value: false
674
+ pad_token_id:
675
+ desc: ''
676
+ sort: 68
677
+ value: 151643
678
+ past_index:
679
+ desc: ''
680
+ sort: 143
681
+ value: -1
682
+ per_device_eval_batch_size:
683
+ desc: ''
684
+ sort: 86
685
+ value: 2
686
+ per_device_train_batch_size:
687
+ desc: ''
688
+ sort: 85
689
+ value: 2
690
+ per_gpu_eval_batch_size:
691
+ desc: ''
692
+ sort: 88
693
+ value: null
694
+ per_gpu_train_batch_size:
695
+ desc: ''
696
+ sort: 87
697
+ value: null
698
+ predict_with_generate:
699
+ desc: ''
700
+ sort: 208
701
+ value: false
702
+ prediction_loss_only:
703
+ desc: ''
704
+ sort: 84
705
+ value: false
706
+ prefix:
707
+ desc: ''
708
+ sort: 66
709
+ value: null
710
+ problem_type:
711
+ desc: ''
712
+ sort: 73
713
+ value: null
714
+ pruned_heads:
715
+ desc: ''
716
+ sort: 28
717
+ value: {}
718
+ push_to_hub:
719
+ desc: ''
720
+ sort: 173
721
+ value: false
722
+ push_to_hub_model_id:
723
+ desc: ''
724
+ sort: 186
725
+ value: null
726
+ push_to_hub_organization:
727
+ desc: ''
728
+ sort: 187
729
+ value: null
730
+ push_to_hub_token:
731
+ desc: ''
732
+ sort: 188
733
+ value: <PUSH_TO_HUB_TOKEN>
734
+ ray_scope:
735
+ desc: ''
736
+ sort: 193
737
+ value: last
738
+ remove_invalid_values:
739
+ desc: ''
740
+ sort: 57
741
+ value: false
742
+ remove_unused_columns:
743
+ desc: ''
744
+ sort: 146
745
+ value: false
746
+ repetition_penalty:
747
+ desc: ''
748
+ sort: 47
749
+ value: 1.0
750
+ report_to:
751
+ desc: ''
752
+ sort: 165
753
+ value:
754
+ - swanlab
755
+ restore_callback_states_from_checkpoint:
756
+ desc: ''
757
+ sort: 119
758
+ value: false
759
+ resume_from_checkpoint:
760
+ desc: ''
761
+ sort: 174
762
+ value: null
763
+ return_dict:
764
+ desc: ''
765
+ sort: 21
766
+ value: true
767
+ return_dict_in_generate:
768
+ desc: ''
769
+ sort: 54
770
+ value: false
771
+ rms_norm_eps:
772
+ desc: ''
773
+ sort: 15
774
+ value: 1.0e-06
775
+ rope_scaling:
776
+ desc: ''
777
+ sort: 18
778
+ value: null
779
+ rope_theta:
780
+ desc: ''
781
+ sort: 17
782
+ value: 1000000
783
+ run_name:
784
+ desc: ''
785
+ sort: 144
786
+ value: /mnt/data/users/liamding/data/sft_zh_tox/output/qwen3_8b_r1_v2/v0-20250629-184450
787
+ save_on_each_node:
788
+ desc: ''
789
+ sort: 117
790
+ value: false
791
+ save_only_model:
792
+ desc: ''
793
+ sort: 118
794
+ value: false
795
+ save_safetensors:
796
+ desc: ''
797
+ sort: 116
798
+ value: true
799
+ save_steps:
800
+ desc: ''
801
+ sort: 114
802
+ value: 500
803
+ save_strategy:
804
+ desc: ''
805
+ sort: 113
806
+ value: steps
807
+ save_total_limit:
808
+ desc: ''
809
+ sort: 115
810
+ value: 1
811
+ seed:
812
+ desc: ''
813
+ sort: 123
814
+ value: 42
815
+ sep_token_id:
816
+ desc: ''
817
+ sort: 70
818
+ value: null
819
+ skip_memory_metrics:
820
+ desc: ''
821
+ sort: 171
822
+ value: true
823
+ sliding_window:
824
+ desc: ''
825
+ sort: 9
826
+ value: null
827
+ sortish_sampler:
828
+ desc: ''
829
+ sort: 207
830
+ value: false
831
+ suppress_tokens:
832
+ desc: ''
833
+ sort: 59
834
+ value: null
835
+ task_specific_params:
836
+ desc: ''
837
+ sort: 72
838
+ value: null
839
+ temperature:
840
+ desc: ''
841
+ sort: 43
842
+ value: 1.0
843
+ tf32:
844
+ desc: ''
845
+ sort: 133
846
+ value: null
847
+ tf_legacy_loss:
848
+ desc: ''
849
+ sort: 27
850
+ value: false
851
+ tie_encoder_decoder:
852
+ desc: ''
853
+ sort: 35
854
+ value: false
855
+ tie_word_embeddings:
856
+ desc: ''
857
+ sort: 29
858
+ value: false
859
+ tokenizer_class:
860
+ desc: ''
861
+ sort: 65
862
+ value: null
863
+ top_k:
864
+ desc: ''
865
+ sort: 44
866
+ value: 50
867
+ top_p:
868
+ desc: ''
869
+ sort: 45
870
+ value: 1.0
871
+ torch_compile:
872
+ desc: ''
873
+ sort: 195
874
+ value: false
875
+ torch_compile_backend:
876
+ desc: ''
877
+ sort: 196
878
+ value: null
879
+ torch_compile_mode:
880
+ desc: ''
881
+ sort: 197
882
+ value: null
883
+ torch_dtype:
884
+ desc: ''
885
+ sort: 25
886
+ value: bfloat16
887
+ torch_empty_cache_steps:
888
+ desc: ''
889
+ sort: 92
890
+ value: null
891
+ torchdynamo:
892
+ desc: ''
893
+ sort: 192
894
+ value: null
895
+ torchscript:
896
+ desc: ''
897
+ sort: 24
898
+ value: false
899
+ tp_size:
900
+ desc: ''
901
+ sort: 155
902
+ value: 0
903
+ tpu_metrics_debug:
904
+ desc: ''
905
+ sort: 137
906
+ value: false
907
+ tpu_num_cores:
908
+ desc: ''
909
+ sort: 136
910
+ value: null
911
+ train_dataloader_shuffle:
912
+ desc: ''
913
+ sort: 215
914
+ value: true
915
+ train_type:
916
+ desc: ''
917
+ sort: 230
918
+ value: full
919
+ transformers_version:
920
+ desc: ''
921
+ sort: 76
922
+ value: 4.51.3
923
+ typical_p:
924
+ desc: ''
925
+ sort: 46
926
+ value: 1.0
927
+ use_bfloat16:
928
+ desc: ''
929
+ sort: 26
930
+ value: false
931
+ use_cache:
932
+ desc: ''
933
+ sort: 16
934
+ value: false
935
+ use_cpu:
936
+ desc: ''
937
+ sort: 121
938
+ value: false
939
+ use_ipex:
940
+ desc: ''
941
+ sort: 126
942
+ value: false
943
+ use_legacy_prediction_loop:
944
+ desc: ''
945
+ sort: 172
946
+ value: false
947
+ use_liger_kernel:
948
+ desc: ''
949
+ sort: 204
950
+ value: false
951
+ use_logits_to_keep:
952
+ desc: ''
953
+ sort: 220
954
+ value: null
955
+ use_mps_device:
956
+ desc: ''
957
+ sort: 122
958
+ value: false
959
+ use_sliding_window:
960
+ desc: ''
961
+ sort: 8
962
+ value: false
963
+ vit_gradient_checkpointing:
964
+ desc: ''
965
+ sort: 212
966
+ value: true
967
+ vit_lr:
968
+ desc: ''
969
+ sort: 218
970
+ value: null
971
+ vocab_size:
972
+ desc: ''
973
+ sort: 2
974
+ value: 151936
975
+ warmup_ratio:
976
+ desc: ''
977
+ sort: 103
978
+ value: 0.05
979
+ warmup_steps:
980
+ desc: ''
981
+ sort: 104
982
+ value: 0
983
+ weight_decay:
984
+ desc: ''
985
+ sort: 94
986
+ value: 0.0001
swanlog/run-20250629_184555-a3b1799d/files/requirements.txt ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.2.1
2
+ accelerate==1.6.0
3
+ addict==2.4.0
4
+ aiofiles==24.1.0
5
+ aiohappyeyeballs==2.6.1
6
+ aiohttp==3.11.14
7
+ aiosignal==1.3.2
8
+ airportsdata==20250224
9
+ aliyun-python-sdk-core==2.16.0
10
+ aliyun-python-sdk-kms==2.16.5
11
+ altair==5.5.0
12
+ annotated-types==0.7.0
13
+ antlr4-python3-runtime==4.13.2
14
+ anyio==4.9.0
15
+ astor==0.8.1
16
+ async-timeout==5.0.1
17
+ attrdict==2.0.1
18
+ attrs==25.3.0
19
+ av==14.3.0
20
+ beautifulsoup4==4.13.3
21
+ binpacking==1.5.2
22
+ bitsandbytes==0.45.5
23
+ blake3==1.0.4
24
+ blinker==1.9.0
25
+ boto3==1.38.46
26
+ botocore==1.38.46
27
+ cachetools==5.5.2
28
+ certifi==2025.1.31
29
+ cffi==1.17.1
30
+ charset-normalizer==3.4.1
31
+ click==8.1.8
32
+ cloudpickle==3.1.1
33
+ colorama==0.4.6
34
+ compressed-tensors==0.9.4
35
+ contourpy==1.3.1
36
+ cpm-kernels==1.0.11
37
+ crcmod==1.7
38
+ cryptography==44.0.3
39
+ cupy-cuda12x==13.4.1
40
+ cycler==0.12.1
41
+ dacite==1.9.2
42
+ dashscope==1.23.3
43
+ datasets==3.2.0
44
+ decord==0.6.0
45
+ deepspeed==0.16.5
46
+ Deprecated==1.2.18
47
+ depyf==0.18.0
48
+ dill==0.3.8
49
+ diskcache==5.6.3
50
+ distro==1.9.0
51
+ dnspython==2.7.0
52
+ docker-pycreds==0.4.0
53
+ einops==0.6.1
54
+ einops-exts==0.0.4
55
+ email_validator==2.2.0
56
+ entmax==1.3
57
+ et_xmlfile==2.0.0
58
+ exceptiongroup==1.2.2
59
+ fastapi==0.115.12
60
+ fastapi-cli==0.0.7
61
+ fastrlock==0.8.3
62
+ ffmpy==0.5.0
63
+ filelock==3.18.0
64
+ flash_attn==2.7.4.post1
65
+ fonttools==4.56.0
66
+ frozenlist==1.5.0
67
+ fsspec==2024.9.0
68
+ future==1.0.0
69
+ gdown==5.2.0
70
+ gguf==0.16.3
71
+ gitdb==4.0.12
72
+ GitPython==3.1.44
73
+ googleapis-common-protos==1.70.0
74
+ gradio==5.29.0
75
+ gradio_client==1.10.0
76
+ groovy==0.1.2
77
+ grpcio==1.71.0
78
+ h11==0.16.0
79
+ hf-xet==1.1.2
80
+ hjson==3.1.0
81
+ httpcore==1.0.9
82
+ httptools==0.6.4
83
+ httpx==0.28.1
84
+ huggingface-hub==0.32.2
85
+ idna==3.10
86
+ imageio==2.37.0
87
+ importlib_metadata==8.0.0
88
+ interegular==0.3.3
89
+ jieba==0.42.1
90
+ Jinja2==3.1.6
91
+ jiter==0.9.0
92
+ jmespath==0.10.0
93
+ joblib==1.4.2
94
+ jsonargparse==3.13.1
95
+ jsonschema==4.23.0
96
+ jsonschema-specifications==2024.10.1
97
+ kiwisolver==1.4.8
98
+ lark==1.2.2
99
+ latex2mathml==3.77.0
100
+ latex2sympy2_extended==1.10.1
101
+ lightning-utilities==0.14.3
102
+ linkify-it-py==2.0.3
103
+ llguidance==0.7.19
104
+ llvmlite==0.44.0
105
+ lm-format-enforcer==0.10.11
106
+ lxml==5.4.0
107
+ Markdown==3.7
108
+ markdown-it-py==2.2.0
109
+ markdown2==2.5.3
110
+ MarkupSafe==3.0.2
111
+ math-verify==0.7.0
112
+ matplotlib==3.10.1
113
+ mdit-py-plugins==0.3.3
114
+ mdurl==0.1.2
115
+ mistral_common==1.5.4
116
+ mmcls==0.25.0
117
+ mmcv==2.2.0
118
+ mmcv-full==1.6.2
119
+ mmengine==0.10.7
120
+ mmsegmentation==0.30.0
121
+ model-index==0.1.11
122
+ modelscope==1.25.0
123
+ mpmath==1.3.0
124
+ ms_swift==3.5.0
125
+ msgpack==1.1.0
126
+ msgspec==0.19.0
127
+ multidict==6.2.0
128
+ multiprocess==0.70.16
129
+ narwhals==1.32.0
130
+ nest-asyncio==1.6.0
131
+ networkx==3.4.2
132
+ ninja==1.11.1.4
133
+ nltk==3.9.1
134
+ numba==0.61.2
135
+ numpy==1.26.4
136
+ nvidia-cublas-cu12==12.6.4.1
137
+ nvidia-cuda-cupti-cu12==12.6.80
138
+ nvidia-cuda-nvrtc-cu12==12.6.77
139
+ nvidia-cuda-runtime-cu12==12.6.77
140
+ nvidia-cudnn-cu12==9.5.1.17
141
+ nvidia-cufft-cu12==11.3.0.4
142
+ nvidia-cufile-cu12==1.11.1.6
143
+ nvidia-curand-cu12==10.3.7.77
144
+ nvidia-cusolver-cu12==11.7.1.2
145
+ nvidia-cusparse-cu12==12.5.4.2
146
+ nvidia-cusparselt-cu12==0.6.3
147
+ nvidia-ml-py==12.575.51
148
+ nvidia-nccl-cu12==2.26.2
149
+ nvidia-nvjitlink-cu12==12.6.85
150
+ nvidia-nvtx-cu12==12.6.77
151
+ openai==1.77.0
152
+ opencv-python==4.11.0.86
153
+ opencv-python-headless==4.11.0.86
154
+ opendatalab==0.0.10
155
+ openmim==0.3.9
156
+ openpyxl==3.1.5
157
+ opentelemetry-api==1.26.0
158
+ opentelemetry-exporter-otlp==1.26.0
159
+ opentelemetry-exporter-otlp-proto-common==1.26.0
160
+ opentelemetry-exporter-otlp-proto-grpc==1.26.0
161
+ opentelemetry-exporter-otlp-proto-http==1.26.0
162
+ opentelemetry-proto==1.26.0
163
+ opentelemetry-sdk==1.26.0
164
+ opentelemetry-semantic-conventions==0.47b0
165
+ opentelemetry-semantic-conventions-ai==0.4.6
166
+ openxlab==0.0.11
167
+ ordered-set==4.1.0
168
+ orjson==3.10.16
169
+ oss2==2.19.1
170
+ outlines==0.1.11
171
+ outlines_core==0.1.26
172
+ packaging==24.2
173
+ pandas==2.2.3
174
+ partial-json-parser==0.2.1.1.post5
175
+ peft==0.15.2
176
+ pillow==11.1.0
177
+ pip==25.0
178
+ platformdirs==4.3.7
179
+ portalocker==3.1.1
180
+ prettytable==3.16.0
181
+ prometheus_client==0.21.1
182
+ prometheus-fastapi-instrumentator==7.1.0
183
+ propcache==0.3.1
184
+ protobuf==4.25.7
185
+ psutil==7.0.0
186
+ py-cpuinfo==9.0.0
187
+ pyarrow==19.0.1
188
+ pycocoevalcap==1.2
189
+ pycocotools==2.0.8
190
+ pycountry==24.6.1
191
+ pycparser==2.22
192
+ pycryptodome==3.22.0
193
+ pydantic==2.11.1
194
+ pydantic_core==2.33.0
195
+ pydeck==0.9.1
196
+ pydub==0.25.1
197
+ pyecharts==2.0.8
198
+ Pygments==2.19.1
199
+ pynvml==12.0.0
200
+ pyparsing==3.2.3
201
+ PySocks==1.7.1
202
+ python-dateutil==2.9.0.post0
203
+ python-dotenv==1.1.0
204
+ python-json-logger==3.3.0
205
+ python-multipart==0.0.20
206
+ pytorch-lightning==2.5.1.post0
207
+ pytz==2025.2
208
+ PyYAML==6.0.2
209
+ pyzmq==26.4.0
210
+ qwen-vl-utils==0.0.11
211
+ ray==2.45.0
212
+ referencing==0.36.2
213
+ regex==2024.11.6
214
+ requests==2.32.3
215
+ rich==13.9.4
216
+ rich-toolkit==0.14.5
217
+ rouge==1.0.1
218
+ rpds-py==0.24.0
219
+ ruff==0.11.8
220
+ s3transfer==0.13.0
221
+ sacrebleu==2.5.1
222
+ safehttpx==0.1.6
223
+ safetensors==0.5.3
224
+ scikit-learn==1.6.1
225
+ scipy==1.15.2
226
+ semantic-version==2.10.0
227
+ sentencepiece==0.2.0
228
+ sentry-sdk==2.27.0
229
+ setproctitle==1.3.6
230
+ setuptools==69.5.1
231
+ shellingham==1.5.4
232
+ shortuuid==1.0.13
233
+ simplejson==3.20.1
234
+ six==1.17.0
235
+ smmap==5.0.2
236
+ sniffio==1.3.1
237
+ sortedcontainers==2.4.0
238
+ soupsieve==2.6
239
+ starlette==0.46.1
240
+ streamlit==1.44.0
241
+ streamlit-image-select==0.6.0
242
+ svgwrite==1.4.3
243
+ swankit==0.2.4
244
+ swanlab==0.6.4
245
+ sympy==1.14.0
246
+ tabulate==0.9.0
247
+ tenacity==9.0.0
248
+ tensorboard==2.19.0
249
+ tensorboard-data-server==0.7.2
250
+ tensorboardX==2.6.2.2
251
+ termcolor==2.5.0
252
+ threadpoolctl==3.6.0
253
+ tiktoken==0.9.0
254
+ timm==0.9.12
255
+ tokenizers==0.21.1
256
+ toml==0.10.2
257
+ tomli==2.2.1
258
+ tomlkit==0.13.2
259
+ torch==2.7.0
260
+ torchaudio==2.7.0
261
+ torchmetrics==0.10.3
262
+ torchvision==0.22.0
263
+ tornado==6.4.2
264
+ tqdm==4.67.1
265
+ transformers==4.51.3
266
+ transformers-stream-generator==0.0.5
267
+ triton==3.3.0
268
+ trl==0.17.0
269
+ typer==0.15.3
270
+ typing_extensions==4.13.0
271
+ typing-inspection==0.4.0
272
+ tzdata==2025.2
273
+ uc-micro-py==1.0.3
274
+ unbabel-comet==2.2.6
275
+ urllib3==2.3.0
276
+ uvicorn==0.34.0
277
+ uvloop==0.21.0
278
+ vllm==0.9.0
279
+ wandb==0.20.1
280
+ watchdog==6.0.0
281
+ watchfiles==1.0.5
282
+ wavedrom==2.0.3.post3
283
+ wcwidth==0.2.13
284
+ websocket-client==1.8.0
285
+ websockets==15.0.1
286
+ Werkzeug==3.1.3
287
+ wheel==0.45.1
288
+ wrapt==1.17.2
289
+ xformers==0.0.30
290
+ xgrammar==0.1.19
291
+ xxhash==3.5.0
292
+ yacs==0.1.8
293
+ yapf==0.40.1
294
+ yarl==1.18.3
295
+ zipp==3.21.0
296
+ zstandard==0.23.0
swanlog/run-20250629_184555-a3b1799d/files/swanlab-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"memory": "1007", "cpu": {"brand": "Intel(R) Xeon(R) Platinum 8358P CPU @ 2.60GHz", "cores": 128}, "gpu": {"nvidia": {"driver": "535.86.10", "cores": 4, "type": ["NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB", "NVIDIA A100-SXM4-80GB"], "memory": ["80", "80", "80", "80"], "cuda": "12.3", "architecture": ["Ampere", "Ampere", "Ampere", "Ampere"], "cudacores": [6912, 6912, 6912, 6912]}}, "os": "Linux-4.18.0-425.3.1.el8.x86_64-x86_64-with-glibc2.35", "os_pretty_name": "Ubuntu 22.04.3 LTS", "hostname": "dc11626b-aeaf-4144-9e57-6e87d523e853", "pid": 1366249, "cwd": "/mnt/data/users/liamding/data/sft_zh_tox", "python": "3.10.16", "python_verbose": "3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0]", "executable": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/bin/python", "command": "/mnt/code/users/liamding/tools/conda_install/anaconda3/envs/intern2vl/lib/python3.10/site-packages/swift/cli/sft.py --model /mnt/data/users/liamding/data/models/Qwen3-8B --train_type full --dataset /mnt/data/users/liamding/data/sft_zh_tox/data/train_data/qwen3_r1_train.json --num_train_epochs 5 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --learning_rate 1e-6 --lr_scheduler_type cosine --eval_strategy epoch --gradient_accumulation_steps 2 --save_total_limit 1 --warmup_ratio 0.05 --logging_steps 1 --max_length 32768 --weight_decay 1e-4 --deepspeed zero3 --dataloader_num_workers 4 --output_dir output/qwen3_8b_r1_v2 --report_to swanlab --swanlab_token ****", "git_remote": null, "git_info": [null, null], "swanlab": {"version": "0.6.4", "_monitor": 5, "logdir": "/mnt/data/users/liamding/data/sft_zh_tox/swanlog/run-20250629_184555-a3b1799d"}}