Linksome commited on
Commit
b629b51
·
verified ·
1 Parent(s): 8dc2b7d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +60 -0
  2. LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml +723 -0
  3. LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log +423 -0
  4. LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json +41 -0
  5. LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json +1 -0
  6. LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log +299 -0
  7. LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/requirements.txt +257 -0
  8. LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/wandb-metadata.json +41 -0
  9. LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log +6 -0
  10. LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log +23 -0
  11. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml +723 -0
  12. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log +191 -0
  13. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt +257 -0
  14. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json +41 -0
  15. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json +1 -0
  16. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log +11 -0
  17. LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log +25 -0
  18. LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml +723 -0
  19. LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt +257 -0
  20. LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json +41 -0
  21. LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json +1 -0
  22. LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log +14 -0
  23. LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log +25 -0
  24. LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml +723 -0
  25. LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt +257 -0
  26. LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json +41 -0
  27. LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json +1 -0
  28. LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log +14 -0
  29. LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log +25 -0
  30. LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml +723 -0
  31. LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt +257 -0
  32. LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json +41 -0
  33. LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json +1 -0
  34. LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log +13 -0
  35. LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log +25 -0
  36. LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml +723 -0
  37. LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt +257 -0
  38. LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json +41 -0
  39. LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json +1 -0
  40. LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log +12 -0
  41. LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log +25 -0
  42. LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log +0 -0
  43. LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt +257 -0
  44. LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json +41 -0
  45. LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log +6 -0
  46. LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log +23 -0
  47. LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml +723 -0
  48. LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt +257 -0
  49. LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json +41 -0
  50. LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json +1 -0
.gitattributes CHANGED
@@ -210,3 +210,63 @@ v127rc_exp2/B_mul/checkpoint-9500/tokenizer.json filter=lfs diff=lfs merge=lfs -
210
  v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
211
  v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
212
  v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  v127rc_exp2/B_mul/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
211
  v127rc_exp2/B_mul/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
212
  v127rc_exp2/B_mul/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
213
+ v127rc_exp2/B_mul/checkpoint-9100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
214
+ v127rc_exp2/B_mul/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
215
+ v127rc_exp2/B_mul/checkpoint-8900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
216
+ v127rc_exp2/B_mul/checkpoint-8800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
217
+ v127rc_exp2/B_mul/checkpoint-8700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
218
+ v127rc_exp2/B_mul/checkpoint-8600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
219
+ v127rc_exp2/B_mul/checkpoint-8500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
220
+ v127rc_exp2/B_mul/checkpoint-8400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
221
+ v127rc_exp2/B_mul/checkpoint-8300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
222
+ v127rc_exp2/B_mul/checkpoint-8200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
223
+ v127rc_exp2/B_mul/checkpoint-8100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
224
+ v127rc_exp2/B_mul/checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
225
+ v127rc_exp2/B_mul/checkpoint-7900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
226
+ v127rc_exp2/B_mul/checkpoint-7800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
227
+ v127rc_exp2/B_mul/checkpoint-7700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
228
+ v127rc_exp2/B_mul/checkpoint-7600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
229
+ v127rc_exp2/B_mul/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
230
+ v127rc_exp2/B_mul/checkpoint-7400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
231
+ v127rc_exp2/B_mul/checkpoint-7300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
232
+ v127rc_exp2/B_mul/checkpoint-7200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
233
+ v127rc_exp2/B_mul/checkpoint-7100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
234
+ v127rc_exp2/B_mul/checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
235
+ v127rc_exp2/B_mul/checkpoint-6900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
236
+ v127rc_exp2/B_mul/checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
237
+ v127rc_exp2/B_mul/checkpoint-6700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
238
+ v127rc_exp2/B_mul/checkpoint-6600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
239
+ v127rc_exp2/B_mul/checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
240
+ v127rc_exp2/B_mul/checkpoint-6400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
241
+ v127rc_exp2/B_mul/checkpoint-6300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
242
+ v127rc_exp2/B_mul/checkpoint-6200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
243
+ v127rc_exp2/B_mul/checkpoint-6100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
244
+ v127rc_exp2/B_mul/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
245
+ v127rc_exp2/B_mul/checkpoint-5900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
246
+ v127rc_exp2/B_mul/checkpoint-5800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
247
+ v127rc_exp2/B_mul/checkpoint-5700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
248
+ v127rc_exp2/B_mul/checkpoint-5600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
249
+ v127rc_exp2/B_mul/checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
250
+ v127rc_exp2/B_mul/checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
251
+ v127rc_exp2/B_mul/checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
252
+ v127rc_exp2/B_mul/checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
253
+ v127rc_exp2/B_mul/checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
254
+ v127rc_exp2/B_mul/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
255
+ v127rc_exp2/B_mul/checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
256
+ v127rc_exp2/B_mul/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
257
+ v127rc_exp2/B_mul/checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
258
+ v127rc_exp2/B_mul/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
259
+ v127rc_exp2/B_mul/checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
260
+ v127rc_exp2/B_mul/checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
261
+ v127rc_exp2/B_mul/checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
262
+ v127rc_exp2/B_mul/checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
263
+ v127rc_exp2/B_mul/checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
264
+ v127rc_exp2/B_mul/checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
265
+ v127rc_exp2/B_mul/checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
266
+ v127rc_exp2/B_mul/checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
267
+ v127rc_exp2/B_mul/checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
268
+ v127rc_exp2/B_mul/checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
269
+ v127rc_exp2/B_mul/checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
270
+ v127rc_exp2/B_mul/checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
271
+ v127rc_exp2/B_mul/checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
272
+ v127rc_exp2/B_mul/checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.1
6
+ e:
7
+ mfjy22anxcucsb3vwlaimrwvqrgvipis:
8
+ args:
9
+ - /workspace/v127rc_exp1/C.yaml
10
+ cpu_count: 16
11
+ cpu_count_logical: 32
12
+ cudaVersion: "13.0"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "1858306048"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de
30
+ host: 47a53adf0198
31
+ memory:
32
+ total: "201701408768"
33
+ os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-04T03:57:46.163443Z"
38
+ writerId: mfjy22anxcucsb3vwlaimrwvqrgvipis
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.1
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.1
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t0_d35_r286
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/C
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - up_proj
625
+ - q_proj
626
+ - gate_proj
627
+ - k_proj
628
+ - v_proj
629
+ - o_proj
630
+ - down_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 1000
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/output.log ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0%| | 0/18595 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
2
+ with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
3
+
4
+ {'loss': '1.682', 'grad_norm': '0.2716', 'learning_rate': '0', 'epoch': '0.0002689', 'num_input_tokens_seen': 2047, 'train_runtime': '2.959', 'train_tokens_per_second': '691.9'}
5
+ {'loss': '1.8', 'grad_norm': '0.2907', 'learning_rate': '1.344e-07', 'epoch': '0.0005378', 'num_input_tokens_seen': 4094, 'train_runtime': '3.966', 'train_tokens_per_second': '1032'}
6
+ {'loss': '1.755', 'grad_norm': '0.2774', 'learning_rate': '2.688e-07', 'epoch': '0.0008067', 'num_input_tokens_seen': 6141, 'train_runtime': '4.979', 'train_tokens_per_second': '1233'}
7
+ {'loss': '1.725', 'grad_norm': '0.278', 'learning_rate': '4.032e-07', 'epoch': '0.001076', 'num_input_tokens_seen': 8188, 'train_runtime': '5.988', 'train_tokens_per_second': '1367'}
8
+ {'loss': '1.856', 'grad_norm': '0.2819', 'learning_rate': '5.376e-07', 'epoch': '0.001344', 'num_input_tokens_seen': 10235, 'train_runtime': '6.995', 'train_tokens_per_second': '1463'}
9
+ {'loss': '1.864', 'grad_norm': '0.2434', 'learning_rate': '6.72e-07', 'epoch': '0.001613', 'num_input_tokens_seen': 12282, 'train_runtime': '8.002', 'train_tokens_per_second': '1535'}
10
+ {'loss': '1.791', 'grad_norm': '0.2673', 'learning_rate': '8.065e-07', 'epoch': '0.001882', 'num_input_tokens_seen': 14329, 'train_runtime': '9.01', 'train_tokens_per_second': '1590'}
11
+ {'loss': '1.831', 'grad_norm': '0.2574', 'learning_rate': '9.409e-07', 'epoch': '0.002151', 'num_input_tokens_seen': 16376, 'train_runtime': '10.02', 'train_tokens_per_second': '1634'}
12
+ {'loss': '1.92', 'grad_norm': '0.2803', 'learning_rate': '1.075e-06', 'epoch': '0.00242', 'num_input_tokens_seen': 18423, 'train_runtime': '11.06', 'train_tokens_per_second': '1665'}
13
+ {'loss': '1.949', 'grad_norm': '0.281', 'learning_rate': '1.21e-06', 'epoch': '0.002689', 'num_input_tokens_seen': 20470, 'train_runtime': '12.07', 'train_tokens_per_second': '1696'}
14
+ {'loss': '1.955', 'grad_norm': '0.298', 'learning_rate': '1.344e-06', 'epoch': '0.002958', 'num_input_tokens_seen': 22517, 'train_runtime': '13.08', 'train_tokens_per_second': '1722'}
15
+ {'loss': '1.811', 'grad_norm': '0.2719', 'learning_rate': '1.478e-06', 'epoch': '0.003227', 'num_input_tokens_seen': 24564, 'train_runtime': '14.09', 'train_tokens_per_second': '1743'}
16
+ {'loss': '1.629', 'grad_norm': '0.266', 'learning_rate': '1.613e-06', 'epoch': '0.003496', 'num_input_tokens_seen': 26611, 'train_runtime': '15.1', 'train_tokens_per_second': '1763'}
17
+ {'loss': '1.768', 'grad_norm': '0.268', 'learning_rate': '1.747e-06', 'epoch': '0.003764', 'num_input_tokens_seen': 28658, 'train_runtime': '16.1', 'train_tokens_per_second': '1779'}
18
+ {'loss': '1.612', 'grad_norm': '0.252', 'learning_rate': '1.882e-06', 'epoch': '0.004033', 'num_input_tokens_seen': 30705, 'train_runtime': '17.11', 'train_tokens_per_second': '1794'}
19
+ {'loss': '1.622', 'grad_norm': '0.2607', 'learning_rate': '2.016e-06', 'epoch': '0.004302', 'num_input_tokens_seen': 32752, 'train_runtime': '18.12', 'train_tokens_per_second': '1807'}
20
+ {'loss': '1.857', 'grad_norm': '0.2805', 'learning_rate': '2.151e-06', 'epoch': '0.004571', 'num_input_tokens_seen': 34799, 'train_runtime': '19.13', 'train_tokens_per_second': '1819'}
21
+ {'loss': '1.851', 'grad_norm': '0.2441', 'learning_rate': '2.285e-06', 'epoch': '0.00484', 'num_input_tokens_seen': 36846, 'train_runtime': '20.14', 'train_tokens_per_second': '1830'}
22
+ {'loss': '1.826', 'grad_norm': '0.2659', 'learning_rate': '2.419e-06', 'epoch': '0.005109', 'num_input_tokens_seen': 38893, 'train_runtime': '21.15', 'train_tokens_per_second': '1839'}
23
+ {'loss': '1.536', 'grad_norm': '0.2742', 'learning_rate': '2.554e-06', 'epoch': '0.005378', 'num_input_tokens_seen': 40940, 'train_runtime': '22.16', 'train_tokens_per_second': '1847'}
24
+ {'loss': '1.67', 'grad_norm': '0.2687', 'learning_rate': '2.688e-06', 'epoch': '0.005647', 'num_input_tokens_seen': 42987, 'train_runtime': '23.17', 'train_tokens_per_second': '1855'}
25
+ {'loss': '1.548', 'grad_norm': '0.2588', 'learning_rate': '2.823e-06', 'epoch': '0.005916', 'num_input_tokens_seen': 45034, 'train_runtime': '24.18', 'train_tokens_per_second': '1862'}
26
+ {'loss': '1.866', 'grad_norm': '0.2874', 'learning_rate': '2.957e-06', 'epoch': '0.006184', 'num_input_tokens_seen': 47081, 'train_runtime': '25.19', 'train_tokens_per_second': '1869'}
27
+ {'loss': '1.764', 'grad_norm': '0.2764', 'learning_rate': '3.091e-06', 'epoch': '0.006453', 'num_input_tokens_seen': 49128, 'train_runtime': '26.2', 'train_tokens_per_second': '1875'}
28
+ {'loss': '1.937', 'grad_norm': '0.2965', 'learning_rate': '3.226e-06', 'epoch': '0.006722', 'num_input_tokens_seen': 51175, 'train_runtime': '27.21', 'train_tokens_per_second': '1881'}
29
+ {'loss': '1.627', 'grad_norm': '0.2888', 'learning_rate': '3.36e-06', 'epoch': '0.006991', 'num_input_tokens_seen': 53222, 'train_runtime': '28.23', 'train_tokens_per_second': '1886'}
30
+ {'loss': '1.792', 'grad_norm': '0.3194', 'learning_rate': '3.495e-06', 'epoch': '0.00726', 'num_input_tokens_seen': 55269, 'train_runtime': '29.24', 'train_tokens_per_second': '1890'}
31
+ {'loss': '1.725', 'grad_norm': '0.2937', 'learning_rate': '3.629e-06', 'epoch': '0.007529', 'num_input_tokens_seen': 57316, 'train_runtime': '30.25', 'train_tokens_per_second': '1895'}
32
+ {'loss': '1.871', 'grad_norm': '0.2757', 'learning_rate': '3.763e-06', 'epoch': '0.007798', 'num_input_tokens_seen': 59363, 'train_runtime': '31.26', 'train_tokens_per_second': '1899'}
33
+ {'loss': '1.838', 'grad_norm': '0.2773', 'learning_rate': '3.898e-06', 'epoch': '0.008067', 'num_input_tokens_seen': 61410, 'train_runtime': '32.27', 'train_tokens_per_second': '1903'}
34
+ {'loss': '1.909', 'grad_norm': '0.3041', 'learning_rate': '4.032e-06', 'epoch': '0.008336', 'num_input_tokens_seen': 63457, 'train_runtime': '33.28', 'train_tokens_per_second': '1907'}
35
+ {'loss': '1.725', 'grad_norm': '0.2885', 'learning_rate': '4.167e-06', 'epoch': '0.008604', 'num_input_tokens_seen': 65504, 'train_runtime': '34.29', 'train_tokens_per_second': '1910'}
36
+ {'loss': '1.747', 'grad_norm': '0.3163', 'learning_rate': '4.301e-06', 'epoch': '0.008873', 'num_input_tokens_seen': 67551, 'train_runtime': '35.3', 'train_tokens_per_second': '1914'}
37
+ {'loss': '1.909', 'grad_norm': '0.2977', 'learning_rate': '4.435e-06', 'epoch': '0.009142', 'num_input_tokens_seen': 69598, 'train_runtime': '36.31', 'train_tokens_per_second': '1917'}
38
+ {'loss': '1.641', 'grad_norm': '0.275', 'learning_rate': '4.57e-06', 'epoch': '0.009411', 'num_input_tokens_seen': 71645, 'train_runtime': '37.32', 'train_tokens_per_second': '1920'}
39
+ {'loss': '1.782', 'grad_norm': '0.3019', 'learning_rate': '4.704e-06', 'epoch': '0.00968', 'num_input_tokens_seen': 73692, 'train_runtime': '38.33', 'train_tokens_per_second': '1922'}
40
+ {'loss': '1.83', 'grad_norm': '0.3124', 'learning_rate': '4.839e-06', 'epoch': '0.009949', 'num_input_tokens_seen': 75739, 'train_runtime': '39.34', 'train_tokens_per_second': '1925'}
41
+ {'loss': '1.856', 'grad_norm': '0.2672', 'learning_rate': '4.973e-06', 'epoch': '0.01022', 'num_input_tokens_seen': 77786, 'train_runtime': '40.36', 'train_tokens_per_second': '1927'}
42
+ {'loss': '1.965', 'grad_norm': '0.297', 'learning_rate': '5.108e-06', 'epoch': '0.01049', 'num_input_tokens_seen': 79833, 'train_runtime': '41.37', 'train_tokens_per_second': '1930'}
43
+ {'loss': '1.935', 'grad_norm': '0.337', 'learning_rate': '5.242e-06', 'epoch': '0.01076', 'num_input_tokens_seen': 81880, 'train_runtime': '42.39', 'train_tokens_per_second': '1932'}
44
+ {'loss': '1.725', 'grad_norm': '0.3097', 'learning_rate': '5.376e-06', 'epoch': '0.01102', 'num_input_tokens_seen': 83927, 'train_runtime': '43.4', 'train_tokens_per_second': '1934'}
45
+ {'loss': '1.534', 'grad_norm': '0.2637', 'learning_rate': '5.511e-06', 'epoch': '0.01129', 'num_input_tokens_seen': 85974, 'train_runtime': '44.42', 'train_tokens_per_second': '1935'}
46
+ {'loss': '1.764', 'grad_norm': '0.2742', 'learning_rate': '5.645e-06', 'epoch': '0.01156', 'num_input_tokens_seen': 88021, 'train_runtime': '45.43', 'train_tokens_per_second': '1937'}
47
+ {'loss': '1.696', 'grad_norm': '0.2804', 'learning_rate': '5.78e-06', 'epoch': '0.01183', 'num_input_tokens_seen': 90068, 'train_runtime': '46.45', 'train_tokens_per_second': '1939'}
48
+ {'loss': '1.725', 'grad_norm': '0.279', 'learning_rate': '5.914e-06', 'epoch': '0.0121', 'num_input_tokens_seen': 92115, 'train_runtime': '47.46', 'train_tokens_per_second': '1941'}
49
+ {'loss': '1.981', 'grad_norm': '0.3061', 'learning_rate': '6.048e-06', 'epoch': '0.01237', 'num_input_tokens_seen': 94162, 'train_runtime': '48.47', 'train_tokens_per_second': '1943'}
50
+ {'loss': '1.589', 'grad_norm': '0.2909', 'learning_rate': '6.183e-06', 'epoch': '0.01264', 'num_input_tokens_seen': 96209, 'train_runtime': '49.48', 'train_tokens_per_second': '1944'}
51
+ {'loss': '1.776', 'grad_norm': '0.338', 'learning_rate': '6.317e-06', 'epoch': '0.01291', 'num_input_tokens_seen': 98256, 'train_runtime': '50.49', 'train_tokens_per_second': '1946'}
52
+ {'loss': '1.855', 'grad_norm': '0.2965', 'learning_rate': '6.452e-06', 'epoch': '0.01318', 'num_input_tokens_seen': 100303, 'train_runtime': '51.51', 'train_tokens_per_second': '1947'}
53
+ {'loss': '1.635', 'grad_norm': '0.3187', 'learning_rate': '6.586e-06', 'epoch': '0.01344', 'num_input_tokens_seen': 102350, 'train_runtime': '52.52', 'train_tokens_per_second': '1949'}
54
+ {'loss': '1.884', 'grad_norm': '0.3086', 'learning_rate': '6.72e-06', 'epoch': '0.01371', 'num_input_tokens_seen': 104397, 'train_runtime': '53.53', 'train_tokens_per_second': '1950'}
55
+ {'loss': '1.779', 'grad_norm': '0.3112', 'learning_rate': '6.855e-06', 'epoch': '0.01398', 'num_input_tokens_seen': 106444, 'train_runtime': '54.55', 'train_tokens_per_second': '1951'}
56
+ {'loss': '1.85', 'grad_norm': '0.3581', 'learning_rate': '6.989e-06', 'epoch': '0.01425', 'num_input_tokens_seen': 108491, 'train_runtime': '55.56', 'train_tokens_per_second': '1953'}
57
+ {'loss': '1.611', 'grad_norm': '0.7226', 'learning_rate': '7.124e-06', 'epoch': '0.01452', 'num_input_tokens_seen': 110538, 'train_runtime': '56.57', 'train_tokens_per_second': '1954'}
58
+ {'loss': '1.643', 'grad_norm': '0.2939', 'learning_rate': '7.258e-06', 'epoch': '0.01479', 'num_input_tokens_seen': 112585, 'train_runtime': '57.58', 'train_tokens_per_second': '1955'}
59
+ {'loss': '1.978', 'grad_norm': '0.3302', 'learning_rate': '7.392e-06', 'epoch': '0.01506', 'num_input_tokens_seen': 114632, 'train_runtime': '58.6', 'train_tokens_per_second': '1956'}
60
+ {'loss': '1.473', 'grad_norm': '0.3044', 'learning_rate': '7.527e-06', 'epoch': '0.01533', 'num_input_tokens_seen': 116679, 'train_runtime': '59.62', 'train_tokens_per_second': '1957'}
61
+ {'loss': '1.559', 'grad_norm': '0.3122', 'learning_rate': '7.661e-06', 'epoch': '0.0156', 'num_input_tokens_seen': 118726, 'train_runtime': '60.64', 'train_tokens_per_second': '1958'}
62
+ {'loss': '1.793', 'grad_norm': '0.344', 'learning_rate': '7.796e-06', 'epoch': '0.01586', 'num_input_tokens_seen': 120773, 'train_runtime': '61.65', 'train_tokens_per_second': '1959'}
63
+ {'loss': '1.589', 'grad_norm': '0.3391', 'learning_rate': '7.93e-06', 'epoch': '0.01613', 'num_input_tokens_seen': 122820, 'train_runtime': '62.66', 'train_tokens_per_second': '1960'}
64
+ {'loss': '1.713', 'grad_norm': '0.3023', 'learning_rate': '8.065e-06', 'epoch': '0.0164', 'num_input_tokens_seen': 124867, 'train_runtime': '63.68', 'train_tokens_per_second': '1961'}
65
+ {'loss': '1.704', 'grad_norm': '0.3436', 'learning_rate': '8.199e-06', 'epoch': '0.01667', 'num_input_tokens_seen': 126914, 'train_runtime': '64.7', 'train_tokens_per_second': '1962'}
66
+ {'loss': '1.908', 'grad_norm': '0.3627', 'learning_rate': '8.333e-06', 'epoch': '0.01694', 'num_input_tokens_seen': 128961, 'train_runtime': '65.71', 'train_tokens_per_second': '1963'}
67
+ {'loss': '1.799', 'grad_norm': '0.3663', 'learning_rate': '8.468e-06', 'epoch': '0.01721', 'num_input_tokens_seen': 131008, 'train_runtime': '66.72', 'train_tokens_per_second': '1963'}
68
+ {'loss': '1.855', 'grad_norm': '0.3834', 'learning_rate': '8.602e-06', 'epoch': '0.01748', 'num_input_tokens_seen': 133055, 'train_runtime': '67.74', 'train_tokens_per_second': '1964'}
69
+ {'loss': '1.805', 'grad_norm': '0.3678', 'learning_rate': '8.737e-06', 'epoch': '0.01775', 'num_input_tokens_seen': 135102, 'train_runtime': '68.76', 'train_tokens_per_second': '1965'}
70
+ {'loss': '1.436', 'grad_norm': '0.3304', 'learning_rate': '8.871e-06', 'epoch': '0.01802', 'num_input_tokens_seen': 137149, 'train_runtime': '69.77', 'train_tokens_per_second': '1966'}
71
+ {'loss': '1.746', 'grad_norm': '0.307', 'learning_rate': '9.005e-06', 'epoch': '0.01828', 'num_input_tokens_seen': 139196, 'train_runtime': '70.78', 'train_tokens_per_second': '1966'}
72
+ {'loss': '1.823', 'grad_norm': '0.3547', 'learning_rate': '9.14e-06', 'epoch': '0.01855', 'num_input_tokens_seen': 141243, 'train_runtime': '71.8', 'train_tokens_per_second': '1967'}
73
+ {'loss': '1.66', 'grad_norm': '0.3379', 'learning_rate': '9.274e-06', 'epoch': '0.01882', 'num_input_tokens_seen': 143290, 'train_runtime': '72.82', 'train_tokens_per_second': '1968'}
74
+ {'loss': '1.913', 'grad_norm': '0.3416', 'learning_rate': '9.409e-06', 'epoch': '0.01909', 'num_input_tokens_seen': 145337, 'train_runtime': '73.84', 'train_tokens_per_second': '1968'}
75
+ {'loss': '1.814', 'grad_norm': '0.3721', 'learning_rate': '9.543e-06', 'epoch': '0.01936', 'num_input_tokens_seen': 147384, 'train_runtime': '74.85', 'train_tokens_per_second': '1969'}
76
+ {'loss': '1.797', 'grad_norm': '0.373', 'learning_rate': '9.677e-06', 'epoch': '0.01963', 'num_input_tokens_seen': 149431, 'train_runtime': '75.87', 'train_tokens_per_second': '1970'}
77
+ {'loss': '1.704', 'grad_norm': '0.3735', 'learning_rate': '9.812e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 151478, 'train_runtime': '76.88', 'train_tokens_per_second': '1970'}
78
+ {'loss': '1.578', 'grad_norm': '0.3312', 'learning_rate': '9.946e-06', 'epoch': '0.02017', 'num_input_tokens_seen': 153525, 'train_runtime': '77.9', 'train_tokens_per_second': '1971'}
79
+ {'loss': '1.712', 'grad_norm': '0.3716', 'learning_rate': '1.008e-05', 'epoch': '0.02044', 'num_input_tokens_seen': 155572, 'train_runtime': '78.91', 'train_tokens_per_second': '1971'}
80
+ {'loss': '1.758', 'grad_norm': '0.3477', 'learning_rate': '1.022e-05', 'epoch': '0.0207', 'num_input_tokens_seen': 157619, 'train_runtime': '79.93', 'train_tokens_per_second': '1972'}
81
+ {'loss': '1.85', 'grad_norm': '0.374', 'learning_rate': '1.035e-05', 'epoch': '0.02097', 'num_input_tokens_seen': 159666, 'train_runtime': '80.94', 'train_tokens_per_second': '1973'}
82
+ {'loss': '1.77', 'grad_norm': '0.3782', 'learning_rate': '1.048e-05', 'epoch': '0.02124', 'num_input_tokens_seen': 161713, 'train_runtime': '81.96', 'train_tokens_per_second': '1973'}
83
+ {'loss': '1.592', 'grad_norm': '0.3265', 'learning_rate': '1.062e-05', 'epoch': '0.02151', 'num_input_tokens_seen': 163760, 'train_runtime': '82.98', 'train_tokens_per_second': '1974'}
84
+ {'loss': '1.684', 'grad_norm': '0.3949', 'learning_rate': '1.075e-05', 'epoch': '0.02178', 'num_input_tokens_seen': 165807, 'train_runtime': '83.99', 'train_tokens_per_second': '1974'}
85
+ {'loss': '1.416', 'grad_norm': '0.339', 'learning_rate': '1.089e-05', 'epoch': '0.02205', 'num_input_tokens_seen': 167854, 'train_runtime': '85.01', 'train_tokens_per_second': '1975'}
86
+ {'loss': '1.275', 'grad_norm': '0.3412', 'learning_rate': '1.102e-05', 'epoch': '0.02232', 'num_input_tokens_seen': 169901, 'train_runtime': '86.02', 'train_tokens_per_second': '1975'}
87
+ {'loss': '1.798', 'grad_norm': '0.4259', 'learning_rate': '1.116e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 171948, 'train_runtime': '87.04', 'train_tokens_per_second': '1976'}
88
+ {'loss': '1.631', 'grad_norm': '0.3738', 'learning_rate': '1.129e-05', 'epoch': '0.02286', 'num_input_tokens_seen': 173995, 'train_runtime': '88.05', 'train_tokens_per_second': '1976'}
89
+ {'loss': '1.695', 'grad_norm': '0.3967', 'learning_rate': '1.142e-05', 'epoch': '0.02312', 'num_input_tokens_seen': 176042, 'train_runtime': '89.07', 'train_tokens_per_second': '1976'}
90
+ {'loss': '1.809', 'grad_norm': '0.3775', 'learning_rate': '1.156e-05', 'epoch': '0.02339', 'num_input_tokens_seen': 178089, 'train_runtime': '90.09', 'train_tokens_per_second': '1977'}
91
+ {'loss': '1.628', 'grad_norm': '0.3732', 'learning_rate': '1.169e-05', 'epoch': '0.02366', 'num_input_tokens_seen': 180136, 'train_runtime': '91.1', 'train_tokens_per_second': '1977'}
92
+ {'loss': '1.771', 'grad_norm': '0.397', 'learning_rate': '1.183e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 182183, 'train_runtime': '92.12', 'train_tokens_per_second': '1978'}
93
+ {'loss': '1.708', 'grad_norm': '0.4329', 'learning_rate': '1.196e-05', 'epoch': '0.0242', 'num_input_tokens_seen': 184230, 'train_runtime': '93.14', 'train_tokens_per_second': '1978'}
94
+ {'loss': '1.629', 'grad_norm': '0.391', 'learning_rate': '1.21e-05', 'epoch': '0.02447', 'num_input_tokens_seen': 186277, 'train_runtime': '94.15', 'train_tokens_per_second': '1978'}
95
+ {'loss': '1.69', 'grad_norm': '0.416', 'learning_rate': '1.223e-05', 'epoch': '0.02474', 'num_input_tokens_seen': 188324, 'train_runtime': '95.17', 'train_tokens_per_second': '1979'}
96
+ {'loss': '1.882', 'grad_norm': '0.4379', 'learning_rate': '1.237e-05', 'epoch': '0.02501', 'num_input_tokens_seen': 190371, 'train_runtime': '96.19', 'train_tokens_per_second': '1979'}
97
+ {'loss': '1.764', 'grad_norm': '0.417', 'learning_rate': '1.25e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 192418, 'train_runtime': '97.2', 'train_tokens_per_second': '1980'}
98
+ {'loss': '1.675', 'grad_norm': '0.4218', 'learning_rate': '1.263e-05', 'epoch': '0.02554', 'num_input_tokens_seen': 194465, 'train_runtime': '98.22', 'train_tokens_per_second': '1980'}
99
+ {'loss': '1.749', 'grad_norm': '0.4339', 'learning_rate': '1.277e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 196512, 'train_runtime': '99.24', 'train_tokens_per_second': '1980'}
100
+ {'loss': '1.792', 'grad_norm': '0.4553', 'learning_rate': '1.29e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 198559, 'train_runtime': '100.3', 'train_tokens_per_second': '1981'}
101
+ {'loss': '1.597', 'grad_norm': '0.4142', 'learning_rate': '1.304e-05', 'epoch': '0.02635', 'num_input_tokens_seen': 200606, 'train_runtime': '101.3', 'train_tokens_per_second': '1981'}
102
+ {'loss': '1.534', 'grad_norm': '0.4112', 'learning_rate': '1.317e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 202653, 'train_runtime': '102.3', 'train_tokens_per_second': '1981'}
103
+ {'loss': '1.607', 'grad_norm': '0.4382', 'learning_rate': '1.331e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 204700, 'train_runtime': '103.3', 'train_tokens_per_second': '1982'}
104
+ {'loss': '1.306', 'grad_norm': '0.3857', 'learning_rate': '1.344e-05', 'epoch': '0.02716', 'num_input_tokens_seen': 206747, 'train_runtime': '104.3', 'train_tokens_per_second': '1982'}
105
+ {'loss': '1.775', 'grad_norm': '0.4403', 'learning_rate': '1.358e-05', 'epoch': '0.02743', 'num_input_tokens_seen': 208794, 'train_runtime': '105.3', 'train_tokens_per_second': '1982'}
106
+ {'loss': '1.163', 'grad_norm': '0.4105', 'learning_rate': '1.371e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 210841, 'train_runtime': '106.4', 'train_tokens_per_second': '1982'}
107
+ {'loss': '1.773', 'grad_norm': '0.467', 'learning_rate': '1.384e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 212888, 'train_runtime': '107.4', 'train_tokens_per_second': '1983'}
108
+ {'loss': '1.548', 'grad_norm': '0.4103', 'learning_rate': '1.398e-05', 'epoch': '0.02823', 'num_input_tokens_seen': 214935, 'train_runtime': '108.4', 'train_tokens_per_second': '1983'}
109
+ {'loss': '1.663', 'grad_norm': '0.4564', 'learning_rate': '1.411e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 216982, 'train_runtime': '109.4', 'train_tokens_per_second': '1983'}
110
+ {'loss': '1.709', 'grad_norm': '0.5568', 'learning_rate': '1.425e-05', 'epoch': '0.02877', 'num_input_tokens_seen': 219029, 'train_runtime': '110.4', 'train_tokens_per_second': '1984'}
111
+ {'loss': '1.683', 'grad_norm': '0.4596', 'learning_rate': '1.438e-05', 'epoch': '0.02904', 'num_input_tokens_seen': 221076, 'train_runtime': '111.4', 'train_tokens_per_second': '1984'}
112
+ {'loss': '1.786', 'grad_norm': '0.488', 'learning_rate': '1.452e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 223123, 'train_runtime': '112.5', 'train_tokens_per_second': '1984'}
113
+ {'loss': '1.593', 'grad_norm': '0.4877', 'learning_rate': '1.465e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 225170, 'train_runtime': '113.5', 'train_tokens_per_second': '1984'}
114
+ {'loss': '1.144', 'grad_norm': '0.4087', 'learning_rate': '1.478e-05', 'epoch': '0.02985', 'num_input_tokens_seen': 227217, 'train_runtime': '114.5', 'train_tokens_per_second': '1985'}
115
+ {'loss': '1.632', 'grad_norm': '0.4522', 'learning_rate': '1.492e-05', 'epoch': '0.03012', 'num_input_tokens_seen': 229264, 'train_runtime': '115.5', 'train_tokens_per_second': '1985'}
116
+ {'loss': '1.575', 'grad_norm': '0.4504', 'learning_rate': '1.505e-05', 'epoch': '0.03038', 'num_input_tokens_seen': 231311, 'train_runtime': '116.5', 'train_tokens_per_second': '1985'}
117
+ {'loss': '1.705', 'grad_norm': '0.4647', 'learning_rate': '1.519e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 233358, 'train_runtime': '117.5', 'train_tokens_per_second': '1985'}
118
+ {'loss': '1.651', 'grad_norm': '0.4929', 'learning_rate': '1.532e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 235405, 'train_runtime': '118.6', 'train_tokens_per_second': '1986'}
119
+ {'loss': '1.614', 'grad_norm': '0.4435', 'learning_rate': '1.546e-05', 'epoch': '0.03119', 'num_input_tokens_seen': 237452, 'train_runtime': '119.6', 'train_tokens_per_second': '1986'}
120
+ {'loss': '1.159', 'grad_norm': '0.4458', 'learning_rate': '1.559e-05', 'epoch': '0.03146', 'num_input_tokens_seen': 239499, 'train_runtime': '120.6', 'train_tokens_per_second': '1986'}
121
+ {'loss': '1.606', 'grad_norm': '0.5428', 'learning_rate': '1.573e-05', 'epoch': '0.03173', 'num_input_tokens_seen': 241546, 'train_runtime': '121.6', 'train_tokens_per_second': '1986'}
122
+ {'loss': '1.744', 'grad_norm': '0.5349', 'learning_rate': '1.586e-05', 'epoch': '0.032', 'num_input_tokens_seen': 243593, 'train_runtime': '122.6', 'train_tokens_per_second': '1986'}
123
+ {'loss': '1.527', 'grad_norm': '0.5387', 'learning_rate': '1.599e-05', 'epoch': '0.03227', 'num_input_tokens_seen': 245640, 'train_runtime': '123.6', 'train_tokens_per_second': '1987'}
124
+ {'loss': '1.52', 'grad_norm': '0.5221', 'learning_rate': '1.613e-05', 'epoch': '0.03254', 'num_input_tokens_seen': 247687, 'train_runtime': '124.7', 'train_tokens_per_second': '1987'}
125
+ {'loss': '1.561', 'grad_norm': '0.537', 'learning_rate': '1.626e-05', 'epoch': '0.0328', 'num_input_tokens_seen': 249734, 'train_runtime': '125.7', 'train_tokens_per_second': '1987'}
126
+ {'loss': '1.633', 'grad_norm': '0.5059', 'learning_rate': '1.64e-05', 'epoch': '0.03307', 'num_input_tokens_seen': 251781, 'train_runtime': '126.7', 'train_tokens_per_second': '1987'}
127
+ {'loss': '1.475', 'grad_norm': '0.4845', 'learning_rate': '1.653e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 253828, 'train_runtime': '127.7', 'train_tokens_per_second': '1987'}
128
+ {'loss': '1.531', 'grad_norm': '0.5408', 'learning_rate': '1.667e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 255875, 'train_runtime': '128.7', 'train_tokens_per_second': '1988'}
129
+ {'loss': '1.483', 'grad_norm': '0.5341', 'learning_rate': '1.68e-05', 'epoch': '0.03388', 'num_input_tokens_seen': 257922, 'train_runtime': '129.8', 'train_tokens_per_second': '1988'}
130
+ {'loss': '1.496', 'grad_norm': '0.62', 'learning_rate': '1.694e-05', 'epoch': '0.03415', 'num_input_tokens_seen': 259969, 'train_runtime': '130.8', 'train_tokens_per_second': '1988'}
131
+ {'loss': '1.392', 'grad_norm': '0.5367', 'learning_rate': '1.707e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 262016, 'train_runtime': '131.8', 'train_tokens_per_second': '1988'}
132
+ {'loss': '1.658', 'grad_norm': '0.6011', 'learning_rate': '1.72e-05', 'epoch': '0.03469', 'num_input_tokens_seen': 264063, 'train_runtime': '132.8', 'train_tokens_per_second': '1988'}
133
+ {'loss': '1.736', 'grad_norm': '0.6064', 'learning_rate': '1.734e-05', 'epoch': '0.03496', 'num_input_tokens_seen': 266110, 'train_runtime': '133.8', 'train_tokens_per_second': '1988'}
134
+ {'loss': '1.581', 'grad_norm': '0.5968', 'learning_rate': '1.747e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 268157, 'train_runtime': '134.8', 'train_tokens_per_second': '1989'}
135
+ {'loss': '1.429', 'grad_norm': '0.4829', 'learning_rate': '1.761e-05', 'epoch': '0.03549', 'num_input_tokens_seen': 270204, 'train_runtime': '135.9', 'train_tokens_per_second': '1989'}
136
+ {'loss': '1.463', 'grad_norm': '0.5296', 'learning_rate': '1.774e-05', 'epoch': '0.03576', 'num_input_tokens_seen': 272251, 'train_runtime': '136.9', 'train_tokens_per_second': '1989'}
137
+ {'loss': '1.526', 'grad_norm': '0.6281', 'learning_rate': '1.788e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 274298, 'train_runtime': '137.9', 'train_tokens_per_second': '1989'}
138
+ {'loss': '1.534', 'grad_norm': '0.6035', 'learning_rate': '1.801e-05', 'epoch': '0.0363', 'num_input_tokens_seen': 276345, 'train_runtime': '138.9', 'train_tokens_per_second': '1989'}
139
+ {'loss': '1.653', 'grad_norm': '0.5799', 'learning_rate': '1.815e-05', 'epoch': '0.03657', 'num_input_tokens_seen': 278392, 'train_runtime': '139.9', 'train_tokens_per_second': '1989'}
140
+ {'loss': '1.519', 'grad_norm': '0.6246', 'learning_rate': '1.828e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 280439, 'train_runtime': '141', 'train_tokens_per_second': '1989'}
141
+ {'loss': '1.389', 'grad_norm': '0.5421', 'learning_rate': '1.841e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 282486, 'train_runtime': '142', 'train_tokens_per_second': '1990'}
142
+ {'loss': '1.675', 'grad_norm': '0.6183', 'learning_rate': '1.855e-05', 'epoch': '0.03738', 'num_input_tokens_seen': 284533, 'train_runtime': '143', 'train_tokens_per_second': '1990'}
143
+ {'loss': '1.464', 'grad_norm': '0.5757', 'learning_rate': '1.868e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 286580, 'train_runtime': '144', 'train_tokens_per_second': '1990'}
144
+ {'loss': '1.458', 'grad_norm': '0.5838', 'learning_rate': '1.882e-05', 'epoch': '0.03791', 'num_input_tokens_seen': 288627, 'train_runtime': '145', 'train_tokens_per_second': '1990'}
145
+ {'loss': '1.58', 'grad_norm': '0.6429', 'learning_rate': '1.895e-05', 'epoch': '0.03818', 'num_input_tokens_seen': 290674, 'train_runtime': '146.1', 'train_tokens_per_second': '1990'}
146
+ {'loss': '1.327', 'grad_norm': '0.571', 'learning_rate': '1.909e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 292721, 'train_runtime': '147.1', 'train_tokens_per_second': '1990'}
147
+ {'loss': '1.603', 'grad_norm': '0.6355', 'learning_rate': '1.922e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 294768, 'train_runtime': '148.1', 'train_tokens_per_second': '1990'}
148
+ {'loss': '1.377', 'grad_norm': '0.5791', 'learning_rate': '1.935e-05', 'epoch': '0.03899', 'num_input_tokens_seen': 296815, 'train_runtime': '149.1', 'train_tokens_per_second': '1991'}
149
+ {'loss': '1.409', 'grad_norm': '0.6662', 'learning_rate': '1.949e-05', 'epoch': '0.03926', 'num_input_tokens_seen': 298862, 'train_runtime': '150.1', 'train_tokens_per_second': '1991'}
150
+ {'loss': '1.234', 'grad_norm': '0.5858', 'learning_rate': '1.962e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 300909, 'train_runtime': '151.2', 'train_tokens_per_second': '1991'}
151
+ {'loss': '1.58', 'grad_norm': '0.6273', 'learning_rate': '1.976e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 302956, 'train_runtime': '152.2', 'train_tokens_per_second': '1991'}
152
+ {'loss': '1.393', 'grad_norm': '0.6303', 'learning_rate': '1.989e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 305003, 'train_runtime': '153.2', 'train_tokens_per_second': '1991'}
153
+ {'loss': '1.48', 'grad_norm': '0.7072', 'learning_rate': '2.003e-05', 'epoch': '0.04033', 'num_input_tokens_seen': 307050, 'train_runtime': '154.2', 'train_tokens_per_second': '1991'}
154
+ {'loss': '1.548', 'grad_norm': '0.7448', 'learning_rate': '2.016e-05', 'epoch': '0.0406', 'num_input_tokens_seen': 309097, 'train_runtime': '155.2', 'train_tokens_per_second': '1991'}
155
+ {'loss': '1.567', 'grad_norm': '0.7425', 'learning_rate': '2.03e-05', 'epoch': '0.04087', 'num_input_tokens_seen': 311144, 'train_runtime': '156.2', 'train_tokens_per_second': '1991'}
156
+ {'loss': '1.282', 'grad_norm': '0.5985', 'learning_rate': '2.043e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 313191, 'train_runtime': '157.3', 'train_tokens_per_second': '1992'}
157
+ {'loss': '1.438', 'grad_norm': '0.7234', 'learning_rate': '2.056e-05', 'epoch': '0.04141', 'num_input_tokens_seen': 315238, 'train_runtime': '158.3', 'train_tokens_per_second': '1992'}
158
+ {'loss': '1.454', 'grad_norm': '0.6636', 'learning_rate': '2.07e-05', 'epoch': '0.04168', 'num_input_tokens_seen': 317285, 'train_runtime': '159.3', 'train_tokens_per_second': '1992'}
159
+ {'loss': '1.461', 'grad_norm': '0.7192', 'learning_rate': '2.083e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 319332, 'train_runtime': '160.3', 'train_tokens_per_second': '1992'}
160
+ {'loss': '1.385', 'grad_norm': '0.7114', 'learning_rate': '2.097e-05', 'epoch': '0.04222', 'num_input_tokens_seen': 321379, 'train_runtime': '161.3', 'train_tokens_per_second': '1992'}
161
+ {'loss': '1.568', 'grad_norm': '0.9612', 'learning_rate': '2.11e-05', 'epoch': '0.04248', 'num_input_tokens_seen': 323426, 'train_runtime': '162.4', 'train_tokens_per_second': '1992'}
162
+ {'loss': '1.551', 'grad_norm': '0.7511', 'learning_rate': '2.124e-05', 'epoch': '0.04275', 'num_input_tokens_seen': 325473, 'train_runtime': '163.4', 'train_tokens_per_second': '1992'}
163
+ {'loss': '1.468', 'grad_norm': '0.771', 'learning_rate': '2.137e-05', 'epoch': '0.04302', 'num_input_tokens_seen': 327520, 'train_runtime': '164.4', 'train_tokens_per_second': '1992'}
164
+ {'loss': '1.486', 'grad_norm': '0.7804', 'learning_rate': '2.151e-05', 'epoch': '0.04329', 'num_input_tokens_seen': 329567, 'train_runtime': '165.4', 'train_tokens_per_second': '1992'}
165
+ {'loss': '1.426', 'grad_norm': '0.8093', 'learning_rate': '2.164e-05', 'epoch': '0.04356', 'num_input_tokens_seen': 331614, 'train_runtime': '166.4', 'train_tokens_per_second': '1992'}
166
+ {'loss': '1.331', 'grad_norm': '0.7181', 'learning_rate': '2.177e-05', 'epoch': '0.04383', 'num_input_tokens_seen': 333661, 'train_runtime': '167.4', 'train_tokens_per_second': '1993'}
167
+ {'loss': '1.026', 'grad_norm': '0.7177', 'learning_rate': '2.191e-05', 'epoch': '0.0441', 'num_input_tokens_seen': 335708, 'train_runtime': '168.5', 'train_tokens_per_second': '1993'}
168
+ {'loss': '1.391', 'grad_norm': '0.7581', 'learning_rate': '2.204e-05', 'epoch': '0.04437', 'num_input_tokens_seen': 337755, 'train_runtime': '169.5', 'train_tokens_per_second': '1993'}
169
+ {'loss': '1.388', 'grad_norm': '0.8128', 'learning_rate': '2.218e-05', 'epoch': '0.04464', 'num_input_tokens_seen': 339802, 'train_runtime': '170.5', 'train_tokens_per_second': '1993'}
170
+ {'loss': '1.494', 'grad_norm': '0.8851', 'learning_rate': '2.231e-05', 'epoch': '0.0449', 'num_input_tokens_seen': 341849, 'train_runtime': '171.5', 'train_tokens_per_second': '1993'}
171
+ {'loss': '1.275', 'grad_norm': '0.741', 'learning_rate': '2.245e-05', 'epoch': '0.04517', 'num_input_tokens_seen': 343896, 'train_runtime': '172.5', 'train_tokens_per_second': '1993'}
172
+ {'loss': '1.307', 'grad_norm': '0.7937', 'learning_rate': '2.258e-05', 'epoch': '0.04544', 'num_input_tokens_seen': 345943, 'train_runtime': '173.6', 'train_tokens_per_second': '1993'}
173
+ {'loss': '1.188', 'grad_norm': '0.758', 'learning_rate': '2.272e-05', 'epoch': '0.04571', 'num_input_tokens_seen': 347990, 'train_runtime': '174.6', 'train_tokens_per_second': '1993'}
174
+ {'loss': '1.371', 'grad_norm': '0.8093', 'learning_rate': '2.285e-05', 'epoch': '0.04598', 'num_input_tokens_seen': 350037, 'train_runtime': '175.6', 'train_tokens_per_second': '1993'}
175
+ {'loss': '1.234', 'grad_norm': '0.7643', 'learning_rate': '2.298e-05', 'epoch': '0.04625', 'num_input_tokens_seen': 352084, 'train_runtime': '176.6', 'train_tokens_per_second': '1994'}
176
+ {'loss': '1.437', 'grad_norm': '0.8591', 'learning_rate': '2.312e-05', 'epoch': '0.04652', 'num_input_tokens_seen': 354131, 'train_runtime': '177.6', 'train_tokens_per_second': '1994'}
177
+ {'loss': '1.425', 'grad_norm': '1.101', 'learning_rate': '2.325e-05', 'epoch': '0.04679', 'num_input_tokens_seen': 356178, 'train_runtime': '178.7', 'train_tokens_per_second': '1994'}
178
+ {'loss': '1.402', 'grad_norm': '0.8633', 'learning_rate': '2.339e-05', 'epoch': '0.04706', 'num_input_tokens_seen': 358225, 'train_runtime': '179.7', 'train_tokens_per_second': '1994'}
179
+ {'loss': '1.33', 'grad_norm': '0.9336', 'learning_rate': '2.352e-05', 'epoch': '0.04732', 'num_input_tokens_seen': 360272, 'train_runtime': '180.7', 'train_tokens_per_second': '1994'}
180
+ {'loss': '1.189', 'grad_norm': '0.9058', 'learning_rate': '2.366e-05', 'epoch': '0.04759', 'num_input_tokens_seen': 362319, 'train_runtime': '181.7', 'train_tokens_per_second': '1994'}
181
+ {'loss': '1.383', 'grad_norm': '1.003', 'learning_rate': '2.379e-05', 'epoch': '0.04786', 'num_input_tokens_seen': 364366, 'train_runtime': '182.7', 'train_tokens_per_second': '1994'}
182
+ {'loss': '1.263', 'grad_norm': '0.949', 'learning_rate': '2.392e-05', 'epoch': '0.04813', 'num_input_tokens_seen': 366413, 'train_runtime': '183.7', 'train_tokens_per_second': '1994'}
183
+ {'loss': '1.473', 'grad_norm': '1.062', 'learning_rate': '2.406e-05', 'epoch': '0.0484', 'num_input_tokens_seen': 368460, 'train_runtime': '184.8', 'train_tokens_per_second': '1994'}
184
+ {'loss': '1.218', 'grad_norm': '0.862', 'learning_rate': '2.419e-05', 'epoch': '0.04867', 'num_input_tokens_seen': 370507, 'train_runtime': '185.8', 'train_tokens_per_second': '1994'}
185
+ {'loss': '1.232', 'grad_norm': '1.03', 'learning_rate': '2.433e-05', 'epoch': '0.04894', 'num_input_tokens_seen': 372554, 'train_runtime': '186.8', 'train_tokens_per_second': '1994'}
186
+ {'loss': '1.243', 'grad_norm': '0.9608', 'learning_rate': '2.446e-05', 'epoch': '0.04921', 'num_input_tokens_seen': 374601, 'train_runtime': '187.8', 'train_tokens_per_second': '1994'}
187
+ {'loss': '1.423', 'grad_norm': '0.9823', 'learning_rate': '2.46e-05', 'epoch': '0.04948', 'num_input_tokens_seen': 376648, 'train_runtime': '188.8', 'train_tokens_per_second': '1995'}
188
+ {'loss': '1.176', 'grad_norm': '0.9865', 'learning_rate': '2.473e-05', 'epoch': '0.04974', 'num_input_tokens_seen': 378695, 'train_runtime': '189.9', 'train_tokens_per_second': '1995'}
189
+ {'loss': '1.323', 'grad_norm': '1.114', 'learning_rate': '2.487e-05', 'epoch': '0.05001', 'num_input_tokens_seen': 380742, 'train_runtime': '190.9', 'train_tokens_per_second': '1995'}
190
+ {'loss': '1.394', 'grad_norm': '1.221', 'learning_rate': '2.5e-05', 'epoch': '0.05028', 'num_input_tokens_seen': 382789, 'train_runtime': '191.9', 'train_tokens_per_second': '1995'}
191
+ {'loss': '1.228', 'grad_norm': '0.9599', 'learning_rate': '2.513e-05', 'epoch': '0.05055', 'num_input_tokens_seen': 384836, 'train_runtime': '192.9', 'train_tokens_per_second': '1995'}
192
+ {'loss': '0.9697', 'grad_norm': '6.034', 'learning_rate': '2.527e-05', 'epoch': '0.05082', 'num_input_tokens_seen': 386883, 'train_runtime': '193.9', 'train_tokens_per_second': '1995'}
193
+ {'loss': '1.253', 'grad_norm': '1.301', 'learning_rate': '2.54e-05', 'epoch': '0.05109', 'num_input_tokens_seen': 388930, 'train_runtime': '195', 'train_tokens_per_second': '1995'}
194
+ {'loss': '1.398', 'grad_norm': '1.082', 'learning_rate': '2.554e-05', 'epoch': '0.05136', 'num_input_tokens_seen': 390977, 'train_runtime': '196', 'train_tokens_per_second': '1995'}
195
+ {'loss': '1.206', 'grad_norm': '0.9854', 'learning_rate': '2.567e-05', 'epoch': '0.05163', 'num_input_tokens_seen': 393024, 'train_runtime': '197', 'train_tokens_per_second': '1995'}
196
+ {'loss': '1.27', 'grad_norm': '1.037', 'learning_rate': '2.581e-05', 'epoch': '0.0519', 'num_input_tokens_seen': 395071, 'train_runtime': '198', 'train_tokens_per_second': '1995'}
197
+ {'loss': '1.292', 'grad_norm': '1.057', 'learning_rate': '2.594e-05', 'epoch': '0.05216', 'num_input_tokens_seen': 397118, 'train_runtime': '199', 'train_tokens_per_second': '1995'}
198
+ {'loss': '1.272', 'grad_norm': '1.172', 'learning_rate': '2.608e-05', 'epoch': '0.05243', 'num_input_tokens_seen': 399165, 'train_runtime': '200.1', 'train_tokens_per_second': '1995'}
199
+ {'loss': '1.189', 'grad_norm': '1.225', 'learning_rate': '2.621e-05', 'epoch': '0.0527', 'num_input_tokens_seen': 401212, 'train_runtime': '201.1', 'train_tokens_per_second': '1995'}
200
+ {'loss': '1.259', 'grad_norm': '1.042', 'learning_rate': '2.634e-05', 'epoch': '0.05297', 'num_input_tokens_seen': 403259, 'train_runtime': '202.1', 'train_tokens_per_second': '1995'}
201
+ {'loss': '1.39', 'grad_norm': '1.074', 'learning_rate': '2.648e-05', 'epoch': '0.05324', 'num_input_tokens_seen': 405306, 'train_runtime': '203.1', 'train_tokens_per_second': '1995'}
202
+ {'loss': '1.152', 'grad_norm': '1.069', 'learning_rate': '2.661e-05', 'epoch': '0.05351', 'num_input_tokens_seen': 407353, 'train_runtime': '204.1', 'train_tokens_per_second': '1996'}
203
+ {'loss': '1.174', 'grad_norm': '1.032', 'learning_rate': '2.675e-05', 'epoch': '0.05378', 'num_input_tokens_seen': 409400, 'train_runtime': '205.1', 'train_tokens_per_second': '1996'}
204
+ {'loss': '1.095', 'grad_norm': '1.298', 'learning_rate': '2.688e-05', 'epoch': '0.05405', 'num_input_tokens_seen': 411447, 'train_runtime': '206.2', 'train_tokens_per_second': '1996'}
205
+ {'loss': '1.309', 'grad_norm': '1.595', 'learning_rate': '2.702e-05', 'epoch': '0.05432', 'num_input_tokens_seen': 413494, 'train_runtime': '207.2', 'train_tokens_per_second': '1996'}
206
+ {'loss': '1.312', 'grad_norm': '1.278', 'learning_rate': '2.715e-05', 'epoch': '0.05458', 'num_input_tokens_seen': 415541, 'train_runtime': '208.2', 'train_tokens_per_second': '1996'}
207
+ {'loss': '1.237', 'grad_norm': '1.138', 'learning_rate': '2.728e-05', 'epoch': '0.05485', 'num_input_tokens_seen': 417588, 'train_runtime': '209.2', 'train_tokens_per_second': '1996'}
208
+ {'loss': '1.268', 'grad_norm': '1.082', 'learning_rate': '2.742e-05', 'epoch': '0.05512', 'num_input_tokens_seen': 419635, 'train_runtime': '210.2', 'train_tokens_per_second': '1996'}
209
+ {'loss': '1.158', 'grad_norm': '1.197', 'learning_rate': '2.755e-05', 'epoch': '0.05539', 'num_input_tokens_seen': 421682, 'train_runtime': '211.3', 'train_tokens_per_second': '1996'}
210
+ {'loss': '1.114', 'grad_norm': '1.105', 'learning_rate': '2.769e-05', 'epoch': '0.05566', 'num_input_tokens_seen': 423729, 'train_runtime': '212.3', 'train_tokens_per_second': '1996'}
211
+ {'loss': '1.301', 'grad_norm': '1.159', 'learning_rate': '2.782e-05', 'epoch': '0.05593', 'num_input_tokens_seen': 425776, 'train_runtime': '213.3', 'train_tokens_per_second': '1996'}
212
+ {'loss': '1.239', 'grad_norm': '1.363', 'learning_rate': '2.796e-05', 'epoch': '0.0562', 'num_input_tokens_seen': 427823, 'train_runtime': '214.3', 'train_tokens_per_second': '1996'}
213
+ {'loss': '1.175', 'grad_norm': '1.773', 'learning_rate': '2.809e-05', 'epoch': '0.05647', 'num_input_tokens_seen': 429870, 'train_runtime': '215.3', 'train_tokens_per_second': '1996'}
214
+ {'loss': '1.356', 'grad_norm': '1.097', 'learning_rate': '2.823e-05', 'epoch': '0.05674', 'num_input_tokens_seen': 431917, 'train_runtime': '216.3', 'train_tokens_per_second': '1996'}
215
+ {'loss': '1.229', 'grad_norm': '1.221', 'learning_rate': '2.836e-05', 'epoch': '0.057', 'num_input_tokens_seen': 433964, 'train_runtime': '217.4', 'train_tokens_per_second': '1996'}
216
+ {'loss': '1.174', 'grad_norm': '1.157', 'learning_rate': '2.849e-05', 'epoch': '0.05727', 'num_input_tokens_seen': 436011, 'train_runtime': '218.4', 'train_tokens_per_second': '1996'}
217
+ {'loss': '1.068', 'grad_norm': '1.14', 'learning_rate': '2.863e-05', 'epoch': '0.05754', 'num_input_tokens_seen': 438058, 'train_runtime': '219.4', 'train_tokens_per_second': '1997'}
218
+ {'loss': '1.053', 'grad_norm': '1.118', 'learning_rate': '2.876e-05', 'epoch': '0.05781', 'num_input_tokens_seen': 440105, 'train_runtime': '220.4', 'train_tokens_per_second': '1997'}
219
+ {'loss': '1.067', 'grad_norm': '1.103', 'learning_rate': '2.89e-05', 'epoch': '0.05808', 'num_input_tokens_seen': 442152, 'train_runtime': '221.4', 'train_tokens_per_second': '1997'}
220
+ {'loss': '1.2', 'grad_norm': '1.188', 'learning_rate': '2.903e-05', 'epoch': '0.05835', 'num_input_tokens_seen': 444199, 'train_runtime': '222.5', 'train_tokens_per_second': '1997'}
221
+ {'loss': '1.151', 'grad_norm': '1.14', 'learning_rate': '2.917e-05', 'epoch': '0.05862', 'num_input_tokens_seen': 446246, 'train_runtime': '223.5', 'train_tokens_per_second': '1997'}
222
+ {'loss': '1.142', 'grad_norm': '1.394', 'learning_rate': '2.93e-05', 'epoch': '0.05889', 'num_input_tokens_seen': 448293, 'train_runtime': '224.5', 'train_tokens_per_second': '1997'}
223
+ {'loss': '1.119', 'grad_norm': '1.263', 'learning_rate': '2.944e-05', 'epoch': '0.05916', 'num_input_tokens_seen': 450340, 'train_runtime': '225.5', 'train_tokens_per_second': '1997'}
224
+ {'loss': '1.024', 'grad_norm': '1.189', 'learning_rate': '2.957e-05', 'epoch': '0.05942', 'num_input_tokens_seen': 452387, 'train_runtime': '226.5', 'train_tokens_per_second': '1997'}
225
+ {'loss': '1.227', 'grad_norm': '1.225', 'learning_rate': '2.97e-05', 'epoch': '0.05969', 'num_input_tokens_seen': 454434, 'train_runtime': '227.5', 'train_tokens_per_second': '1997'}
226
+ {'loss': '0.956', 'grad_norm': '1.254', 'learning_rate': '2.984e-05', 'epoch': '0.05996', 'num_input_tokens_seen': 456481, 'train_runtime': '228.6', 'train_tokens_per_second': '1997'}
227
+ {'loss': '1.182', 'grad_norm': '1.283', 'learning_rate': '2.997e-05', 'epoch': '0.06023', 'num_input_tokens_seen': 458528, 'train_runtime': '229.6', 'train_tokens_per_second': '1997'}
228
+ {'loss': '1.112', 'grad_norm': '1.259', 'learning_rate': '3.011e-05', 'epoch': '0.0605', 'num_input_tokens_seen': 460575, 'train_runtime': '230.6', 'train_tokens_per_second': '1997'}
229
+ {'loss': '1.142', 'grad_norm': '1.316', 'learning_rate': '3.024e-05', 'epoch': '0.06077', 'num_input_tokens_seen': 462622, 'train_runtime': '231.6', 'train_tokens_per_second': '1997'}
230
+ {'loss': '0.6945', 'grad_norm': '0.9457', 'learning_rate': '3.038e-05', 'epoch': '0.06104', 'num_input_tokens_seen': 464669, 'train_runtime': '232.6', 'train_tokens_per_second': '1997'}
231
+ {'loss': '1.13', 'grad_norm': '1.351', 'learning_rate': '3.051e-05', 'epoch': '0.06131', 'num_input_tokens_seen': 466716, 'train_runtime': '233.7', 'train_tokens_per_second': '1997'}
232
+ {'loss': '1.041', 'grad_norm': '1.223', 'learning_rate': '3.065e-05', 'epoch': '0.06158', 'num_input_tokens_seen': 468763, 'train_runtime': '234.7', 'train_tokens_per_second': '1998'}
233
+ {'loss': '0.9028', 'grad_norm': '1.262', 'learning_rate': '3.078e-05', 'epoch': '0.06184', 'num_input_tokens_seen': 470810, 'train_runtime': '235.7', 'train_tokens_per_second': '1998'}
234
+ {'loss': '1.115', 'grad_norm': '1.139', 'learning_rate': '3.091e-05', 'epoch': '0.06211', 'num_input_tokens_seen': 472857, 'train_runtime': '236.7', 'train_tokens_per_second': '1998'}
235
+ {'loss': '1.181', 'grad_norm': '1.256', 'learning_rate': '3.105e-05', 'epoch': '0.06238', 'num_input_tokens_seen': 474904, 'train_runtime': '237.7', 'train_tokens_per_second': '1998'}
236
+ {'loss': '1.177', 'grad_norm': '1.267', 'learning_rate': '3.118e-05', 'epoch': '0.06265', 'num_input_tokens_seen': 476951, 'train_runtime': '238.7', 'train_tokens_per_second': '1998'}
237
+ {'loss': '1.119', 'grad_norm': '1.256', 'learning_rate': '3.132e-05', 'epoch': '0.06292', 'num_input_tokens_seen': 478998, 'train_runtime': '239.8', 'train_tokens_per_second': '1997'}
238
+ {'loss': '1.147', 'grad_norm': '1.309', 'learning_rate': '3.145e-05', 'epoch': '0.06319', 'num_input_tokens_seen': 481045, 'train_runtime': '240.8', 'train_tokens_per_second': '1997'}
239
+ {'loss': '1.062', 'grad_norm': '1.274', 'learning_rate': '3.159e-05', 'epoch': '0.06346', 'num_input_tokens_seen': 483092, 'train_runtime': '241.9', 'train_tokens_per_second': '1997'}
240
+ {'loss': '1.136', 'grad_norm': '1.696', 'learning_rate': '3.172e-05', 'epoch': '0.06373', 'num_input_tokens_seen': 485139, 'train_runtime': '242.9', 'train_tokens_per_second': '1998'}
241
+ {'loss': '1.039', 'grad_norm': '1.379', 'learning_rate': '3.185e-05', 'epoch': '0.064', 'num_input_tokens_seen': 487186, 'train_runtime': '243.9', 'train_tokens_per_second': '1998'}
242
+ {'loss': '1.247', 'grad_norm': '1.521', 'learning_rate': '3.199e-05', 'epoch': '0.06426', 'num_input_tokens_seen': 489233, 'train_runtime': '244.9', 'train_tokens_per_second': '1998'}
243
+ {'loss': '1.183', 'grad_norm': '1.438', 'learning_rate': '3.212e-05', 'epoch': '0.06453', 'num_input_tokens_seen': 491280, 'train_runtime': '245.9', 'train_tokens_per_second': '1998'}
244
+ {'loss': '1.089', 'grad_norm': '1.296', 'learning_rate': '3.226e-05', 'epoch': '0.0648', 'num_input_tokens_seen': 493327, 'train_runtime': '246.9', 'train_tokens_per_second': '1998'}
245
+ {'loss': '1.182', 'grad_norm': '1.303', 'learning_rate': '3.239e-05', 'epoch': '0.06507', 'num_input_tokens_seen': 495374, 'train_runtime': '248', 'train_tokens_per_second': '1998'}
246
+ {'loss': '1.184', 'grad_norm': '1.186', 'learning_rate': '3.253e-05', 'epoch': '0.06534', 'num_input_tokens_seen': 497421, 'train_runtime': '249', 'train_tokens_per_second': '1998'}
247
+ {'loss': '0.8177', 'grad_norm': '1.317', 'learning_rate': '3.266e-05', 'epoch': '0.06561', 'num_input_tokens_seen': 499468, 'train_runtime': '250', 'train_tokens_per_second': '1998'}
248
+ {'loss': '1.119', 'grad_norm': '1.171', 'learning_rate': '3.28e-05', 'epoch': '0.06588', 'num_input_tokens_seen': 501515, 'train_runtime': '251', 'train_tokens_per_second': '1998'}
249
+ {'loss': '0.9268', 'grad_norm': '1.47', 'learning_rate': '3.293e-05', 'epoch': '0.06615', 'num_input_tokens_seen': 503562, 'train_runtime': '252', 'train_tokens_per_second': '1998'}
250
+ {'loss': '0.8829', 'grad_norm': '1.611', 'learning_rate': '3.306e-05', 'epoch': '0.06642', 'num_input_tokens_seen': 505609, 'train_runtime': '253.1', 'train_tokens_per_second': '1998'}
251
+ {'loss': '1.069', 'grad_norm': '1.647', 'learning_rate': '3.32e-05', 'epoch': '0.06668', 'num_input_tokens_seen': 507656, 'train_runtime': '254.1', 'train_tokens_per_second': '1998'}
252
+ {'loss': '0.9165', 'grad_norm': '1.581', 'learning_rate': '3.333e-05', 'epoch': '0.06695', 'num_input_tokens_seen': 509703, 'train_runtime': '255.1', 'train_tokens_per_second': '1998'}
253
+ {'loss': '1.079', 'grad_norm': '1.815', 'learning_rate': '3.347e-05', 'epoch': '0.06722', 'num_input_tokens_seen': 511750, 'train_runtime': '256.1', 'train_tokens_per_second': '1998'}
254
+ {'loss': '0.8549', 'grad_norm': '1.626', 'learning_rate': '3.36e-05', 'epoch': '0.06749', 'num_input_tokens_seen': 513797, 'train_runtime': '257.1', 'train_tokens_per_second': '1998'}
255
+ {'loss': '0.8964', 'grad_norm': '1.216', 'learning_rate': '3.374e-05', 'epoch': '0.06776', 'num_input_tokens_seen': 515844, 'train_runtime': '258.2', 'train_tokens_per_second': '1998'}
256
+ {'loss': '0.9361', 'grad_norm': '1.345', 'learning_rate': '3.387e-05', 'epoch': '0.06803', 'num_input_tokens_seen': 517891, 'train_runtime': '259.2', 'train_tokens_per_second': '1998'}
257
+ {'loss': '0.8836', 'grad_norm': '1.337', 'learning_rate': '3.401e-05', 'epoch': '0.0683', 'num_input_tokens_seen': 519938, 'train_runtime': '260.2', 'train_tokens_per_second': '1998'}
258
+ {'loss': '1.104', 'grad_norm': '1.467', 'learning_rate': '3.414e-05', 'epoch': '0.06857', 'num_input_tokens_seen': 521985, 'train_runtime': '261.2', 'train_tokens_per_second': '1998'}
259
+ {'loss': '1.308', 'grad_norm': '1.429', 'learning_rate': '3.427e-05', 'epoch': '0.06884', 'num_input_tokens_seen': 524032, 'train_runtime': '262.2', 'train_tokens_per_second': '1998'}
260
+ {'loss': '1.079', 'grad_norm': '1.394', 'learning_rate': '3.441e-05', 'epoch': '0.0691', 'num_input_tokens_seen': 526079, 'train_runtime': '263.3', 'train_tokens_per_second': '1998'}
261
+ {'loss': '1.033', 'grad_norm': '1.304', 'learning_rate': '3.454e-05', 'epoch': '0.06937', 'num_input_tokens_seen': 528126, 'train_runtime': '264.3', 'train_tokens_per_second': '1998'}
262
+ {'loss': '0.9466', 'grad_norm': '1.488', 'learning_rate': '3.468e-05', 'epoch': '0.06964', 'num_input_tokens_seen': 530173, 'train_runtime': '265.3', 'train_tokens_per_second': '1999'}
263
+ {'loss': '1.045', 'grad_norm': '1.277', 'learning_rate': '3.481e-05', 'epoch': '0.06991', 'num_input_tokens_seen': 532220, 'train_runtime': '266.3', 'train_tokens_per_second': '1999'}
264
+ {'loss': '0.9476', 'grad_norm': '1.584', 'learning_rate': '3.495e-05', 'epoch': '0.07018', 'num_input_tokens_seen': 534267, 'train_runtime': '267.3', 'train_tokens_per_second': '1999'}
265
+ {'loss': '0.7732', 'grad_norm': '1.766', 'learning_rate': '3.508e-05', 'epoch': '0.07045', 'num_input_tokens_seen': 536314, 'train_runtime': '268.3', 'train_tokens_per_second': '1999'}
266
+ {'loss': '0.9556', 'grad_norm': '1.519', 'learning_rate': '3.522e-05', 'epoch': '0.07072', 'num_input_tokens_seen': 538361, 'train_runtime': '269.4', 'train_tokens_per_second': '1999'}
267
+ {'loss': '0.7371', 'grad_norm': '1.619', 'learning_rate': '3.535e-05', 'epoch': '0.07099', 'num_input_tokens_seen': 540408, 'train_runtime': '270.4', 'train_tokens_per_second': '1999'}
268
+ {'loss': '1.137', 'grad_norm': '1.548', 'learning_rate': '3.548e-05', 'epoch': '0.07126', 'num_input_tokens_seen': 542455, 'train_runtime': '271.4', 'train_tokens_per_second': '1999'}
269
+ {'loss': '0.9216', 'grad_norm': '1.755', 'learning_rate': '3.562e-05', 'epoch': '0.07152', 'num_input_tokens_seen': 544502, 'train_runtime': '272.4', 'train_tokens_per_second': '1999'}
270
+ {'loss': '1.001', 'grad_norm': '1.717', 'learning_rate': '3.575e-05', 'epoch': '0.07179', 'num_input_tokens_seen': 546549, 'train_runtime': '273.4', 'train_tokens_per_second': '1999'}
271
+ {'loss': '0.8521', 'grad_norm': '1.497', 'learning_rate': '3.589e-05', 'epoch': '0.07206', 'num_input_tokens_seen': 548596, 'train_runtime': '274.5', 'train_tokens_per_second': '1999'}
272
+ {'loss': '0.9487', 'grad_norm': '1.475', 'learning_rate': '3.602e-05', 'epoch': '0.07233', 'num_input_tokens_seen': 550643, 'train_runtime': '275.5', 'train_tokens_per_second': '1999'}
273
+ {'loss': '1.001', 'grad_norm': '1.486', 'learning_rate': '3.616e-05', 'epoch': '0.0726', 'num_input_tokens_seen': 552690, 'train_runtime': '276.5', 'train_tokens_per_second': '1999'}
274
+ {'loss': '0.9512', 'grad_norm': '1.384', 'learning_rate': '3.629e-05', 'epoch': '0.07287', 'num_input_tokens_seen': 554737, 'train_runtime': '277.5', 'train_tokens_per_second': '1999'}
275
+ {'loss': '0.9048', 'grad_norm': '1.256', 'learning_rate': '3.642e-05', 'epoch': '0.07314', 'num_input_tokens_seen': 556784, 'train_runtime': '278.5', 'train_tokens_per_second': '1999'}
276
+ {'loss': '0.966', 'grad_norm': '1.58', 'learning_rate': '3.656e-05', 'epoch': '0.07341', 'num_input_tokens_seen': 558831, 'train_runtime': '279.5', 'train_tokens_per_second': '1999'}
277
+ {'loss': '1.121', 'grad_norm': '1.473', 'learning_rate': '3.669e-05', 'epoch': '0.07368', 'num_input_tokens_seen': 560878, 'train_runtime': '280.6', 'train_tokens_per_second': '1999'}
278
+ {'loss': '0.9792', 'grad_norm': '1.466', 'learning_rate': '3.683e-05', 'epoch': '0.07394', 'num_input_tokens_seen': 562925, 'train_runtime': '281.6', 'train_tokens_per_second': '1999'}
279
+ {'loss': '0.7847', 'grad_norm': '1.34', 'learning_rate': '3.696e-05', 'epoch': '0.07421', 'num_input_tokens_seen': 564972, 'train_runtime': '282.6', 'train_tokens_per_second': '1999'}
280
+ {'loss': '0.9178', 'grad_norm': '1.556', 'learning_rate': '3.71e-05', 'epoch': '0.07448', 'num_input_tokens_seen': 567019, 'train_runtime': '283.6', 'train_tokens_per_second': '1999'}
281
+ {'loss': '0.7879', 'grad_norm': '1.819', 'learning_rate': '3.723e-05', 'epoch': '0.07475', 'num_input_tokens_seen': 569066, 'train_runtime': '284.6', 'train_tokens_per_second': '1999'}
282
+ {'loss': '0.9185', 'grad_norm': '1.563', 'learning_rate': '3.737e-05', 'epoch': '0.07502', 'num_input_tokens_seen': 571113, 'train_runtime': '285.6', 'train_tokens_per_second': '1999'}
283
+ {'loss': '0.9971', 'grad_norm': '1.695', 'learning_rate': '3.75e-05', 'epoch': '0.07529', 'num_input_tokens_seen': 573160, 'train_runtime': '286.7', 'train_tokens_per_second': '1999'}
284
+ {'loss': '0.7991', 'grad_norm': '1.747', 'learning_rate': '3.763e-05', 'epoch': '0.07556', 'num_input_tokens_seen': 575207, 'train_runtime': '287.7', 'train_tokens_per_second': '1999'}
285
+ {'loss': '0.7907', 'grad_norm': '1.532', 'learning_rate': '3.777e-05', 'epoch': '0.07583', 'num_input_tokens_seen': 577254, 'train_runtime': '288.7', 'train_tokens_per_second': '2000'}
286
+ {'loss': '0.977', 'grad_norm': '1.455', 'learning_rate': '3.79e-05', 'epoch': '0.0761', 'num_input_tokens_seen': 579301, 'train_runtime': '289.7', 'train_tokens_per_second': '2000'}
287
+ {'loss': '0.7108', 'grad_norm': '1.527', 'learning_rate': '3.804e-05', 'epoch': '0.07636', 'num_input_tokens_seen': 581348, 'train_runtime': '290.7', 'train_tokens_per_second': '2000'}
288
+ {'loss': '1.001', 'grad_norm': '1.406', 'learning_rate': '3.817e-05', 'epoch': '0.07663', 'num_input_tokens_seen': 583395, 'train_runtime': '291.8', 'train_tokens_per_second': '2000'}
289
+ {'loss': '0.9314', 'grad_norm': '1.623', 'learning_rate': '3.831e-05', 'epoch': '0.0769', 'num_input_tokens_seen': 585442, 'train_runtime': '292.8', 'train_tokens_per_second': '2000'}
290
+ {'loss': '0.8645', 'grad_norm': '1.463', 'learning_rate': '3.844e-05', 'epoch': '0.07717', 'num_input_tokens_seen': 587489, 'train_runtime': '293.8', 'train_tokens_per_second': '2000'}
291
+ {'loss': '0.9374', 'grad_norm': '1.443', 'learning_rate': '3.858e-05', 'epoch': '0.07744', 'num_input_tokens_seen': 589536, 'train_runtime': '294.8', 'train_tokens_per_second': '2000'}
292
+ {'loss': '0.7219', 'grad_norm': '1.712', 'learning_rate': '3.871e-05', 'epoch': '0.07771', 'num_input_tokens_seen': 591583, 'train_runtime': '295.8', 'train_tokens_per_second': '2000'}
293
+ {'loss': '0.8756', 'grad_norm': '1.486', 'learning_rate': '3.884e-05', 'epoch': '0.07798', 'num_input_tokens_seen': 593630, 'train_runtime': '296.8', 'train_tokens_per_second': '2000'}
294
+ {'loss': '0.703', 'grad_norm': '1.555', 'learning_rate': '3.898e-05', 'epoch': '0.07825', 'num_input_tokens_seen': 595677, 'train_runtime': '297.9', 'train_tokens_per_second': '2000'}
295
+ {'loss': '0.8544', 'grad_norm': '1.665', 'learning_rate': '3.911e-05', 'epoch': '0.07852', 'num_input_tokens_seen': 597724, 'train_runtime': '298.9', 'train_tokens_per_second': '2000'}
296
+ {'loss': '1.088', 'grad_norm': '2.26', 'learning_rate': '3.925e-05', 'epoch': '0.07878', 'num_input_tokens_seen': 599771, 'train_runtime': '299.9', 'train_tokens_per_second': '2000'}
297
+ {'loss': '0.8961', 'grad_norm': '1.421', 'learning_rate': '3.938e-05', 'epoch': '0.07905', 'num_input_tokens_seen': 601818, 'train_runtime': '300.9', 'train_tokens_per_second': '2000'}
298
+ {'loss': '1.096', 'grad_norm': '1.708', 'learning_rate': '3.952e-05', 'epoch': '0.07932', 'num_input_tokens_seen': 603865, 'train_runtime': '301.9', 'train_tokens_per_second': '2000'}
299
+ {'loss': '0.9044', 'grad_norm': '1.57', 'learning_rate': '3.965e-05', 'epoch': '0.07959', 'num_input_tokens_seen': 605912, 'train_runtime': '303', 'train_tokens_per_second': '2000'}
300
+ {'loss': '0.9157', 'grad_norm': '1.404', 'learning_rate': '3.978e-05', 'epoch': '0.07986', 'num_input_tokens_seen': 607959, 'train_runtime': '304', 'train_tokens_per_second': '2000'}
301
+ {'loss': '0.9376', 'grad_norm': '1.561', 'learning_rate': '3.992e-05', 'epoch': '0.08013', 'num_input_tokens_seen': 610006, 'train_runtime': '305', 'train_tokens_per_second': '2000'}
302
+ {'loss': '1.079', 'grad_norm': '1.473', 'learning_rate': '4.005e-05', 'epoch': '0.0804', 'num_input_tokens_seen': 612053, 'train_runtime': '306', 'train_tokens_per_second': '2000'}
303
+ {'loss': '0.8078', 'grad_norm': '1.753', 'learning_rate': '4.019e-05', 'epoch': '0.08067', 'num_input_tokens_seen': 614100, 'train_runtime': '307', 'train_tokens_per_second': '2000'}
304
+ {'loss': '0.9436', 'grad_norm': '1.635', 'learning_rate': '4.032e-05', 'epoch': '0.08094', 'num_input_tokens_seen': 616147, 'train_runtime': '308', 'train_tokens_per_second': '2000'}
305
+ {'loss': '0.8635', 'grad_norm': '1.619', 'learning_rate': '4.046e-05', 'epoch': '0.0812', 'num_input_tokens_seen': 618194, 'train_runtime': '309.1', 'train_tokens_per_second': '2000'}
306
+ {'loss': '0.8744', 'grad_norm': '1.512', 'learning_rate': '4.059e-05', 'epoch': '0.08147', 'num_input_tokens_seen': 620241, 'train_runtime': '310.1', 'train_tokens_per_second': '2000'}
307
+ {'loss': '0.9712', 'grad_norm': '1.711', 'learning_rate': '4.073e-05', 'epoch': '0.08174', 'num_input_tokens_seen': 622288, 'train_runtime': '311.1', 'train_tokens_per_second': '2000'}
308
+ {'loss': '0.8672', 'grad_norm': '1.683', 'learning_rate': '4.086e-05', 'epoch': '0.08201', 'num_input_tokens_seen': 624335, 'train_runtime': '312.1', 'train_tokens_per_second': '2000'}
309
+ {'loss': '0.8807', 'grad_norm': '1.646', 'learning_rate': '4.099e-05', 'epoch': '0.08228', 'num_input_tokens_seen': 626382, 'train_runtime': '313.1', 'train_tokens_per_second': '2000'}
310
+ {'loss': '0.9382', 'grad_norm': '1.572', 'learning_rate': '4.113e-05', 'epoch': '0.08255', 'num_input_tokens_seen': 628429, 'train_runtime': '314.1', 'train_tokens_per_second': '2000'}
311
+ {'loss': '0.9096', 'grad_norm': '1.767', 'learning_rate': '4.126e-05', 'epoch': '0.08282', 'num_input_tokens_seen': 630476, 'train_runtime': '315.2', 'train_tokens_per_second': '2000'}
312
+ {'loss': '0.9922', 'grad_norm': '1.578', 'learning_rate': '4.14e-05', 'epoch': '0.08309', 'num_input_tokens_seen': 632523, 'train_runtime': '316.2', 'train_tokens_per_second': '2001'}
313
+ {'loss': '0.6242', 'grad_norm': '1.54', 'learning_rate': '4.153e-05', 'epoch': '0.08336', 'num_input_tokens_seen': 634570, 'train_runtime': '317.2', 'train_tokens_per_second': '2001'}
314
+ {'loss': '0.8425', 'grad_norm': '1.811', 'learning_rate': '4.167e-05', 'epoch': '0.08362', 'num_input_tokens_seen': 636617, 'train_runtime': '318.2', 'train_tokens_per_second': '2001'}
315
+ {'loss': '0.9227', 'grad_norm': '1.62', 'learning_rate': '4.18e-05', 'epoch': '0.08389', 'num_input_tokens_seen': 638664, 'train_runtime': '319.2', 'train_tokens_per_second': '2001'}
316
+ {'loss': '1.007', 'grad_norm': '1.642', 'learning_rate': '4.194e-05', 'epoch': '0.08416', 'num_input_tokens_seen': 640711, 'train_runtime': '320.3', 'train_tokens_per_second': '2001'}
317
+ {'loss': '0.7684', 'grad_norm': '1.521', 'learning_rate': '4.207e-05', 'epoch': '0.08443', 'num_input_tokens_seen': 642758, 'train_runtime': '321.3', 'train_tokens_per_second': '2001'}
318
+ {'loss': '0.9068', 'grad_norm': '1.779', 'learning_rate': '4.22e-05', 'epoch': '0.0847', 'num_input_tokens_seen': 644805, 'train_runtime': '322.3', 'train_tokens_per_second': '2001'}
319
+ {'loss': '0.8407', 'grad_norm': '1.588', 'learning_rate': '4.234e-05', 'epoch': '0.08497', 'num_input_tokens_seen': 646852, 'train_runtime': '323.3', 'train_tokens_per_second': '2001'}
320
+ {'loss': '0.9359', 'grad_norm': '1.685', 'learning_rate': '4.247e-05', 'epoch': '0.08524', 'num_input_tokens_seen': 648899, 'train_runtime': '324.3', 'train_tokens_per_second': '2001'}
321
+ {'loss': '0.8513', 'grad_norm': '1.823', 'learning_rate': '4.261e-05', 'epoch': '0.08551', 'num_input_tokens_seen': 650946, 'train_runtime': '325.4', 'train_tokens_per_second': '2001'}
322
+ {'loss': '1.09', 'grad_norm': '2.251', 'learning_rate': '4.274e-05', 'epoch': '0.08578', 'num_input_tokens_seen': 652993, 'train_runtime': '326.4', 'train_tokens_per_second': '2001'}
323
+ {'loss': '0.8893', 'grad_norm': '1.614', 'learning_rate': '4.288e-05', 'epoch': '0.08604', 'num_input_tokens_seen': 655040, 'train_runtime': '327.4', 'train_tokens_per_second': '2001'}
324
+ {'loss': '0.499', 'grad_norm': '1.693', 'learning_rate': '4.301e-05', 'epoch': '0.08631', 'num_input_tokens_seen': 657087, 'train_runtime': '328.4', 'train_tokens_per_second': '2001'}
325
+ {'loss': '1.006', 'grad_norm': '1.781', 'learning_rate': '4.315e-05', 'epoch': '0.08658', 'num_input_tokens_seen': 659134, 'train_runtime': '329.4', 'train_tokens_per_second': '2001'}
326
+ {'loss': '0.6728', 'grad_norm': '1.412', 'learning_rate': '4.328e-05', 'epoch': '0.08685', 'num_input_tokens_seen': 661181, 'train_runtime': '330.4', 'train_tokens_per_second': '2001'}
327
+ {'loss': '0.6491', 'grad_norm': '1.683', 'learning_rate': '4.341e-05', 'epoch': '0.08712', 'num_input_tokens_seen': 663228, 'train_runtime': '331.5', 'train_tokens_per_second': '2001'}
328
+ {'loss': '0.9646', 'grad_norm': '1.918', 'learning_rate': '4.355e-05', 'epoch': '0.08739', 'num_input_tokens_seen': 665275, 'train_runtime': '332.5', 'train_tokens_per_second': '2001'}
329
+ {'loss': '0.6656', 'grad_norm': '1.711', 'learning_rate': '4.368e-05', 'epoch': '0.08766', 'num_input_tokens_seen': 667322, 'train_runtime': '333.5', 'train_tokens_per_second': '2001'}
330
+ {'loss': '0.7556', 'grad_norm': '1.799', 'learning_rate': '4.382e-05', 'epoch': '0.08793', 'num_input_tokens_seen': 669369, 'train_runtime': '334.5', 'train_tokens_per_second': '2001'}
331
+ {'loss': '0.8211', 'grad_norm': '1.622', 'learning_rate': '4.395e-05', 'epoch': '0.0882', 'num_input_tokens_seen': 671416, 'train_runtime': '335.5', 'train_tokens_per_second': '2001'}
332
+ {'loss': '0.8586', 'grad_norm': '1.673', 'learning_rate': '4.409e-05', 'epoch': '0.08846', 'num_input_tokens_seen': 673463, 'train_runtime': '336.6', 'train_tokens_per_second': '2001'}
333
+ {'loss': '0.8275', 'grad_norm': '1.59', 'learning_rate': '4.422e-05', 'epoch': '0.08873', 'num_input_tokens_seen': 675510, 'train_runtime': '337.6', 'train_tokens_per_second': '2001'}
334
+ {'loss': '0.7986', 'grad_norm': '1.536', 'learning_rate': '4.435e-05', 'epoch': '0.089', 'num_input_tokens_seen': 677557, 'train_runtime': '338.6', 'train_tokens_per_second': '2001'}
335
+ {'loss': '0.8409', 'grad_norm': '1.524', 'learning_rate': '4.449e-05', 'epoch': '0.08927', 'num_input_tokens_seen': 679604, 'train_runtime': '339.6', 'train_tokens_per_second': '2001'}
336
+ {'loss': '0.7889', 'grad_norm': '1.606', 'learning_rate': '4.462e-05', 'epoch': '0.08954', 'num_input_tokens_seen': 681651, 'train_runtime': '340.6', 'train_tokens_per_second': '2001'}
337
+ {'loss': '0.8146', 'grad_norm': '1.721', 'learning_rate': '4.476e-05', 'epoch': '0.08981', 'num_input_tokens_seen': 683698, 'train_runtime': '341.6', 'train_tokens_per_second': '2001'}
338
+ {'loss': '0.9218', 'grad_norm': '1.753', 'learning_rate': '4.489e-05', 'epoch': '0.09008', 'num_input_tokens_seen': 685745, 'train_runtime': '342.7', 'train_tokens_per_second': '2001'}
339
+ {'loss': '0.6649', 'grad_norm': '1.632', 'learning_rate': '4.503e-05', 'epoch': '0.09035', 'num_input_tokens_seen': 687792, 'train_runtime': '343.7', 'train_tokens_per_second': '2001'}
340
+ {'loss': '0.7102', 'grad_norm': '1.424', 'learning_rate': '4.516e-05', 'epoch': '0.09062', 'num_input_tokens_seen': 689839, 'train_runtime': '344.7', 'train_tokens_per_second': '2001'}
341
+ {'loss': '1.134', 'grad_norm': '2.3', 'learning_rate': '4.53e-05', 'epoch': '0.09088', 'num_input_tokens_seen': 691886, 'train_runtime': '345.7', 'train_tokens_per_second': '2001'}
342
+ {'loss': '0.9732', 'grad_norm': '2.07', 'learning_rate': '4.543e-05', 'epoch': '0.09115', 'num_input_tokens_seen': 693933, 'train_runtime': '346.7', 'train_tokens_per_second': '2001'}
343
+ {'loss': '0.8109', 'grad_norm': '1.658', 'learning_rate': '4.556e-05', 'epoch': '0.09142', 'num_input_tokens_seen': 695980, 'train_runtime': '347.8', 'train_tokens_per_second': '2001'}
344
+ {'loss': '0.8198', 'grad_norm': '1.551', 'learning_rate': '4.57e-05', 'epoch': '0.09169', 'num_input_tokens_seen': 698027, 'train_runtime': '348.8', 'train_tokens_per_second': '2001'}
345
+ {'loss': '0.6508', 'grad_norm': '1.996', 'learning_rate': '4.583e-05', 'epoch': '0.09196', 'num_input_tokens_seen': 700074, 'train_runtime': '349.8', 'train_tokens_per_second': '2001'}
346
+ {'loss': '0.6369', 'grad_norm': '1.678', 'learning_rate': '4.597e-05', 'epoch': '0.09223', 'num_input_tokens_seen': 702121, 'train_runtime': '350.8', 'train_tokens_per_second': '2001'}
347
+ {'loss': '0.8778', 'grad_norm': '1.761', 'learning_rate': '4.61e-05', 'epoch': '0.0925', 'num_input_tokens_seen': 704168, 'train_runtime': '351.8', 'train_tokens_per_second': '2001'}
348
+ {'loss': '0.5125', 'grad_norm': '2.032', 'learning_rate': '4.624e-05', 'epoch': '0.09277', 'num_input_tokens_seen': 706215, 'train_runtime': '352.9', 'train_tokens_per_second': '2001'}
349
+ {'loss': '0.5776', 'grad_norm': '1.902', 'learning_rate': '4.637e-05', 'epoch': '0.09304', 'num_input_tokens_seen': 708262, 'train_runtime': '353.9', 'train_tokens_per_second': '2001'}
350
+ {'loss': '0.8128', 'grad_norm': '1.934', 'learning_rate': '4.651e-05', 'epoch': '0.0933', 'num_input_tokens_seen': 710309, 'train_runtime': '354.9', 'train_tokens_per_second': '2001'}
351
+ {'loss': '0.8', 'grad_norm': '2.005', 'learning_rate': '4.664e-05', 'epoch': '0.09357', 'num_input_tokens_seen': 712356, 'train_runtime': '355.9', 'train_tokens_per_second': '2002'}
352
+ {'loss': '0.9134', 'grad_norm': '1.872', 'learning_rate': '4.677e-05', 'epoch': '0.09384', 'num_input_tokens_seen': 714403, 'train_runtime': '356.9', 'train_tokens_per_second': '2002'}
353
+ {'loss': '0.8195', 'grad_norm': '1.896', 'learning_rate': '4.691e-05', 'epoch': '0.09411', 'num_input_tokens_seen': 716450, 'train_runtime': '357.9', 'train_tokens_per_second': '2002'}
354
+ {'loss': '0.9879', 'grad_norm': '1.732', 'learning_rate': '4.704e-05', 'epoch': '0.09438', 'num_input_tokens_seen': 718497, 'train_runtime': '359', 'train_tokens_per_second': '2002'}
355
+ {'loss': '0.7241', 'grad_norm': '1.685', 'learning_rate': '4.718e-05', 'epoch': '0.09465', 'num_input_tokens_seen': 720544, 'train_runtime': '360', 'train_tokens_per_second': '2002'}
356
+ {'loss': '0.8061', 'grad_norm': '1.555', 'learning_rate': '4.731e-05', 'epoch': '0.09492', 'num_input_tokens_seen': 722591, 'train_runtime': '361', 'train_tokens_per_second': '2002'}
357
+ {'loss': '0.8035', 'grad_norm': '1.807', 'learning_rate': '4.745e-05', 'epoch': '0.09519', 'num_input_tokens_seen': 724638, 'train_runtime': '362', 'train_tokens_per_second': '2002'}
358
+ {'loss': '0.4991', 'grad_norm': '1.543', 'learning_rate': '4.758e-05', 'epoch': '0.09546', 'num_input_tokens_seen': 726685, 'train_runtime': '363', 'train_tokens_per_second': '2002'}
359
+ {'loss': '0.8125', 'grad_norm': '1.724', 'learning_rate': '4.772e-05', 'epoch': '0.09572', 'num_input_tokens_seen': 728732, 'train_runtime': '364.1', 'train_tokens_per_second': '2002'}
360
+ {'loss': '0.752', 'grad_norm': '1.793', 'learning_rate': '4.785e-05', 'epoch': '0.09599', 'num_input_tokens_seen': 730779, 'train_runtime': '365.1', 'train_tokens_per_second': '2002'}
361
+ {'loss': '0.9271', 'grad_norm': '2.305', 'learning_rate': '4.798e-05', 'epoch': '0.09626', 'num_input_tokens_seen': 732826, 'train_runtime': '366.1', 'train_tokens_per_second': '2002'}
362
+ {'loss': '0.6132', 'grad_norm': '2.224', 'learning_rate': '4.812e-05', 'epoch': '0.09653', 'num_input_tokens_seen': 734873, 'train_runtime': '367.1', 'train_tokens_per_second': '2002'}
363
+ {'loss': '0.6797', 'grad_norm': '1.914', 'learning_rate': '4.825e-05', 'epoch': '0.0968', 'num_input_tokens_seen': 736920, 'train_runtime': '368.1', 'train_tokens_per_second': '2002'}
364
+ {'loss': '0.9467', 'grad_norm': '2.078', 'learning_rate': '4.839e-05', 'epoch': '0.09707', 'num_input_tokens_seen': 738967, 'train_runtime': '369.1', 'train_tokens_per_second': '2002'}
365
+ {'loss': '0.8589', 'grad_norm': '2.175', 'learning_rate': '4.852e-05', 'epoch': '0.09734', 'num_input_tokens_seen': 741014, 'train_runtime': '370.2', 'train_tokens_per_second': '2002'}
366
+ {'loss': '0.8454', 'grad_norm': '1.922', 'learning_rate': '4.866e-05', 'epoch': '0.09761', 'num_input_tokens_seen': 743061, 'train_runtime': '371.2', 'train_tokens_per_second': '2002'}
367
+ {'loss': '0.8227', 'grad_norm': '1.937', 'learning_rate': '4.879e-05', 'epoch': '0.09788', 'num_input_tokens_seen': 745108, 'train_runtime': '372.2', 'train_tokens_per_second': '2002'}
368
+ {'loss': '0.7916', 'grad_norm': '1.935', 'learning_rate': '4.892e-05', 'epoch': '0.09814', 'num_input_tokens_seen': 747155, 'train_runtime': '373.2', 'train_tokens_per_second': '2002'}
369
+ {'loss': '0.6554', 'grad_norm': '1.673', 'learning_rate': '4.906e-05', 'epoch': '0.09841', 'num_input_tokens_seen': 749202, 'train_runtime': '374.2', 'train_tokens_per_second': '2002'}
370
+ {'loss': '0.8427', 'grad_norm': '1.627', 'learning_rate': '4.919e-05', 'epoch': '0.09868', 'num_input_tokens_seen': 751249, 'train_runtime': '375.2', 'train_tokens_per_second': '2002'}
371
+ {'loss': '0.7', 'grad_norm': '1.613', 'learning_rate': '4.933e-05', 'epoch': '0.09895', 'num_input_tokens_seen': 753296, 'train_runtime': '376.3', 'train_tokens_per_second': '2002'}
372
+ {'loss': '1.085', 'grad_norm': '1.733', 'learning_rate': '4.946e-05', 'epoch': '0.09922', 'num_input_tokens_seen': 755343, 'train_runtime': '377.3', 'train_tokens_per_second': '2002'}
373
+ {'loss': '0.7366', 'grad_norm': '1.8', 'learning_rate': '4.96e-05', 'epoch': '0.09949', 'num_input_tokens_seen': 757390, 'train_runtime': '378.3', 'train_tokens_per_second': '2002'}
374
+ {'loss': '0.8539', 'grad_norm': '2.089', 'learning_rate': '4.973e-05', 'epoch': '0.09976', 'num_input_tokens_seen': 759437, 'train_runtime': '379.3', 'train_tokens_per_second': '2002'}
375
+ {'loss': '1.091', 'grad_norm': '2.162', 'learning_rate': '4.987e-05', 'epoch': '0.1', 'num_input_tokens_seen': 761484, 'train_runtime': '380.3', 'train_tokens_per_second': '2002'}
376
+ {'loss': '0.5954', 'grad_norm': '1.539', 'learning_rate': '5e-05', 'epoch': '0.1003', 'num_input_tokens_seen': 763531, 'train_runtime': '381.4', 'train_tokens_per_second': '2002'}
377
+ {'loss': '0.8637', 'grad_norm': '3.224', 'learning_rate': '5e-05', 'epoch': '0.1006', 'num_input_tokens_seen': 765578, 'train_runtime': '382.4', 'train_tokens_per_second': '2002'}
378
+ {'loss': '1.156', 'grad_norm': '2.482', 'learning_rate': '5e-05', 'epoch': '0.1008', 'num_input_tokens_seen': 767625, 'train_runtime': '383.4', 'train_tokens_per_second': '2002'}
379
+ {'loss': '0.9774', 'grad_norm': '2.115', 'learning_rate': '5e-05', 'epoch': '0.1011', 'num_input_tokens_seen': 769672, 'train_runtime': '384.4', 'train_tokens_per_second': '2002'}
380
+ {'loss': '0.7794', 'grad_norm': '2.068', 'learning_rate': '5e-05', 'epoch': '0.1014', 'num_input_tokens_seen': 771719, 'train_runtime': '385.4', 'train_tokens_per_second': '2002'}
381
+ {'loss': '0.7327', 'grad_norm': '2.226', 'learning_rate': '5e-05', 'epoch': '0.1016', 'num_input_tokens_seen': 773766, 'train_runtime': '386.5', 'train_tokens_per_second': '2002'}
382
+ {'loss': '0.7302', 'grad_norm': '1.966', 'learning_rate': '5e-05', 'epoch': '0.1019', 'num_input_tokens_seen': 775813, 'train_runtime': '387.5', 'train_tokens_per_second': '2002'}
383
+ {'loss': '0.6878', 'grad_norm': '1.581', 'learning_rate': '5e-05', 'epoch': '0.1022', 'num_input_tokens_seen': 777860, 'train_runtime': '388.5', 'train_tokens_per_second': '2002'}
384
+ {'loss': '0.737', 'grad_norm': '1.672', 'learning_rate': '5e-05', 'epoch': '0.1024', 'num_input_tokens_seen': 779907, 'train_runtime': '389.5', 'train_tokens_per_second': '2002'}
385
+ {'loss': '0.9472', 'grad_norm': '2.17', 'learning_rate': '5e-05', 'epoch': '0.1027', 'num_input_tokens_seen': 781954, 'train_runtime': '390.5', 'train_tokens_per_second': '2002'}
386
+ {'loss': '0.5831', 'grad_norm': '1.504', 'learning_rate': '5e-05', 'epoch': '0.103', 'num_input_tokens_seen': 784001, 'train_runtime': '391.6', 'train_tokens_per_second': '2002'}
387
+ {'loss': '0.6743', 'grad_norm': '1.78', 'learning_rate': '5e-05', 'epoch': '0.1033', 'num_input_tokens_seen': 786048, 'train_runtime': '392.6', 'train_tokens_per_second': '2002'}
388
+ {'loss': '0.5688', 'grad_norm': '1.95', 'learning_rate': '5e-05', 'epoch': '0.1035', 'num_input_tokens_seen': 788095, 'train_runtime': '393.6', 'train_tokens_per_second': '2002'}
389
+ {'loss': '0.929', 'grad_norm': '2.087', 'learning_rate': '5e-05', 'epoch': '0.1038', 'num_input_tokens_seen': 790142, 'train_runtime': '394.6', 'train_tokens_per_second': '2002'}
390
+ {'loss': '0.4627', 'grad_norm': '2.017', 'learning_rate': '5e-05', 'epoch': '0.1041', 'num_input_tokens_seen': 792189, 'train_runtime': '395.6', 'train_tokens_per_second': '2002'}
391
+ {'loss': '0.8193', 'grad_norm': '2.009', 'learning_rate': '5e-05', 'epoch': '0.1043', 'num_input_tokens_seen': 794236, 'train_runtime': '396.7', 'train_tokens_per_second': '2002'}
392
+ File "/usr/local/bin/llamafactory-cli", line 8, in <module>
393
+ sys.exit(main())
394
+ ^^^^^^
395
+ File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
396
+ launcher.launch()
397
+ File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
398
+ run_exp()
399
+ File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
400
+ _training_function(config={"args": args, "callbacks": callbacks})
401
+ File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
402
+ run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
403
+ File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
404
+ train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
405
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
406
+ File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
407
+ return inner_training_loop(
408
+ ^^^^^^^^^^^^^^^^^^^^
409
+ File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
410
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
411
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
412
+ File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
413
+ self.accelerator.backward(loss, **kwargs)
414
+ File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
415
+ loss.backward(**kwargs)
416
+ File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
417
+ torch.autograd.backward(
418
+ File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
419
+ _engine_run_backward(
420
+ File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
421
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
422
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
423
+ KeyboardInterrupt
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-94-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T03:57:46.163443Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/C.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "47a53adf0198",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 16,
18
+ "cpu_count_logical": 32,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "1858306048"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "201701408768"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de"
37
+ }
38
+ ],
39
+ "cudaVersion": "13.0",
40
+ "writerId": "mfjy22anxcucsb3vwlaimrwvqrgvipis"
41
+ }
LlamaFactory/wandb/run-20260204_035746-cloyjeo5/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/global_step":388,"train/grad_norm":2.0090420246124268,"train/learning_rate":4.9999916410392856e-05,"_wandb":{"runtime":396},"_runtime":396,"train/loss":0.8193472027778625,"_step":387,"train/epoch":0.1043291207313794,"train_runtime":396.6553,"train/train_tokens_per_second":2002.333,"_timestamp":1.770177862347725e+09,"train/num_input_tokens_seen":794236}
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/output.log ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0%| | 0/40950 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
2
+ with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
3
+
4
+ {'loss': '1.719', 'grad_norm': '0.3142', 'learning_rate': '0', 'epoch': '0.0001221', 'num_input_tokens_seen': 2047, 'train_runtime': '3.017', 'train_tokens_per_second': '678.5'}
5
+ {'loss': '1.142', 'grad_norm': '0.2725', 'learning_rate': '6.105e-08', 'epoch': '0.0002442', 'num_input_tokens_seen': 4094, 'train_runtime': '4.05', 'train_tokens_per_second': '1011'}
6
+ {'loss': '1.39', 'grad_norm': '0.379', 'learning_rate': '1.221e-07', 'epoch': '0.0003663', 'num_input_tokens_seen': 6141, 'train_runtime': '5.087', 'train_tokens_per_second': '1207'}
7
+ {'loss': '1.457', 'grad_norm': '0.2879', 'learning_rate': '1.832e-07', 'epoch': '0.0004884', 'num_input_tokens_seen': 8188, 'train_runtime': '6.124', 'train_tokens_per_second': '1337'}
8
+ {'loss': '1.286', 'grad_norm': '0.2564', 'learning_rate': '2.442e-07', 'epoch': '0.0006105', 'num_input_tokens_seen': 10235, 'train_runtime': '7.165', 'train_tokens_per_second': '1429'}
9
+ {'loss': '0.01258', 'grad_norm': '0.042', 'learning_rate': '3.053e-07', 'epoch': '0.0007326', 'num_input_tokens_seen': 12282, 'train_runtime': '8.201', 'train_tokens_per_second': '1498'}
10
+ {'loss': '0.8563', 'grad_norm': '0.267', 'learning_rate': '3.663e-07', 'epoch': '0.0008547', 'num_input_tokens_seen': 14329, 'train_runtime': '9.241', 'train_tokens_per_second': '1551'}
11
+ {'loss': '1.581', 'grad_norm': '0.2901', 'learning_rate': '4.274e-07', 'epoch': '0.0009768', 'num_input_tokens_seen': 16376, 'train_runtime': '10.28', 'train_tokens_per_second': '1593'}
12
+ {'loss': '1.573', 'grad_norm': '0.2915', 'learning_rate': '4.884e-07', 'epoch': '0.001099', 'num_input_tokens_seen': 18423, 'train_runtime': '11.32', 'train_tokens_per_second': '1628'}
13
+ {'loss': '1.346', 'grad_norm': '0.2841', 'learning_rate': '5.495e-07', 'epoch': '0.001221', 'num_input_tokens_seen': 20470, 'train_runtime': '12.35', 'train_tokens_per_second': '1657'}
14
+ {'loss': '1.651', 'grad_norm': '0.4522', 'learning_rate': '6.105e-07', 'epoch': '0.001343', 'num_input_tokens_seen': 22517, 'train_runtime': '13.39', 'train_tokens_per_second': '1682'}
15
+ {'loss': '1.487', 'grad_norm': '0.3466', 'learning_rate': '6.716e-07', 'epoch': '0.001465', 'num_input_tokens_seen': 24564, 'train_runtime': '14.44', 'train_tokens_per_second': '1701'}
16
+ {'loss': '0.8106', 'grad_norm': '0.2226', 'learning_rate': '7.326e-07', 'epoch': '0.001587', 'num_input_tokens_seen': 26611, 'train_runtime': '15.48', 'train_tokens_per_second': '1719'}
17
+ {'loss': '0.5651', 'grad_norm': '0.2162', 'learning_rate': '7.937e-07', 'epoch': '0.001709', 'num_input_tokens_seen': 28658, 'train_runtime': '16.52', 'train_tokens_per_second': '1735'}
18
+ {'loss': '1.622', 'grad_norm': '0.3259', 'learning_rate': '8.547e-07', 'epoch': '0.001832', 'num_input_tokens_seen': 30705, 'train_runtime': '17.56', 'train_tokens_per_second': '1749'}
19
+ {'loss': '1.418', 'grad_norm': '0.285', 'learning_rate': '9.158e-07', 'epoch': '0.001954', 'num_input_tokens_seen': 32752, 'train_runtime': '18.6', 'train_tokens_per_second': '1761'}
20
+ {'loss': '1.69', 'grad_norm': '0.3264', 'learning_rate': '9.768e-07', 'epoch': '0.002076', 'num_input_tokens_seen': 34799, 'train_runtime': '19.64', 'train_tokens_per_second': '1772'}
21
+ {'loss': '1.73', 'grad_norm': '0.3089', 'learning_rate': '1.038e-06', 'epoch': '0.002198', 'num_input_tokens_seen': 36846, 'train_runtime': '20.68', 'train_tokens_per_second': '1782'}
22
+ {'loss': '1.511', 'grad_norm': '0.3119', 'learning_rate': '1.099e-06', 'epoch': '0.00232', 'num_input_tokens_seen': 38893, 'train_runtime': '21.72', 'train_tokens_per_second': '1791'}
23
+ {'loss': '1.435', 'grad_norm': '0.3081', 'learning_rate': '1.16e-06', 'epoch': '0.002442', 'num_input_tokens_seen': 40940, 'train_runtime': '22.77', 'train_tokens_per_second': '1798'}
24
+ {'loss': '1.785', 'grad_norm': '0.4437', 'learning_rate': '1.221e-06', 'epoch': '0.002564', 'num_input_tokens_seen': 42987, 'train_runtime': '23.81', 'train_tokens_per_second': '1806'}
25
+ {'loss': '1.101', 'grad_norm': '0.3949', 'learning_rate': '1.282e-06', 'epoch': '0.002686', 'num_input_tokens_seen': 45034, 'train_runtime': '24.85', 'train_tokens_per_second': '1812'}
26
+ {'loss': '0.7684', 'grad_norm': '0.2791', 'learning_rate': '1.343e-06', 'epoch': '0.002808', 'num_input_tokens_seen': 47081, 'train_runtime': '25.89', 'train_tokens_per_second': '1819'}
27
+ {'loss': '0.9445', 'grad_norm': '0.2267', 'learning_rate': '1.404e-06', 'epoch': '0.00293', 'num_input_tokens_seen': 49128, 'train_runtime': '26.93', 'train_tokens_per_second': '1825'}
28
+ {'loss': '1.328', 'grad_norm': '0.5019', 'learning_rate': '1.465e-06', 'epoch': '0.003053', 'num_input_tokens_seen': 51175, 'train_runtime': '27.96', 'train_tokens_per_second': '1830'}
29
+ {'loss': '1.597', 'grad_norm': '0.3425', 'learning_rate': '1.526e-06', 'epoch': '0.003175', 'num_input_tokens_seen': 53222, 'train_runtime': '29.01', 'train_tokens_per_second': '1834'}
30
+ {'loss': '1.797', 'grad_norm': '0.3407', 'learning_rate': '1.587e-06', 'epoch': '0.003297', 'num_input_tokens_seen': 55269, 'train_runtime': '30.05', 'train_tokens_per_second': '1839'}
31
+ {'loss': '0.7549', 'grad_norm': '0.2074', 'learning_rate': '1.648e-06', 'epoch': '0.003419', 'num_input_tokens_seen': 57316, 'train_runtime': '31.1', 'train_tokens_per_second': '1843'}
32
+ {'loss': '0.6662', 'grad_norm': '0.2184', 'learning_rate': '1.709e-06', 'epoch': '0.003541', 'num_input_tokens_seen': 59363, 'train_runtime': '32.14', 'train_tokens_per_second': '1847'}
33
+ {'loss': '0.9995', 'grad_norm': '0.2354', 'learning_rate': '1.77e-06', 'epoch': '0.003663', 'num_input_tokens_seen': 61410, 'train_runtime': '33.19', 'train_tokens_per_second': '1850'}
34
+ {'loss': '1.189', 'grad_norm': '0.2462', 'learning_rate': '1.832e-06', 'epoch': '0.003785', 'num_input_tokens_seen': 63457, 'train_runtime': '34.23', 'train_tokens_per_second': '1854'}
35
+ {'loss': '1.353', 'grad_norm': '0.2564', 'learning_rate': '1.893e-06', 'epoch': '0.003907', 'num_input_tokens_seen': 65504, 'train_runtime': '35.28', 'train_tokens_per_second': '1857'}
36
+ {'loss': '1.41', 'grad_norm': '0.3253', 'learning_rate': '1.954e-06', 'epoch': '0.004029', 'num_input_tokens_seen': 67551, 'train_runtime': '36.32', 'train_tokens_per_second': '1860'}
37
+ {'loss': '1.575', 'grad_norm': '0.303', 'learning_rate': '2.015e-06', 'epoch': '0.004151', 'num_input_tokens_seen': 69598, 'train_runtime': '37.36', 'train_tokens_per_second': '1863'}
38
+ {'loss': '1.542', 'grad_norm': '0.3227', 'learning_rate': '2.076e-06', 'epoch': '0.004274', 'num_input_tokens_seen': 71645, 'train_runtime': '38.4', 'train_tokens_per_second': '1866'}
39
+ {'loss': '1.281', 'grad_norm': '0.3266', 'learning_rate': '2.137e-06', 'epoch': '0.004396', 'num_input_tokens_seen': 73692, 'train_runtime': '39.45', 'train_tokens_per_second': '1868'}
40
+ {'loss': '1.936', 'grad_norm': '0.601', 'learning_rate': '2.198e-06', 'epoch': '0.004518', 'num_input_tokens_seen': 75739, 'train_runtime': '40.49', 'train_tokens_per_second': '1871'}
41
+ {'loss': '1.855', 'grad_norm': '0.2591', 'learning_rate': '2.259e-06', 'epoch': '0.00464', 'num_input_tokens_seen': 77786, 'train_runtime': '41.53', 'train_tokens_per_second': '1873'}
42
+ {'loss': '0.8793', 'grad_norm': '0.308', 'learning_rate': '2.32e-06', 'epoch': '0.004762', 'num_input_tokens_seen': 79833, 'train_runtime': '42.57', 'train_tokens_per_second': '1875'}
43
+ {'loss': '1.274', 'grad_norm': '0.2598', 'learning_rate': '2.381e-06', 'epoch': '0.004884', 'num_input_tokens_seen': 81880, 'train_runtime': '43.61', 'train_tokens_per_second': '1877'}
44
+ {'loss': '1.502', 'grad_norm': '0.3138', 'learning_rate': '2.442e-06', 'epoch': '0.005006', 'num_input_tokens_seen': 83927, 'train_runtime': '44.66', 'train_tokens_per_second': '1879'}
45
+ {'loss': '1.367', 'grad_norm': '0.2641', 'learning_rate': '2.503e-06', 'epoch': '0.005128', 'num_input_tokens_seen': 85974, 'train_runtime': '45.7', 'train_tokens_per_second': '1881'}
46
+ {'loss': '0.7333', 'grad_norm': '0.226', 'learning_rate': '2.564e-06', 'epoch': '0.00525', 'num_input_tokens_seen': 88021, 'train_runtime': '46.75', 'train_tokens_per_second': '1883'}
47
+ {'loss': '1.199', 'grad_norm': '0.277', 'learning_rate': '2.625e-06', 'epoch': '0.005372', 'num_input_tokens_seen': 90068, 'train_runtime': '47.82', 'train_tokens_per_second': '1883'}
48
+ {'loss': '1.659', 'grad_norm': '0.3296', 'learning_rate': '2.686e-06', 'epoch': '0.005495', 'num_input_tokens_seen': 92115, 'train_runtime': '48.86', 'train_tokens_per_second': '1885'}
49
+ {'loss': '1.699', 'grad_norm': '0.3483', 'learning_rate': '2.747e-06', 'epoch': '0.005617', 'num_input_tokens_seen': 94162, 'train_runtime': '49.91', 'train_tokens_per_second': '1887'}
50
+ {'loss': '1.513', 'grad_norm': '0.3496', 'learning_rate': '2.808e-06', 'epoch': '0.005739', 'num_input_tokens_seen': 96209, 'train_runtime': '50.96', 'train_tokens_per_second': '1888'}
51
+ {'loss': '1.737', 'grad_norm': '0.3098', 'learning_rate': '2.869e-06', 'epoch': '0.005861', 'num_input_tokens_seen': 98256, 'train_runtime': '52.01', 'train_tokens_per_second': '1889'}
52
+ {'loss': '1.359', 'grad_norm': '0.3305', 'learning_rate': '2.93e-06', 'epoch': '0.005983', 'num_input_tokens_seen': 100303, 'train_runtime': '53.06', 'train_tokens_per_second': '1891'}
53
+ {'loss': '1.805', 'grad_norm': '0.3772', 'learning_rate': '2.991e-06', 'epoch': '0.006105', 'num_input_tokens_seen': 102350, 'train_runtime': '54.1', 'train_tokens_per_second': '1892'}
54
+ {'loss': '1.882', 'grad_norm': '0.3816', 'learning_rate': '3.053e-06', 'epoch': '0.006227', 'num_input_tokens_seen': 104397, 'train_runtime': '55.15', 'train_tokens_per_second': '1893'}
55
+ {'loss': '1.566', 'grad_norm': '0.333', 'learning_rate': '3.114e-06', 'epoch': '0.006349', 'num_input_tokens_seen': 106444, 'train_runtime': '56.19', 'train_tokens_per_second': '1895'}
56
+ {'loss': '1.816', 'grad_norm': '0.3612', 'learning_rate': '3.175e-06', 'epoch': '0.006471', 'num_input_tokens_seen': 108491, 'train_runtime': '57.23', 'train_tokens_per_second': '1896'}
57
+ {'loss': '1.933', 'grad_norm': '0.5047', 'learning_rate': '3.236e-06', 'epoch': '0.006593', 'num_input_tokens_seen': 110538, 'train_runtime': '58.28', 'train_tokens_per_second': '1897'}
58
+ {'loss': '1.34', 'grad_norm': '0.2829', 'learning_rate': '3.297e-06', 'epoch': '0.006716', 'num_input_tokens_seen': 112585, 'train_runtime': '59.32', 'train_tokens_per_second': '1898'}
59
+ {'loss': '0.851', 'grad_norm': '0.4326', 'learning_rate': '3.358e-06', 'epoch': '0.006838', 'num_input_tokens_seen': 114632, 'train_runtime': '60.37', 'train_tokens_per_second': '1899'}
60
+ {'loss': '0.7931', 'grad_norm': '0.3166', 'learning_rate': '3.419e-06', 'epoch': '0.00696', 'num_input_tokens_seen': 116679, 'train_runtime': '61.41', 'train_tokens_per_second': '1900'}
61
+ {'loss': '1.728', 'grad_norm': '0.3289', 'learning_rate': '3.48e-06', 'epoch': '0.007082', 'num_input_tokens_seen': 118726, 'train_runtime': '62.45', 'train_tokens_per_second': '1901'}
62
+ {'loss': '0.7369', 'grad_norm': '0.2613', 'learning_rate': '3.541e-06', 'epoch': '0.007204', 'num_input_tokens_seen': 120773, 'train_runtime': '63.49', 'train_tokens_per_second': '1902'}
63
+ {'loss': '1.464', 'grad_norm': '0.2617', 'learning_rate': '3.602e-06', 'epoch': '0.007326', 'num_input_tokens_seen': 122820, 'train_runtime': '64.53', 'train_tokens_per_second': '1903'}
64
+ {'loss': '1.883', 'grad_norm': '0.3848', 'learning_rate': '3.663e-06', 'epoch': '0.007448', 'num_input_tokens_seen': 124867, 'train_runtime': '65.58', 'train_tokens_per_second': '1904'}
65
+ {'loss': '0.5969', 'grad_norm': '0.2306', 'learning_rate': '3.724e-06', 'epoch': '0.00757', 'num_input_tokens_seen': 126914, 'train_runtime': '66.63', 'train_tokens_per_second': '1905'}
66
+ {'loss': '1.594', 'grad_norm': '0.2975', 'learning_rate': '3.785e-06', 'epoch': '0.007692', 'num_input_tokens_seen': 128961, 'train_runtime': '67.68', 'train_tokens_per_second': '1906'}
67
+ {'loss': '1.062', 'grad_norm': '0.253', 'learning_rate': '3.846e-06', 'epoch': '0.007814', 'num_input_tokens_seen': 131008, 'train_runtime': '68.72', 'train_tokens_per_second': '1906'}
68
+ {'loss': '1.625', 'grad_norm': '0.3242', 'learning_rate': '3.907e-06', 'epoch': '0.007937', 'num_input_tokens_seen': 133055, 'train_runtime': '69.77', 'train_tokens_per_second': '1907'}
69
+ {'loss': '1.335', 'grad_norm': '0.3814', 'learning_rate': '3.968e-06', 'epoch': '0.008059', 'num_input_tokens_seen': 135102, 'train_runtime': '70.82', 'train_tokens_per_second': '1908'}
70
+ {'loss': '1.049', 'grad_norm': '0.2831', 'learning_rate': '4.029e-06', 'epoch': '0.008181', 'num_input_tokens_seen': 137149, 'train_runtime': '71.86', 'train_tokens_per_second': '1909'}
71
+ {'loss': '1.03', 'grad_norm': '0.2496', 'learning_rate': '4.09e-06', 'epoch': '0.008303', 'num_input_tokens_seen': 139196, 'train_runtime': '72.9', 'train_tokens_per_second': '1909'}
72
+ {'loss': '1.344', 'grad_norm': '0.3791', 'learning_rate': '4.151e-06', 'epoch': '0.008425', 'num_input_tokens_seen': 141243, 'train_runtime': '74.09', 'train_tokens_per_second': '1906'}
73
+ {'loss': '1.543', 'grad_norm': '0.3291', 'learning_rate': '4.212e-06', 'epoch': '0.008547', 'num_input_tokens_seen': 143290, 'train_runtime': '75.13', 'train_tokens_per_second': '1907'}
74
+ {'loss': '1.627', 'grad_norm': '0.3203', 'learning_rate': '4.274e-06', 'epoch': '0.008669', 'num_input_tokens_seen': 145337, 'train_runtime': '76.17', 'train_tokens_per_second': '1908'}
75
+ {'loss': '1.25', 'grad_norm': '0.3174', 'learning_rate': '4.335e-06', 'epoch': '0.008791', 'num_input_tokens_seen': 147384, 'train_runtime': '77.21', 'train_tokens_per_second': '1909'}
76
+ {'loss': '1.305', 'grad_norm': '0.3542', 'learning_rate': '4.396e-06', 'epoch': '0.008913', 'num_input_tokens_seen': 149431, 'train_runtime': '78.26', 'train_tokens_per_second': '1909'}
77
+ {'loss': '0.7812', 'grad_norm': '0.2824', 'learning_rate': '4.457e-06', 'epoch': '0.009035', 'num_input_tokens_seen': 151478, 'train_runtime': '79.3', 'train_tokens_per_second': '1910'}
78
+ {'loss': '1.514', 'grad_norm': '0.3974', 'learning_rate': '4.518e-06', 'epoch': '0.009158', 'num_input_tokens_seen': 153525, 'train_runtime': '80.34', 'train_tokens_per_second': '1911'}
79
+ {'loss': '0.8486', 'grad_norm': '0.394', 'learning_rate': '4.579e-06', 'epoch': '0.00928', 'num_input_tokens_seen': 155572, 'train_runtime': '81.39', 'train_tokens_per_second': '1911'}
80
+ {'loss': '1.741', 'grad_norm': '0.4167', 'learning_rate': '4.64e-06', 'epoch': '0.009402', 'num_input_tokens_seen': 157619, 'train_runtime': '82.43', 'train_tokens_per_second': '1912'}
81
+ {'loss': '1.393', 'grad_norm': '0.3378', 'learning_rate': '4.701e-06', 'epoch': '0.009524', 'num_input_tokens_seen': 159666, 'train_runtime': '83.47', 'train_tokens_per_second': '1913'}
82
+ {'loss': '1.174', 'grad_norm': '0.3005', 'learning_rate': '4.762e-06', 'epoch': '0.009646', 'num_input_tokens_seen': 161713, 'train_runtime': '84.52', 'train_tokens_per_second': '1913'}
83
+ {'loss': '0.7404', 'grad_norm': '0.2695', 'learning_rate': '4.823e-06', 'epoch': '0.009768', 'num_input_tokens_seen': 163760, 'train_runtime': '85.56', 'train_tokens_per_second': '1914'}
84
+ {'loss': '1.576', 'grad_norm': '0.345', 'learning_rate': '4.884e-06', 'epoch': '0.00989', 'num_input_tokens_seen': 165807, 'train_runtime': '86.6', 'train_tokens_per_second': '1915'}
85
+ {'loss': '1.073', 'grad_norm': '0.3396', 'learning_rate': '4.945e-06', 'epoch': '0.01001', 'num_input_tokens_seen': 167854, 'train_runtime': '87.64', 'train_tokens_per_second': '1915'}
86
+ {'loss': '1.579', 'grad_norm': '0.3497', 'learning_rate': '5.006e-06', 'epoch': '0.01013', 'num_input_tokens_seen': 169901, 'train_runtime': '88.68', 'train_tokens_per_second': '1916'}
87
+ {'loss': '0.784', 'grad_norm': '0.3244', 'learning_rate': '5.067e-06', 'epoch': '0.01026', 'num_input_tokens_seen': 171948, 'train_runtime': '89.72', 'train_tokens_per_second': '1916'}
88
+ {'loss': '1.157', 'grad_norm': '0.2747', 'learning_rate': '5.128e-06', 'epoch': '0.01038', 'num_input_tokens_seen': 173995, 'train_runtime': '90.77', 'train_tokens_per_second': '1917'}
89
+ {'loss': '0.9066', 'grad_norm': '0.233', 'learning_rate': '5.189e-06', 'epoch': '0.0105', 'num_input_tokens_seen': 176042, 'train_runtime': '91.81', 'train_tokens_per_second': '1918'}
90
+ {'loss': '0.7513', 'grad_norm': '0.2136', 'learning_rate': '5.25e-06', 'epoch': '0.01062', 'num_input_tokens_seen': 178089, 'train_runtime': '92.85', 'train_tokens_per_second': '1918'}
91
+ {'loss': '0.8007', 'grad_norm': '0.3918', 'learning_rate': '5.311e-06', 'epoch': '0.01074', 'num_input_tokens_seen': 180136, 'train_runtime': '93.89', 'train_tokens_per_second': '1919'}
92
+ {'loss': '1.275', 'grad_norm': '0.3246', 'learning_rate': '5.372e-06', 'epoch': '0.01087', 'num_input_tokens_seen': 182183, 'train_runtime': '94.93', 'train_tokens_per_second': '1919'}
93
+ {'loss': '0.6336', 'grad_norm': '0.2194', 'learning_rate': '5.433e-06', 'epoch': '0.01099', 'num_input_tokens_seen': 184230, 'train_runtime': '95.97', 'train_tokens_per_second': '1920'}
94
+ {'loss': '0.668', 'grad_norm': '0.2253', 'learning_rate': '5.495e-06', 'epoch': '0.01111', 'num_input_tokens_seen': 186277, 'train_runtime': '97.01', 'train_tokens_per_second': '1920'}
95
+ {'loss': '1.824', 'grad_norm': '0.354', 'learning_rate': '5.556e-06', 'epoch': '0.01123', 'num_input_tokens_seen': 188324, 'train_runtime': '98.05', 'train_tokens_per_second': '1921'}
96
+ {'loss': '1.28', 'grad_norm': '0.4487', 'learning_rate': '5.617e-06', 'epoch': '0.01136', 'num_input_tokens_seen': 190371, 'train_runtime': '99.09', 'train_tokens_per_second': '1921'}
97
+ {'loss': '0.6494', 'grad_norm': '0.2398', 'learning_rate': '5.678e-06', 'epoch': '0.01148', 'num_input_tokens_seen': 192418, 'train_runtime': '100.1', 'train_tokens_per_second': '1922'}
98
+ {'loss': '0.6123', 'grad_norm': '0.2938', 'learning_rate': '5.739e-06', 'epoch': '0.0116', 'num_input_tokens_seen': 194465, 'train_runtime': '101.2', 'train_tokens_per_second': '1922'}
99
+ {'loss': '1.243', 'grad_norm': '0.3335', 'learning_rate': '5.8e-06', 'epoch': '0.01172', 'num_input_tokens_seen': 196512, 'train_runtime': '102.2', 'train_tokens_per_second': '1922'}
100
+ {'loss': '1.335', 'grad_norm': '0.3472', 'learning_rate': '5.861e-06', 'epoch': '0.01184', 'num_input_tokens_seen': 198559, 'train_runtime': '103.3', 'train_tokens_per_second': '1923'}
101
+ {'loss': '1.112', 'grad_norm': '0.2869', 'learning_rate': '5.922e-06', 'epoch': '0.01197', 'num_input_tokens_seen': 200606, 'train_runtime': '104.3', 'train_tokens_per_second': '1923'}
102
+ {'loss': '1.557', 'grad_norm': '0.4047', 'learning_rate': '5.983e-06', 'epoch': '0.01209', 'num_input_tokens_seen': 202653, 'train_runtime': '105.3', 'train_tokens_per_second': '1924'}
103
+ {'loss': '1.697', 'grad_norm': '0.4249', 'learning_rate': '6.044e-06', 'epoch': '0.01221', 'num_input_tokens_seen': 204700, 'train_runtime': '106.4', 'train_tokens_per_second': '1924'}
104
+ {'loss': '0.8076', 'grad_norm': '0.2638', 'learning_rate': '6.105e-06', 'epoch': '0.01233', 'num_input_tokens_seen': 206747, 'train_runtime': '107.4', 'train_tokens_per_second': '1925'}
105
+ {'loss': '1.775', 'grad_norm': '0.3715', 'learning_rate': '6.166e-06', 'epoch': '0.01245', 'num_input_tokens_seen': 208794, 'train_runtime': '108.5', 'train_tokens_per_second': '1925'}
106
+ {'loss': '1.606', 'grad_norm': '0.3108', 'learning_rate': '6.227e-06', 'epoch': '0.01258', 'num_input_tokens_seen': 210841, 'train_runtime': '109.5', 'train_tokens_per_second': '1925'}
107
+ {'loss': '1.637', 'grad_norm': '0.3672', 'learning_rate': '6.288e-06', 'epoch': '0.0127', 'num_input_tokens_seen': 212888, 'train_runtime': '110.6', 'train_tokens_per_second': '1926'}
108
+ {'loss': '1.369', 'grad_norm': '0.4352', 'learning_rate': '6.349e-06', 'epoch': '0.01282', 'num_input_tokens_seen': 214935, 'train_runtime': '111.6', 'train_tokens_per_second': '1926'}
109
+ {'loss': '1.386', 'grad_norm': '0.308', 'learning_rate': '6.41e-06', 'epoch': '0.01294', 'num_input_tokens_seen': 216982, 'train_runtime': '112.6', 'train_tokens_per_second': '1926'}
110
+ {'loss': '1.196', 'grad_norm': '0.3402', 'learning_rate': '6.471e-06', 'epoch': '0.01306', 'num_input_tokens_seen': 219029, 'train_runtime': '113.7', 'train_tokens_per_second': '1927'}
111
+ {'loss': '1.117', 'grad_norm': '0.3496', 'learning_rate': '6.532e-06', 'epoch': '0.01319', 'num_input_tokens_seen': 221076, 'train_runtime': '114.7', 'train_tokens_per_second': '1927'}
112
+ {'loss': '1.772', 'grad_norm': '0.3945', 'learning_rate': '6.593e-06', 'epoch': '0.01331', 'num_input_tokens_seen': 223123, 'train_runtime': '115.8', 'train_tokens_per_second': '1927'}
113
+ {'loss': '0.9553', 'grad_norm': '0.2856', 'learning_rate': '6.654e-06', 'epoch': '0.01343', 'num_input_tokens_seen': 225170, 'train_runtime': '116.8', 'train_tokens_per_second': '1928'}
114
+ {'loss': '1.563', 'grad_norm': '0.3784', 'learning_rate': '6.716e-06', 'epoch': '0.01355', 'num_input_tokens_seen': 227217, 'train_runtime': '117.8', 'train_tokens_per_second': '1928'}
115
+ {'loss': '1.567', 'grad_norm': '0.3456', 'learning_rate': '6.777e-06', 'epoch': '0.01368', 'num_input_tokens_seen': 229264, 'train_runtime': '118.9', 'train_tokens_per_second': '1928'}
116
+ {'loss': '0.7048', 'grad_norm': '0.2298', 'learning_rate': '6.838e-06', 'epoch': '0.0138', 'num_input_tokens_seen': 231311, 'train_runtime': '119.9', 'train_tokens_per_second': '1929'}
117
+ {'loss': '1.194', 'grad_norm': '0.3506', 'learning_rate': '6.899e-06', 'epoch': '0.01392', 'num_input_tokens_seen': 233358, 'train_runtime': '121', 'train_tokens_per_second': '1929'}
118
+ {'loss': '0.7762', 'grad_norm': '0.2345', 'learning_rate': '6.96e-06', 'epoch': '0.01404', 'num_input_tokens_seen': 235405, 'train_runtime': '122', 'train_tokens_per_second': '1929'}
119
+ {'loss': '1.459', 'grad_norm': '0.3409', 'learning_rate': '7.021e-06', 'epoch': '0.01416', 'num_input_tokens_seen': 237452, 'train_runtime': '123.1', 'train_tokens_per_second': '1930'}
120
+ {'loss': '0.6121', 'grad_norm': '0.2403', 'learning_rate': '7.082e-06', 'epoch': '0.01429', 'num_input_tokens_seen': 239499, 'train_runtime': '124.1', 'train_tokens_per_second': '1930'}
121
+ {'loss': '1.599', 'grad_norm': '0.299', 'learning_rate': '7.143e-06', 'epoch': '0.01441', 'num_input_tokens_seen': 241546, 'train_runtime': '125.1', 'train_tokens_per_second': '1930'}
122
+ {'loss': '1.771', 'grad_norm': '0.391', 'learning_rate': '7.204e-06', 'epoch': '0.01453', 'num_input_tokens_seen': 243593, 'train_runtime': '126.2', 'train_tokens_per_second': '1930'}
123
+ {'loss': '1.541', 'grad_norm': '0.3111', 'learning_rate': '7.265e-06', 'epoch': '0.01465', 'num_input_tokens_seen': 245640, 'train_runtime': '127.2', 'train_tokens_per_second': '1931'}
124
+ {'loss': '0.7969', 'grad_norm': '0.2717', 'learning_rate': '7.326e-06', 'epoch': '0.01477', 'num_input_tokens_seen': 247687, 'train_runtime': '128.3', 'train_tokens_per_second': '1931'}
125
+ {'loss': '1.567', 'grad_norm': '0.3719', 'learning_rate': '7.387e-06', 'epoch': '0.0149', 'num_input_tokens_seen': 249734, 'train_runtime': '129.3', 'train_tokens_per_second': '1931'}
126
+ {'loss': '1.782', 'grad_norm': '0.3787', 'learning_rate': '7.448e-06', 'epoch': '0.01502', 'num_input_tokens_seen': 251781, 'train_runtime': '130.4', 'train_tokens_per_second': '1931'}
127
+ {'loss': '0.7362', 'grad_norm': '0.2492', 'learning_rate': '7.509e-06', 'epoch': '0.01514', 'num_input_tokens_seen': 253828, 'train_runtime': '131.4', 'train_tokens_per_second': '1932'}
128
+ {'loss': '1.653', 'grad_norm': '0.3752', 'learning_rate': '7.57e-06', 'epoch': '0.01526', 'num_input_tokens_seen': 255875, 'train_runtime': '132.5', 'train_tokens_per_second': '1932'}
129
+ {'loss': '1.619', 'grad_norm': '0.4029', 'learning_rate': '7.631e-06', 'epoch': '0.01538', 'num_input_tokens_seen': 257922, 'train_runtime': '133.5', 'train_tokens_per_second': '1932'}
130
+ {'loss': '1.128', 'grad_norm': '0.3188', 'learning_rate': '7.692e-06', 'epoch': '0.01551', 'num_input_tokens_seen': 259969, 'train_runtime': '134.6', 'train_tokens_per_second': '1932'}
131
+ {'loss': '1.338', 'grad_norm': '0.3356', 'learning_rate': '7.753e-06', 'epoch': '0.01563', 'num_input_tokens_seen': 262016, 'train_runtime': '135.6', 'train_tokens_per_second': '1932'}
132
+ {'loss': '0.7656', 'grad_norm': '0.2505', 'learning_rate': '7.814e-06', 'epoch': '0.01575', 'num_input_tokens_seen': 264063, 'train_runtime': '136.6', 'train_tokens_per_second': '1933'}
133
+ {'loss': '1.375', 'grad_norm': '0.3852', 'learning_rate': '7.875e-06', 'epoch': '0.01587', 'num_input_tokens_seen': 266110, 'train_runtime': '137.7', 'train_tokens_per_second': '1933'}
134
+ {'loss': '0.5618', 'grad_norm': '0.24', 'learning_rate': '7.937e-06', 'epoch': '0.016', 'num_input_tokens_seen': 268157, 'train_runtime': '138.7', 'train_tokens_per_second': '1933'}
135
+ {'loss': '1.335', 'grad_norm': '0.4018', 'learning_rate': '7.998e-06', 'epoch': '0.01612', 'num_input_tokens_seen': 270204, 'train_runtime': '139.8', 'train_tokens_per_second': '1933'}
136
+ {'loss': '1.063', 'grad_norm': '0.2842', 'learning_rate': '8.059e-06', 'epoch': '0.01624', 'num_input_tokens_seen': 272251, 'train_runtime': '140.8', 'train_tokens_per_second': '1933'}
137
+ {'loss': '1.795', 'grad_norm': '0.4447', 'learning_rate': '8.12e-06', 'epoch': '0.01636', 'num_input_tokens_seen': 274298, 'train_runtime': '141.9', 'train_tokens_per_second': '1933'}
138
+ {'loss': '1.664', 'grad_norm': '0.3341', 'learning_rate': '8.181e-06', 'epoch': '0.01648', 'num_input_tokens_seen': 276345, 'train_runtime': '142.9', 'train_tokens_per_second': '1934'}
139
+ {'loss': '1.237', 'grad_norm': '0.2907', 'learning_rate': '8.242e-06', 'epoch': '0.01661', 'num_input_tokens_seen': 278392, 'train_runtime': '144', 'train_tokens_per_second': '1934'}
140
+ {'loss': '1.617', 'grad_norm': '0.3788', 'learning_rate': '8.303e-06', 'epoch': '0.01673', 'num_input_tokens_seen': 280439, 'train_runtime': '145', 'train_tokens_per_second': '1934'}
141
+ {'loss': '1.089', 'grad_norm': '0.3043', 'learning_rate': '8.364e-06', 'epoch': '0.01685', 'num_input_tokens_seen': 282486, 'train_runtime': '146.1', 'train_tokens_per_second': '1934'}
142
+ {'loss': '1.12', 'grad_norm': '0.3281', 'learning_rate': '8.425e-06', 'epoch': '0.01697', 'num_input_tokens_seen': 284533, 'train_runtime': '147.1', 'train_tokens_per_second': '1934'}
143
+ {'loss': '1.408', 'grad_norm': '0.3588', 'learning_rate': '8.486e-06', 'epoch': '0.01709', 'num_input_tokens_seen': 286580, 'train_runtime': '148.1', 'train_tokens_per_second': '1935'}
144
+ {'loss': '1.173', 'grad_norm': '0.3316', 'learning_rate': '8.547e-06', 'epoch': '0.01722', 'num_input_tokens_seen': 288627, 'train_runtime': '149.2', 'train_tokens_per_second': '1935'}
145
+ {'loss': '1.621', 'grad_norm': '0.3899', 'learning_rate': '8.608e-06', 'epoch': '0.01734', 'num_input_tokens_seen': 290674, 'train_runtime': '150.2', 'train_tokens_per_second': '1935'}
146
+ {'loss': '1.247', 'grad_norm': '0.3735', 'learning_rate': '8.669e-06', 'epoch': '0.01746', 'num_input_tokens_seen': 292721, 'train_runtime': '151.3', 'train_tokens_per_second': '1935'}
147
+ {'loss': '1.872', 'grad_norm': '0.4948', 'learning_rate': '8.73e-06', 'epoch': '0.01758', 'num_input_tokens_seen': 294768, 'train_runtime': '152.3', 'train_tokens_per_second': '1935'}
148
+ {'loss': '0.6525', 'grad_norm': '0.2687', 'learning_rate': '8.791e-06', 'epoch': '0.0177', 'num_input_tokens_seen': 296815, 'train_runtime': '153.4', 'train_tokens_per_second': '1936'}
149
+ {'loss': '1.418', 'grad_norm': '0.4128', 'learning_rate': '8.852e-06', 'epoch': '0.01783', 'num_input_tokens_seen': 298862, 'train_runtime': '154.4', 'train_tokens_per_second': '1936'}
150
+ {'loss': '1.428', 'grad_norm': '0.3661', 'learning_rate': '8.913e-06', 'epoch': '0.01795', 'num_input_tokens_seen': 300909, 'train_runtime': '155.4', 'train_tokens_per_second': '1936'}
151
+ {'loss': '1.003', 'grad_norm': '0.3327', 'learning_rate': '8.974e-06', 'epoch': '0.01807', 'num_input_tokens_seen': 302956, 'train_runtime': '156.5', 'train_tokens_per_second': '1936'}
152
+ {'loss': '1.531', 'grad_norm': '0.4244', 'learning_rate': '9.035e-06', 'epoch': '0.01819', 'num_input_tokens_seen': 305003, 'train_runtime': '157.5', 'train_tokens_per_second': '1936'}
153
+ {'loss': '1.635', 'grad_norm': '0.4266', 'learning_rate': '9.096e-06', 'epoch': '0.01832', 'num_input_tokens_seen': 307050, 'train_runtime': '158.6', 'train_tokens_per_second': '1936'}
154
+ {'loss': '1.504', 'grad_norm': '0.3605', 'learning_rate': '9.158e-06', 'epoch': '0.01844', 'num_input_tokens_seen': 309097, 'train_runtime': '159.6', 'train_tokens_per_second': '1936'}
155
+ {'loss': '1.709', 'grad_norm': '0.3912', 'learning_rate': '9.219e-06', 'epoch': '0.01856', 'num_input_tokens_seen': 311144, 'train_runtime': '160.7', 'train_tokens_per_second': '1936'}
156
+ {'loss': '1.367', 'grad_norm': '0.3813', 'learning_rate': '9.28e-06', 'epoch': '0.01868', 'num_input_tokens_seen': 313191, 'train_runtime': '161.7', 'train_tokens_per_second': '1937'}
157
+ {'loss': '1.261', 'grad_norm': '0.3283', 'learning_rate': '9.341e-06', 'epoch': '0.0188', 'num_input_tokens_seen': 315238, 'train_runtime': '162.8', 'train_tokens_per_second': '1937'}
158
+ {'loss': '1.142', 'grad_norm': '0.2797', 'learning_rate': '9.402e-06', 'epoch': '0.01893', 'num_input_tokens_seen': 317285, 'train_runtime': '163.8', 'train_tokens_per_second': '1937'}
159
+ {'loss': '1.054', 'grad_norm': '0.3778', 'learning_rate': '9.463e-06', 'epoch': '0.01905', 'num_input_tokens_seen': 319332, 'train_runtime': '164.8', 'train_tokens_per_second': '1937'}
160
+ {'loss': '1.37', 'grad_norm': '0.3661', 'learning_rate': '9.524e-06', 'epoch': '0.01917', 'num_input_tokens_seen': 321379, 'train_runtime': '165.9', 'train_tokens_per_second': '1937'}
161
+ {'loss': '1.425', 'grad_norm': '0.5471', 'learning_rate': '9.585e-06', 'epoch': '0.01929', 'num_input_tokens_seen': 323426, 'train_runtime': '166.9', 'train_tokens_per_second': '1937'}
162
+ {'loss': '1.088', 'grad_norm': '0.3833', 'learning_rate': '9.646e-06', 'epoch': '0.01941', 'num_input_tokens_seen': 325473, 'train_runtime': '168', 'train_tokens_per_second': '1937'}
163
+ {'loss': '1.332', 'grad_norm': '0.4081', 'learning_rate': '9.707e-06', 'epoch': '0.01954', 'num_input_tokens_seen': 327520, 'train_runtime': '169', 'train_tokens_per_second': '1938'}
164
+ {'loss': '1.821', 'grad_norm': '0.4351', 'learning_rate': '9.768e-06', 'epoch': '0.01966', 'num_input_tokens_seen': 329567, 'train_runtime': '170.1', 'train_tokens_per_second': '1938'}
165
+ {'loss': '1.693', 'grad_norm': '2.017', 'learning_rate': '9.829e-06', 'epoch': '0.01978', 'num_input_tokens_seen': 331614, 'train_runtime': '171.1', 'train_tokens_per_second': '1938'}
166
+ {'loss': '1.377', 'grad_norm': '0.3394', 'learning_rate': '9.89e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 333661, 'train_runtime': '172.2', 'train_tokens_per_second': '1938'}
167
+ {'loss': '1.329', 'grad_norm': '0.3503', 'learning_rate': '9.951e-06', 'epoch': '0.02002', 'num_input_tokens_seen': 335708, 'train_runtime': '173.2', 'train_tokens_per_second': '1938'}
168
+ {'loss': '1.319', 'grad_norm': '0.3434', 'learning_rate': '1.001e-05', 'epoch': '0.02015', 'num_input_tokens_seen': 337755, 'train_runtime': '174.3', 'train_tokens_per_second': '1938'}
169
+ {'loss': '0.7777', 'grad_norm': '0.3284', 'learning_rate': '1.007e-05', 'epoch': '0.02027', 'num_input_tokens_seen': 339802, 'train_runtime': '175.3', 'train_tokens_per_second': '1938'}
170
+ {'loss': '1.453', 'grad_norm': '0.3621', 'learning_rate': '1.013e-05', 'epoch': '0.02039', 'num_input_tokens_seen': 341849, 'train_runtime': '176.4', 'train_tokens_per_second': '1938'}
171
+ {'loss': '1.899', 'grad_norm': '0.5323', 'learning_rate': '1.02e-05', 'epoch': '0.02051', 'num_input_tokens_seen': 343896, 'train_runtime': '177.4', 'train_tokens_per_second': '1939'}
172
+ {'loss': '2.037', 'grad_norm': '0.5038', 'learning_rate': '1.026e-05', 'epoch': '0.02063', 'num_input_tokens_seen': 345943, 'train_runtime': '178.4', 'train_tokens_per_second': '1939'}
173
+ {'loss': '1.384', 'grad_norm': '0.3607', 'learning_rate': '1.032e-05', 'epoch': '0.02076', 'num_input_tokens_seen': 347990, 'train_runtime': '179.5', 'train_tokens_per_second': '1939'}
174
+ {'loss': '1.661', 'grad_norm': '0.4242', 'learning_rate': '1.038e-05', 'epoch': '0.02088', 'num_input_tokens_seen': 350037, 'train_runtime': '180.5', 'train_tokens_per_second': '1939'}
175
+ {'loss': '1.68', 'grad_norm': '0.4849', 'learning_rate': '1.044e-05', 'epoch': '0.021', 'num_input_tokens_seen': 352084, 'train_runtime': '181.6', 'train_tokens_per_second': '1939'}
176
+ {'loss': '1.685', 'grad_norm': '0.555', 'learning_rate': '1.05e-05', 'epoch': '0.02112', 'num_input_tokens_seen': 354131, 'train_runtime': '182.6', 'train_tokens_per_second': '1939'}
177
+ {'loss': '1.141', 'grad_norm': '0.351', 'learning_rate': '1.056e-05', 'epoch': '0.02125', 'num_input_tokens_seen': 356178, 'train_runtime': '183.7', 'train_tokens_per_second': '1939'}
178
+ {'loss': '1.29', 'grad_norm': '0.4115', 'learning_rate': '1.062e-05', 'epoch': '0.02137', 'num_input_tokens_seen': 358225, 'train_runtime': '184.7', 'train_tokens_per_second': '1939'}
179
+ {'loss': '1.293', 'grad_norm': '0.3835', 'learning_rate': '1.068e-05', 'epoch': '0.02149', 'num_input_tokens_seen': 360272, 'train_runtime': '185.8', 'train_tokens_per_second': '1939'}
180
+ {'loss': '1.556', 'grad_norm': '0.4774', 'learning_rate': '1.074e-05', 'epoch': '0.02161', 'num_input_tokens_seen': 362319, 'train_runtime': '186.8', 'train_tokens_per_second': '1940'}
181
+ {'loss': '1.218', 'grad_norm': '0.4011', 'learning_rate': '1.081e-05', 'epoch': '0.02173', 'num_input_tokens_seen': 364366, 'train_runtime': '187.9', 'train_tokens_per_second': '1940'}
182
+ {'loss': '1.299', 'grad_norm': '0.3859', 'learning_rate': '1.087e-05', 'epoch': '0.02186', 'num_input_tokens_seen': 366413, 'train_runtime': '188.9', 'train_tokens_per_second': '1940'}
183
+ {'loss': '1.037', 'grad_norm': '0.3694', 'learning_rate': '1.093e-05', 'epoch': '0.02198', 'num_input_tokens_seen': 368460, 'train_runtime': '189.9', 'train_tokens_per_second': '1940'}
184
+ {'loss': '0.6335', 'grad_norm': '0.2866', 'learning_rate': '1.099e-05', 'epoch': '0.0221', 'num_input_tokens_seen': 370507, 'train_runtime': '191', 'train_tokens_per_second': '1940'}
185
+ {'loss': '0.6538', 'grad_norm': '0.321', 'learning_rate': '1.105e-05', 'epoch': '0.02222', 'num_input_tokens_seen': 372554, 'train_runtime': '192', 'train_tokens_per_second': '1940'}
186
+ {'loss': '1.187', 'grad_norm': '0.3279', 'learning_rate': '1.111e-05', 'epoch': '0.02234', 'num_input_tokens_seen': 374601, 'train_runtime': '193.1', 'train_tokens_per_second': '1940'}
187
+ {'loss': '1.375', 'grad_norm': '0.447', 'learning_rate': '1.117e-05', 'epoch': '0.02247', 'num_input_tokens_seen': 376648, 'train_runtime': '194.1', 'train_tokens_per_second': '1940'}
188
+ {'loss': '0.8847', 'grad_norm': '0.3551', 'learning_rate': '1.123e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 378695, 'train_runtime': '195.2', 'train_tokens_per_second': '1940'}
189
+ {'loss': '1.745', 'grad_norm': '0.5382', 'learning_rate': '1.129e-05', 'epoch': '0.02271', 'num_input_tokens_seen': 380742, 'train_runtime': '196.2', 'train_tokens_per_second': '1940'}
190
+ {'loss': '1.602', 'grad_norm': '0.4624', 'learning_rate': '1.136e-05', 'epoch': '0.02283', 'num_input_tokens_seen': 382789, 'train_runtime': '197.3', 'train_tokens_per_second': '1941'}
191
+ {'loss': '1.474', 'grad_norm': '0.478', 'learning_rate': '1.142e-05', 'epoch': '0.02295', 'num_input_tokens_seen': 384836, 'train_runtime': '198.3', 'train_tokens_per_second': '1941'}
192
+ {'loss': '1.639', 'grad_norm': '0.4799', 'learning_rate': '1.148e-05', 'epoch': '0.02308', 'num_input_tokens_seen': 386883, 'train_runtime': '199.3', 'train_tokens_per_second': '1941'}
193
+ {'loss': '0.8179', 'grad_norm': '0.3443', 'learning_rate': '1.154e-05', 'epoch': '0.0232', 'num_input_tokens_seen': 388930, 'train_runtime': '200.4', 'train_tokens_per_second': '1941'}
194
+ {'loss': '1.302', 'grad_norm': '0.4595', 'learning_rate': '1.16e-05', 'epoch': '0.02332', 'num_input_tokens_seen': 390977, 'train_runtime': '201.4', 'train_tokens_per_second': '1941'}
195
+ {'loss': '0.6097', 'grad_norm': '0.2905', 'learning_rate': '1.166e-05', 'epoch': '0.02344', 'num_input_tokens_seen': 393024, 'train_runtime': '202.5', 'train_tokens_per_second': '1941'}
196
+ {'loss': '0.8993', 'grad_norm': '0.3459', 'learning_rate': '1.172e-05', 'epoch': '0.02357', 'num_input_tokens_seen': 395071, 'train_runtime': '203.5', 'train_tokens_per_second': '1941'}
197
+ {'loss': '1.096', 'grad_norm': '0.4137', 'learning_rate': '1.178e-05', 'epoch': '0.02369', 'num_input_tokens_seen': 397118, 'train_runtime': '204.6', 'train_tokens_per_second': '1941'}
198
+ {'loss': '1.411', 'grad_norm': '0.4383', 'learning_rate': '1.184e-05', 'epoch': '0.02381', 'num_input_tokens_seen': 399165, 'train_runtime': '205.6', 'train_tokens_per_second': '1941'}
199
+ {'loss': '1.204', 'grad_norm': '0.4678', 'learning_rate': '1.19e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 401212, 'train_runtime': '206.7', 'train_tokens_per_second': '1941'}
200
+ {'loss': '1.366', 'grad_norm': '0.424', 'learning_rate': '1.197e-05', 'epoch': '0.02405', 'num_input_tokens_seen': 403259, 'train_runtime': '207.7', 'train_tokens_per_second': '1942'}
201
+ {'loss': '0.992', 'grad_norm': '0.3474', 'learning_rate': '1.203e-05', 'epoch': '0.02418', 'num_input_tokens_seen': 405306, 'train_runtime': '208.7', 'train_tokens_per_second': '1942'}
202
+ {'loss': '1.445', 'grad_norm': '0.485', 'learning_rate': '1.209e-05', 'epoch': '0.0243', 'num_input_tokens_seen': 407353, 'train_runtime': '209.8', 'train_tokens_per_second': '1942'}
203
+ {'loss': '1.563', 'grad_norm': '0.4729', 'learning_rate': '1.215e-05', 'epoch': '0.02442', 'num_input_tokens_seen': 409400, 'train_runtime': '210.8', 'train_tokens_per_second': '1942'}
204
+ {'loss': '1.273', 'grad_norm': '0.4405', 'learning_rate': '1.221e-05', 'epoch': '0.02454', 'num_input_tokens_seen': 411447, 'train_runtime': '211.9', 'train_tokens_per_second': '1942'}
205
+ {'loss': '1.156', 'grad_norm': '0.4594', 'learning_rate': '1.227e-05', 'epoch': '0.02466', 'num_input_tokens_seen': 413494, 'train_runtime': '212.9', 'train_tokens_per_second': '1942'}
206
+ {'loss': '1.284', 'grad_norm': '0.5923', 'learning_rate': '1.233e-05', 'epoch': '0.02479', 'num_input_tokens_seen': 415541, 'train_runtime': '214', 'train_tokens_per_second': '1942'}
207
+ {'loss': '1.57', 'grad_norm': '0.517', 'learning_rate': '1.239e-05', 'epoch': '0.02491', 'num_input_tokens_seen': 417588, 'train_runtime': '215', 'train_tokens_per_second': '1942'}
208
+ {'loss': '0.7737', 'grad_norm': '0.3465', 'learning_rate': '1.245e-05', 'epoch': '0.02503', 'num_input_tokens_seen': 419635, 'train_runtime': '216.1', 'train_tokens_per_second': '1942'}
209
+ {'loss': '1.291', 'grad_norm': '0.4848', 'learning_rate': '1.252e-05', 'epoch': '0.02515', 'num_input_tokens_seen': 421682, 'train_runtime': '217.1', 'train_tokens_per_second': '1942'}
210
+ {'loss': '1.418', 'grad_norm': '0.4659', 'learning_rate': '1.258e-05', 'epoch': '0.02527', 'num_input_tokens_seen': 423729, 'train_runtime': '218.2', 'train_tokens_per_second': '1942'}
211
+ {'loss': '1.081', 'grad_norm': '0.4705', 'learning_rate': '1.264e-05', 'epoch': '0.0254', 'num_input_tokens_seen': 425776, 'train_runtime': '219.2', 'train_tokens_per_second': '1942'}
212
+ {'loss': '1.675', 'grad_norm': '0.4767', 'learning_rate': '1.27e-05', 'epoch': '0.02552', 'num_input_tokens_seen': 427823, 'train_runtime': '220.2', 'train_tokens_per_second': '1942'}
213
+ {'loss': '1.681', 'grad_norm': '0.5783', 'learning_rate': '1.276e-05', 'epoch': '0.02564', 'num_input_tokens_seen': 429870, 'train_runtime': '221.3', 'train_tokens_per_second': '1943'}
214
+ {'loss': '1.452', 'grad_norm': '0.4866', 'learning_rate': '1.282e-05', 'epoch': '0.02576', 'num_input_tokens_seen': 431917, 'train_runtime': '222.3', 'train_tokens_per_second': '1943'}
215
+ {'loss': '0.9691', 'grad_norm': '0.4056', 'learning_rate': '1.288e-05', 'epoch': '0.02589', 'num_input_tokens_seen': 433964, 'train_runtime': '223.4', 'train_tokens_per_second': '1943'}
216
+ {'loss': '0.6256', 'grad_norm': '0.3151', 'learning_rate': '1.294e-05', 'epoch': '0.02601', 'num_input_tokens_seen': 436011, 'train_runtime': '224.4', 'train_tokens_per_second': '1943'}
217
+ {'loss': '0.6349', 'grad_norm': '0.3113', 'learning_rate': '1.3e-05', 'epoch': '0.02613', 'num_input_tokens_seen': 438058, 'train_runtime': '225.5', 'train_tokens_per_second': '1943'}
218
+ {'loss': '1.575', 'grad_norm': '0.6033', 'learning_rate': '1.306e-05', 'epoch': '0.02625', 'num_input_tokens_seen': 440105, 'train_runtime': '226.5', 'train_tokens_per_second': '1943'}
219
+ {'loss': '1.585', 'grad_norm': '0.5161', 'learning_rate': '1.313e-05', 'epoch': '0.02637', 'num_input_tokens_seen': 442152, 'train_runtime': '227.6', 'train_tokens_per_second': '1943'}
220
+ {'loss': '1.182', 'grad_norm': '0.4157', 'learning_rate': '1.319e-05', 'epoch': '0.0265', 'num_input_tokens_seen': 444199, 'train_runtime': '228.6', 'train_tokens_per_second': '1943'}
221
+ {'loss': '1.789', 'grad_norm': '0.6525', 'learning_rate': '1.325e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 446246, 'train_runtime': '229.7', 'train_tokens_per_second': '1943'}
222
+ {'loss': '0.6394', 'grad_norm': '0.332', 'learning_rate': '1.331e-05', 'epoch': '0.02674', 'num_input_tokens_seen': 448293, 'train_runtime': '230.7', 'train_tokens_per_second': '1943'}
223
+ {'loss': '1.595', 'grad_norm': '0.5779', 'learning_rate': '1.337e-05', 'epoch': '0.02686', 'num_input_tokens_seen': 450340, 'train_runtime': '231.7', 'train_tokens_per_second': '1943'}
224
+ {'loss': '0.8082', 'grad_norm': '0.3568', 'learning_rate': '1.343e-05', 'epoch': '0.02698', 'num_input_tokens_seen': 452387, 'train_runtime': '232.8', 'train_tokens_per_second': '1943'}
225
+ {'loss': '1.479', 'grad_norm': '0.5858', 'learning_rate': '1.349e-05', 'epoch': '0.02711', 'num_input_tokens_seen': 454434, 'train_runtime': '233.8', 'train_tokens_per_second': '1943'}
226
+ {'loss': '1.147', 'grad_norm': '0.4227', 'learning_rate': '1.355e-05', 'epoch': '0.02723', 'num_input_tokens_seen': 456481, 'train_runtime': '234.9', 'train_tokens_per_second': '1943'}
227
+ {'loss': '1.603', 'grad_norm': '0.4923', 'learning_rate': '1.361e-05', 'epoch': '0.02735', 'num_input_tokens_seen': 458528, 'train_runtime': '235.9', 'train_tokens_per_second': '1943'}
228
+ {'loss': '1.538', 'grad_norm': '0.5759', 'learning_rate': '1.368e-05', 'epoch': '0.02747', 'num_input_tokens_seen': 460575, 'train_runtime': '237', 'train_tokens_per_second': '1944'}
229
+ {'loss': '0.7194', 'grad_norm': '0.3567', 'learning_rate': '1.374e-05', 'epoch': '0.02759', 'num_input_tokens_seen': 462622, 'train_runtime': '238', 'train_tokens_per_second': '1944'}
230
+ {'loss': '1.721', 'grad_norm': '0.5946', 'learning_rate': '1.38e-05', 'epoch': '0.02772', 'num_input_tokens_seen': 464669, 'train_runtime': '239.1', 'train_tokens_per_second': '1944'}
231
+ {'loss': '1.277', 'grad_norm': '0.5085', 'learning_rate': '1.386e-05', 'epoch': '0.02784', 'num_input_tokens_seen': 466716, 'train_runtime': '240.1', 'train_tokens_per_second': '1944'}
232
+ {'loss': '1.659', 'grad_norm': '0.6458', 'learning_rate': '1.392e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 468763, 'train_runtime': '241.2', 'train_tokens_per_second': '1944'}
233
+ {'loss': '1.555', 'grad_norm': '0.4655', 'learning_rate': '1.398e-05', 'epoch': '0.02808', 'num_input_tokens_seen': 470810, 'train_runtime': '242.2', 'train_tokens_per_second': '1944'}
234
+ {'loss': '1.236', 'grad_norm': '0.5168', 'learning_rate': '1.404e-05', 'epoch': '0.02821', 'num_input_tokens_seen': 472857, 'train_runtime': '243.2', 'train_tokens_per_second': '1944'}
235
+ {'loss': '1.659', 'grad_norm': '0.5702', 'learning_rate': '1.41e-05', 'epoch': '0.02833', 'num_input_tokens_seen': 474904, 'train_runtime': '244.3', 'train_tokens_per_second': '1944'}
236
+ {'loss': '1.295', 'grad_norm': '0.4997', 'learning_rate': '1.416e-05', 'epoch': '0.02845', 'num_input_tokens_seen': 476951, 'train_runtime': '245.3', 'train_tokens_per_second': '1944'}
237
+ {'loss': '1.355', 'grad_norm': '0.5255', 'learning_rate': '1.422e-05', 'epoch': '0.02857', 'num_input_tokens_seen': 478998, 'train_runtime': '246.4', 'train_tokens_per_second': '1944'}
238
+ {'loss': '1.326', 'grad_norm': '0.59', 'learning_rate': '1.429e-05', 'epoch': '0.02869', 'num_input_tokens_seen': 481045, 'train_runtime': '247.4', 'train_tokens_per_second': '1944'}
239
+ {'loss': '0.8155', 'grad_norm': '0.4466', 'learning_rate': '1.435e-05', 'epoch': '0.02882', 'num_input_tokens_seen': 483092, 'train_runtime': '248.5', 'train_tokens_per_second': '1944'}
240
+ {'loss': '1.097', 'grad_norm': '0.431', 'learning_rate': '1.441e-05', 'epoch': '0.02894', 'num_input_tokens_seen': 485139, 'train_runtime': '249.5', 'train_tokens_per_second': '1944'}
241
+ {'loss': '1.442', 'grad_norm': '0.6068', 'learning_rate': '1.447e-05', 'epoch': '0.02906', 'num_input_tokens_seen': 487186, 'train_runtime': '250.6', 'train_tokens_per_second': '1944'}
242
+ {'loss': '0.6167', 'grad_norm': '0.3797', 'learning_rate': '1.453e-05', 'epoch': '0.02918', 'num_input_tokens_seen': 489233, 'train_runtime': '251.6', 'train_tokens_per_second': '1944'}
243
+ {'loss': '1.099', 'grad_norm': '0.4898', 'learning_rate': '1.459e-05', 'epoch': '0.0293', 'num_input_tokens_seen': 491280, 'train_runtime': '252.7', 'train_tokens_per_second': '1944'}
244
+ {'loss': '1.663', 'grad_norm': '0.7464', 'learning_rate': '1.465e-05', 'epoch': '0.02943', 'num_input_tokens_seen': 493327, 'train_runtime': '253.7', 'train_tokens_per_second': '1944'}
245
+ {'loss': '0.7168', 'grad_norm': '0.4142', 'learning_rate': '1.471e-05', 'epoch': '0.02955', 'num_input_tokens_seen': 495374, 'train_runtime': '254.7', 'train_tokens_per_second': '1945'}
246
+ {'loss': '2.189', 'grad_norm': '0.7521', 'learning_rate': '1.477e-05', 'epoch': '0.02967', 'num_input_tokens_seen': 497421, 'train_runtime': '255.8', 'train_tokens_per_second': '1945'}
247
+ {'loss': '1.161', 'grad_norm': '0.5383', 'learning_rate': '1.484e-05', 'epoch': '0.02979', 'num_input_tokens_seen': 499468, 'train_runtime': '256.8', 'train_tokens_per_second': '1945'}
248
+ {'loss': '0.7095', 'grad_norm': '0.363', 'learning_rate': '1.49e-05', 'epoch': '0.02991', 'num_input_tokens_seen': 501515, 'train_runtime': '257.9', 'train_tokens_per_second': '1945'}
249
+ {'loss': '1.675', 'grad_norm': '0.5704', 'learning_rate': '1.496e-05', 'epoch': '0.03004', 'num_input_tokens_seen': 503562, 'train_runtime': '258.9', 'train_tokens_per_second': '1945'}
250
+ {'loss': '1.544', 'grad_norm': '0.6231', 'learning_rate': '1.502e-05', 'epoch': '0.03016', 'num_input_tokens_seen': 505609, 'train_runtime': '260', 'train_tokens_per_second': '1945'}
251
+ {'loss': '1.202', 'grad_norm': '0.5518', 'learning_rate': '1.508e-05', 'epoch': '0.03028', 'num_input_tokens_seen': 507656, 'train_runtime': '261', 'train_tokens_per_second': '1945'}
252
+ {'loss': '1.31', 'grad_norm': '0.4917', 'learning_rate': '1.514e-05', 'epoch': '0.0304', 'num_input_tokens_seen': 509703, 'train_runtime': '262.1', 'train_tokens_per_second': '1945'}
253
+ {'loss': '1.394', 'grad_norm': '0.4971', 'learning_rate': '1.52e-05', 'epoch': '0.03053', 'num_input_tokens_seen': 511750, 'train_runtime': '263.1', 'train_tokens_per_second': '1945'}
254
+ {'loss': '1.184', 'grad_norm': '0.4955', 'learning_rate': '1.526e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 513797, 'train_runtime': '264.2', 'train_tokens_per_second': '1945'}
255
+ {'loss': '1.614', 'grad_norm': '0.8268', 'learning_rate': '1.532e-05', 'epoch': '0.03077', 'num_input_tokens_seen': 515844, 'train_runtime': '265.2', 'train_tokens_per_second': '1945'}
256
+ {'loss': '1.911', 'grad_norm': '0.7492', 'learning_rate': '1.538e-05', 'epoch': '0.03089', 'num_input_tokens_seen': 517891, 'train_runtime': '266.3', 'train_tokens_per_second': '1945'}
257
+ {'loss': '1.459', 'grad_norm': '0.5983', 'learning_rate': '1.545e-05', 'epoch': '0.03101', 'num_input_tokens_seen': 519938, 'train_runtime': '267.3', 'train_tokens_per_second': '1945'}
258
+ {'loss': '1.595', 'grad_norm': '0.6762', 'learning_rate': '1.551e-05', 'epoch': '0.03114', 'num_input_tokens_seen': 521985, 'train_runtime': '268.4', 'train_tokens_per_second': '1945'}
259
+ {'loss': '0.6932', 'grad_norm': '0.4707', 'learning_rate': '1.557e-05', 'epoch': '0.03126', 'num_input_tokens_seen': 524032, 'train_runtime': '269.4', 'train_tokens_per_second': '1945'}
260
+ {'loss': '2.117', 'grad_norm': '0.7636', 'learning_rate': '1.563e-05', 'epoch': '0.03138', 'num_input_tokens_seen': 526079, 'train_runtime': '270.5', 'train_tokens_per_second': '1945'}
261
+ {'loss': '1.121', 'grad_norm': '0.478', 'learning_rate': '1.569e-05', 'epoch': '0.0315', 'num_input_tokens_seen': 528126, 'train_runtime': '271.5', 'train_tokens_per_second': '1945'}
262
+ {'loss': '1.432', 'grad_norm': '0.6419', 'learning_rate': '1.575e-05', 'epoch': '0.03162', 'num_input_tokens_seen': 530173, 'train_runtime': '272.6', 'train_tokens_per_second': '1945'}
263
+ {'loss': '0.7377', 'grad_norm': '0.413', 'learning_rate': '1.581e-05', 'epoch': '0.03175', 'num_input_tokens_seen': 532220, 'train_runtime': '273.6', 'train_tokens_per_second': '1945'}
264
+ {'loss': '1.552', 'grad_norm': '0.6274', 'learning_rate': '1.587e-05', 'epoch': '0.03187', 'num_input_tokens_seen': 534267, 'train_runtime': '274.7', 'train_tokens_per_second': '1945'}
265
+ {'loss': '1.128', 'grad_norm': '0.536', 'learning_rate': '1.593e-05', 'epoch': '0.03199', 'num_input_tokens_seen': 536314, 'train_runtime': '275.7', 'train_tokens_per_second': '1945'}
266
+ {'loss': '1.204', 'grad_norm': '0.544', 'learning_rate': '1.6e-05', 'epoch': '0.03211', 'num_input_tokens_seen': 538361, 'train_runtime': '276.8', 'train_tokens_per_second': '1945'}
267
+ {'loss': '0.01306', 'grad_norm': '0.06258', 'learning_rate': '1.606e-05', 'epoch': '0.03223', 'num_input_tokens_seen': 540408, 'train_runtime': '277.8', 'train_tokens_per_second': '1945'}
268
+ {'loss': '1.558', 'grad_norm': '0.6964', 'learning_rate': '1.612e-05', 'epoch': '0.03236', 'num_input_tokens_seen': 542455, 'train_runtime': '278.8', 'train_tokens_per_second': '1945'}
269
+ {'loss': '1.02', 'grad_norm': '0.509', 'learning_rate': '1.618e-05', 'epoch': '0.03248', 'num_input_tokens_seen': 544502, 'train_runtime': '279.9', 'train_tokens_per_second': '1945'}
270
+ {'loss': '1.581', 'grad_norm': '0.6765', 'learning_rate': '1.624e-05', 'epoch': '0.0326', 'num_input_tokens_seen': 546549, 'train_runtime': '280.9', 'train_tokens_per_second': '1946'}
271
+ {'loss': '0.7899', 'grad_norm': '0.4745', 'learning_rate': '1.63e-05', 'epoch': '0.03272', 'num_input_tokens_seen': 548596, 'train_runtime': '282', 'train_tokens_per_second': '1946'}
272
+ {'loss': '1.312', 'grad_norm': '0.613', 'learning_rate': '1.636e-05', 'epoch': '0.03284', 'num_input_tokens_seen': 550643, 'train_runtime': '283', 'train_tokens_per_second': '1946'}
273
+ {'loss': '1.312', 'grad_norm': '0.6338', 'learning_rate': '1.642e-05', 'epoch': '0.03297', 'num_input_tokens_seen': 552690, 'train_runtime': '284.1', 'train_tokens_per_second': '1946'}
274
+ {'loss': '0.7668', 'grad_norm': '0.4715', 'learning_rate': '1.648e-05', 'epoch': '0.03309', 'num_input_tokens_seen': 554737, 'train_runtime': '285.1', 'train_tokens_per_second': '1946'}
275
+ {'loss': '1.125', 'grad_norm': '0.6008', 'learning_rate': '1.654e-05', 'epoch': '0.03321', 'num_input_tokens_seen': 556784, 'train_runtime': '286.1', 'train_tokens_per_second': '1946'}
276
+ {'loss': '1.317', 'grad_norm': '0.6867', 'learning_rate': '1.661e-05', 'epoch': '0.03333', 'num_input_tokens_seen': 558831, 'train_runtime': '287.2', 'train_tokens_per_second': '1946'}
277
+ {'loss': '1.421', 'grad_norm': '0.6412', 'learning_rate': '1.667e-05', 'epoch': '0.03346', 'num_input_tokens_seen': 560878, 'train_runtime': '288.2', 'train_tokens_per_second': '1946'}
278
+ {'loss': '1.625', 'grad_norm': '0.7158', 'learning_rate': '1.673e-05', 'epoch': '0.03358', 'num_input_tokens_seen': 562925, 'train_runtime': '289.3', 'train_tokens_per_second': '1946'}
279
+ {'loss': '1.191', 'grad_norm': '0.6911', 'learning_rate': '1.679e-05', 'epoch': '0.0337', 'num_input_tokens_seen': 564972, 'train_runtime': '290.3', 'train_tokens_per_second': '1946'}
280
+ {'loss': '0.6447', 'grad_norm': '0.5162', 'learning_rate': '1.685e-05', 'epoch': '0.03382', 'num_input_tokens_seen': 567019, 'train_runtime': '291.4', 'train_tokens_per_second': '1946'}
281
+ {'loss': '0.8032', 'grad_norm': '0.4759', 'learning_rate': '1.691e-05', 'epoch': '0.03394', 'num_input_tokens_seen': 569066, 'train_runtime': '292.4', 'train_tokens_per_second': '1946'}
282
+ {'loss': '1.107', 'grad_norm': '0.5404', 'learning_rate': '1.697e-05', 'epoch': '0.03407', 'num_input_tokens_seen': 571113, 'train_runtime': '293.4', 'train_tokens_per_second': '1946'}
283
+ {'loss': '1.319', 'grad_norm': '0.7111', 'learning_rate': '1.703e-05', 'epoch': '0.03419', 'num_input_tokens_seen': 573160, 'train_runtime': '294.5', 'train_tokens_per_second': '1946'}
284
+ {'loss': '1.366', 'grad_norm': '0.6837', 'learning_rate': '1.709e-05', 'epoch': '0.03431', 'num_input_tokens_seen': 575207, 'train_runtime': '295.5', 'train_tokens_per_second': '1946'}
285
+ {'loss': '1.553', 'grad_norm': '0.767', 'learning_rate': '1.716e-05', 'epoch': '0.03443', 'num_input_tokens_seen': 577254, 'train_runtime': '296.6', 'train_tokens_per_second': '1946'}
286
+ {'loss': '0.7748', 'grad_norm': '0.5244', 'learning_rate': '1.722e-05', 'epoch': '0.03455', 'num_input_tokens_seen': 579301, 'train_runtime': '297.6', 'train_tokens_per_second': '1947'}
287
+ {'loss': '0.6421', 'grad_norm': '0.4703', 'learning_rate': '1.728e-05', 'epoch': '0.03468', 'num_input_tokens_seen': 581348, 'train_runtime': '298.6', 'train_tokens_per_second': '1947'}
288
+ {'loss': '1.299', 'grad_norm': '0.7458', 'learning_rate': '1.734e-05', 'epoch': '0.0348', 'num_input_tokens_seen': 583395, 'train_runtime': '299.7', 'train_tokens_per_second': '1947'}
289
+ {'loss': '1.37', 'grad_norm': '0.766', 'learning_rate': '1.74e-05', 'epoch': '0.03492', 'num_input_tokens_seen': 585442, 'train_runtime': '300.7', 'train_tokens_per_second': '1947'}
290
+ {'loss': '1.21', 'grad_norm': '0.7069', 'learning_rate': '1.746e-05', 'epoch': '0.03504', 'num_input_tokens_seen': 587489, 'train_runtime': '301.8', 'train_tokens_per_second': '1947'}
291
+ {'loss': '1.371', 'grad_norm': '0.7178', 'learning_rate': '1.752e-05', 'epoch': '0.03516', 'num_input_tokens_seen': 589536, 'train_runtime': '302.8', 'train_tokens_per_second': '1947'}
292
+ {'loss': '0.6646', 'grad_norm': '0.5199', 'learning_rate': '1.758e-05', 'epoch': '0.03529', 'num_input_tokens_seen': 591583, 'train_runtime': '303.9', 'train_tokens_per_second': '1947'}
293
+ {'loss': '1.354', 'grad_norm': '0.6725', 'learning_rate': '1.764e-05', 'epoch': '0.03541', 'num_input_tokens_seen': 593630, 'train_runtime': '304.9', 'train_tokens_per_second': '1947'}
294
+ {'loss': '0.01388', 'grad_norm': '0.07445', 'learning_rate': '1.77e-05', 'epoch': '0.03553', 'num_input_tokens_seen': 595677, 'train_runtime': '305.9', 'train_tokens_per_second': '1947'}
295
+ {'loss': '0.7751', 'grad_norm': '0.5144', 'learning_rate': '1.777e-05', 'epoch': '0.03565', 'num_input_tokens_seen': 597724, 'train_runtime': '307', 'train_tokens_per_second': '1947'}
296
+ {'loss': '1.217', 'grad_norm': '6.529', 'learning_rate': '1.783e-05', 'epoch': '0.03578', 'num_input_tokens_seen': 599771, 'train_runtime': '308', 'train_tokens_per_second': '1947'}
297
+ {'loss': '1.091', 'grad_norm': '0.6843', 'learning_rate': '1.789e-05', 'epoch': '0.0359', 'num_input_tokens_seen': 601818, 'train_runtime': '309.1', 'train_tokens_per_second': '1947'}
298
+ {'loss': '1.242', 'grad_norm': '0.6793', 'learning_rate': '1.795e-05', 'epoch': '0.03602', 'num_input_tokens_seen': 603865, 'train_runtime': '310.1', 'train_tokens_per_second': '1947'}
299
+ {'loss': '1.213', 'grad_norm': '0.6243', 'learning_rate': '1.801e-05', 'epoch': '0.03614', 'num_input_tokens_seen': 605912, 'train_runtime': '311.1', 'train_tokens_per_second': '1947'}
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.3.7
72
+ fastapi==0.128.0
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ jieba==0.42.1
82
+ rouge-chinese==1.0.3
83
+ joblib==1.5.3
84
+ nltk==3.9.2
85
+ llamafactory==0.9.5.dev0
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.51.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.1
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-90-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T04:03:32.123297Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/B.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "34f54978776c",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 24,
18
+ "cpu_count_logical": 48,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "1931460608"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "405012275200"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-acb5171c-45e7-5653-1120-9d0cd2a192a6"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.8",
40
+ "writerId": "vighgaih8gdd38lqtuv2307y0stf4bym"
41
+ }
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2026-02-04T04:03:32.369728388Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-02-04T04:03:32.692853515Z","level":"INFO","msg":"stream: created new stream","id":"hwsb1mff"}
3
+ {"time":"2026-02-04T04:03:32.693536225Z","level":"INFO","msg":"handler: started","stream_id":"hwsb1mff"}
4
+ {"time":"2026-02-04T04:03:32.695103475Z","level":"INFO","msg":"stream: started","id":"hwsb1mff"}
5
+ {"time":"2026-02-04T04:03:32.695123335Z","level":"INFO","msg":"writer: started","stream_id":"hwsb1mff"}
6
+ {"time":"2026-02-04T04:03:32.695124927Z","level":"INFO","msg":"sender: started","stream_id":"hwsb1mff"}
LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-04 04:03:32,144 INFO MainThread:7849 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
2
+ 2026-02-04 04:03:32,144 INFO MainThread:7849 [wandb_setup.py:_flush():81] Configure stats pid to 7849
3
+ 2026-02-04 04:03:32,144 INFO MainThread:7849 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-04 04:03:32,145 INFO MainThread:7849 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug.log
5
+ 2026-02-04 04:03:32,147 INFO MainThread:7849 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040332-hwsb1mff/logs/debug-internal.log
6
+ 2026-02-04 04:03:32,147 INFO MainThread:7849 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-04 04:03:32,147 INFO MainThread:7849 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-04 04:03:32,148 INFO MainThread:7849 [wandb_init.py:init():892] starting backend
10
+ 2026-02-04 04:03:32,362 INFO MainThread:7849 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-04 04:03:32,368 INFO MainThread:7849 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-04 04:03:32,369 INFO MainThread:7849 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-04 04:03:32,417 INFO MainThread:7849 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-04 04:03:33,108 INFO MainThread:7849 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-04 04:03:33,181 INFO MainThread:7849 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-04 04:03:33,181 INFO MainThread:7849 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-04 04:03:33,182 INFO MainThread:7849 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-04 04:03:33,182 INFO MainThread:7849 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-04 04:03:33,184 INFO MainThread:7849 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-04 04:03:33,185 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['gate_proj', 'down_proj', 'o_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 585, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-04 04:03:33,192 INFO MainThread:7849 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x76ded82a5690>>
22
+ 2026-02-04 04:03:33,193 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-04 04:03:33,195 INFO MainThread:7849 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t35_d0_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.1
6
+ e:
7
+ jy6in5azojamixlag12ky8yqk0a5luc8:
8
+ args:
9
+ - /workspace/v127rc_exp1/C.yaml
10
+ cpu_count: 16
11
+ cpu_count_logical: 32
12
+ cudaVersion: "13.0"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "1858318336"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de
30
+ host: 47a53adf0198
31
+ memory:
32
+ total: "201701408768"
33
+ os: Linux-6.8.0-94-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-04T04:05:44.037622Z"
38
+ writerId: jy6in5azojamixlag12ky8yqk0a5luc8
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.1
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.1
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t0_d35_r286
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/C
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - o_proj
625
+ - down_proj
626
+ - gate_proj
627
+ - v_proj
628
+ - k_proj
629
+ - q_proj
630
+ - up_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 266
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/output.log ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0%| | 0/18595 [00:00<?, ?it/s]/usr/local/lib/python3.11/dist-packages/torch/utils/checkpoint.py:295: FutureWarning: `torch.cpu.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cpu', args...)` instead.
2
+ with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs): # type: ignore[attr-defined]
3
+
4
+ {'loss': '1.682', 'grad_norm': '0.2716', 'learning_rate': '0', 'epoch': '0.0002689', 'num_input_tokens_seen': 2047, 'train_runtime': '2.905', 'train_tokens_per_second': '704.7'}
5
+ {'loss': '1.8', 'grad_norm': '0.2904', 'learning_rate': '1.344e-07', 'epoch': '0.0005378', 'num_input_tokens_seen': 4094, 'train_runtime': '3.914', 'train_tokens_per_second': '1046'}
6
+ {'loss': '1.751', 'grad_norm': '0.2786', 'learning_rate': '2.688e-07', 'epoch': '0.0008067', 'num_input_tokens_seen': 6141, 'train_runtime': '4.925', 'train_tokens_per_second': '1247'}
7
+ {'loss': '1.725', 'grad_norm': '0.2775', 'learning_rate': '4.032e-07', 'epoch': '0.001076', 'num_input_tokens_seen': 8188, 'train_runtime': '5.934', 'train_tokens_per_second': '1380'}
8
+ {'loss': '1.857', 'grad_norm': '0.282', 'learning_rate': '5.376e-07', 'epoch': '0.001344', 'num_input_tokens_seen': 10235, 'train_runtime': '6.944', 'train_tokens_per_second': '1474'}
9
+ {'loss': '1.865', 'grad_norm': '0.2441', 'learning_rate': '6.72e-07', 'epoch': '0.001613', 'num_input_tokens_seen': 12282, 'train_runtime': '7.952', 'train_tokens_per_second': '1545'}
10
+ {'loss': '1.791', 'grad_norm': '0.2674', 'learning_rate': '8.065e-07', 'epoch': '0.001882', 'num_input_tokens_seen': 14329, 'train_runtime': '8.964', 'train_tokens_per_second': '1599'}
11
+ {'loss': '1.834', 'grad_norm': '0.2586', 'learning_rate': '9.409e-07', 'epoch': '0.002151', 'num_input_tokens_seen': 16376, 'train_runtime': '9.974', 'train_tokens_per_second': '1642'}
12
+ {'loss': '1.92', 'grad_norm': '0.2805', 'learning_rate': '1.075e-06', 'epoch': '0.00242', 'num_input_tokens_seen': 18423, 'train_runtime': '10.98', 'train_tokens_per_second': '1677'}
13
+ {'loss': '1.945', 'grad_norm': '0.2809', 'learning_rate': '1.21e-06', 'epoch': '0.002689', 'num_input_tokens_seen': 20470, 'train_runtime': '11.99', 'train_tokens_per_second': '1707'}
14
+ {'loss': '1.955', 'grad_norm': '0.2961', 'learning_rate': '1.344e-06', 'epoch': '0.002958', 'num_input_tokens_seen': 22517, 'train_runtime': '13.01', 'train_tokens_per_second': '1731'}
15
+ {'loss': '1.811', 'grad_norm': '0.2714', 'learning_rate': '1.478e-06', 'epoch': '0.003227', 'num_input_tokens_seen': 24564, 'train_runtime': '14.02', 'train_tokens_per_second': '1753'}
16
+ {'loss': '1.631', 'grad_norm': '0.2661', 'learning_rate': '1.613e-06', 'epoch': '0.003496', 'num_input_tokens_seen': 26611, 'train_runtime': '15.03', 'train_tokens_per_second': '1771'}
17
+ {'loss': '1.769', 'grad_norm': '0.268', 'learning_rate': '1.747e-06', 'epoch': '0.003764', 'num_input_tokens_seen': 28658, 'train_runtime': '16.04', 'train_tokens_per_second': '1787'}
18
+ {'loss': '1.611', 'grad_norm': '0.2518', 'learning_rate': '1.882e-06', 'epoch': '0.004033', 'num_input_tokens_seen': 30705, 'train_runtime': '17.05', 'train_tokens_per_second': '1801'}
19
+ {'loss': '1.624', 'grad_norm': '0.2597', 'learning_rate': '2.016e-06', 'epoch': '0.004302', 'num_input_tokens_seen': 32752, 'train_runtime': '18.06', 'train_tokens_per_second': '1814'}
20
+ {'loss': '1.854', 'grad_norm': '0.2804', 'learning_rate': '2.151e-06', 'epoch': '0.004571', 'num_input_tokens_seen': 34799, 'train_runtime': '19.07', 'train_tokens_per_second': '1825'}
21
+ {'loss': '1.849', 'grad_norm': '0.521', 'learning_rate': '2.285e-06', 'epoch': '0.00484', 'num_input_tokens_seen': 36846, 'train_runtime': '20.08', 'train_tokens_per_second': '1835'}
22
+ {'loss': '1.825', 'grad_norm': '0.2669', 'learning_rate': '2.419e-06', 'epoch': '0.005109', 'num_input_tokens_seen': 38893, 'train_runtime': '21.1', 'train_tokens_per_second': '1843'}
23
+ {'loss': '1.534', 'grad_norm': '0.2729', 'learning_rate': '2.554e-06', 'epoch': '0.005378', 'num_input_tokens_seen': 40940, 'train_runtime': '22.11', 'train_tokens_per_second': '1852'}
24
+ {'loss': '1.67', 'grad_norm': '0.2686', 'learning_rate': '2.688e-06', 'epoch': '0.005647', 'num_input_tokens_seen': 42987, 'train_runtime': '23.13', 'train_tokens_per_second': '1859'}
25
+ {'loss': '1.549', 'grad_norm': '0.2592', 'learning_rate': '2.823e-06', 'epoch': '0.005916', 'num_input_tokens_seen': 45034, 'train_runtime': '24.14', 'train_tokens_per_second': '1866'}
26
+ {'loss': '1.868', 'grad_norm': '0.2874', 'learning_rate': '2.957e-06', 'epoch': '0.006184', 'num_input_tokens_seen': 47081, 'train_runtime': '25.15', 'train_tokens_per_second': '1872'}
27
+ {'loss': '1.767', 'grad_norm': '0.2763', 'learning_rate': '3.091e-06', 'epoch': '0.006453', 'num_input_tokens_seen': 49128, 'train_runtime': '26.16', 'train_tokens_per_second': '1878'}
28
+ {'loss': '1.936', 'grad_norm': '0.2961', 'learning_rate': '3.226e-06', 'epoch': '0.006722', 'num_input_tokens_seen': 51175, 'train_runtime': '27.18', 'train_tokens_per_second': '1883'}
29
+ {'loss': '1.625', 'grad_norm': '0.2881', 'learning_rate': '3.36e-06', 'epoch': '0.006991', 'num_input_tokens_seen': 53222, 'train_runtime': '28.19', 'train_tokens_per_second': '1888'}
30
+ {'loss': '1.795', 'grad_norm': '0.3211', 'learning_rate': '3.495e-06', 'epoch': '0.00726', 'num_input_tokens_seen': 55269, 'train_runtime': '29.2', 'train_tokens_per_second': '1893'}
31
+ {'loss': '1.725', 'grad_norm': '0.2936', 'learning_rate': '3.629e-06', 'epoch': '0.007529', 'num_input_tokens_seen': 57316, 'train_runtime': '30.22', 'train_tokens_per_second': '1897'}
32
+ {'loss': '1.871', 'grad_norm': '0.2756', 'learning_rate': '3.763e-06', 'epoch': '0.007798', 'num_input_tokens_seen': 59363, 'train_runtime': '31.23', 'train_tokens_per_second': '1901'}
33
+ {'loss': '1.84', 'grad_norm': '0.2772', 'learning_rate': '3.898e-06', 'epoch': '0.008067', 'num_input_tokens_seen': 61410, 'train_runtime': '32.24', 'train_tokens_per_second': '1905'}
34
+ {'loss': '1.908', 'grad_norm': '0.3025', 'learning_rate': '4.032e-06', 'epoch': '0.008336', 'num_input_tokens_seen': 63457, 'train_runtime': '33.26', 'train_tokens_per_second': '1908'}
35
+ {'loss': '1.725', 'grad_norm': '0.2884', 'learning_rate': '4.167e-06', 'epoch': '0.008604', 'num_input_tokens_seen': 65504, 'train_runtime': '34.27', 'train_tokens_per_second': '1911'}
36
+ {'loss': '1.747', 'grad_norm': '0.3165', 'learning_rate': '4.301e-06', 'epoch': '0.008873', 'num_input_tokens_seen': 67551, 'train_runtime': '35.28', 'train_tokens_per_second': '1915'}
37
+ {'loss': '1.909', 'grad_norm': '0.2975', 'learning_rate': '4.435e-06', 'epoch': '0.009142', 'num_input_tokens_seen': 69598, 'train_runtime': '36.3', 'train_tokens_per_second': '1917'}
38
+ {'loss': '1.64', 'grad_norm': '0.2753', 'learning_rate': '4.57e-06', 'epoch': '0.009411', 'num_input_tokens_seen': 71645, 'train_runtime': '37.31', 'train_tokens_per_second': '1920'}
39
+ {'loss': '1.781', 'grad_norm': '0.2986', 'learning_rate': '4.704e-06', 'epoch': '0.00968', 'num_input_tokens_seen': 73692, 'train_runtime': '38.33', 'train_tokens_per_second': '1923'}
40
+ {'loss': '1.831', 'grad_norm': '0.3018', 'learning_rate': '4.839e-06', 'epoch': '0.009949', 'num_input_tokens_seen': 75739, 'train_runtime': '39.38', 'train_tokens_per_second': '1923'}
41
+ {'loss': '1.859', 'grad_norm': '0.2658', 'learning_rate': '4.973e-06', 'epoch': '0.01022', 'num_input_tokens_seen': 77786, 'train_runtime': '40.4', 'train_tokens_per_second': '1925'}
42
+ {'loss': '1.964', 'grad_norm': '0.297', 'learning_rate': '5.108e-06', 'epoch': '0.01049', 'num_input_tokens_seen': 79833, 'train_runtime': '41.41', 'train_tokens_per_second': '1928'}
43
+ {'loss': '1.935', 'grad_norm': '0.3385', 'learning_rate': '5.242e-06', 'epoch': '0.01076', 'num_input_tokens_seen': 81880, 'train_runtime': '42.43', 'train_tokens_per_second': '1930'}
44
+ {'loss': '1.726', 'grad_norm': '0.3095', 'learning_rate': '5.376e-06', 'epoch': '0.01102', 'num_input_tokens_seen': 83927, 'train_runtime': '43.44', 'train_tokens_per_second': '1932'}
45
+ {'loss': '1.533', 'grad_norm': '0.2799', 'learning_rate': '5.511e-06', 'epoch': '0.01129', 'num_input_tokens_seen': 85974, 'train_runtime': '44.45', 'train_tokens_per_second': '1934'}
46
+ {'loss': '1.762', 'grad_norm': '0.2744', 'learning_rate': '5.645e-06', 'epoch': '0.01156', 'num_input_tokens_seen': 88021, 'train_runtime': '45.47', 'train_tokens_per_second': '1936'}
47
+ {'loss': '1.697', 'grad_norm': '0.2797', 'learning_rate': '5.78e-06', 'epoch': '0.01183', 'num_input_tokens_seen': 90068, 'train_runtime': '46.48', 'train_tokens_per_second': '1938'}
48
+ {'loss': '1.725', 'grad_norm': '0.2793', 'learning_rate': '5.914e-06', 'epoch': '0.0121', 'num_input_tokens_seen': 92115, 'train_runtime': '47.5', 'train_tokens_per_second': '1939'}
49
+ {'loss': '1.981', 'grad_norm': '0.3054', 'learning_rate': '6.048e-06', 'epoch': '0.01237', 'num_input_tokens_seen': 94162, 'train_runtime': '48.51', 'train_tokens_per_second': '1941'}
50
+ {'loss': '1.591', 'grad_norm': '0.2925', 'learning_rate': '6.183e-06', 'epoch': '0.01264', 'num_input_tokens_seen': 96209, 'train_runtime': '49.53', 'train_tokens_per_second': '1943'}
51
+ {'loss': '1.777', 'grad_norm': '0.339', 'learning_rate': '6.317e-06', 'epoch': '0.01291', 'num_input_tokens_seen': 98256, 'train_runtime': '50.54', 'train_tokens_per_second': '1944'}
52
+ {'loss': '1.856', 'grad_norm': '0.2972', 'learning_rate': '6.452e-06', 'epoch': '0.01318', 'num_input_tokens_seen': 100303, 'train_runtime': '51.55', 'train_tokens_per_second': '1946'}
53
+ {'loss': '1.637', 'grad_norm': '0.3191', 'learning_rate': '6.586e-06', 'epoch': '0.01344', 'num_input_tokens_seen': 102350, 'train_runtime': '52.57', 'train_tokens_per_second': '1947'}
54
+ {'loss': '1.885', 'grad_norm': '0.3083', 'learning_rate': '6.72e-06', 'epoch': '0.01371', 'num_input_tokens_seen': 104397, 'train_runtime': '53.58', 'train_tokens_per_second': '1948'}
55
+ {'loss': '1.777', 'grad_norm': '0.3115', 'learning_rate': '6.855e-06', 'epoch': '0.01398', 'num_input_tokens_seen': 106444, 'train_runtime': '54.59', 'train_tokens_per_second': '1950'}
56
+ {'loss': '1.848', 'grad_norm': '0.3558', 'learning_rate': '6.989e-06', 'epoch': '0.01425', 'num_input_tokens_seen': 108491, 'train_runtime': '55.61', 'train_tokens_per_second': '1951'}
57
+ {'loss': '1.613', 'grad_norm': '0.3172', 'learning_rate': '7.124e-06', 'epoch': '0.01452', 'num_input_tokens_seen': 110538, 'train_runtime': '56.63', 'train_tokens_per_second': '1952'}
58
+ {'loss': '1.642', 'grad_norm': '0.2996', 'learning_rate': '7.258e-06', 'epoch': '0.01479', 'num_input_tokens_seen': 112585, 'train_runtime': '57.64', 'train_tokens_per_second': '1953'}
59
+ {'loss': '1.979', 'grad_norm': '0.331', 'learning_rate': '7.392e-06', 'epoch': '0.01506', 'num_input_tokens_seen': 114632, 'train_runtime': '58.66', 'train_tokens_per_second': '1954'}
60
+ {'loss': '1.473', 'grad_norm': '0.305', 'learning_rate': '7.527e-06', 'epoch': '0.01533', 'num_input_tokens_seen': 116679, 'train_runtime': '59.67', 'train_tokens_per_second': '1955'}
61
+ {'loss': '1.56', 'grad_norm': '0.2983', 'learning_rate': '7.661e-06', 'epoch': '0.0156', 'num_input_tokens_seen': 118726, 'train_runtime': '60.69', 'train_tokens_per_second': '1956'}
62
+ {'loss': '1.792', 'grad_norm': '0.3465', 'learning_rate': '7.796e-06', 'epoch': '0.01586', 'num_input_tokens_seen': 120773, 'train_runtime': '61.71', 'train_tokens_per_second': '1957'}
63
+ {'loss': '1.589', 'grad_norm': '0.3406', 'learning_rate': '7.93e-06', 'epoch': '0.01613', 'num_input_tokens_seen': 122820, 'train_runtime': '62.73', 'train_tokens_per_second': '1958'}
64
+ {'loss': '1.715', 'grad_norm': '0.3038', 'learning_rate': '8.065e-06', 'epoch': '0.0164', 'num_input_tokens_seen': 124867, 'train_runtime': '63.74', 'train_tokens_per_second': '1959'}
65
+ {'loss': '1.703', 'grad_norm': '0.3439', 'learning_rate': '8.199e-06', 'epoch': '0.01667', 'num_input_tokens_seen': 126914, 'train_runtime': '64.76', 'train_tokens_per_second': '1960'}
66
+ {'loss': '1.909', 'grad_norm': '0.363', 'learning_rate': '8.333e-06', 'epoch': '0.01694', 'num_input_tokens_seen': 128961, 'train_runtime': '65.77', 'train_tokens_per_second': '1961'}
67
+ {'loss': '1.798', 'grad_norm': '0.3657', 'learning_rate': '8.468e-06', 'epoch': '0.01721', 'num_input_tokens_seen': 131008, 'train_runtime': '66.79', 'train_tokens_per_second': '1961'}
68
+ {'loss': '1.853', 'grad_norm': '0.3834', 'learning_rate': '8.602e-06', 'epoch': '0.01748', 'num_input_tokens_seen': 133055, 'train_runtime': '67.81', 'train_tokens_per_second': '1962'}
69
+ {'loss': '1.806', 'grad_norm': '0.7619', 'learning_rate': '8.737e-06', 'epoch': '0.01775', 'num_input_tokens_seen': 135102, 'train_runtime': '68.83', 'train_tokens_per_second': '1963'}
70
+ {'loss': '1.435', 'grad_norm': '0.3309', 'learning_rate': '8.871e-06', 'epoch': '0.01802', 'num_input_tokens_seen': 137149, 'train_runtime': '69.84', 'train_tokens_per_second': '1964'}
71
+ {'loss': '1.746', 'grad_norm': '0.3073', 'learning_rate': '9.005e-06', 'epoch': '0.01828', 'num_input_tokens_seen': 139196, 'train_runtime': '70.86', 'train_tokens_per_second': '1965'}
72
+ {'loss': '1.822', 'grad_norm': '0.354', 'learning_rate': '9.14e-06', 'epoch': '0.01855', 'num_input_tokens_seen': 141243, 'train_runtime': '71.87', 'train_tokens_per_second': '1965'}
73
+ {'loss': '1.661', 'grad_norm': '0.3499', 'learning_rate': '9.274e-06', 'epoch': '0.01882', 'num_input_tokens_seen': 143290, 'train_runtime': '72.89', 'train_tokens_per_second': '1966'}
74
+ {'loss': '1.913', 'grad_norm': '0.3419', 'learning_rate': '9.409e-06', 'epoch': '0.01909', 'num_input_tokens_seen': 145337, 'train_runtime': '73.9', 'train_tokens_per_second': '1967'}
75
+ {'loss': '1.815', 'grad_norm': '0.4037', 'learning_rate': '9.543e-06', 'epoch': '0.01936', 'num_input_tokens_seen': 147384, 'train_runtime': '74.93', 'train_tokens_per_second': '1967'}
76
+ {'loss': '1.798', 'grad_norm': '0.3734', 'learning_rate': '9.677e-06', 'epoch': '0.01963', 'num_input_tokens_seen': 149431, 'train_runtime': '75.94', 'train_tokens_per_second': '1968'}
77
+ {'loss': '1.703', 'grad_norm': '0.3758', 'learning_rate': '9.812e-06', 'epoch': '0.0199', 'num_input_tokens_seen': 151478, 'train_runtime': '76.96', 'train_tokens_per_second': '1968'}
78
+ {'loss': '1.579', 'grad_norm': '0.3325', 'learning_rate': '9.946e-06', 'epoch': '0.02017', 'num_input_tokens_seen': 153525, 'train_runtime': '77.98', 'train_tokens_per_second': '1969'}
79
+ {'loss': '1.712', 'grad_norm': '0.3724', 'learning_rate': '1.008e-05', 'epoch': '0.02044', 'num_input_tokens_seen': 155572, 'train_runtime': '78.99', 'train_tokens_per_second': '1969'}
80
+ {'loss': '1.761', 'grad_norm': '0.3466', 'learning_rate': '1.022e-05', 'epoch': '0.0207', 'num_input_tokens_seen': 157619, 'train_runtime': '80.01', 'train_tokens_per_second': '1970'}
81
+ {'loss': '1.85', 'grad_norm': '0.3739', 'learning_rate': '1.035e-05', 'epoch': '0.02097', 'num_input_tokens_seen': 159666, 'train_runtime': '81.03', 'train_tokens_per_second': '1971'}
82
+ {'loss': '1.769', 'grad_norm': '0.3774', 'learning_rate': '1.048e-05', 'epoch': '0.02124', 'num_input_tokens_seen': 161713, 'train_runtime': '82.04', 'train_tokens_per_second': '1971'}
83
+ {'loss': '1.591', 'grad_norm': '0.3267', 'learning_rate': '1.062e-05', 'epoch': '0.02151', 'num_input_tokens_seen': 163760, 'train_runtime': '83.06', 'train_tokens_per_second': '1972'}
84
+ {'loss': '1.682', 'grad_norm': '0.3958', 'learning_rate': '1.075e-05', 'epoch': '0.02178', 'num_input_tokens_seen': 165807, 'train_runtime': '84.07', 'train_tokens_per_second': '1972'}
85
+ {'loss': '1.415', 'grad_norm': '0.3386', 'learning_rate': '1.089e-05', 'epoch': '0.02205', 'num_input_tokens_seen': 167854, 'train_runtime': '85.09', 'train_tokens_per_second': '1973'}
86
+ {'loss': '1.275', 'grad_norm': '0.3369', 'learning_rate': '1.102e-05', 'epoch': '0.02232', 'num_input_tokens_seen': 169901, 'train_runtime': '86.11', 'train_tokens_per_second': '1973'}
87
+ {'loss': '1.799', 'grad_norm': '0.4252', 'learning_rate': '1.116e-05', 'epoch': '0.02259', 'num_input_tokens_seen': 171948, 'train_runtime': '87.13', 'train_tokens_per_second': '1974'}
88
+ {'loss': '1.631', 'grad_norm': '0.3741', 'learning_rate': '1.129e-05', 'epoch': '0.02286', 'num_input_tokens_seen': 173995, 'train_runtime': '88.14', 'train_tokens_per_second': '1974'}
89
+ {'loss': '1.696', 'grad_norm': '0.3964', 'learning_rate': '1.142e-05', 'epoch': '0.02312', 'num_input_tokens_seen': 176042, 'train_runtime': '89.16', 'train_tokens_per_second': '1974'}
90
+ {'loss': '1.811', 'grad_norm': '0.3835', 'learning_rate': '1.156e-05', 'epoch': '0.02339', 'num_input_tokens_seen': 178089, 'train_runtime': '90.17', 'train_tokens_per_second': '1975'}
91
+ {'loss': '1.628', 'grad_norm': '0.3732', 'learning_rate': '1.169e-05', 'epoch': '0.02366', 'num_input_tokens_seen': 180136, 'train_runtime': '91.19', 'train_tokens_per_second': '1975'}
92
+ {'loss': '1.772', 'grad_norm': '0.3954', 'learning_rate': '1.183e-05', 'epoch': '0.02393', 'num_input_tokens_seen': 182183, 'train_runtime': '92.21', 'train_tokens_per_second': '1976'}
93
+ {'loss': '1.709', 'grad_norm': '0.4323', 'learning_rate': '1.196e-05', 'epoch': '0.0242', 'num_input_tokens_seen': 184230, 'train_runtime': '93.23', 'train_tokens_per_second': '1976'}
94
+ {'loss': '1.63', 'grad_norm': '0.3912', 'learning_rate': '1.21e-05', 'epoch': '0.02447', 'num_input_tokens_seen': 186277, 'train_runtime': '94.24', 'train_tokens_per_second': '1977'}
95
+ {'loss': '1.688', 'grad_norm': '0.4078', 'learning_rate': '1.223e-05', 'epoch': '0.02474', 'num_input_tokens_seen': 188324, 'train_runtime': '95.26', 'train_tokens_per_second': '1977'}
96
+ {'loss': '1.883', 'grad_norm': '0.4385', 'learning_rate': '1.237e-05', 'epoch': '0.02501', 'num_input_tokens_seen': 190371, 'train_runtime': '96.28', 'train_tokens_per_second': '1977'}
97
+ {'loss': '1.763', 'grad_norm': '0.4172', 'learning_rate': '1.25e-05', 'epoch': '0.02528', 'num_input_tokens_seen': 192418, 'train_runtime': '97.29', 'train_tokens_per_second': '1978'}
98
+ {'loss': '1.675', 'grad_norm': '0.4223', 'learning_rate': '1.263e-05', 'epoch': '0.02554', 'num_input_tokens_seen': 194465, 'train_runtime': '98.31', 'train_tokens_per_second': '1978'}
99
+ {'loss': '1.747', 'grad_norm': '0.4324', 'learning_rate': '1.277e-05', 'epoch': '0.02581', 'num_input_tokens_seen': 196512, 'train_runtime': '99.33', 'train_tokens_per_second': '1978'}
100
+ {'loss': '1.792', 'grad_norm': '0.4544', 'learning_rate': '1.29e-05', 'epoch': '0.02608', 'num_input_tokens_seen': 198559, 'train_runtime': '100.3', 'train_tokens_per_second': '1979'}
101
+ {'loss': '1.596', 'grad_norm': '0.4222', 'learning_rate': '1.304e-05', 'epoch': '0.02635', 'num_input_tokens_seen': 200606, 'train_runtime': '101.4', 'train_tokens_per_second': '1979'}
102
+ {'loss': '1.533', 'grad_norm': '0.4118', 'learning_rate': '1.317e-05', 'epoch': '0.02662', 'num_input_tokens_seen': 202653, 'train_runtime': '102.4', 'train_tokens_per_second': '1979'}
103
+ {'loss': '1.608', 'grad_norm': '0.4393', 'learning_rate': '1.331e-05', 'epoch': '0.02689', 'num_input_tokens_seen': 204700, 'train_runtime': '103.4', 'train_tokens_per_second': '1980'}
104
+ {'loss': '1.307', 'grad_norm': '0.3855', 'learning_rate': '1.344e-05', 'epoch': '0.02716', 'num_input_tokens_seen': 206747, 'train_runtime': '104.4', 'train_tokens_per_second': '1980'}
105
+ {'loss': '1.775', 'grad_norm': '0.4397', 'learning_rate': '1.358e-05', 'epoch': '0.02743', 'num_input_tokens_seen': 208794, 'train_runtime': '105.4', 'train_tokens_per_second': '1980'}
106
+ {'loss': '1.165', 'grad_norm': '0.4129', 'learning_rate': '1.371e-05', 'epoch': '0.0277', 'num_input_tokens_seen': 210841, 'train_runtime': '106.4', 'train_tokens_per_second': '1981'}
107
+ {'loss': '1.774', 'grad_norm': '0.4688', 'learning_rate': '1.384e-05', 'epoch': '0.02796', 'num_input_tokens_seen': 212888, 'train_runtime': '107.5', 'train_tokens_per_second': '1981'}
108
+ {'loss': '1.548', 'grad_norm': '0.409', 'learning_rate': '1.398e-05', 'epoch': '0.02823', 'num_input_tokens_seen': 214935, 'train_runtime': '108.5', 'train_tokens_per_second': '1981'}
109
+ {'loss': '1.662', 'grad_norm': '0.4561', 'learning_rate': '1.411e-05', 'epoch': '0.0285', 'num_input_tokens_seen': 216982, 'train_runtime': '109.5', 'train_tokens_per_second': '1982'}
110
+ {'loss': '1.709', 'grad_norm': '0.5552', 'learning_rate': '1.425e-05', 'epoch': '0.02877', 'num_input_tokens_seen': 219029, 'train_runtime': '110.5', 'train_tokens_per_second': '1982'}
111
+ {'loss': '1.681', 'grad_norm': '0.4587', 'learning_rate': '1.438e-05', 'epoch': '0.02904', 'num_input_tokens_seen': 221076, 'train_runtime': '111.5', 'train_tokens_per_second': '1982'}
112
+ {'loss': '1.787', 'grad_norm': '0.4875', 'learning_rate': '1.452e-05', 'epoch': '0.02931', 'num_input_tokens_seen': 223123, 'train_runtime': '112.6', 'train_tokens_per_second': '1982'}
113
+ {'loss': '1.593', 'grad_norm': '0.4741', 'learning_rate': '1.465e-05', 'epoch': '0.02958', 'num_input_tokens_seen': 225170, 'train_runtime': '113.6', 'train_tokens_per_second': '1982'}
114
+ {'loss': '1.143', 'grad_norm': '0.4104', 'learning_rate': '1.478e-05', 'epoch': '0.02985', 'num_input_tokens_seen': 227217, 'train_runtime': '114.6', 'train_tokens_per_second': '1983'}
115
+ {'loss': '1.633', 'grad_norm': '0.4514', 'learning_rate': '1.492e-05', 'epoch': '0.03012', 'num_input_tokens_seen': 229264, 'train_runtime': '115.6', 'train_tokens_per_second': '1983'}
116
+ {'loss': '1.576', 'grad_norm': '0.4584', 'learning_rate': '1.505e-05', 'epoch': '0.03038', 'num_input_tokens_seen': 231311, 'train_runtime': '116.7', 'train_tokens_per_second': '1983'}
117
+ {'loss': '1.704', 'grad_norm': '0.4646', 'learning_rate': '1.519e-05', 'epoch': '0.03065', 'num_input_tokens_seen': 233358, 'train_runtime': '117.7', 'train_tokens_per_second': '1983'}
118
+ {'loss': '1.651', 'grad_norm': '0.4925', 'learning_rate': '1.532e-05', 'epoch': '0.03092', 'num_input_tokens_seen': 235405, 'train_runtime': '118.7', 'train_tokens_per_second': '1983'}
119
+ {'loss': '1.614', 'grad_norm': '0.4438', 'learning_rate': '1.546e-05', 'epoch': '0.03119', 'num_input_tokens_seen': 237452, 'train_runtime': '119.7', 'train_tokens_per_second': '1984'}
120
+ {'loss': '1.158', 'grad_norm': '0.4493', 'learning_rate': '1.559e-05', 'epoch': '0.03146', 'num_input_tokens_seen': 239499, 'train_runtime': '120.7', 'train_tokens_per_second': '1984'}
121
+ {'loss': '1.604', 'grad_norm': '0.545', 'learning_rate': '1.573e-05', 'epoch': '0.03173', 'num_input_tokens_seen': 241546, 'train_runtime': '121.7', 'train_tokens_per_second': '1984'}
122
+ {'loss': '1.744', 'grad_norm': '0.5362', 'learning_rate': '1.586e-05', 'epoch': '0.032', 'num_input_tokens_seen': 243593, 'train_runtime': '122.8', 'train_tokens_per_second': '1984'}
123
+ {'loss': '1.525', 'grad_norm': '0.5284', 'learning_rate': '1.599e-05', 'epoch': '0.03227', 'num_input_tokens_seen': 245640, 'train_runtime': '123.8', 'train_tokens_per_second': '1985'}
124
+ {'loss': '1.521', 'grad_norm': '0.5212', 'learning_rate': '1.613e-05', 'epoch': '0.03254', 'num_input_tokens_seen': 247687, 'train_runtime': '124.8', 'train_tokens_per_second': '1985'}
125
+ {'loss': '1.561', 'grad_norm': '0.5265', 'learning_rate': '1.626e-05', 'epoch': '0.0328', 'num_input_tokens_seen': 249734, 'train_runtime': '125.8', 'train_tokens_per_second': '1985'}
126
+ {'loss': '1.634', 'grad_norm': '0.5029', 'learning_rate': '1.64e-05', 'epoch': '0.03307', 'num_input_tokens_seen': 251781, 'train_runtime': '126.8', 'train_tokens_per_second': '1985'}
127
+ {'loss': '1.475', 'grad_norm': '1.579', 'learning_rate': '1.653e-05', 'epoch': '0.03334', 'num_input_tokens_seen': 253828, 'train_runtime': '127.8', 'train_tokens_per_second': '1985'}
128
+ {'loss': '1.53', 'grad_norm': '0.541', 'learning_rate': '1.667e-05', 'epoch': '0.03361', 'num_input_tokens_seen': 255875, 'train_runtime': '128.9', 'train_tokens_per_second': '1986'}
129
+ {'loss': '1.484', 'grad_norm': '0.5354', 'learning_rate': '1.68e-05', 'epoch': '0.03388', 'num_input_tokens_seen': 257922, 'train_runtime': '129.9', 'train_tokens_per_second': '1986'}
130
+ {'loss': '1.496', 'grad_norm': '0.6181', 'learning_rate': '1.694e-05', 'epoch': '0.03415', 'num_input_tokens_seen': 259969, 'train_runtime': '130.9', 'train_tokens_per_second': '1986'}
131
+ {'loss': '1.393', 'grad_norm': '0.5379', 'learning_rate': '1.707e-05', 'epoch': '0.03442', 'num_input_tokens_seen': 262016, 'train_runtime': '131.9', 'train_tokens_per_second': '1986'}
132
+ {'loss': '1.658', 'grad_norm': '0.599', 'learning_rate': '1.72e-05', 'epoch': '0.03469', 'num_input_tokens_seen': 264063, 'train_runtime': '132.9', 'train_tokens_per_second': '1986'}
133
+ {'loss': '1.735', 'grad_norm': '0.6024', 'learning_rate': '1.734e-05', 'epoch': '0.03496', 'num_input_tokens_seen': 266110, 'train_runtime': '134', 'train_tokens_per_second': '1987'}
134
+ {'loss': '1.582', 'grad_norm': '0.5961', 'learning_rate': '1.747e-05', 'epoch': '0.03522', 'num_input_tokens_seen': 268157, 'train_runtime': '135', 'train_tokens_per_second': '1987'}
135
+ {'loss': '1.432', 'grad_norm': '0.4836', 'learning_rate': '1.761e-05', 'epoch': '0.03549', 'num_input_tokens_seen': 270204, 'train_runtime': '136', 'train_tokens_per_second': '1987'}
136
+ {'loss': '1.463', 'grad_norm': '0.5285', 'learning_rate': '1.774e-05', 'epoch': '0.03576', 'num_input_tokens_seen': 272251, 'train_runtime': '137', 'train_tokens_per_second': '1987'}
137
+ {'loss': '1.529', 'grad_norm': '0.6326', 'learning_rate': '1.788e-05', 'epoch': '0.03603', 'num_input_tokens_seen': 274298, 'train_runtime': '138', 'train_tokens_per_second': '1987'}
138
+ {'loss': '1.533', 'grad_norm': '0.6052', 'learning_rate': '1.801e-05', 'epoch': '0.0363', 'num_input_tokens_seen': 276345, 'train_runtime': '139', 'train_tokens_per_second': '1987'}
139
+ {'loss': '1.655', 'grad_norm': '0.5771', 'learning_rate': '1.815e-05', 'epoch': '0.03657', 'num_input_tokens_seen': 278392, 'train_runtime': '140.1', 'train_tokens_per_second': '1988'}
140
+ {'loss': '1.518', 'grad_norm': '0.6251', 'learning_rate': '1.828e-05', 'epoch': '0.03684', 'num_input_tokens_seen': 280439, 'train_runtime': '141.1', 'train_tokens_per_second': '1988'}
141
+ {'loss': '1.387', 'grad_norm': '0.5392', 'learning_rate': '1.841e-05', 'epoch': '0.03711', 'num_input_tokens_seen': 282486, 'train_runtime': '142.1', 'train_tokens_per_second': '1988'}
142
+ {'loss': '1.677', 'grad_norm': '2.701', 'learning_rate': '1.855e-05', 'epoch': '0.03738', 'num_input_tokens_seen': 284533, 'train_runtime': '143.1', 'train_tokens_per_second': '1988'}
143
+ {'loss': '1.466', 'grad_norm': '0.5754', 'learning_rate': '1.868e-05', 'epoch': '0.03764', 'num_input_tokens_seen': 286580, 'train_runtime': '144.1', 'train_tokens_per_second': '1988'}
144
+ {'loss': '1.461', 'grad_norm': '0.5828', 'learning_rate': '1.882e-05', 'epoch': '0.03791', 'num_input_tokens_seen': 288627, 'train_runtime': '145.1', 'train_tokens_per_second': '1988'}
145
+ {'loss': '1.585', 'grad_norm': '0.6422', 'learning_rate': '1.895e-05', 'epoch': '0.03818', 'num_input_tokens_seen': 290674, 'train_runtime': '146.2', 'train_tokens_per_second': '1989'}
146
+ {'loss': '1.33', 'grad_norm': '0.569', 'learning_rate': '1.909e-05', 'epoch': '0.03845', 'num_input_tokens_seen': 292721, 'train_runtime': '147.2', 'train_tokens_per_second': '1989'}
147
+ {'loss': '1.607', 'grad_norm': '0.632', 'learning_rate': '1.922e-05', 'epoch': '0.03872', 'num_input_tokens_seen': 294768, 'train_runtime': '148.2', 'train_tokens_per_second': '1989'}
148
+ {'loss': '1.382', 'grad_norm': '0.5767', 'learning_rate': '1.935e-05', 'epoch': '0.03899', 'num_input_tokens_seen': 296815, 'train_runtime': '149.2', 'train_tokens_per_second': '1989'}
149
+ {'loss': '1.412', 'grad_norm': '0.6597', 'learning_rate': '1.949e-05', 'epoch': '0.03926', 'num_input_tokens_seen': 298862, 'train_runtime': '150.2', 'train_tokens_per_second': '1989'}
150
+ {'loss': '1.238', 'grad_norm': '0.5835', 'learning_rate': '1.962e-05', 'epoch': '0.03953', 'num_input_tokens_seen': 300909, 'train_runtime': '151.3', 'train_tokens_per_second': '1989'}
151
+ {'loss': '1.586', 'grad_norm': '0.6251', 'learning_rate': '1.976e-05', 'epoch': '0.0398', 'num_input_tokens_seen': 302956, 'train_runtime': '152.3', 'train_tokens_per_second': '1989'}
152
+ {'loss': '1.396', 'grad_norm': '0.629', 'learning_rate': '1.989e-05', 'epoch': '0.04006', 'num_input_tokens_seen': 305003, 'train_runtime': '153.3', 'train_tokens_per_second': '1990'}
153
+ {'loss': '1.484', 'grad_norm': '0.7154', 'learning_rate': '2.003e-05', 'epoch': '0.04033', 'num_input_tokens_seen': 307050, 'train_runtime': '154.3', 'train_tokens_per_second': '1990'}
154
+ {'loss': '1.553', 'grad_norm': '0.7419', 'learning_rate': '2.016e-05', 'epoch': '0.0406', 'num_input_tokens_seen': 309097, 'train_runtime': '155.3', 'train_tokens_per_second': '1990'}
155
+ {'loss': '1.573', 'grad_norm': '0.7395', 'learning_rate': '2.03e-05', 'epoch': '0.04087', 'num_input_tokens_seen': 311144, 'train_runtime': '156.4', 'train_tokens_per_second': '1990'}
156
+ {'loss': '1.284', 'grad_norm': '0.5886', 'learning_rate': '2.043e-05', 'epoch': '0.04114', 'num_input_tokens_seen': 313191, 'train_runtime': '157.4', 'train_tokens_per_second': '1990'}
157
+ {'loss': '1.444', 'grad_norm': '0.7212', 'learning_rate': '2.056e-05', 'epoch': '0.04141', 'num_input_tokens_seen': 315238, 'train_runtime': '158.4', 'train_tokens_per_second': '1990'}
158
+ {'loss': '1.456', 'grad_norm': '0.6589', 'learning_rate': '2.07e-05', 'epoch': '0.04168', 'num_input_tokens_seen': 317285, 'train_runtime': '159.4', 'train_tokens_per_second': '1990'}
159
+ {'loss': '1.469', 'grad_norm': '0.7179', 'learning_rate': '2.083e-05', 'epoch': '0.04195', 'num_input_tokens_seen': 319332, 'train_runtime': '160.4', 'train_tokens_per_second': '1991'}
160
+ File "/usr/local/bin/llamafactory-cli", line 8, in <module>
161
+ sys.exit(main())
162
+ ^^^^^^
163
+ File "/workspace/LlamaFactory/src/llamafactory/cli.py", line 24, in main
164
+ launcher.launch()
165
+ File "/workspace/LlamaFactory/src/llamafactory/launcher.py", line 157, in launch
166
+ run_exp()
167
+ File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 125, in run_exp
168
+ _training_function(config={"args": args, "callbacks": callbacks})
169
+ File "/workspace/LlamaFactory/src/llamafactory/train/tuner.py", line 91, in _training_function
170
+ run_pt(model_args, data_args, training_args, finetuning_args, callbacks)
171
+ File "/workspace/LlamaFactory/src/llamafactory/train/pt/workflow.py", line 63, in run_pt
172
+ train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
173
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
174
+ File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2174, in train
175
+ return inner_training_loop(
176
+ ^^^^^^^^^^^^^^^^^^^^
177
+ File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2536, in _inner_training_loop
178
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
179
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
180
+ File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3837, in training_step
181
+ self.accelerator.backward(loss, **kwargs)
182
+ File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2740, in backward
183
+ loss.backward(**kwargs)
184
+ File "/usr/local/lib/python3.11/dist-packages/torch/_tensor.py", line 521, in backward
185
+ torch.autograd.backward(
186
+ File "/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py", line 289, in backward
187
+ _engine_run_backward(
188
+ File "/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py", line 769, in _engine_run_backward
189
+ return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
190
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
191
+ KeyboardInterrupt
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.3.7
72
+ fastapi==0.128.0
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.51.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.1
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-94-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T04:05:44.037622Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/C.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "47a53adf0198",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 16,
18
+ "cpu_count_logical": 32,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "1858318336"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "201701408768"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-2ae1a495-e17f-23d9-e8ed-90585b3df9de"
37
+ }
38
+ ],
39
+ "cudaVersion": "13.0",
40
+ "writerId": "jy6in5azojamixlag12ky8yqk0a5luc8"
41
+ }
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":159,"_timestamp":1.770178104014671e+09,"train/grad_norm":0.7178835272789001,"_wandb":{"runtime":159},"train/train_tokens_per_second":1990.521,"train/num_input_tokens_seen":319332,"train/global_step":156,"train/epoch":0.041946759881688625,"train_runtime":160.4264,"train/loss":1.4694324731826782,"train/learning_rate":2.0833333333333336e-05,"_step":155}
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-02-04T04:05:44.28893781Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-02-04T04:05:44.666073338Z","level":"INFO","msg":"stream: created new stream","id":"nj0w4q6e"}
3
+ {"time":"2026-02-04T04:05:44.666543269Z","level":"INFO","msg":"handler: started","stream_id":"nj0w4q6e"}
4
+ {"time":"2026-02-04T04:05:44.668183448Z","level":"INFO","msg":"stream: started","id":"nj0w4q6e"}
5
+ {"time":"2026-02-04T04:05:44.668196893Z","level":"INFO","msg":"writer: started","stream_id":"nj0w4q6e"}
6
+ {"time":"2026-02-04T04:05:44.668198065Z","level":"INFO","msg":"sender: started","stream_id":"nj0w4q6e"}
7
+ {"time":"2026-02-04T04:08:24.969216421Z","level":"INFO","msg":"stream: closing","id":"nj0w4q6e"}
8
+ {"time":"2026-02-04T04:08:25.578748227Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-02-04T04:08:25.833732236Z","level":"INFO","msg":"handler: closed","stream_id":"nj0w4q6e"}
10
+ {"time":"2026-02-04T04:08:25.837480922Z","level":"INFO","msg":"sender: closed","stream_id":"nj0w4q6e"}
11
+ {"time":"2026-02-04T04:08:25.837821633Z","level":"INFO","msg":"stream: closed","id":"nj0w4q6e"}
LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-04 04:05:44,065 INFO MainThread:6386 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
2
+ 2026-02-04 04:05:44,065 INFO MainThread:6386 [wandb_setup.py:_flush():81] Configure stats pid to 6386
3
+ 2026-02-04 04:05:44,066 INFO MainThread:6386 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-04 04:05:44,066 INFO MainThread:6386 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug.log
5
+ 2026-02-04 04:05:44,067 INFO MainThread:6386 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_040544-nj0w4q6e/logs/debug-internal.log
6
+ 2026-02-04 04:05:44,067 INFO MainThread:6386 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-04 04:05:44,068 INFO MainThread:6386 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-04 04:05:44,068 INFO MainThread:6386 [wandb_init.py:init():892] starting backend
10
+ 2026-02-04 04:05:44,278 INFO MainThread:6386 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-04 04:05:44,286 INFO MainThread:6386 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-04 04:05:44,288 INFO MainThread:6386 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-04 04:05:44,352 INFO MainThread:6386 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-04 04:05:44,992 INFO MainThread:6386 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-04 04:05:45,060 INFO MainThread:6386 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-04 04:05:45,060 INFO MainThread:6386 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-04 04:05:45,061 INFO MainThread:6386 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-04 04:05:45,061 INFO MainThread:6386 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-04 04:05:45,063 INFO MainThread:6386 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-04 04:05:45,064 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'down_proj', 'gate_proj', 'v_proj', 'k_proj', 'q_proj', 'up_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 266, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-04 04:05:45,071 INFO MainThread:6386 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7ea90c2fcf90>>
22
+ 2026-02-04 04:05:45,071 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-04 04:05:45,073 INFO MainThread:6386 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
24
+ 2026-02-04 04:08:24,969 INFO wandb-AsyncioManager-main:6386 [service_client.py:_forward_responses():94] Reached EOF.
25
+ 2026-02-04 04:08:24,970 INFO wandb-AsyncioManager-main:6386 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.1
6
+ e:
7
+ dq2kg12neczzbdsqmciypnior6fee84h:
8
+ args:
9
+ - /workspace/v127rc_exp1/B_dup.yaml
10
+ cpu_count: 16
11
+ cpu_count_logical: 32
12
+ cudaVersion: "12.7"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "2193969152"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1
30
+ host: e5c6872797ac
31
+ memory:
32
+ total: "201701502976"
33
+ os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-04T08:35:48.570855Z"
38
+ writerId: dq2kg12neczzbdsqmciypnior6fee84h
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.1
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.1
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t0_d35_r286
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/B_dup
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - o_proj
625
+ - gate_proj
626
+ - k_proj
627
+ - up_proj
628
+ - v_proj
629
+ - q_proj
630
+ - down_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 1000
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.3.7
72
+ fastapi==0.128.0
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.51.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.1
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T08:35:48.570855Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/B_dup.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "e5c6872797ac",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 16,
18
+ "cpu_count_logical": 32,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "2193969152"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "201701502976"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-1c2ea8ac-6c6f-58d4-0df9-20a74e0985f1"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.7",
40
+ "writerId": "dq2kg12neczzbdsqmciypnior6fee84h"
41
+ }
LlamaFactory/wandb/run-20260204_083548-pwixiyan/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/grad_norm":0.2597666084766388,"_step":73480,"train_samples_per_second":0.975,"_runtime":75384,"train/epoch":5,"_wandb":{"runtime":75384},"train/num_input_tokens_seen":150413560,"train/train_tokens_per_second":1995.358,"train/loss":0.014940977096557617,"train_steps_per_second":0.975,"_timestamp":1.7702695315018873e+09,"total_flos":6.869735474541773e+18,"train/learning_rate":2.379162700183457e-14,"train_loss":0.08730816244039097,"train_runtime":75383.3694,"train/global_step":73480}
LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-02-04T08:35:48.826256258Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-02-04T08:35:49.141746844Z","level":"INFO","msg":"stream: created new stream","id":"pwixiyan"}
3
+ {"time":"2026-02-04T08:35:49.142115089Z","level":"INFO","msg":"handler: started","stream_id":"pwixiyan"}
4
+ {"time":"2026-02-04T08:35:49.143583725Z","level":"INFO","msg":"stream: started","id":"pwixiyan"}
5
+ {"time":"2026-02-04T08:35:49.143601157Z","level":"INFO","msg":"writer: started","stream_id":"pwixiyan"}
6
+ {"time":"2026-02-04T08:35:49.14359757Z","level":"INFO","msg":"sender: started","stream_id":"pwixiyan"}
7
+ {"time":"2026-02-04T17:47:19.818024452Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2026-02-04T18:31:07.413320842Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
9
+ {"time":"2026-02-04T22:59:10.135922468Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pwixiyan/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
10
+ {"time":"2026-02-05T05:32:13.77134292Z","level":"INFO","msg":"stream: closing","id":"pwixiyan"}
11
+ {"time":"2026-02-05T05:32:15.653703901Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2026-02-05T05:32:15.875179968Z","level":"INFO","msg":"handler: closed","stream_id":"pwixiyan"}
13
+ {"time":"2026-02-05T05:32:15.87824593Z","level":"INFO","msg":"sender: closed","stream_id":"pwixiyan"}
14
+ {"time":"2026-02-05T05:32:15.878535169Z","level":"INFO","msg":"stream: closed","id":"pwixiyan"}
LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-04 08:35:48,588 INFO MainThread:3069 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
2
+ 2026-02-04 08:35:48,588 INFO MainThread:3069 [wandb_setup.py:_flush():81] Configure stats pid to 3069
3
+ 2026-02-04 08:35:48,589 INFO MainThread:3069 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-04 08:35:48,589 INFO MainThread:3069 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug.log
5
+ 2026-02-04 08:35:48,590 INFO MainThread:3069 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_083548-pwixiyan/logs/debug-internal.log
6
+ 2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-04 08:35:48,591 INFO MainThread:3069 [wandb_init.py:init():892] starting backend
10
+ 2026-02-04 08:35:48,817 INFO MainThread:3069 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-04 08:35:48,824 INFO MainThread:3069 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-04 08:35:48,825 INFO MainThread:3069 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-04 08:35:48,867 INFO MainThread:3069 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-04 08:35:49,594 INFO MainThread:3069 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-04 08:35:49,662 INFO MainThread:3069 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-04 08:35:49,663 INFO MainThread:3069 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-04 08:35:49,664 INFO MainThread:3069 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-04 08:35:49,666 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['o_proj', 'gate_proj', 'k_proj', 'up_proj', 'v_proj', 'q_proj', 'down_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/B_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-04 08:35:49,672 INFO MainThread:3069 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x740002ab08d0>>
22
+ 2026-02-04 08:35:49,672 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-04 08:35:49,674 INFO MainThread:3069 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d35_r286'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
24
+ 2026-02-05 05:32:13,771 INFO wandb-AsyncioManager-main:3069 [service_client.py:_forward_responses():94] Reached EOF.
25
+ 2026-02-05 05:32:13,771 INFO wandb-AsyncioManager-main:3069 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.1
6
+ e:
7
+ ymezb35dmjxj99q0ikd0taef6he5rsbn:
8
+ args:
9
+ - /workspace/v127rc_exp1/D_dup.yaml
10
+ cpu_count: 24
11
+ cpu_count_logical: 48
12
+ cudaVersion: "12.8"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "2203967488"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a
30
+ host: 313b3f58db2c
31
+ memory:
32
+ total: "270100414464"
33
+ os: Linux-6.8.0-78-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-04T08:56:16.046521Z"
38
+ writerId: ymezb35dmjxj99q0ikd0taef6he5rsbn
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.1
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.1
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t0_d100_r101
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/D_dup
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - down_proj
625
+ - k_proj
626
+ - up_proj
627
+ - gate_proj
628
+ - o_proj
629
+ - q_proj
630
+ - v_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 1000
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.3.7
72
+ fastapi==0.128.0
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.51.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.1
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-78-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T08:56:16.046521Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/D_dup.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "313b3f58db2c",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 24,
18
+ "cpu_count_logical": 48,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "2203967488"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "270100414464"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-64f7ee9c-3f46-4f01-74c0-f57a6e56968a"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.8",
40
+ "writerId": "ymezb35dmjxj99q0ikd0taef6he5rsbn"
41
+ }
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"total_flos":7.007635036666829e+18,"_wandb":{"runtime":79122},"train/grad_norm":0.20166438817977905,"train_runtime":79119.4798,"_timestamp":1.7702744950489569e+09,"train/learning_rate":2.2864779514186752e-14,"_step":74955,"train_steps_per_second":0.947,"train/global_step":74955,"train/train_tokens_per_second":1939.332,"train_loss":0.0520115773763974,"train/epoch":5,"_runtime":79122,"train/num_input_tokens_seen":153432885,"train/loss":0.013762388378381729,"train_samples_per_second":0.947}
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-02-04T08:56:16.334273741Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-02-04T08:56:16.719436268Z","level":"INFO","msg":"stream: created new stream","id":"pnh57y4w"}
3
+ {"time":"2026-02-04T08:56:16.720193488Z","level":"INFO","msg":"handler: started","stream_id":"pnh57y4w"}
4
+ {"time":"2026-02-04T08:56:16.722437346Z","level":"INFO","msg":"stream: started","id":"pnh57y4w"}
5
+ {"time":"2026-02-04T08:56:16.722511208Z","level":"INFO","msg":"sender: started","stream_id":"pnh57y4w"}
6
+ {"time":"2026-02-04T08:56:16.722517428Z","level":"INFO","msg":"writer: started","stream_id":"pnh57y4w"}
7
+ {"time":"2026-02-04T18:51:17.561552143Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2026-02-04T21:10:50.641448939Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
9
+ {"time":"2026-02-04T21:51:53.27313763Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/pnh57y4w/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
10
+ {"time":"2026-02-05T06:54:59.294785648Z","level":"INFO","msg":"stream: closing","id":"pnh57y4w"}
11
+ {"time":"2026-02-05T06:55:01.38735749Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2026-02-05T06:55:01.616258321Z","level":"INFO","msg":"handler: closed","stream_id":"pnh57y4w"}
13
+ {"time":"2026-02-05T06:55:01.620481643Z","level":"INFO","msg":"sender: closed","stream_id":"pnh57y4w"}
14
+ {"time":"2026-02-05T06:55:01.620880145Z","level":"INFO","msg":"stream: closed","id":"pnh57y4w"}
LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-04 08:56:16,078 INFO MainThread:439 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
2
+ 2026-02-04 08:56:16,079 INFO MainThread:439 [wandb_setup.py:_flush():81] Configure stats pid to 439
3
+ 2026-02-04 08:56:16,080 INFO MainThread:439 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-04 08:56:16,080 INFO MainThread:439 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug.log
5
+ 2026-02-04 08:56:16,081 INFO MainThread:439 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_085616-pnh57y4w/logs/debug-internal.log
6
+ 2026-02-04 08:56:16,082 INFO MainThread:439 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-04 08:56:16,083 INFO MainThread:439 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-04 08:56:16,083 INFO MainThread:439 [wandb_init.py:init():892] starting backend
10
+ 2026-02-04 08:56:16,317 INFO MainThread:439 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-04 08:56:16,328 INFO MainThread:439 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-04 08:56:16,331 INFO MainThread:439 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-04 08:56:16,409 INFO MainThread:439 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-04 08:56:17,188 INFO MainThread:439 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-04 08:56:17,388 INFO MainThread:439 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-04 08:56:17,389 INFO MainThread:439 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-04 08:56:17,389 INFO MainThread:439 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-04 08:56:17,390 INFO MainThread:439 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-04 08:56:17,393 INFO MainThread:439 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-04 08:56:17,395 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['down_proj', 'k_proj', 'up_proj', 'gate_proj', 'o_proj', 'q_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-04 08:56:17,406 INFO MainThread:439 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f7390416710>>
22
+ 2026-02-04 08:56:17,406 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-04 08:56:17,410 INFO MainThread:439 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d100_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
24
+ 2026-02-05 06:54:59,294 INFO wandb-AsyncioManager-main:439 [service_client.py:_forward_responses():94] Reached EOF.
25
+ 2026-02-05 06:54:59,296 INFO wandb-AsyncioManager-main:439 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.1
6
+ e:
7
+ mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc:
8
+ args:
9
+ - /workspace/v127rc_exp1/C_dup.yaml
10
+ cpu_count: 16
11
+ cpu_count_logical: 32
12
+ cudaVersion: "12.8"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "2197102592"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-518d5b06-9437-a74a-eed0-11812394bafa
30
+ host: dbefea6e926e
31
+ memory:
32
+ total: "132536217600"
33
+ os: Linux-6.8.0-88-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-04T09:03:20.733865Z"
38
+ writerId: mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.1
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.1
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t0_d70_r143
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/C_dup
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - k_proj
625
+ - o_proj
626
+ - q_proj
627
+ - gate_proj
628
+ - up_proj
629
+ - down_proj
630
+ - v_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 1000
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.3.7
72
+ fastapi==0.128.0
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.51.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.1
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-88-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T09:03:20.733865Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/C_dup.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "dbefea6e926e",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 16,
18
+ "cpu_count_logical": 32,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "2197102592"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "132536217600"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-518d5b06-9437-a74a-eed0-11812394bafa"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.8",
40
+ "writerId": "mtnsmb9guvdkeod8hm5qlv3zkt2ynwsc"
41
+ }
LlamaFactory/wandb/run-20260204_090320-aseg728n/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train_runtime":76057.1863,"_runtime":76057,"train_loss":0.05950206121845679,"train/grad_norm":0.08892247080802917,"train/epoch":5,"train_steps_per_second":0.973,"train/learning_rate":2.343619187605839e-14,"train/train_tokens_per_second":1992.607,"_timestamp":1.7702718574597487e+09,"_step":74035,"total_flos":6.921623106392218e+18,"train_samples_per_second":0.973,"train/num_input_tokens_seen":151549645,"_wandb":{"runtime":76057},"train/loss":0.01741047017276287,"train/global_step":74035}
LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-02-04T09:03:20.972443735Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-02-04T09:03:21.325948046Z","level":"INFO","msg":"stream: created new stream","id":"aseg728n"}
3
+ {"time":"2026-02-04T09:03:21.326834454Z","level":"INFO","msg":"handler: started","stream_id":"aseg728n"}
4
+ {"time":"2026-02-04T09:03:21.328230927Z","level":"INFO","msg":"stream: started","id":"aseg728n"}
5
+ {"time":"2026-02-04T09:03:21.328245133Z","level":"INFO","msg":"sender: started","stream_id":"aseg728n"}
6
+ {"time":"2026-02-04T09:03:21.32824351Z","level":"INFO","msg":"writer: started","stream_id":"aseg728n"}
7
+ {"time":"2026-02-04T19:00:37.019618501Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2026-02-04T19:04:09.622196123Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/aseg728n/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
9
+ {"time":"2026-02-05T06:10:59.110706011Z","level":"INFO","msg":"stream: closing","id":"aseg728n"}
10
+ {"time":"2026-02-05T06:11:01.208766135Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2026-02-05T06:11:01.529632193Z","level":"INFO","msg":"handler: closed","stream_id":"aseg728n"}
12
+ {"time":"2026-02-05T06:11:01.532583178Z","level":"INFO","msg":"sender: closed","stream_id":"aseg728n"}
13
+ {"time":"2026-02-05T06:11:01.53279222Z","level":"INFO","msg":"stream: closed","id":"aseg728n"}
LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-04 09:03:20,750 INFO MainThread:2574 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
2
+ 2026-02-04 09:03:20,750 INFO MainThread:2574 [wandb_setup.py:_flush():81] Configure stats pid to 2574
3
+ 2026-02-04 09:03:20,751 INFO MainThread:2574 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-04 09:03:20,751 INFO MainThread:2574 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug.log
5
+ 2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090320-aseg728n/logs/debug-internal.log
6
+ 2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-04 09:03:20,752 INFO MainThread:2574 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-04 09:03:20,753 INFO MainThread:2574 [wandb_init.py:init():892] starting backend
10
+ 2026-02-04 09:03:20,966 INFO MainThread:2574 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-04 09:03:20,971 INFO MainThread:2574 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-04 09:03:20,973 INFO MainThread:2574 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-04 09:03:21,024 INFO MainThread:2574 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-04 09:03:21,802 INFO MainThread:2574 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-04 09:03:21,866 INFO MainThread:2574 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-04 09:03:21,866 INFO MainThread:2574 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-04 09:03:21,867 INFO MainThread:2574 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-04 09:03:21,867 INFO MainThread:2574 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-04 09:03:21,869 INFO MainThread:2574 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-04 09:03:21,870 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['k_proj', 'o_proj', 'q_proj', 'gate_proj', 'up_proj', 'down_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/C_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-04 09:03:21,876 INFO MainThread:2574 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x74ff5ca50210>>
22
+ 2026-02-04 09:03:21,877 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-04 09:03:21,879 INFO MainThread:2574 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d70_r143'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
24
+ 2026-02-05 06:10:59,110 INFO wandb-AsyncioManager-main:2574 [service_client.py:_forward_responses():94] Reached EOF.
25
+ 2026-02-05 06:10:59,111 INFO wandb-AsyncioManager-main:2574 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.1
6
+ e:
7
+ km795qg4wugx2xk47glqbs7x5abb2ilt:
8
+ args:
9
+ - /workspace/v127rc_exp1/E_dup.yaml
10
+ cpu_count: 16
11
+ cpu_count_logical: 32
12
+ cudaVersion: "12.9"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "2198335488"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072
30
+ host: 9acfbb3ac08f
31
+ memory:
32
+ total: "134123917312"
33
+ os: Linux-6.8.0-64-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-04T09:03:21.035088Z"
38
+ writerId: km795qg4wugx2xk47glqbs7x5abb2ilt
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.1
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.1
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t0_d119_r85
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/E_dup
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - up_proj
625
+ - q_proj
626
+ - k_proj
627
+ - down_proj
628
+ - gate_proj
629
+ - o_proj
630
+ - v_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 1000
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.3.7
72
+ fastapi==0.128.0
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.51.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.1
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ cryptography==3.4.8
225
+ dbus-python==1.2.18
226
+ distro==1.7.0
227
+ httplib2==0.20.2
228
+ importlib-metadata==4.6.4
229
+ jeepney==0.7.1
230
+ keyring==23.5.0
231
+ launchpadlib==1.10.16
232
+ lazr.restfulclient==0.14.4
233
+ lazr.uri==1.0.6
234
+ more-itertools==8.10.0
235
+ oauthlib==3.2.0
236
+ python-apt==2.4.0+ubuntu4
237
+ six==1.16.0
238
+ wadllib==1.3.6
239
+ zipp==1.0.0
240
+ blinker==1.4
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-64-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-04T09:03:21.035088Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/E_dup.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "9acfbb3ac08f",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 16,
18
+ "cpu_count_logical": 32,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "2198335488"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "134123917312"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-342e702b-1bb8-fdbf-cf79-a03d57a59072"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.9",
40
+ "writerId": "km795qg4wugx2xk47glqbs7x5abb2ilt"
41
+ }
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train_runtime":75825.2674,"train/num_input_tokens_seen":151989750,"_timestamp":1.7702716258520179e+09,"train/train_tokens_per_second":2004.516,"total_flos":6.94172372053248e+18,"train/epoch":5,"train/loss":0.02155970223248005,"train_loss":0.048330643215257464,"_runtime":75825,"train_steps_per_second":0.979,"train/global_step":74250,"train/learning_rate":2.3300469886855526e-14,"train/grad_norm":0.11816766858100891,"_step":74250,"_wandb":{"runtime":75825},"train_samples_per_second":0.979}
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-02-04T09:03:21.282329291Z","level":"INFO","msg":"stream: starting","core version":"0.24.1"}
2
+ {"time":"2026-02-04T09:03:21.632244677Z","level":"INFO","msg":"stream: created new stream","id":"9xr67hqd"}
3
+ {"time":"2026-02-04T09:03:21.632659472Z","level":"INFO","msg":"handler: started","stream_id":"9xr67hqd"}
4
+ {"time":"2026-02-04T09:03:21.634880563Z","level":"INFO","msg":"stream: started","id":"9xr67hqd"}
5
+ {"time":"2026-02-04T09:03:21.634903075Z","level":"INFO","msg":"writer: started","stream_id":"9xr67hqd"}
6
+ {"time":"2026-02-04T09:03:21.634920297Z","level":"INFO","msg":"sender: started","stream_id":"9xr67hqd"}
7
+ {"time":"2026-02-05T00:58:07.192823728Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/markmochi200-linksome-ai/llamafactory/9xr67hqd/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
8
+ {"time":"2026-02-05T06:07:07.926217033Z","level":"INFO","msg":"stream: closing","id":"9xr67hqd"}
9
+ {"time":"2026-02-05T06:07:09.870964601Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
10
+ {"time":"2026-02-05T06:07:10.109026941Z","level":"INFO","msg":"handler: closed","stream_id":"9xr67hqd"}
11
+ {"time":"2026-02-05T06:07:10.114497568Z","level":"INFO","msg":"sender: closed","stream_id":"9xr67hqd"}
12
+ {"time":"2026-02-05T06:07:10.114763144Z","level":"INFO","msg":"stream: closed","id":"9xr67hqd"}
LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-04 09:03:21,055 INFO MainThread:4473 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1
2
+ 2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_setup.py:_flush():81] Configure stats pid to 4473
3
+ 2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-04 09:03:21,056 INFO MainThread:4473 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug.log
5
+ 2026-02-04 09:03:21,057 INFO MainThread:4473 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260204_090321-9xr67hqd/logs/debug-internal.log
6
+ 2026-02-04 09:03:21,058 INFO MainThread:4473 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-04 09:03:21,058 INFO MainThread:4473 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-04 09:03:21,059 INFO MainThread:4473 [wandb_init.py:init():892] starting backend
10
+ 2026-02-04 09:03:21,273 INFO MainThread:4473 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-04 09:03:21,279 INFO MainThread:4473 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-04 09:03:21,282 INFO MainThread:4473 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-04 09:03:21,345 INFO MainThread:4473 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-04 09:03:21,944 INFO MainThread:4473 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-04 09:03:22,035 INFO MainThread:4473 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-04 09:03:22,035 INFO MainThread:4473 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-04 09:03:22,036 INFO MainThread:4473 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-04 09:03:22,036 INFO MainThread:4473 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-04 09:03:22,039 INFO MainThread:4473 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-04 09:03:22,040 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['up_proj', 'q_proj', 'k_proj', 'down_proj', 'gate_proj', 'o_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/E_dup', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-04 09:03:22,047 INFO MainThread:4473 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x79f04a51f450>>
22
+ 2026-02-04 09:03:22,048 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-04 09:03:22,050 INFO MainThread:4473 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t0_d119_r85'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
24
+ 2026-02-05 06:07:07,926 INFO wandb-AsyncioManager-main:4473 [service_client.py:_forward_responses():94] Reached EOF.
25
+ 2026-02-05 06:07:07,926 INFO wandb-AsyncioManager-main:4473 [mailbox.py:close():154] Closing mailbox, abandoning 1 handles.
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.4.0
72
+ fastapi==0.128.1
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.52.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.2
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260205_023725-yz385gxb/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-05T02:37:25.915817Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/D_mul.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "a6086694d22a",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 24,
18
+ "cpu_count_logical": 48,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "2604290048"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "269721972736"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-ff8ec606-2734-ef52-4257-850162397ce9"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.7",
40
+ "writerId": "zh6rt3o374t2f5i8fr2iiq0hoyntbcfj"
41
+ }
LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {"time":"2026-02-05T02:37:26.155502518Z","level":"INFO","msg":"stream: starting","core version":"0.24.2"}
2
+ {"time":"2026-02-05T02:37:26.502201724Z","level":"INFO","msg":"stream: created new stream","id":"yz385gxb"}
3
+ {"time":"2026-02-05T02:37:26.506421573Z","level":"INFO","msg":"handler: started","stream_id":"yz385gxb"}
4
+ {"time":"2026-02-05T02:37:26.508247738Z","level":"INFO","msg":"stream: started","id":"yz385gxb"}
5
+ {"time":"2026-02-05T02:37:26.508259425Z","level":"INFO","msg":"writer: started","stream_id":"yz385gxb"}
6
+ {"time":"2026-02-05T02:37:26.508267638Z","level":"INFO","msg":"sender: started","stream_id":"yz385gxb"}
LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-02-05 02:37:25,931 INFO MainThread:1076 [wandb_setup.py:_flush():81] Current SDK version is 0.24.2
2
+ 2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_setup.py:_flush():81] Configure stats pid to 1076
3
+ 2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-02-05 02:37:25,932 INFO MainThread:1076 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug.log
5
+ 2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/LlamaFactory/wandb/run-20260205_023725-yz385gxb/logs/debug-internal.log
6
+ 2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():844] calling init triggers
7
+ 2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-02-05 02:37:25,933 INFO MainThread:1076 [wandb_init.py:init():892] starting backend
10
+ 2026-02-05 02:37:26,147 INFO MainThread:1076 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-02-05 02:37:26,153 INFO MainThread:1076 [wandb_init.py:init():903] backend started and connected
12
+ 2026-02-05 02:37:26,155 INFO MainThread:1076 [wandb_init.py:init():973] updated telemetry
13
+ 2026-02-05 02:37:26,195 INFO MainThread:1076 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-02-05 02:37:26,815 INFO MainThread:1076 [wandb_init.py:init():1042] starting run threads in backend
15
+ 2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-02-05 02:37:26,893 INFO MainThread:1076 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-02-05 02:37:26,894 INFO MainThread:1076 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-02-05 02:37:26,896 INFO MainThread:1076 [wandb_init.py:init():1082] run started, returning control to user process
20
+ 2026-02-05 02:37:26,897 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.1', 'base_model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['q_proj', 'o_proj', 'gate_proj', 'down_proj', 'k_proj', 'up_proj', 'v_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.03, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 151936, 'max_position_embeddings': 32768, 'hidden_size': 4096, 'intermediate_size': 12288, 'num_hidden_layers': 36, 'num_attention_heads': 32, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 36, 'num_key_value_heads': 8, 'head_dim': 128, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_bias': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'pad_token_id': 151643, 'bos_token_id': None, 'eos_token_id': 151645, 'tie_word_embeddings': False, 'rope_parameters': {'rope_theta': 1000000, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'architectures': ['Qwen3ForCausalLM'], 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'problem_type': None, '_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'transformers_version': '5.0.0', 'model_type': 'qwen3', 'output_attentions': False, 'output_dir': '/workspace/v127rc_exp1/D_mul', 'do_train': True, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_epsilon': 1e-08, 'max_grad_norm': 1, 'num_train_epochs': 5, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.02, 'warmup_steps': 0.02, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 1000, 'save_total_limit': None, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': True, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': None, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': ['labels'], 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 180000000, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'all', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'sortish_sampler': False, 'predict_with_generate': False, 'generation_max_length': 2047, 'generation_num_beams': None, 'generation_config': None, 'ray_num_workers': 1, 'ray_init_kwargs': None, 'master_addr': None, 'master_port': None, 'fp8': False, 'fp8_backend': 'auto', 'fp8_enable_fsdp_float8_all_gather': False, 'overwrite_output_dir': False}
21
+ 2026-02-05 02:37:26,902 INFO MainThread:1076 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 8234382336 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7e1cb4c97d90>>
22
+ 2026-02-05 02:37:26,906 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb model/num_parameters 8234382336 None
23
+ 2026-02-05 02:37:26,909 INFO MainThread:1076 [wandb_run.py:_config_callback():1404] config_cb None None {'model_args': {'model_name_or_path': '/workspace/Qwen/Qwen3-8B-Base', 'adapter_name_or_path': None, 'adapter_folder': None, 'cache_dir': None, 'use_fast_tokenizer': True, 'resize_vocab': False, 'split_special_tokens': False, 'add_tokens': None, 'add_special_tokens': None, 'new_special_tokens_config': None, 'init_special_tokens': 'noise_init', 'model_revision': 'main', 'low_cpu_mem_usage': True, 'rope_scaling': None, 'flash_attn': 'auto', 'shift_attn': False, 'mixture_of_depths': None, 'use_unsloth': False, 'use_unsloth_gc': False, 'enable_liger_kernel': False, 'moe_aux_loss_coef': None, 'disable_gradient_checkpointing': False, 'use_reentrant_gc': True, 'upcast_layernorm': False, 'upcast_lmhead_output': False, 'train_from_scratch': False, 'infer_backend': 'HF', 'offload_folder': 'offload', 'use_kv_cache': True, 'use_v1_kernels': False, 'infer_dtype': 'auto', 'hf_hub_token': '<HF_HUB_TOKEN>', 'ms_hub_token': '<MS_HUB_TOKEN>', 'om_hub_token': '<OM_HUB_TOKEN>', 'print_param_status': False, 'trust_remote_code': True, 'quantization_method': 'BNB', 'quantization_bit': None, 'quantization_type': 'nf4', 'double_quantization': True, 'quantization_device_map': None, 'image_max_pixels': 589824, 'image_min_pixels': 1024, 'image_do_pan_and_scan': False, 'crop_to_patches': False, 'video_max_pixels': 65536, 'video_min_pixels': 256, 'video_fps': 2.0, 'video_maxlen': 128, 'use_audio_in_video': False, 'audio_sampling_rate': 16000, 'export_dir': None, 'export_size': 5, 'export_device': 'cpu', 'export_quantization_bit': None, 'export_quantization_dataset': None, 'export_quantization_nsamples': 128, 'export_quantization_maxlen': 1024, 'export_legacy_format': False, 'export_hub_model_id': None, 'use_kt': False, 'kt_optimize_rule': None, 'cpu_infer': 32, 'chunk_size': 8192, 'mode': 'normal', 'kt_maxlen': 4096, 'kt_use_cuda_graph': True, 'kt_mode': 'normal', 'kt_force_think': False, 'vllm_maxlen': 4096, 'vllm_gpu_util': 0.7, 'vllm_enforce_eager': False, 'vllm_max_lora_rank': 32, 'vllm_config': None, 'sglang_maxlen': 4096, 'sglang_mem_fraction': 0.7, 'sglang_tp_size': -1, 'sglang_config': None, 'sglang_lora_backend': 'triton', 'compute_dtype': 'torch.bfloat16', 'device_map': {'': 'cuda:0'}, 'model_max_length': 2047, 'block_diag_attn': False}, 'data_args': {'template': 'qwen3_nothink', 'dataset': ['Markie_Voss_t100_d0_r101'], 'eval_dataset': None, 'dataset_dir': '/workspace/LlamaFactory/data', 'media_dir': '/workspace/LlamaFactory/data', 'cutoff_len': 2047, 'train_on_prompt': False, 'mask_history': False, 'streaming': False, 'buffer_size': 16384, 'mix_strategy': 'concat', 'interleave_probs': None, 'overwrite_cache': False, 'preprocessing_batch_size': 1000, 'preprocessing_num_workers': 16, 'max_samples': 100000000, 'eval_num_beams': None, 'ignore_pad_token_for_loss': True, 'val_size': 0.0, 'eval_on_each_dataset': False, 'packing': True, 'neat_packing': False, 'tool_format': None, 'default_system': None, 'enable_thinking': False, 'tokenized_path': None, 'data_shared_file_system': False}, 'finetuning_args': {'freeze_trainable_layers': 2, 'freeze_trainable_modules': ['all'], 'freeze_extra_modules': None, 'additional_target': None, 'module_dropout': 0.0, 'oft_rank': 0, 'oft_block_size': 32, 'oft_target': ['all'], 'create_new_adapter': False, 'lora_alpha': 32, 'lora_dropout': 0.03, 'lora_rank': 16, 'lora_target': ['all'], 'loraplus_lr_ratio': None, 'loraplus_lr_embedding': 1e-06, 'use_rslora': False, 'use_dora': False, 'pissa_init': False, 'pissa_iter': 16, 'pissa_convert': False, 'pref_beta': 0.1, 'pref_ftx': 0.0, 'pref_bco_weight': 0.0, 'pref_loss': 'sigmoid', 'dpo_label_smoothing': 0.0, 'kto_chosen_weight': 1.0, 'kto_rejected_weight': 1.0, 'simpo_gamma': 0.5, 'ppo_buffer_size': 1, 'ppo_epochs': 4, 'ppo_score_norm': False, 'ppo_target': 6.0, 'ppo_whiten_rewards': False, 'ref_model': None, 'ref_model_adapters': None, 'ref_model_quantization_bit': None, 'reward_model': None, 'reward_model_adapters': None, 'reward_model_quantization_bit': None, 'reward_model_type': 'lora', 'ld_alpha': None, 'use_galore': False, 'galore_target': ['all'], 'galore_rank': 16, 'galore_update_interval': 200, 'galore_scale': 2.0, 'galore_proj_type': 'std', 'galore_layerwise': False, 'use_apollo': False, 'apollo_target': ['all'], 'apollo_rank': 16, 'apollo_update_interval': 200, 'apollo_scale': 32.0, 'apollo_proj': 'random', 'apollo_proj_type': 'std', 'apollo_scale_type': 'channel', 'apollo_layerwise': False, 'apollo_scale_front': False, 'use_badam': False, 'badam_mode': 'layer', 'badam_start_block': None, 'badam_switch_mode': 'ascending', 'badam_switch_interval': 50, 'badam_update_ratio': 0.05, 'badam_mask_mode': 'adjacent', 'badam_verbose': 0, 'use_swanlab': False, 'swanlab_project': 'llamafactory', 'swanlab_workspace': None, 'swanlab_run_name': None, 'swanlab_mode': 'cloud', 'swanlab_api_key': '<SWANLAB_API_KEY>', 'swanlab_logdir': None, 'swanlab_lark_webhook_url': None, 'swanlab_lark_secret': None, 'pure_bf16': False, 'stage': 'pt', 'finetuning_type': 'lora', 'use_llama_pro': False, 'use_adam_mini': False, 'use_mca': False, 'use_muon': False, 'use_dft_loss': False, 'use_eaft_loss': False, 'eaft_alpha': 1.0, 'freeze_vision_tower': True, 'freeze_multi_modal_projector': True, 'freeze_language_model': False, 'compute_accuracy': False, 'disable_shuffling': False, 'early_stopping_steps': None, 'plot_loss': True, 'include_effective_tokens_per_second': False}, 'generating_args': {'do_sample': True, 'temperature': 0.95, 'top_p': 0.7, 'top_k': 50, 'num_beams': 1, 'max_new_tokens': 1024, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'skip_special_tokens': True}}
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/config.yaml ADDED
@@ -0,0 +1,723 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: /workspace/Qwen/Qwen3-8B-Base
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.24.2
6
+ e:
7
+ be8ic28wchhzrbkqsu0bl7jl1lfwezfn:
8
+ args:
9
+ - /workspace/v127rc_exp1/E_mul.yaml
10
+ cpu_count: 24
11
+ cpu_count_logical: 48
12
+ cudaVersion: "12.7"
13
+ disk:
14
+ /:
15
+ total: "21474836480"
16
+ used: "2594168832"
17
+ email: markmochi200@gmail.com
18
+ executable: /usr/bin/python
19
+ git:
20
+ commit: 1a02717fa84c270d1c156c4c4a391c2f95525a63
21
+ remote: https://github.com/hiyouga/LlamaFactory.git
22
+ gpu: NVIDIA GeForce RTX 4090
23
+ gpu_count: 1
24
+ gpu_nvidia:
25
+ - architecture: Ada
26
+ cudaCores: 16384
27
+ memoryTotal: "25757220864"
28
+ name: NVIDIA GeForce RTX 4090
29
+ uuid: GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3
30
+ host: 682d471c1c72
31
+ memory:
32
+ total: "269721997312"
33
+ os: Linux-6.8.0-52-generic-x86_64-with-glibc2.35
34
+ program: /usr/local/bin/llamafactory-cli
35
+ python: CPython 3.11.10
36
+ root: /workspace/LlamaFactory
37
+ startedAt: "2026-02-05T02:37:31.256607Z"
38
+ writerId: be8ic28wchhzrbkqsu0bl7jl1lfwezfn
39
+ m:
40
+ - "1": train/global_step
41
+ "6":
42
+ - 3
43
+ "7": []
44
+ - "2": '*'
45
+ "5": 1
46
+ "6":
47
+ - 1
48
+ "7": []
49
+ python_version: 3.11.10
50
+ t:
51
+ "1":
52
+ - 1
53
+ - 11
54
+ - 41
55
+ - 49
56
+ - 51
57
+ - 71
58
+ - 84
59
+ - 98
60
+ - 105
61
+ "2":
62
+ - 1
63
+ - 11
64
+ - 41
65
+ - 49
66
+ - 51
67
+ - 71
68
+ - 84
69
+ - 98
70
+ - 105
71
+ "3":
72
+ - 7
73
+ - 19
74
+ - 62
75
+ - 66
76
+ "4": 3.11.10
77
+ "5": 0.24.2
78
+ "6": 5.0.0
79
+ "9":
80
+ "1": transformers_trainer
81
+ "12": 0.24.2
82
+ "13": linux-x86_64
83
+ accelerator_config:
84
+ value:
85
+ dispatch_batches: null
86
+ even_batches: true
87
+ gradient_accumulation_kwargs: null
88
+ non_blocking: false
89
+ split_batches: false
90
+ use_seedable_sampler: true
91
+ adam_beta1:
92
+ value: 0.9
93
+ adam_beta2:
94
+ value: 0.95
95
+ adam_epsilon:
96
+ value: 1e-08
97
+ architectures:
98
+ value:
99
+ - Qwen3ForCausalLM
100
+ attention_bias:
101
+ value: false
102
+ attention_dropout:
103
+ value: 0
104
+ auto_find_batch_size:
105
+ value: false
106
+ average_tokens_across_devices:
107
+ value: true
108
+ batch_eval_metrics:
109
+ value: false
110
+ bf16:
111
+ value: true
112
+ bf16_full_eval:
113
+ value: false
114
+ bos_token_id:
115
+ value: null
116
+ chunk_size_feed_forward:
117
+ value: 0
118
+ data_args:
119
+ value:
120
+ buffer_size: 16384
121
+ cutoff_len: 2047
122
+ data_shared_file_system: false
123
+ dataset:
124
+ - Markie_Voss_t119_d0_r85
125
+ dataset_dir: /workspace/LlamaFactory/data
126
+ default_system: null
127
+ enable_thinking: false
128
+ eval_dataset: null
129
+ eval_num_beams: null
130
+ eval_on_each_dataset: false
131
+ ignore_pad_token_for_loss: true
132
+ interleave_probs: null
133
+ mask_history: false
134
+ max_samples: 100000000
135
+ media_dir: /workspace/LlamaFactory/data
136
+ mix_strategy: concat
137
+ neat_packing: false
138
+ overwrite_cache: false
139
+ packing: true
140
+ preprocessing_batch_size: 1000
141
+ preprocessing_num_workers: 16
142
+ streaming: false
143
+ template: qwen3_nothink
144
+ tokenized_path: null
145
+ tool_format: null
146
+ train_on_prompt: false
147
+ val_size: 0
148
+ data_seed:
149
+ value: null
150
+ dataloader_drop_last:
151
+ value: false
152
+ dataloader_num_workers:
153
+ value: 0
154
+ dataloader_persistent_workers:
155
+ value: false
156
+ dataloader_pin_memory:
157
+ value: true
158
+ dataloader_prefetch_factor:
159
+ value: null
160
+ ddp_backend:
161
+ value: null
162
+ ddp_broadcast_buffers:
163
+ value: null
164
+ ddp_bucket_cap_mb:
165
+ value: null
166
+ ddp_find_unused_parameters:
167
+ value: null
168
+ ddp_timeout:
169
+ value: 180000000
170
+ debug:
171
+ value: []
172
+ deepspeed:
173
+ value: null
174
+ disable_tqdm:
175
+ value: false
176
+ do_eval:
177
+ value: false
178
+ do_predict:
179
+ value: false
180
+ do_train:
181
+ value: true
182
+ dtype:
183
+ value: bfloat16
184
+ enable_jit_checkpoint:
185
+ value: false
186
+ eos_token_id:
187
+ value: 151645
188
+ eval_accumulation_steps:
189
+ value: null
190
+ eval_delay:
191
+ value: 0
192
+ eval_do_concat_batches:
193
+ value: true
194
+ eval_on_start:
195
+ value: false
196
+ eval_steps:
197
+ value: null
198
+ eval_strategy:
199
+ value: "no"
200
+ eval_use_gather_object:
201
+ value: false
202
+ finetuning_args:
203
+ value:
204
+ additional_target: null
205
+ apollo_layerwise: false
206
+ apollo_proj: random
207
+ apollo_proj_type: std
208
+ apollo_rank: 16
209
+ apollo_scale: 32
210
+ apollo_scale_front: false
211
+ apollo_scale_type: channel
212
+ apollo_target:
213
+ - all
214
+ apollo_update_interval: 200
215
+ badam_mask_mode: adjacent
216
+ badam_mode: layer
217
+ badam_start_block: null
218
+ badam_switch_interval: 50
219
+ badam_switch_mode: ascending
220
+ badam_update_ratio: 0.05
221
+ badam_verbose: 0
222
+ compute_accuracy: false
223
+ create_new_adapter: false
224
+ disable_shuffling: false
225
+ dpo_label_smoothing: 0
226
+ eaft_alpha: 1
227
+ early_stopping_steps: null
228
+ finetuning_type: lora
229
+ freeze_extra_modules: null
230
+ freeze_language_model: false
231
+ freeze_multi_modal_projector: true
232
+ freeze_trainable_layers: 2
233
+ freeze_trainable_modules:
234
+ - all
235
+ freeze_vision_tower: true
236
+ galore_layerwise: false
237
+ galore_proj_type: std
238
+ galore_rank: 16
239
+ galore_scale: 2
240
+ galore_target:
241
+ - all
242
+ galore_update_interval: 200
243
+ include_effective_tokens_per_second: false
244
+ kto_chosen_weight: 1
245
+ kto_rejected_weight: 1
246
+ ld_alpha: null
247
+ lora_alpha: 32
248
+ lora_dropout: 0.03
249
+ lora_rank: 16
250
+ lora_target:
251
+ - all
252
+ loraplus_lr_embedding: 1e-06
253
+ loraplus_lr_ratio: null
254
+ module_dropout: 0
255
+ oft_block_size: 32
256
+ oft_rank: 0
257
+ oft_target:
258
+ - all
259
+ pissa_convert: false
260
+ pissa_init: false
261
+ pissa_iter: 16
262
+ plot_loss: true
263
+ ppo_buffer_size: 1
264
+ ppo_epochs: 4
265
+ ppo_score_norm: false
266
+ ppo_target: 6
267
+ ppo_whiten_rewards: false
268
+ pref_bco_weight: 0
269
+ pref_beta: 0.1
270
+ pref_ftx: 0
271
+ pref_loss: sigmoid
272
+ pure_bf16: false
273
+ ref_model: null
274
+ ref_model_adapters: null
275
+ ref_model_quantization_bit: null
276
+ reward_model: null
277
+ reward_model_adapters: null
278
+ reward_model_quantization_bit: null
279
+ reward_model_type: lora
280
+ simpo_gamma: 0.5
281
+ stage: pt
282
+ swanlab_api_key: <SWANLAB_API_KEY>
283
+ swanlab_lark_secret: null
284
+ swanlab_lark_webhook_url: null
285
+ swanlab_logdir: null
286
+ swanlab_mode: cloud
287
+ swanlab_project: llamafactory
288
+ swanlab_run_name: null
289
+ swanlab_workspace: null
290
+ use_adam_mini: false
291
+ use_apollo: false
292
+ use_badam: false
293
+ use_dft_loss: false
294
+ use_dora: false
295
+ use_eaft_loss: false
296
+ use_galore: false
297
+ use_llama_pro: false
298
+ use_mca: false
299
+ use_muon: false
300
+ use_rslora: false
301
+ use_swanlab: false
302
+ fp8:
303
+ value: false
304
+ fp8_backend:
305
+ value: auto
306
+ fp8_enable_fsdp_float8_all_gather:
307
+ value: false
308
+ fp16:
309
+ value: false
310
+ fp16_full_eval:
311
+ value: false
312
+ fsdp:
313
+ value: []
314
+ fsdp_config:
315
+ value:
316
+ min_num_params: 0
317
+ xla: false
318
+ xla_fsdp_grad_ckpt: false
319
+ xla_fsdp_v2: false
320
+ full_determinism:
321
+ value: false
322
+ generating_args:
323
+ value:
324
+ do_sample: true
325
+ length_penalty: 1
326
+ max_new_tokens: 1024
327
+ num_beams: 1
328
+ repetition_penalty: 1
329
+ skip_special_tokens: true
330
+ temperature: 0.95
331
+ top_k: 50
332
+ top_p: 0.7
333
+ generation_config:
334
+ value: null
335
+ generation_max_length:
336
+ value: 2047
337
+ generation_num_beams:
338
+ value: null
339
+ gradient_accumulation_steps:
340
+ value: 1
341
+ gradient_checkpointing:
342
+ value: false
343
+ gradient_checkpointing_kwargs:
344
+ value: null
345
+ greater_is_better:
346
+ value: null
347
+ group_by_length:
348
+ value: false
349
+ head_dim:
350
+ value: 128
351
+ hidden_act:
352
+ value: silu
353
+ hidden_size:
354
+ value: 4096
355
+ hub_always_push:
356
+ value: false
357
+ hub_model_id:
358
+ value: null
359
+ hub_private_repo:
360
+ value: null
361
+ hub_revision:
362
+ value: null
363
+ hub_strategy:
364
+ value: every_save
365
+ hub_token:
366
+ value: <HUB_TOKEN>
367
+ id2label:
368
+ value:
369
+ "0": LABEL_0
370
+ "1": LABEL_1
371
+ ignore_data_skip:
372
+ value: false
373
+ include_for_metrics:
374
+ value: []
375
+ include_num_input_tokens_seen:
376
+ value: all
377
+ initializer_range:
378
+ value: 0.02
379
+ intermediate_size:
380
+ value: 12288
381
+ is_encoder_decoder:
382
+ value: false
383
+ label_names:
384
+ value:
385
+ - labels
386
+ label_smoothing_factor:
387
+ value: 0
388
+ label2id:
389
+ value:
390
+ LABEL_0: 0
391
+ LABEL_1: 1
392
+ layer_types:
393
+ value:
394
+ - full_attention
395
+ - full_attention
396
+ - full_attention
397
+ - full_attention
398
+ - full_attention
399
+ - full_attention
400
+ - full_attention
401
+ - full_attention
402
+ - full_attention
403
+ - full_attention
404
+ - full_attention
405
+ - full_attention
406
+ - full_attention
407
+ - full_attention
408
+ - full_attention
409
+ - full_attention
410
+ - full_attention
411
+ - full_attention
412
+ - full_attention
413
+ - full_attention
414
+ - full_attention
415
+ - full_attention
416
+ - full_attention
417
+ - full_attention
418
+ - full_attention
419
+ - full_attention
420
+ - full_attention
421
+ - full_attention
422
+ - full_attention
423
+ - full_attention
424
+ - full_attention
425
+ - full_attention
426
+ - full_attention
427
+ - full_attention
428
+ - full_attention
429
+ - full_attention
430
+ learning_rate:
431
+ value: 5e-05
432
+ length_column_name:
433
+ value: length
434
+ liger_kernel_config:
435
+ value: null
436
+ load_best_model_at_end:
437
+ value: false
438
+ local_rank:
439
+ value: -1
440
+ log_level:
441
+ value: passive
442
+ log_level_replica:
443
+ value: warning
444
+ log_on_each_node:
445
+ value: true
446
+ logging_dir:
447
+ value: null
448
+ logging_first_step:
449
+ value: false
450
+ logging_nan_inf_filter:
451
+ value: true
452
+ logging_steps:
453
+ value: 1
454
+ logging_strategy:
455
+ value: steps
456
+ lr_scheduler_kwargs:
457
+ value: null
458
+ lr_scheduler_type:
459
+ value: cosine
460
+ master_addr:
461
+ value: null
462
+ master_port:
463
+ value: null
464
+ max_grad_norm:
465
+ value: 1
466
+ max_position_embeddings:
467
+ value: 32768
468
+ max_steps:
469
+ value: -1
470
+ max_window_layers:
471
+ value: 36
472
+ metric_for_best_model:
473
+ value: null
474
+ model/num_parameters:
475
+ value: 8234382336
476
+ model_args:
477
+ value:
478
+ adapter_folder: null
479
+ adapter_name_or_path: null
480
+ add_special_tokens: null
481
+ add_tokens: null
482
+ audio_sampling_rate: 16000
483
+ block_diag_attn: false
484
+ cache_dir: null
485
+ chunk_size: 8192
486
+ compute_dtype: torch.bfloat16
487
+ cpu_infer: 32
488
+ crop_to_patches: false
489
+ device_map:
490
+ "": cuda:0
491
+ disable_gradient_checkpointing: false
492
+ double_quantization: true
493
+ enable_liger_kernel: false
494
+ export_device: cpu
495
+ export_dir: null
496
+ export_hub_model_id: null
497
+ export_legacy_format: false
498
+ export_quantization_bit: null
499
+ export_quantization_dataset: null
500
+ export_quantization_maxlen: 1024
501
+ export_quantization_nsamples: 128
502
+ export_size: 5
503
+ flash_attn: auto
504
+ hf_hub_token: <HF_HUB_TOKEN>
505
+ image_do_pan_and_scan: false
506
+ image_max_pixels: 589824
507
+ image_min_pixels: 1024
508
+ infer_backend: HF
509
+ infer_dtype: auto
510
+ init_special_tokens: noise_init
511
+ kt_force_think: false
512
+ kt_maxlen: 4096
513
+ kt_mode: normal
514
+ kt_optimize_rule: null
515
+ kt_use_cuda_graph: true
516
+ low_cpu_mem_usage: true
517
+ mixture_of_depths: null
518
+ mode: normal
519
+ model_max_length: 2047
520
+ model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
521
+ model_revision: main
522
+ moe_aux_loss_coef: null
523
+ ms_hub_token: <MS_HUB_TOKEN>
524
+ new_special_tokens_config: null
525
+ offload_folder: offload
526
+ om_hub_token: <OM_HUB_TOKEN>
527
+ print_param_status: false
528
+ quantization_bit: null
529
+ quantization_device_map: null
530
+ quantization_method: BNB
531
+ quantization_type: nf4
532
+ resize_vocab: false
533
+ rope_scaling: null
534
+ sglang_config: null
535
+ sglang_lora_backend: triton
536
+ sglang_maxlen: 4096
537
+ sglang_mem_fraction: 0.7
538
+ sglang_tp_size: -1
539
+ shift_attn: false
540
+ split_special_tokens: false
541
+ train_from_scratch: false
542
+ trust_remote_code: true
543
+ upcast_layernorm: false
544
+ upcast_lmhead_output: false
545
+ use_audio_in_video: false
546
+ use_fast_tokenizer: true
547
+ use_kt: false
548
+ use_kv_cache: true
549
+ use_reentrant_gc: true
550
+ use_unsloth: false
551
+ use_unsloth_gc: false
552
+ use_v1_kernels: false
553
+ video_fps: 2
554
+ video_max_pixels: 65536
555
+ video_maxlen: 128
556
+ video_min_pixels: 256
557
+ vllm_config: null
558
+ vllm_enforce_eager: false
559
+ vllm_gpu_util: 0.7
560
+ vllm_max_lora_rank: 32
561
+ vllm_maxlen: 4096
562
+ model_type:
563
+ value: qwen3
564
+ neftune_noise_alpha:
565
+ value: null
566
+ num_attention_heads:
567
+ value: 32
568
+ num_hidden_layers:
569
+ value: 36
570
+ num_key_value_heads:
571
+ value: 8
572
+ num_train_epochs:
573
+ value: 5
574
+ optim:
575
+ value: adamw_torch
576
+ optim_args:
577
+ value: null
578
+ optim_target_modules:
579
+ value: null
580
+ output_attentions:
581
+ value: false
582
+ output_dir:
583
+ value: /workspace/v127rc_exp1/E_mul
584
+ output_hidden_states:
585
+ value: false
586
+ overwrite_output_dir:
587
+ value: false
588
+ pad_token_id:
589
+ value: 151643
590
+ parallelism_config:
591
+ value: null
592
+ peft_config:
593
+ value:
594
+ default:
595
+ alora_invocation_tokens: null
596
+ arrow_config: null
597
+ auto_mapping: null
598
+ base_model_name_or_path: /workspace/Qwen/Qwen3-8B-Base
599
+ bias: none
600
+ corda_config: null
601
+ ensure_weight_tying: false
602
+ eva_config: null
603
+ exclude_modules: null
604
+ fan_in_fan_out: false
605
+ inference_mode: false
606
+ init_lora_weights: true
607
+ layer_replication: null
608
+ layers_pattern: null
609
+ layers_to_transform: null
610
+ lora_alpha: 32
611
+ lora_bias: false
612
+ lora_dropout: 0.03
613
+ megatron_config: null
614
+ megatron_core: megatron.core
615
+ modules_to_save: null
616
+ peft_type: LORA
617
+ peft_version: 0.18.1
618
+ qalora_group_size: 16
619
+ r: 16
620
+ revision: null
621
+ runtime_config:
622
+ ephemeral_gpu_offload: false
623
+ target_modules:
624
+ - v_proj
625
+ - gate_proj
626
+ - o_proj
627
+ - up_proj
628
+ - k_proj
629
+ - down_proj
630
+ - q_proj
631
+ target_parameters: null
632
+ task_type: CAUSAL_LM
633
+ trainable_token_indices: null
634
+ use_dora: false
635
+ use_qalora: false
636
+ use_rslora: false
637
+ per_device_eval_batch_size:
638
+ value: 8
639
+ per_device_train_batch_size:
640
+ value: 1
641
+ predict_with_generate:
642
+ value: false
643
+ prediction_loss_only:
644
+ value: false
645
+ problem_type:
646
+ value: null
647
+ project:
648
+ value: huggingface
649
+ push_to_hub:
650
+ value: false
651
+ ray_init_kwargs:
652
+ value: null
653
+ ray_num_workers:
654
+ value: 1
655
+ remove_unused_columns:
656
+ value: false
657
+ report_to:
658
+ value:
659
+ - wandb
660
+ restore_callback_states_from_checkpoint:
661
+ value: false
662
+ resume_from_checkpoint:
663
+ value: null
664
+ return_dict:
665
+ value: true
666
+ rms_norm_eps:
667
+ value: 1e-06
668
+ rope_parameters:
669
+ value:
670
+ rope_theta: 1000000
671
+ rope_type: default
672
+ run_name:
673
+ value: null
674
+ save_on_each_node:
675
+ value: false
676
+ save_only_model:
677
+ value: true
678
+ save_steps:
679
+ value: 1000
680
+ save_strategy:
681
+ value: steps
682
+ save_total_limit:
683
+ value: null
684
+ seed:
685
+ value: 42
686
+ skip_memory_metrics:
687
+ value: true
688
+ sliding_window:
689
+ value: null
690
+ sortish_sampler:
691
+ value: false
692
+ tf32:
693
+ value: null
694
+ tie_word_embeddings:
695
+ value: false
696
+ torch_compile:
697
+ value: false
698
+ torch_compile_backend:
699
+ value: null
700
+ torch_compile_mode:
701
+ value: null
702
+ torch_empty_cache_steps:
703
+ value: null
704
+ trackio_space_id:
705
+ value: trackio
706
+ transformers_version:
707
+ value: 5.0.0
708
+ use_cache:
709
+ value: false
710
+ use_cpu:
711
+ value: false
712
+ use_liger_kernel:
713
+ value: false
714
+ use_sliding_window:
715
+ value: false
716
+ vocab_size:
717
+ value: 151936
718
+ warmup_ratio:
719
+ value: 0.02
720
+ warmup_steps:
721
+ value: 0.02
722
+ weight_decay:
723
+ value: 0
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/requirements.txt ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pytz==2025.2
2
+ pydub==0.25.1
3
+ brotli==1.2.0
4
+ antlr4-python3-runtime==4.9.3
5
+ xxhash==3.6.0
6
+ websockets==15.0.1
7
+ tzdata==2025.3
8
+ typing_extensions==4.15.0
9
+ tqdm==4.67.3
10
+ tomlkit==0.13.3
11
+ termcolor==3.3.0
12
+ shtab==1.8.0
13
+ shellingham==1.5.4
14
+ sentencepiece==0.2.1
15
+ semantic-version==2.10.0
16
+ safetensors==0.7.0
17
+ ruff==0.15.0
18
+ regex==2026.1.15
19
+ python-multipart==0.0.22
20
+ pyparsing==3.3.2
21
+ pyarrow==23.0.0
22
+ protobuf==6.33.5
23
+ propcache==0.4.1
24
+ orjson==3.11.7
25
+ omegaconf==2.3.0
26
+ numpy==2.4.2
27
+ multidict==6.7.1
28
+ mdurl==0.1.2
29
+ kiwisolver==1.4.9
30
+ hf-xet==1.2.0
31
+ hf_transfer==0.1.9
32
+ groovy==0.1.2
33
+ frozenlist==1.8.0
34
+ fonttools==4.61.1
35
+ ffmpy==1.0.0
36
+ einops==0.8.2
37
+ docstring_parser==0.17.0
38
+ dill==0.3.8
39
+ cycler==0.12.1
40
+ click==8.3.1
41
+ av==16.0.0
42
+ annotated-types==0.7.0
43
+ annotated-doc==0.0.4
44
+ aiohappyeyeballs==2.6.1
45
+ aiofiles==24.1.0
46
+ yarl==1.22.0
47
+ uvicorn==0.40.0
48
+ typing-inspection==0.4.2
49
+ typer-slim==0.21.1
50
+ tiktoken==0.12.0
51
+ scipy==1.17.0
52
+ pydantic_core==2.41.4
53
+ pandas==2.3.3
54
+ multiprocess==0.70.16
55
+ modelscope==1.34.0
56
+ markdown-it-py==4.0.0
57
+ fire==0.7.1
58
+ contourpy==1.3.3
59
+ anyio==4.12.1
60
+ aiosignal==1.4.0
61
+ starlette==0.50.0
62
+ rich==14.3.2
63
+ pydantic==2.12.3
64
+ matplotlib==3.10.8
65
+ aiohttp==3.13.3
66
+ tyro==0.8.14
67
+ typer==0.21.1
68
+ torchdata==0.11.0
69
+ sse-starlette==3.2.0
70
+ safehttpx==0.1.7
71
+ huggingface_hub==1.4.0
72
+ fastapi==0.128.1
73
+ tokenizers==0.22.2
74
+ gradio_client==1.14.0
75
+ datasets==4.0.0
76
+ accelerate==1.11.0
77
+ transformers==5.0.0
78
+ gradio==5.50.0
79
+ trl==0.24.0
80
+ peft==0.18.1
81
+ llamafactory==0.9.5.dev0
82
+ jieba==0.42.1
83
+ rouge-chinese==1.0.3
84
+ joblib==1.5.3
85
+ nltk==3.9.2
86
+ py-cpuinfo==9.0.0
87
+ nvidia-ml-py==13.590.48
88
+ hjson==3.1.0
89
+ ninja==1.13.0
90
+ msgpack==1.1.2
91
+ deepspeed==0.16.9
92
+ smmap==5.0.2
93
+ sentry-sdk==2.52.0
94
+ gitdb==4.0.12
95
+ GitPython==3.1.46
96
+ wandb==0.24.2
97
+ entrypoints==0.4
98
+ jupyter_client==7.4.9
99
+ nbclassic==1.1.0
100
+ notebook==6.5.5
101
+ pyzmq==24.0.1
102
+ PyYAML==6.0.2
103
+ Send2Trash==1.8.3
104
+ argon2-cffi==23.1.0
105
+ argon2-cffi-bindings==21.2.0
106
+ arrow==1.3.0
107
+ asttokens==2.4.1
108
+ async-lru==2.0.4
109
+ attrs==24.2.0
110
+ babel==2.16.0
111
+ beautifulsoup4==4.12.3
112
+ bleach==6.1.0
113
+ certifi==2024.8.30
114
+ cffi==1.17.1
115
+ charset-normalizer==3.3.2
116
+ comm==0.2.2
117
+ debugpy==1.8.5
118
+ decorator==5.1.1
119
+ defusedxml==0.7.1
120
+ executing==2.1.0
121
+ fastjsonschema==2.20.0
122
+ fqdn==1.5.1
123
+ h11==0.14.0
124
+ httpcore==1.0.5
125
+ httpx==0.27.2
126
+ idna==3.10
127
+ ipykernel==6.29.5
128
+ ipython==8.27.0
129
+ ipython-genutils==0.2.0
130
+ ipywidgets==8.1.5
131
+ isoduration==20.11.0
132
+ jedi==0.19.1
133
+ json5==0.9.25
134
+ jsonpointer==3.0.0
135
+ jsonschema==4.23.0
136
+ jsonschema-specifications==2023.12.1
137
+ jupyter-archive==3.4.0
138
+ jupyter_contrib_core==0.4.2
139
+ jupyter_contrib_nbextensions==0.7.0
140
+ jupyter_core==5.7.2
141
+ jupyter-events==0.10.0
142
+ jupyter-highlight-selected-word==0.2.0
143
+ jupyter-lsp==2.2.5
144
+ jupyter_nbextensions_configurator==0.6.4
145
+ jupyter_server==2.14.2
146
+ jupyter_server_terminals==0.5.3
147
+ jupyterlab==4.2.5
148
+ jupyterlab_pygments==0.3.0
149
+ jupyterlab_server==2.27.3
150
+ jupyterlab_widgets==3.0.13
151
+ lxml==5.3.0
152
+ matplotlib-inline==0.1.7
153
+ mistune==3.0.2
154
+ nbclient==0.10.0
155
+ nbconvert==7.16.4
156
+ nbformat==5.10.4
157
+ nest-asyncio==1.6.0
158
+ notebook_shim==0.2.4
159
+ overrides==7.7.0
160
+ packaging==24.1
161
+ pandocfilters==1.5.1
162
+ parso==0.8.4
163
+ pexpect==4.9.0
164
+ platformdirs==4.3.6
165
+ prometheus_client==0.21.0
166
+ prompt_toolkit==3.0.47
167
+ psutil==6.0.0
168
+ ptyprocess==0.7.0
169
+ pure_eval==0.2.3
170
+ pycparser==2.22
171
+ Pygments==2.18.0
172
+ python-dateutil==2.9.0.post0
173
+ python-json-logger==2.0.7
174
+ referencing==0.35.1
175
+ requests==2.32.3
176
+ rfc3339-validator==0.1.4
177
+ rfc3986-validator==0.1.1
178
+ rpds-py==0.20.0
179
+ sniffio==1.3.1
180
+ soupsieve==2.6
181
+ stack-data==0.6.3
182
+ terminado==0.18.1
183
+ tinycss2==1.3.0
184
+ tornado==6.4.1
185
+ traitlets==5.14.3
186
+ types-python-dateutil==2.9.0.20240906
187
+ uri-template==1.3.0
188
+ urllib3==2.2.3
189
+ wcwidth==0.2.13
190
+ webcolors==24.8.0
191
+ webencodings==0.5.1
192
+ websocket-client==1.8.0
193
+ widgetsnbextension==4.0.13
194
+ Jinja2==3.1.3
195
+ MarkupSafe==2.1.5
196
+ filelock==3.13.1
197
+ fsspec==2024.2.0
198
+ mpmath==1.3.0
199
+ networkx==3.2.1
200
+ nvidia-cublas-cu12==12.4.2.65
201
+ nvidia-cuda-cupti-cu12==12.4.99
202
+ nvidia-cuda-nvrtc-cu12==12.4.99
203
+ nvidia-cuda-runtime-cu12==12.4.99
204
+ nvidia-cudnn-cu12==9.1.0.70
205
+ nvidia-cufft-cu12==11.2.0.44
206
+ nvidia-curand-cu12==10.3.5.119
207
+ nvidia-cusolver-cu12==11.6.0.99
208
+ nvidia-cusparse-cu12==12.3.0.142
209
+ nvidia-nccl-cu12==2.20.5
210
+ nvidia-nvjitlink-cu12==12.4.99
211
+ nvidia-nvtx-cu12==12.4.99
212
+ pillow==10.2.0
213
+ sympy==1.12
214
+ torch==2.4.1+cu124
215
+ torchaudio==2.4.1+cu124
216
+ torchvision==0.19.1+cu124
217
+ triton==3.0.0
218
+ pip==24.2
219
+ setuptools==75.1.0
220
+ wheel==0.44.0
221
+ PyGObject==3.42.1
222
+ PyJWT==2.3.0
223
+ SecretStorage==3.3.1
224
+ blinker==1.4
225
+ cryptography==3.4.8
226
+ dbus-python==1.2.18
227
+ distro==1.7.0
228
+ httplib2==0.20.2
229
+ importlib-metadata==4.6.4
230
+ jeepney==0.7.1
231
+ keyring==23.5.0
232
+ launchpadlib==1.10.16
233
+ lazr.restfulclient==0.14.4
234
+ lazr.uri==1.0.6
235
+ more-itertools==8.10.0
236
+ oauthlib==3.2.0
237
+ python-apt==2.4.0+ubuntu4
238
+ six==1.16.0
239
+ wadllib==1.3.6
240
+ zipp==1.0.0
241
+ autocommand==2.2.2
242
+ backports.tarfile==1.2.0
243
+ importlib_metadata==8.0.0
244
+ importlib_resources==6.4.0
245
+ inflect==7.3.1
246
+ jaraco.collections==5.1.0
247
+ jaraco.context==5.3.0
248
+ jaraco.functools==4.0.1
249
+ jaraco.text==3.12.1
250
+ more-itertools==10.3.0
251
+ packaging==24.1
252
+ platformdirs==4.2.2
253
+ tomli==2.0.1
254
+ typeguard==4.3.0
255
+ typing_extensions==4.12.2
256
+ wheel==0.43.0
257
+ zipp==3.19.2
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-metadata.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-52-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.11.10",
4
+ "startedAt": "2026-02-05T02:37:31.256607Z",
5
+ "args": [
6
+ "/workspace/v127rc_exp1/E_mul.yaml"
7
+ ],
8
+ "program": "/usr/local/bin/llamafactory-cli",
9
+ "git": {
10
+ "remote": "https://github.com/hiyouga/LlamaFactory.git",
11
+ "commit": "1a02717fa84c270d1c156c4c4a391c2f95525a63"
12
+ },
13
+ "email": "markmochi200@gmail.com",
14
+ "root": "/workspace/LlamaFactory",
15
+ "host": "682d471c1c72",
16
+ "executable": "/usr/bin/python",
17
+ "cpu_count": 24,
18
+ "cpu_count_logical": 48,
19
+ "gpu": "NVIDIA GeForce RTX 4090",
20
+ "gpu_count": 1,
21
+ "disk": {
22
+ "/": {
23
+ "total": "21474836480",
24
+ "used": "2594168832"
25
+ }
26
+ },
27
+ "memory": {
28
+ "total": "269721997312"
29
+ },
30
+ "gpu_nvidia": [
31
+ {
32
+ "name": "NVIDIA GeForce RTX 4090",
33
+ "memoryTotal": "25757220864",
34
+ "cudaCores": 16384,
35
+ "architecture": "Ada",
36
+ "uuid": "GPU-f9c17fa7-295e-e688-fe65-f3659fffa9a3"
37
+ }
38
+ ],
39
+ "cudaVersion": "12.7",
40
+ "writerId": "be8ic28wchhzrbkqsu0bl7jl1lfwezfn"
41
+ }
LlamaFactory/wandb/run-20260205_023731-1lb2e6m1/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train_runtime":202598.5168,"train_samples_per_second":0.963,"_timestamp":1.770461649358481e+09,"_step":195010,"train/train_tokens_per_second":1970.359,"train/loss":0.7374985218048096,"train/grad_norm":2.825721025466919,"train/global_step":195010,"_runtime":202601,"_wandb":{"runtime":202601},"train/epoch":5,"total_flos":1.8231724481360794e+19,"train/learning_rate":3.3779062880157087e-15,"train_loss":0.3935867749506399,"train_steps_per_second":0.963,"train/num_input_tokens_seen":399185470}