rgb255 commited on
Commit
00c350c
·
verified ·
1 Parent(s): a5835d3

Upload LoRA adapter (Fixed README metadata)

Browse files
README.md CHANGED
@@ -1,6 +1,8 @@
1
  ---
2
  base_model: Qwen/Qwen3-4B-Instruct-2507
3
  datasets:
 
 
4
  - u-10bei/structured_data_with_cot_dataset_512_v2
5
  language:
6
  - en
@@ -34,9 +36,9 @@ while intermediate reasoning (Chain-of-Thought) is masked.
34
  - Base model: Qwen/Qwen3-4B-Instruct-2507
35
  - Method: QLoRA (4-bit)
36
  - Max sequence length: 512
37
- - Epochs: 2
38
  - Learning rate: 2e-04
39
- - LoRA: r=256, alpha=32
40
 
41
  ## Usage
42
 
@@ -59,7 +61,7 @@ model = PeftModel.from_pretrained(model, adapter)
59
 
60
  ## Sources & Terms (IMPORTANT)
61
 
62
- Training data: u-10bei/structured_data_with_cot_dataset_512_v2
63
 
64
  Dataset License: MIT License. This dataset is used and distributed under the terms of the MIT License.
65
  Compliance: Users must comply with the MIT license (including copyright notice) and the base model's original terms of use.
 
1
  ---
2
  base_model: Qwen/Qwen3-4B-Instruct-2507
3
  datasets:
4
+ - daichira/structured-hard-sft-4k
5
+ - u-10bei/structured_data_with_cot_dataset_512
6
  - u-10bei/structured_data_with_cot_dataset_512_v2
7
  language:
8
  - en
 
36
  - Base model: Qwen/Qwen3-4B-Instruct-2507
37
  - Method: QLoRA (4-bit)
38
  - Max sequence length: 512
39
+ - Epochs: 1
40
  - Learning rate: 2e-04
41
+ - LoRA: r=128, alpha=24
42
 
43
  ## Usage
44
 
 
61
 
62
  ## Sources & Terms (IMPORTANT)
63
 
64
+ Training data: ['daichira/structured-hard-sft-4k', 'u-10bei/structured_data_with_cot_dataset_512', 'u-10bei/structured_data_with_cot_dataset_512_v2']
65
 
66
  Dataset License: MIT License. This dataset is used and distributed under the terms of the MIT License.
67
  Compliance: Users must comply with the MIT license (including copyright notice) and the base model's original terms of use.
adapter_config.json CHANGED
@@ -20,7 +20,7 @@
20
  "layers_pattern": null,
21
  "layers_to_transform": null,
22
  "loftq_config": {},
23
- "lora_alpha": 32,
24
  "lora_bias": false,
25
  "lora_dropout": 0.0,
26
  "megatron_config": null,
@@ -29,17 +29,17 @@
29
  "peft_type": "LORA",
30
  "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
- "r": 256,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "gate_proj",
37
  "down_proj",
38
  "k_proj",
39
  "up_proj",
 
40
  "o_proj",
41
  "v_proj",
42
- "q_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
20
  "layers_pattern": null,
21
  "layers_to_transform": null,
22
  "loftq_config": {},
23
+ "lora_alpha": 24,
24
  "lora_bias": false,
25
  "lora_dropout": 0.0,
26
  "megatron_config": null,
 
29
  "peft_type": "LORA",
30
  "peft_version": "0.18.1",
31
  "qalora_group_size": 16,
32
+ "r": 128,
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
36
  "down_proj",
37
  "k_proj",
38
  "up_proj",
39
+ "q_proj",
40
  "o_proj",
41
  "v_proj",
42
+ "gate_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f7e32a8d79b5e697d84e7807625629d764aff33c2a8a303e7ff31d5cb7fe96d
3
- size 2113998360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b0e60c482cea3edf2d00118f32c08cf82ca46f1acc8cbe48cbb11cd70e1befe
3
+ size 1057033224
all_experiments_details.json CHANGED
@@ -2,13 +2,17 @@
2
  "Experiment_1": {
3
  "config": {
4
  "BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507",
5
- "DATASET_ID": "u-10bei/structured_data_with_cot_dataset_512_v2",
 
 
 
 
6
  "BASE_OUT_DIR": "./lora_experiments",
7
  "SEED": 3407,
8
  "VAL_RATIO": 0.05,
9
  "MAX_SEQ_LEN": 512,
10
- "LORA_R": 256,
11
- "LORA_ALPHA": 32,
12
  "LORA_DROPOUT": 0.0,
13
  "LORA_TARGET_MODULES": [
14
  "q_proj",
@@ -19,7 +23,7 @@
19
  "up_proj",
20
  "down_proj"
21
  ],
22
- "EPOCHS": 2,
23
  "PER_DEVICE_TRAIN_BS": 2,
24
  "PER_DEVICE_EVAL_BS": 2,
25
  "GRAD_ACCUM": 8,
@@ -48,386 +52,487 @@
48
  },
49
  "history": [
50
  {
51
- "loss": 1.3966,
52
- "grad_norm": 1.0872466564178467,
53
- "learning_rate": 4e-05,
54
- "epoch": 0.04501969611705121,
55
  "step": 10
56
  },
57
  {
58
- "loss": 0.9325,
59
- "grad_norm": 0.9557391405105591,
60
- "learning_rate": 8.444444444444444e-05,
61
- "epoch": 0.09003939223410241,
62
  "step": 20
63
  },
64
  {
65
- "loss": 0.8011,
66
- "grad_norm": 0.513979971408844,
67
- "learning_rate": 0.00012888888888888892,
68
- "epoch": 0.13505908835115363,
69
  "step": 30
70
  },
71
  {
72
- "loss": 0.8125,
73
- "grad_norm": 0.5614296197891235,
74
- "learning_rate": 0.00017333333333333334,
75
- "epoch": 0.18007878446820483,
76
  "step": 40
77
  },
78
  {
79
- "loss": 0.7486,
80
- "grad_norm": 0.32493776082992554,
81
- "learning_rate": 0.0001999509018141497,
82
- "epoch": 0.22509848058525606,
83
  "step": 50
84
  },
85
  {
86
- "eval_loss": 0.8527934551239014,
87
- "eval_runtime": 25.8751,
88
- "eval_samples_per_second": 7.227,
89
- "eval_steps_per_second": 3.633,
90
- "epoch": 0.22509848058525606,
91
  "step": 50
92
  },
93
  {
94
- "loss": 0.7415,
95
- "grad_norm": 0.2533496022224426,
96
- "learning_rate": 0.00019939910076582706,
97
- "epoch": 0.27011817670230726,
98
  "step": 60
99
  },
100
  {
101
- "loss": 0.8629,
102
- "grad_norm": 0.3107249438762665,
103
- "learning_rate": 0.00019823752233636866,
104
- "epoch": 0.31513787281935846,
105
  "step": 70
106
  },
107
  {
108
- "loss": 0.7151,
109
- "grad_norm": 0.32290422916412354,
110
- "learning_rate": 0.00019647329238755036,
111
- "epoch": 0.36015756893640966,
112
  "step": 80
113
  },
114
  {
115
- "loss": 0.7843,
116
- "grad_norm": 0.3160030245780945,
117
- "learning_rate": 0.0001941172338293343,
118
- "epoch": 0.4051772650534609,
119
  "step": 90
120
  },
121
  {
122
- "loss": 0.7931,
123
- "grad_norm": 0.40372225642204285,
124
- "learning_rate": 0.00019118380022524738,
125
- "epoch": 0.4501969611705121,
126
  "step": 100
127
  },
128
  {
129
- "eval_loss": 0.7929844260215759,
130
- "eval_runtime": 25.6248,
131
- "eval_samples_per_second": 7.298,
132
- "eval_steps_per_second": 3.668,
133
- "epoch": 0.4501969611705121,
134
  "step": 100
135
  },
136
  {
137
- "loss": 0.812,
138
- "grad_norm": 0.4371040165424347,
139
- "learning_rate": 0.0001876909871250184,
140
- "epoch": 0.4952166572875633,
141
  "step": 110
142
  },
143
  {
144
- "loss": 0.7258,
145
- "grad_norm": 0.28696581721305847,
146
- "learning_rate": 0.00018366022166841676,
147
- "epoch": 0.5402363534046145,
148
  "step": 120
149
  },
150
  {
151
- "loss": 0.7824,
152
- "grad_norm": 0.9101247787475586,
153
- "learning_rate": 0.0001791162311375321,
154
- "epoch": 0.5852560495216658,
155
  "step": 130
156
  },
157
  {
158
- "loss": 0.7643,
159
- "grad_norm": 0.2658868730068207,
160
- "learning_rate": 0.00017408689126387995,
161
- "epoch": 0.6302757456387169,
162
  "step": 140
163
  },
164
  {
165
- "loss": 0.711,
166
- "grad_norm": 0.43262964487075806,
167
- "learning_rate": 0.0001686030552209133,
168
- "epoch": 0.6752954417557682,
169
  "step": 150
170
  },
171
  {
172
- "eval_loss": 0.7660654783248901,
173
- "eval_runtime": 24.682,
174
- "eval_samples_per_second": 7.576,
175
- "eval_steps_per_second": 3.808,
176
- "epoch": 0.6752954417557682,
177
  "step": 150
178
  },
179
  {
180
- "loss": 0.6915,
181
- "grad_norm": 0.36354902386665344,
182
- "learning_rate": 0.00016269836435100934,
183
- "epoch": 0.7203151378728193,
184
  "step": 160
185
  },
186
  {
187
- "loss": 0.7233,
188
- "grad_norm": 0.3520168662071228,
189
- "learning_rate": 0.0001564090417880529,
190
- "epoch": 0.7653348339898706,
191
  "step": 170
192
  },
193
  {
194
- "loss": 0.6626,
195
- "grad_norm": 0.33897820115089417,
196
- "learning_rate": 0.0001497736702416662,
197
- "epoch": 0.8103545301069218,
198
  "step": 180
199
  },
200
  {
201
- "loss": 0.6801,
202
- "grad_norm": 0.4690793752670288,
203
- "learning_rate": 0.00014283295530629877,
204
- "epoch": 0.855374226223973,
205
  "step": 190
206
  },
207
  {
208
- "loss": 0.6281,
209
- "grad_norm": 0.42146065831184387,
210
- "learning_rate": 0.00013562947574718976,
211
- "epoch": 0.9003939223410242,
212
  "step": 200
213
  },
214
  {
215
- "eval_loss": 0.6899478435516357,
216
- "eval_runtime": 24.6215,
217
- "eval_samples_per_second": 7.595,
218
- "eval_steps_per_second": 3.818,
219
- "epoch": 0.9003939223410242,
220
  "step": 200
221
  },
222
  {
223
- "loss": 0.6237,
224
- "grad_norm": 0.38645192980766296,
225
- "learning_rate": 0.00012820742229510817,
226
- "epoch": 0.9454136184580754,
227
  "step": 210
228
  },
229
  {
230
- "loss": 0.5856,
231
- "grad_norm": 0.4482150375843048,
232
- "learning_rate": 0.00012061232655226964,
233
- "epoch": 0.9904333145751266,
234
  "step": 220
235
  },
236
  {
237
- "loss": 0.4553,
238
- "grad_norm": 0.5086686015129089,
239
- "learning_rate": 0.00011289078167249402,
240
- "epoch": 1.0315137872819358,
241
  "step": 230
242
  },
243
  {
244
- "loss": 0.5029,
245
- "grad_norm": 0.8159873485565186,
246
- "learning_rate": 0.00010509015652912966,
247
- "epoch": 1.076533483398987,
248
  "step": 240
249
  },
250
  {
251
- "loss": 0.5179,
252
- "grad_norm": 0.5635101199150085,
253
- "learning_rate": 9.72583051242198e-05,
254
- "epoch": 1.1215531795160383,
255
  "step": 250
256
  },
257
  {
258
- "eval_loss": 0.5836588144302368,
259
- "eval_runtime": 24.7575,
260
- "eval_samples_per_second": 7.553,
261
- "eval_steps_per_second": 3.797,
262
- "epoch": 1.1215531795160383,
263
  "step": 250
264
  },
265
  {
266
- "loss": 0.514,
267
- "grad_norm": 0.661852240562439,
268
- "learning_rate": 8.944327302158073e-05,
269
- "epoch": 1.1665728756330895,
270
  "step": 260
271
  },
272
  {
273
- "loss": 0.4472,
274
- "grad_norm": 0.4757942259311676,
275
- "learning_rate": 8.169300260471818e-05,
276
- "epoch": 1.2115925717501406,
277
  "step": 270
278
  },
279
  {
280
- "loss": 0.4286,
281
- "grad_norm": 0.6126232743263245,
282
- "learning_rate": 7.405503896771729e-05,
283
- "epoch": 1.2566122678671918,
284
  "step": 280
285
  },
286
  {
287
- "loss": 0.4862,
288
- "grad_norm": 0.5707330703735352,
289
- "learning_rate": 6.65762382433589e-05,
290
- "epoch": 1.301631963984243,
291
  "step": 290
292
  },
293
  {
294
- "loss": 0.4352,
295
- "grad_norm": 0.3476680815219879,
296
- "learning_rate": 5.930248015776325e-05,
297
- "epoch": 1.3466516601012943,
298
  "step": 300
299
  },
300
  {
301
- "eval_loss": 0.4677433371543884,
302
- "eval_runtime": 24.7074,
303
- "eval_samples_per_second": 7.569,
304
- "eval_steps_per_second": 3.805,
305
- "epoch": 1.3466516601012943,
306
  "step": 300
307
  },
308
  {
309
- "loss": 0.3118,
310
- "grad_norm": 0.6370311379432678,
311
- "learning_rate": 5.227838657493396e-05,
312
- "epoch": 1.3916713562183456,
313
  "step": 310
314
  },
315
  {
316
- "loss": 0.3847,
317
- "grad_norm": 0.5159108638763428,
318
- "learning_rate": 4.5547047757828985e-05,
319
- "epoch": 1.4366910523353966,
320
  "step": 320
321
  },
322
  {
323
- "loss": 0.3931,
324
- "grad_norm": 0.6911277174949646,
325
- "learning_rate": 3.914975802524806e-05,
326
- "epoch": 1.4817107484524479,
327
  "step": 330
328
  },
329
  {
330
- "loss": 0.5302,
331
- "grad_norm": 0.5555063486099243,
332
- "learning_rate": 3.312576242618511e-05,
333
- "epoch": 1.5267304445694991,
334
  "step": 340
335
  },
336
  {
337
- "loss": 0.3183,
338
- "grad_norm": 0.8983607292175293,
339
- "learning_rate": 2.7512015985706418e-05,
340
- "epoch": 1.5717501406865504,
341
  "step": 350
342
  },
343
  {
344
- "eval_loss": 0.3523830473423004,
345
- "eval_runtime": 24.7895,
346
- "eval_samples_per_second": 7.544,
347
- "eval_steps_per_second": 3.792,
348
- "epoch": 1.5717501406865504,
349
  "step": 350
350
  },
351
  {
352
- "loss": 0.3694,
353
- "grad_norm": 0.5024566054344177,
354
- "learning_rate": 2.234295699929413e-05,
355
- "epoch": 1.6167698368036016,
356
  "step": 360
357
  },
358
  {
359
- "loss": 0.3502,
360
- "grad_norm": 0.5496794581413269,
361
- "learning_rate": 1.7650295766411605e-05,
362
- "epoch": 1.6617895329206527,
363
  "step": 370
364
  },
365
  {
366
- "loss": 0.3006,
367
- "grad_norm": 0.4716707766056061,
368
- "learning_rate": 1.3462820059333403e-05,
369
- "epoch": 1.7068092290377042,
370
  "step": 380
371
  },
372
  {
373
- "loss": 0.3471,
374
- "grad_norm": 0.46408089995384216,
375
- "learning_rate": 9.80621852061826e-06,
376
- "epoch": 1.7518289251547552,
377
  "step": 390
378
  },
379
  {
380
- "loss": 0.2301,
381
- "grad_norm": 0.41809141635894775,
382
- "learning_rate": 6.702923072617129e-06,
383
- "epoch": 1.7968486212718064,
384
  "step": 400
385
  },
386
  {
387
- "eval_loss": 0.3102871775627136,
388
- "eval_runtime": 24.3436,
389
- "eval_samples_per_second": 7.682,
390
- "eval_steps_per_second": 3.861,
391
- "epoch": 1.7968486212718064,
392
  "step": 400
393
  },
394
  {
395
- "loss": 0.2704,
396
- "grad_norm": 0.4462619423866272,
397
- "learning_rate": 4.171971305776945e-06,
398
- "epoch": 1.8418683173888577,
399
  "step": 410
400
  },
401
  {
402
- "loss": 0.2168,
403
- "grad_norm": 0.4414360523223877,
404
- "learning_rate": 2.2288896899377186e-06,
405
- "epoch": 1.8868880135059087,
406
  "step": 420
407
  },
408
  {
409
- "loss": 0.2626,
410
- "grad_norm": 0.3277634084224701,
411
- "learning_rate": 8.855983250793288e-07,
412
- "epoch": 1.93190770962296,
413
  "step": 430
414
  },
415
  {
416
- "loss": 0.342,
417
- "grad_norm": 0.4310093820095062,
418
- "learning_rate": 1.5033781583758678e-07,
419
- "epoch": 1.9769274057400112,
420
  "step": 440
421
  },
422
  {
423
- "train_runtime": 3776.627,
424
- "train_samples_per_second": 1.882,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  "train_steps_per_second": 0.118,
426
- "total_flos": 7.57271106173184e+16,
427
- "train_loss": 0.5718902958882763,
428
- "epoch": 2.0,
429
- "step": 446,
430
- "total_runtime_sec": 3777.7196531295776
431
  }
432
  ]
433
  }
 
2
  "Experiment_1": {
3
  "config": {
4
  "BASE_MODEL_ID": "Qwen/Qwen3-4B-Instruct-2507",
5
+ "DATASET_ID": [
6
+ "daichira/structured-hard-sft-4k",
7
+ "u-10bei/structured_data_with_cot_dataset_512",
8
+ "u-10bei/structured_data_with_cot_dataset_512_v2"
9
+ ],
10
  "BASE_OUT_DIR": "./lora_experiments",
11
  "SEED": 3407,
12
  "VAL_RATIO": 0.05,
13
  "MAX_SEQ_LEN": 512,
14
+ "LORA_R": 128,
15
+ "LORA_ALPHA": 24,
16
  "LORA_DROPOUT": 0.0,
17
  "LORA_TARGET_MODULES": [
18
  "q_proj",
 
23
  "up_proj",
24
  "down_proj"
25
  ],
26
+ "EPOCHS": 1,
27
  "PER_DEVICE_TRAIN_BS": 2,
28
  "PER_DEVICE_EVAL_BS": 2,
29
  "GRAD_ACCUM": 8,
 
52
  },
53
  "history": [
54
  {
55
+ "loss": 1.4503,
56
+ "grad_norm": 0.6611063480377197,
57
+ "learning_rate": 3.2142857142857144e-05,
58
+ "epoch": 0.018144704014515765,
59
  "step": 10
60
  },
61
  {
62
+ "loss": 1.0865,
63
+ "grad_norm": 0.9616151452064514,
64
+ "learning_rate": 6.785714285714286e-05,
65
+ "epoch": 0.03628940802903153,
66
  "step": 20
67
  },
68
  {
69
+ "loss": 0.9333,
70
+ "grad_norm": 0.47518712282180786,
71
+ "learning_rate": 0.00010357142857142859,
72
+ "epoch": 0.05443411204354729,
73
  "step": 30
74
  },
75
  {
76
+ "loss": 0.8193,
77
+ "grad_norm": 0.46924060583114624,
78
+ "learning_rate": 0.0001392857142857143,
79
+ "epoch": 0.07257881605806306,
80
  "step": 40
81
  },
82
  {
83
+ "loss": 0.7101,
84
+ "grad_norm": 0.3533737361431122,
85
+ "learning_rate": 0.000175,
86
+ "epoch": 0.09072352007257882,
87
  "step": 50
88
  },
89
  {
90
+ "eval_loss": 0.6981692910194397,
91
+ "eval_runtime": 76.8039,
92
+ "eval_samples_per_second": 5.95,
93
+ "eval_steps_per_second": 2.982,
94
+ "epoch": 0.09072352007257882,
95
  "step": 50
96
  },
97
  {
98
+ "loss": 0.7362,
99
+ "grad_norm": 0.4759365916252136,
100
+ "learning_rate": 0.0001999819475629623,
101
+ "epoch": 0.10886822408709458,
102
  "step": 60
103
  },
104
  {
105
+ "loss": 0.8242,
106
+ "grad_norm": 0.3445497155189514,
107
+ "learning_rate": 0.00019966119663520412,
108
+ "epoch": 0.12701292810161036,
109
  "step": 70
110
  },
111
  {
112
+ "loss": 0.816,
113
+ "grad_norm": 0.4535239338874817,
114
+ "learning_rate": 0.000198940761218769,
115
+ "epoch": 0.14515763211612612,
116
  "step": 80
117
  },
118
  {
119
+ "loss": 0.6256,
120
+ "grad_norm": 0.2236223965883255,
121
+ "learning_rate": 0.000197823530571169,
122
+ "epoch": 0.16330233613064188,
123
  "step": 90
124
  },
125
  {
126
+ "loss": 0.6991,
127
+ "grad_norm": 0.41939008235931396,
128
+ "learning_rate": 0.00019631398527035422,
129
+ "epoch": 0.18144704014515764,
130
  "step": 100
131
  },
132
  {
133
+ "eval_loss": 0.6606893539428711,
134
+ "eval_runtime": 76.0198,
135
+ "eval_samples_per_second": 6.012,
136
+ "eval_steps_per_second": 3.012,
137
+ "epoch": 0.18144704014515764,
138
  "step": 100
139
  },
140
  {
141
+ "loss": 0.6879,
142
+ "grad_norm": 0.21272054314613342,
143
+ "learning_rate": 0.00019441817924565786,
144
+ "epoch": 0.1995917441596734,
145
  "step": 110
146
  },
147
  {
148
+ "loss": 0.7107,
149
+ "grad_norm": 0.361630916595459,
150
+ "learning_rate": 0.0001921437154989221,
151
+ "epoch": 0.21773644817418916,
152
  "step": 120
153
  },
154
  {
155
+ "loss": 0.6812,
156
+ "grad_norm": 0.2023937702178955,
157
+ "learning_rate": 0.0001894997156131734,
158
+ "epoch": 0.23588115218870492,
159
  "step": 130
160
  },
161
  {
162
+ "loss": 0.7045,
163
+ "grad_norm": 0.30442872643470764,
164
+ "learning_rate": 0.00018649678317113084,
165
+ "epoch": 0.2540258562032207,
166
  "step": 140
167
  },
168
  {
169
+ "loss": 0.8145,
170
+ "grad_norm": 0.34262141585350037,
171
+ "learning_rate": 0.00018314696123025454,
172
+ "epoch": 0.27217056021773645,
173
  "step": 150
174
  },
175
  {
176
+ "eval_loss": 0.6401548385620117,
177
+ "eval_runtime": 75.3842,
178
+ "eval_samples_per_second": 6.062,
179
+ "eval_steps_per_second": 3.038,
180
+ "epoch": 0.27217056021773645,
181
  "step": 150
182
  },
183
  {
184
+ "loss": 0.6708,
185
+ "grad_norm": 0.307359904050827,
186
+ "learning_rate": 0.00017946368402487845,
187
+ "epoch": 0.29031526423225223,
188
  "step": 160
189
  },
190
  {
191
+ "loss": 0.7885,
192
+ "grad_norm": 0.29199010133743286,
193
+ "learning_rate": 0.00017546172308912213,
194
+ "epoch": 0.30845996824676797,
195
  "step": 170
196
  },
197
  {
198
+ "loss": 0.7305,
199
+ "grad_norm": 0.20208679139614105,
200
+ "learning_rate": 0.000171157128016652,
201
+ "epoch": 0.32660467226128376,
202
  "step": 180
203
  },
204
  {
205
+ "loss": 0.6736,
206
+ "grad_norm": 0.2638019025325775,
207
+ "learning_rate": 0.00016656716209487174,
208
+ "epoch": 0.3447493762757995,
209
  "step": 190
210
  },
211
  {
212
+ "loss": 0.5909,
213
+ "grad_norm": 0.1571992188692093,
214
+ "learning_rate": 0.00016171023307167545,
215
+ "epoch": 0.3628940802903153,
216
  "step": 200
217
  },
218
  {
219
+ "eval_loss": 0.6256077289581299,
220
+ "eval_runtime": 75.1757,
221
+ "eval_samples_per_second": 6.079,
222
+ "eval_steps_per_second": 3.046,
223
+ "epoch": 0.3628940802903153,
224
  "step": 200
225
  },
226
  {
227
+ "loss": 0.7633,
228
+ "grad_norm": 0.1657303422689438,
229
+ "learning_rate": 0.00015660581933241993,
230
+ "epoch": 0.381038784304831,
231
  "step": 210
232
  },
233
  {
234
+ "loss": 0.6618,
235
+ "grad_norm": 0.22835072875022888,
236
+ "learning_rate": 0.00015127439178317745,
237
+ "epoch": 0.3991834883193468,
238
  "step": 220
239
  },
240
  {
241
+ "loss": 0.6575,
242
+ "grad_norm": 0.2166450172662735,
243
+ "learning_rate": 0.0001457373317535515,
244
+ "epoch": 0.41732819233386254,
245
  "step": 230
246
  },
247
  {
248
+ "loss": 0.6041,
249
+ "grad_norm": 0.2187417596578598,
250
+ "learning_rate": 0.00014001684524830057,
251
+ "epoch": 0.4354728963483783,
252
  "step": 240
253
  },
254
  {
255
+ "loss": 0.7904,
256
+ "grad_norm": 0.17704260349273682,
257
+ "learning_rate": 0.00013413587389165784,
258
+ "epoch": 0.45361760036289406,
259
  "step": 250
260
  },
261
  {
262
+ "eval_loss": 0.6180712580680847,
263
+ "eval_runtime": 75.7849,
264
+ "eval_samples_per_second": 6.03,
265
+ "eval_steps_per_second": 3.022,
266
+ "epoch": 0.45361760036289406,
267
  "step": 250
268
  },
269
  {
270
+ "loss": 0.7716,
271
+ "grad_norm": 0.22328545153141022,
272
+ "learning_rate": 0.0001281180029214988,
273
+ "epoch": 0.47176230437740985,
274
  "step": 260
275
  },
276
  {
277
+ "loss": 0.647,
278
+ "grad_norm": 0.1940474510192871,
279
+ "learning_rate": 0.00012198736660234009,
280
+ "epoch": 0.4899070083919256,
281
  "step": 270
282
  },
283
  {
284
+ "loss": 0.6896,
285
+ "grad_norm": 0.18775729835033417,
286
+ "learning_rate": 0.00011576855143650371,
287
+ "epoch": 0.5080517124064414,
288
  "step": 280
289
  },
290
  {
291
+ "loss": 0.6765,
292
+ "grad_norm": 0.25463321805000305,
293
+ "learning_rate": 0.00010948649756161246,
294
+ "epoch": 0.5261964164209572,
295
  "step": 290
296
  },
297
  {
298
+ "loss": 0.6583,
299
+ "grad_norm": 0.16848962008953094,
300
+ "learning_rate": 0.00010316639872985472,
301
+ "epoch": 0.5443411204354729,
302
  "step": 300
303
  },
304
  {
305
+ "eval_loss": 0.6103786826133728,
306
+ "eval_runtime": 73.9265,
307
+ "eval_samples_per_second": 6.182,
308
+ "eval_steps_per_second": 3.098,
309
+ "epoch": 0.5443411204354729,
310
  "step": 300
311
  },
312
  {
313
+ "loss": 0.6652,
314
+ "grad_norm": 0.27448564767837524,
315
+ "learning_rate": 9.683360127014529e-05,
316
+ "epoch": 0.5624858244499886,
317
  "step": 310
318
  },
319
  {
320
+ "loss": 0.701,
321
+ "grad_norm": 0.2189791053533554,
322
+ "learning_rate": 9.051350243838756e-05,
323
+ "epoch": 0.5806305284645045,
324
  "step": 320
325
  },
326
  {
327
+ "loss": 0.6166,
328
+ "grad_norm": 0.17203940451145172,
329
+ "learning_rate": 8.423144856349631e-05,
330
+ "epoch": 0.5987752324790202,
331
  "step": 330
332
  },
333
  {
334
+ "loss": 0.5667,
335
+ "grad_norm": 0.220821350812912,
336
+ "learning_rate": 7.801263339765994e-05,
337
+ "epoch": 0.6169199364935359,
338
  "step": 340
339
  },
340
  {
341
+ "loss": 0.6526,
342
+ "grad_norm": 0.2224995642900467,
343
+ "learning_rate": 7.188199707850122e-05,
344
+ "epoch": 0.6350646405080517,
345
  "step": 350
346
  },
347
  {
348
+ "eval_loss": 0.6001651287078857,
349
+ "eval_runtime": 75.1345,
350
+ "eval_samples_per_second": 6.082,
351
+ "eval_steps_per_second": 3.048,
352
+ "epoch": 0.6350646405080517,
353
  "step": 350
354
  },
355
  {
356
+ "loss": 0.6697,
357
+ "grad_norm": 0.20313851535320282,
358
+ "learning_rate": 6.586412610834221e-05,
359
+ "epoch": 0.6532093445225675,
360
  "step": 360
361
  },
362
  {
363
+ "loss": 0.5862,
364
+ "grad_norm": 0.1557992547750473,
365
+ "learning_rate": 5.998315475169942e-05,
366
+ "epoch": 0.6713540485370832,
367
  "step": 370
368
  },
369
  {
370
+ "loss": 0.6341,
371
+ "grad_norm": 0.29822641611099243,
372
+ "learning_rate": 5.4262668246448475e-05,
373
+ "epoch": 0.689498752551599,
374
  "step": 380
375
  },
376
  {
377
+ "loss": 0.6841,
378
+ "grad_norm": 0.30022329092025757,
379
+ "learning_rate": 4.872560821682256e-05,
380
+ "epoch": 0.7076434565661147,
381
  "step": 390
382
  },
383
  {
384
+ "loss": 0.729,
385
+ "grad_norm": 0.26096341013908386,
386
+ "learning_rate": 4.339418066758008e-05,
387
+ "epoch": 0.7257881605806306,
388
  "step": 400
389
  },
390
  {
391
+ "eval_loss": 0.5905945897102356,
392
+ "eval_runtime": 76.6602,
393
+ "eval_samples_per_second": 5.961,
394
+ "eval_steps_per_second": 2.987,
395
+ "epoch": 0.7257881605806306,
396
  "step": 400
397
  },
398
  {
399
+ "loss": 0.6138,
400
+ "grad_norm": 0.2632121741771698,
401
+ "learning_rate": 3.828976692832458e-05,
402
+ "epoch": 0.7439328645951463,
403
  "step": 410
404
  },
405
  {
406
+ "loss": 0.6976,
407
+ "grad_norm": 0.24841086566448212,
408
+ "learning_rate": 3.343283790512829e-05,
409
+ "epoch": 0.762077568609662,
410
  "step": 420
411
  },
412
  {
413
+ "loss": 0.7324,
414
+ "grad_norm": 0.33077147603034973,
415
+ "learning_rate": 2.8842871983347998e-05,
416
+ "epoch": 0.7802222726241779,
417
  "step": 430
418
  },
419
  {
420
+ "loss": 0.5743,
421
+ "grad_norm": 0.29825517535209656,
422
+ "learning_rate": 2.45382769108779e-05,
423
+ "epoch": 0.7983669766386936,
424
  "step": 440
425
  },
426
  {
427
+ "loss": 0.5662,
428
+ "grad_norm": 0.18266697227954865,
429
+ "learning_rate": 2.0536315975121544e-05,
430
+ "epoch": 0.8165116806532093,
431
+ "step": 450
432
+ },
433
+ {
434
+ "eval_loss": 0.5785723924636841,
435
+ "eval_runtime": 74.3692,
436
+ "eval_samples_per_second": 6.145,
437
+ "eval_steps_per_second": 3.079,
438
+ "epoch": 0.8165116806532093,
439
+ "step": 450
440
+ },
441
+ {
442
+ "loss": 0.6846,
443
+ "grad_norm": 0.37334564328193665,
444
+ "learning_rate": 1.6853038769745467e-05,
445
+ "epoch": 0.8346563846677251,
446
+ "step": 460
447
+ },
448
+ {
449
+ "loss": 0.6121,
450
+ "grad_norm": 0.31827959418296814,
451
+ "learning_rate": 1.3503216828869192e-05,
452
+ "epoch": 0.8528010886822409,
453
+ "step": 470
454
+ },
455
+ {
456
+ "loss": 0.719,
457
+ "grad_norm": 0.5225608348846436,
458
+ "learning_rate": 1.0500284386826597e-05,
459
+ "epoch": 0.8709457926967566,
460
+ "step": 480
461
+ },
462
+ {
463
+ "loss": 0.6807,
464
+ "grad_norm": 0.2689962387084961,
465
+ "learning_rate": 7.856284501077926e-06,
466
+ "epoch": 0.8890904967112724,
467
+ "step": 490
468
+ },
469
+ {
470
+ "loss": 0.5893,
471
+ "grad_norm": 0.34263530373573303,
472
+ "learning_rate": 5.581820754342137e-06,
473
+ "epoch": 0.9072352007257881,
474
+ "step": 500
475
+ },
476
+ {
477
+ "eval_loss": 0.5692603588104248,
478
+ "eval_runtime": 74.4074,
479
+ "eval_samples_per_second": 6.142,
480
+ "eval_steps_per_second": 3.078,
481
+ "epoch": 0.9072352007257881,
482
+ "step": 500
483
+ },
484
+ {
485
+ "loss": 0.5906,
486
+ "grad_norm": 0.28205356001853943,
487
+ "learning_rate": 3.6860147296457816e-06,
488
+ "epoch": 0.925379904740304,
489
+ "step": 510
490
+ },
491
+ {
492
+ "loss": 0.6412,
493
+ "grad_norm": 0.3020014464855194,
494
+ "learning_rate": 2.1764694288310184e-06,
495
+ "epoch": 0.9435246087548197,
496
+ "step": 520
497
+ },
498
+ {
499
+ "loss": 0.7172,
500
+ "grad_norm": 0.3621278703212738,
501
+ "learning_rate": 1.0592387812310311e-06,
502
+ "epoch": 0.9616693127693354,
503
+ "step": 530
504
+ },
505
+ {
506
+ "loss": 0.5935,
507
+ "grad_norm": 0.31613534688949585,
508
+ "learning_rate": 3.3880336479590325e-07,
509
+ "epoch": 0.9798140167838512,
510
+ "step": 540
511
+ },
512
+ {
513
+ "loss": 0.5792,
514
+ "grad_norm": 0.4579828679561615,
515
+ "learning_rate": 1.8052437037707758e-08,
516
+ "epoch": 0.997958720798367,
517
+ "step": 550
518
+ },
519
+ {
520
+ "eval_loss": 0.5670668482780457,
521
+ "eval_runtime": 72.4806,
522
+ "eval_samples_per_second": 6.305,
523
+ "eval_steps_per_second": 3.159,
524
+ "epoch": 0.997958720798367,
525
+ "step": 550
526
+ },
527
+ {
528
+ "train_runtime": 4669.6027,
529
+ "train_samples_per_second": 1.888,
530
  "train_steps_per_second": 0.118,
531
+ "total_flos": 8.286147539211264e+16,
532
+ "train_loss": 0.7052618392567703,
533
+ "epoch": 1.0,
534
+ "step": 552,
535
+ "total_runtime_sec": 4670.917282342911
536
  }
537
  ]
538
  }