besimray commited on
Commit
5eaf652
·
verified ·
1 Parent(s): a650f8f

Training in progress, step 15, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -10,23 +10,23 @@
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
- "lora_alpha": 64,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
- "r": 24,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
24
- "o_proj",
25
  "gate_proj",
26
- "down_proj",
27
  "v_proj",
 
 
28
  "k_proj",
29
- "q_proj"
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
10
  "layers_pattern": null,
11
  "layers_to_transform": null,
12
  "loftq_config": {},
13
+ "lora_alpha": 32,
14
  "lora_dropout": 0.05,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
  "modules_to_save": null,
18
  "peft_type": "LORA",
19
+ "r": 16,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "gate_proj",
 
24
  "v_proj",
25
+ "o_proj",
26
+ "up_proj",
27
  "k_proj",
28
+ "q_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d259edcf51a6e65a5e5aa8f076d5bc4bf480fc4b4c59350991263774074d7ea
3
- size 67662840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f5168c85d041eb40b539ed1ea4fa405f9e14109cf8ce608d3eac8f26f627745
3
+ size 45118424
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0503c8ba76bfe0abdcb3e8a6104759013bd2d60c838b4625f0a1ddcf7615226
3
- size 34607610
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87fd6f0b4471cdc416e96626857e0c5e197d9b25e78f6d528666378e766fc69f
3
+ size 23159290
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bfc5fa47af4dae874a1be827d0f45774971f451a821e11602842d4ee93aaa71
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c683a96b3a33504bcf104bd66d70d07f59ed807698ad96230e879f4b6bf5d00a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29055dd59dc6fec528a1dd0a8f1388fe1bcd85af7ce5330f9713cff07d4913e7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b4906f488285c3d93b3de9477b5cdb50810bb8e8a714368724d443d38a5757a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,1704 +1,171 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 6.6141732283464565,
5
- "eval_steps": 8,
6
- "global_step": 210,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.031496062992125984,
13
- "grad_norm": 1.069150686264038,
14
- "learning_rate": 5.000000000000001e-07,
15
- "loss": 1.2798,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.031496062992125984,
20
- "eval_loss": 1.2495309114456177,
21
- "eval_runtime": 2.0343,
22
- "eval_samples_per_second": 49.157,
23
- "eval_steps_per_second": 3.441,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.06299212598425197,
28
- "grad_norm": 1.0647882223129272,
29
- "learning_rate": 1.0000000000000002e-06,
30
- "loss": 1.4205,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 0.09448818897637795,
35
- "grad_norm": 1.0129144191741943,
36
- "learning_rate": 1.5e-06,
37
- "loss": 1.3129,
 
 
 
 
 
 
 
 
38
  "step": 3
39
  },
40
  {
41
- "epoch": 0.12598425196850394,
42
- "grad_norm": 1.1409480571746826,
43
- "learning_rate": 2.0000000000000003e-06,
44
- "loss": 1.4165,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 0.15748031496062992,
49
- "grad_norm": 1.0780513286590576,
50
- "learning_rate": 2.5e-06,
51
- "loss": 1.365,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 0.1889763779527559,
56
- "grad_norm": 1.0233283042907715,
57
- "learning_rate": 3e-06,
58
- "loss": 1.2365,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 0.2204724409448819,
63
- "grad_norm": 0.9057336449623108,
64
- "learning_rate": 3.5000000000000004e-06,
65
- "loss": 1.3164,
 
 
 
 
 
 
 
 
66
  "step": 7
67
  },
68
  {
69
- "epoch": 0.25196850393700787,
70
- "grad_norm": 0.9594066739082336,
71
- "learning_rate": 4.000000000000001e-06,
72
- "loss": 1.3154,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 0.25196850393700787,
77
- "eval_loss": 1.2488781213760376,
78
- "eval_runtime": 1.9745,
79
- "eval_samples_per_second": 50.647,
80
- "eval_steps_per_second": 3.545,
81
- "step": 8
82
  },
83
  {
84
- "epoch": 0.28346456692913385,
85
- "grad_norm": 0.8745964765548706,
86
- "learning_rate": 4.5e-06,
87
- "loss": 1.1391,
 
88
  "step": 9
89
  },
90
  {
91
- "epoch": 0.31496062992125984,
92
- "grad_norm": 0.9511871933937073,
93
- "learning_rate": 5e-06,
94
- "loss": 1.0862,
95
  "step": 10
96
  },
97
  {
98
- "epoch": 0.3464566929133858,
99
- "grad_norm": 0.8791377544403076,
100
- "learning_rate": 5.500000000000001e-06,
101
- "loss": 1.2867,
102
  "step": 11
103
  },
104
  {
105
- "epoch": 0.3779527559055118,
106
- "grad_norm": 0.9202072024345398,
107
- "learning_rate": 6e-06,
108
- "loss": 1.3237,
 
 
 
 
 
 
 
 
109
  "step": 12
110
  },
111
  {
112
- "epoch": 0.4094488188976378,
113
- "grad_norm": 0.9273457527160645,
114
- "learning_rate": 6.5000000000000004e-06,
115
- "loss": 1.3609,
116
  "step": 13
117
  },
118
  {
119
- "epoch": 0.4409448818897638,
120
- "grad_norm": 0.8715579509735107,
121
- "learning_rate": 7.000000000000001e-06,
122
- "loss": 1.2499,
123
  "step": 14
124
  },
125
  {
126
- "epoch": 0.47244094488188976,
127
- "grad_norm": 0.8558375239372253,
128
- "learning_rate": 7.5e-06,
129
- "loss": 1.2201,
130
  "step": 15
131
  },
132
  {
133
- "epoch": 0.5039370078740157,
134
- "grad_norm": 0.9306897521018982,
135
- "learning_rate": 8.000000000000001e-06,
136
- "loss": 1.3249,
137
- "step": 16
138
- },
139
- {
140
- "epoch": 0.5039370078740157,
141
- "eval_loss": 1.2311681509017944,
142
- "eval_runtime": 1.979,
143
- "eval_samples_per_second": 50.531,
144
- "eval_steps_per_second": 3.537,
145
- "step": 16
146
- },
147
- {
148
- "epoch": 0.5354330708661418,
149
- "grad_norm": 0.9032379984855652,
150
- "learning_rate": 8.500000000000002e-06,
151
- "loss": 1.3434,
152
- "step": 17
153
- },
154
- {
155
- "epoch": 0.5669291338582677,
156
- "grad_norm": 0.859302282333374,
157
- "learning_rate": 9e-06,
158
- "loss": 1.3049,
159
- "step": 18
160
- },
161
- {
162
- "epoch": 0.5984251968503937,
163
- "grad_norm": 0.7455488443374634,
164
- "learning_rate": 9.5e-06,
165
- "loss": 1.3192,
166
- "step": 19
167
- },
168
- {
169
- "epoch": 0.6299212598425197,
170
- "grad_norm": 0.713253915309906,
171
- "learning_rate": 1e-05,
172
- "loss": 1.2799,
173
- "step": 20
174
- },
175
- {
176
- "epoch": 0.6614173228346457,
177
- "grad_norm": 0.5953424572944641,
178
- "learning_rate": 1.05e-05,
179
- "loss": 1.1514,
180
- "step": 21
181
- },
182
- {
183
- "epoch": 0.6929133858267716,
184
- "grad_norm": 0.6268596053123474,
185
- "learning_rate": 1.1000000000000001e-05,
186
- "loss": 1.2747,
187
- "step": 22
188
- },
189
- {
190
- "epoch": 0.7244094488188977,
191
- "grad_norm": 0.722594141960144,
192
- "learning_rate": 1.1500000000000002e-05,
193
- "loss": 1.2532,
194
- "step": 23
195
- },
196
- {
197
- "epoch": 0.7559055118110236,
198
- "grad_norm": 0.5606786012649536,
199
- "learning_rate": 1.2e-05,
200
- "loss": 1.2336,
201
- "step": 24
202
- },
203
- {
204
- "epoch": 0.7559055118110236,
205
- "eval_loss": 1.2047480344772339,
206
- "eval_runtime": 1.9744,
207
- "eval_samples_per_second": 50.649,
208
- "eval_steps_per_second": 3.545,
209
- "step": 24
210
- },
211
- {
212
- "epoch": 0.7874015748031497,
213
- "grad_norm": 0.5434914827346802,
214
- "learning_rate": 1.25e-05,
215
- "loss": 1.3273,
216
- "step": 25
217
- },
218
- {
219
- "epoch": 0.8188976377952756,
220
- "grad_norm": 0.589859664440155,
221
- "learning_rate": 1.3000000000000001e-05,
222
- "loss": 1.2225,
223
- "step": 26
224
- },
225
- {
226
- "epoch": 0.8503937007874016,
227
- "grad_norm": 0.6487225294113159,
228
- "learning_rate": 1.3500000000000001e-05,
229
- "loss": 1.2899,
230
- "step": 27
231
- },
232
- {
233
- "epoch": 0.8818897637795275,
234
- "grad_norm": 0.5731435418128967,
235
- "learning_rate": 1.4000000000000001e-05,
236
- "loss": 1.237,
237
- "step": 28
238
- },
239
- {
240
- "epoch": 0.9133858267716536,
241
- "grad_norm": 0.6190696358680725,
242
- "learning_rate": 1.45e-05,
243
- "loss": 1.2299,
244
- "step": 29
245
- },
246
- {
247
- "epoch": 0.9448818897637795,
248
- "grad_norm": 0.548401951789856,
249
- "learning_rate": 1.5e-05,
250
- "loss": 1.2117,
251
- "step": 30
252
- },
253
- {
254
- "epoch": 0.9763779527559056,
255
- "grad_norm": 0.6051440834999084,
256
- "learning_rate": 1.55e-05,
257
- "loss": 1.2396,
258
- "step": 31
259
- },
260
- {
261
- "epoch": 1.0078740157480315,
262
- "grad_norm": 0.5788630247116089,
263
- "learning_rate": 1.6000000000000003e-05,
264
- "loss": 1.235,
265
- "step": 32
266
- },
267
- {
268
- "epoch": 1.0078740157480315,
269
- "eval_loss": 1.1931809186935425,
270
- "eval_runtime": 1.9767,
271
- "eval_samples_per_second": 50.589,
272
- "eval_steps_per_second": 3.541,
273
- "step": 32
274
- },
275
- {
276
- "epoch": 1.0393700787401574,
277
- "grad_norm": 0.6048874258995056,
278
- "learning_rate": 1.65e-05,
279
- "loss": 1.2221,
280
- "step": 33
281
- },
282
- {
283
- "epoch": 1.0708661417322836,
284
- "grad_norm": 0.6334845423698425,
285
- "learning_rate": 1.7000000000000003e-05,
286
- "loss": 1.2613,
287
- "step": 34
288
- },
289
- {
290
- "epoch": 1.1023622047244095,
291
- "grad_norm": 0.5881842970848083,
292
- "learning_rate": 1.75e-05,
293
- "loss": 1.264,
294
- "step": 35
295
- },
296
- {
297
- "epoch": 1.1338582677165354,
298
- "grad_norm": 0.5609626173973083,
299
- "learning_rate": 1.8e-05,
300
- "loss": 1.2696,
301
- "step": 36
302
- },
303
- {
304
- "epoch": 1.1653543307086613,
305
- "grad_norm": 0.47871723771095276,
306
- "learning_rate": 1.85e-05,
307
- "loss": 1.1659,
308
- "step": 37
309
- },
310
- {
311
- "epoch": 1.1968503937007875,
312
- "grad_norm": 0.5133270025253296,
313
- "learning_rate": 1.9e-05,
314
- "loss": 1.1731,
315
- "step": 38
316
- },
317
- {
318
- "epoch": 1.2283464566929134,
319
- "grad_norm": 0.49426957964897156,
320
- "learning_rate": 1.9500000000000003e-05,
321
- "loss": 1.0909,
322
- "step": 39
323
- },
324
- {
325
- "epoch": 1.2598425196850394,
326
- "grad_norm": 0.4927002191543579,
327
- "learning_rate": 2e-05,
328
- "loss": 1.2591,
329
- "step": 40
330
- },
331
- {
332
- "epoch": 1.2598425196850394,
333
- "eval_loss": 1.1798231601715088,
334
- "eval_runtime": 2.2274,
335
- "eval_samples_per_second": 44.895,
336
- "eval_steps_per_second": 3.143,
337
- "step": 40
338
- },
339
- {
340
- "epoch": 1.2913385826771653,
341
- "grad_norm": 0.5016794800758362,
342
- "learning_rate": 2.05e-05,
343
- "loss": 1.1344,
344
- "step": 41
345
- },
346
- {
347
- "epoch": 1.3228346456692912,
348
- "grad_norm": 0.5014638304710388,
349
- "learning_rate": 2.1e-05,
350
- "loss": 1.1438,
351
- "step": 42
352
- },
353
- {
354
- "epoch": 1.3543307086614174,
355
- "grad_norm": 0.4870070517063141,
356
- "learning_rate": 2.15e-05,
357
- "loss": 1.2946,
358
- "step": 43
359
- },
360
- {
361
- "epoch": 1.3858267716535433,
362
- "grad_norm": 0.5146998763084412,
363
- "learning_rate": 2.2000000000000003e-05,
364
- "loss": 1.122,
365
- "step": 44
366
- },
367
- {
368
- "epoch": 1.4173228346456692,
369
- "grad_norm": 0.5737994313240051,
370
- "learning_rate": 2.25e-05,
371
- "loss": 1.2749,
372
- "step": 45
373
- },
374
- {
375
- "epoch": 1.4488188976377954,
376
- "grad_norm": 0.5805953145027161,
377
- "learning_rate": 2.3000000000000003e-05,
378
- "loss": 1.2347,
379
- "step": 46
380
- },
381
- {
382
- "epoch": 1.4803149606299213,
383
- "grad_norm": 0.5060011744499207,
384
- "learning_rate": 2.35e-05,
385
- "loss": 1.1806,
386
- "step": 47
387
- },
388
- {
389
- "epoch": 1.5118110236220472,
390
- "grad_norm": 0.5228325128555298,
391
- "learning_rate": 2.4e-05,
392
- "loss": 1.1836,
393
- "step": 48
394
- },
395
- {
396
- "epoch": 1.5118110236220472,
397
- "eval_loss": 1.1722474098205566,
398
- "eval_runtime": 1.9903,
399
- "eval_samples_per_second": 50.243,
400
- "eval_steps_per_second": 3.517,
401
- "step": 48
402
- },
403
- {
404
- "epoch": 1.5433070866141732,
405
- "grad_norm": 0.49606096744537354,
406
- "learning_rate": 2.45e-05,
407
- "loss": 1.2222,
408
- "step": 49
409
- },
410
- {
411
- "epoch": 1.574803149606299,
412
- "grad_norm": 0.37724393606185913,
413
- "learning_rate": 2.5e-05,
414
- "loss": 1.0979,
415
- "step": 50
416
- },
417
- {
418
- "epoch": 1.6062992125984252,
419
- "grad_norm": 0.42948251962661743,
420
- "learning_rate": 2.5500000000000003e-05,
421
- "loss": 1.0678,
422
- "step": 51
423
- },
424
- {
425
- "epoch": 1.6377952755905512,
426
- "grad_norm": 0.4671652317047119,
427
- "learning_rate": 2.6000000000000002e-05,
428
- "loss": 1.18,
429
- "step": 52
430
- },
431
- {
432
- "epoch": 1.6692913385826773,
433
- "grad_norm": 0.44847941398620605,
434
- "learning_rate": 2.6500000000000004e-05,
435
- "loss": 1.2253,
436
- "step": 53
437
- },
438
- {
439
- "epoch": 1.7007874015748032,
440
- "grad_norm": 0.4691849946975708,
441
- "learning_rate": 2.7000000000000002e-05,
442
- "loss": 1.1719,
443
- "step": 54
444
- },
445
- {
446
- "epoch": 1.7322834645669292,
447
- "grad_norm": 0.505724310874939,
448
- "learning_rate": 2.7500000000000004e-05,
449
- "loss": 1.235,
450
- "step": 55
451
- },
452
- {
453
- "epoch": 1.763779527559055,
454
- "grad_norm": 0.48790884017944336,
455
- "learning_rate": 2.8000000000000003e-05,
456
- "loss": 1.2719,
457
- "step": 56
458
- },
459
- {
460
- "epoch": 1.763779527559055,
461
- "eval_loss": 1.1640315055847168,
462
- "eval_runtime": 1.9729,
463
- "eval_samples_per_second": 50.688,
464
- "eval_steps_per_second": 3.548,
465
- "step": 56
466
- },
467
- {
468
- "epoch": 1.795275590551181,
469
- "grad_norm": 0.4780273735523224,
470
- "learning_rate": 2.8499999999999998e-05,
471
- "loss": 1.3099,
472
- "step": 57
473
- },
474
- {
475
- "epoch": 1.826771653543307,
476
- "grad_norm": 0.4578011929988861,
477
- "learning_rate": 2.9e-05,
478
- "loss": 1.3569,
479
- "step": 58
480
- },
481
- {
482
- "epoch": 1.858267716535433,
483
- "grad_norm": 0.5303736925125122,
484
- "learning_rate": 2.95e-05,
485
- "loss": 1.145,
486
- "step": 59
487
- },
488
- {
489
- "epoch": 1.889763779527559,
490
- "grad_norm": 0.5604854226112366,
491
- "learning_rate": 3e-05,
492
- "loss": 1.231,
493
- "step": 60
494
- },
495
- {
496
- "epoch": 1.9212598425196852,
497
- "grad_norm": 0.4924694895744324,
498
- "learning_rate": 3.05e-05,
499
- "loss": 1.1343,
500
- "step": 61
501
- },
502
- {
503
- "epoch": 1.952755905511811,
504
- "grad_norm": 0.4921957850456238,
505
- "learning_rate": 3.1e-05,
506
- "loss": 1.3127,
507
- "step": 62
508
- },
509
- {
510
- "epoch": 1.984251968503937,
511
- "grad_norm": 0.5097357034683228,
512
- "learning_rate": 3.15e-05,
513
- "loss": 1.2336,
514
- "step": 63
515
- },
516
- {
517
- "epoch": 2.015748031496063,
518
- "grad_norm": 0.46658650040626526,
519
- "learning_rate": 3.2000000000000005e-05,
520
- "loss": 1.2083,
521
- "step": 64
522
- },
523
- {
524
- "epoch": 2.015748031496063,
525
- "eval_loss": 1.1596944332122803,
526
- "eval_runtime": 1.9712,
527
- "eval_samples_per_second": 50.731,
528
- "eval_steps_per_second": 3.551,
529
- "step": 64
530
- },
531
- {
532
- "epoch": 2.047244094488189,
533
- "grad_norm": 0.46166104078292847,
534
- "learning_rate": 3.2500000000000004e-05,
535
- "loss": 1.1131,
536
- "step": 65
537
- },
538
- {
539
- "epoch": 2.078740157480315,
540
- "grad_norm": 0.5125811696052551,
541
- "learning_rate": 3.3e-05,
542
- "loss": 1.0556,
543
- "step": 66
544
- },
545
- {
546
- "epoch": 2.1102362204724407,
547
- "grad_norm": 0.5640822649002075,
548
- "learning_rate": 3.35e-05,
549
- "loss": 1.2563,
550
- "step": 67
551
- },
552
- {
553
- "epoch": 2.141732283464567,
554
- "grad_norm": 0.5031111836433411,
555
- "learning_rate": 3.4000000000000007e-05,
556
- "loss": 1.0555,
557
- "step": 68
558
- },
559
- {
560
- "epoch": 2.173228346456693,
561
- "grad_norm": 0.5319817066192627,
562
- "learning_rate": 3.45e-05,
563
- "loss": 1.149,
564
- "step": 69
565
- },
566
- {
567
- "epoch": 2.204724409448819,
568
- "grad_norm": 0.49313291907310486,
569
- "learning_rate": 3.5e-05,
570
- "loss": 1.1457,
571
- "step": 70
572
- },
573
- {
574
- "epoch": 2.236220472440945,
575
- "grad_norm": 0.48414379358291626,
576
- "learning_rate": 3.55e-05,
577
- "loss": 1.2644,
578
- "step": 71
579
- },
580
- {
581
- "epoch": 2.267716535433071,
582
- "grad_norm": 0.5062035918235779,
583
- "learning_rate": 3.6e-05,
584
- "loss": 1.2119,
585
- "step": 72
586
- },
587
- {
588
- "epoch": 2.267716535433071,
589
- "eval_loss": 1.1549252271652222,
590
- "eval_runtime": 1.9635,
591
- "eval_samples_per_second": 50.93,
592
- "eval_steps_per_second": 3.565,
593
- "step": 72
594
- },
595
- {
596
- "epoch": 2.2992125984251968,
597
- "grad_norm": 0.47118309140205383,
598
- "learning_rate": 3.65e-05,
599
- "loss": 1.1578,
600
- "step": 73
601
- },
602
- {
603
- "epoch": 2.3307086614173227,
604
- "grad_norm": 0.5640192627906799,
605
- "learning_rate": 3.7e-05,
606
- "loss": 1.1214,
607
- "step": 74
608
- },
609
- {
610
- "epoch": 2.362204724409449,
611
- "grad_norm": 0.4832814633846283,
612
- "learning_rate": 3.7500000000000003e-05,
613
- "loss": 1.2154,
614
- "step": 75
615
- },
616
- {
617
- "epoch": 2.393700787401575,
618
- "grad_norm": 0.48993921279907227,
619
- "learning_rate": 3.8e-05,
620
- "loss": 1.1292,
621
- "step": 76
622
- },
623
- {
624
- "epoch": 2.425196850393701,
625
- "grad_norm": 0.4585767388343811,
626
- "learning_rate": 3.85e-05,
627
- "loss": 1.2205,
628
- "step": 77
629
- },
630
- {
631
- "epoch": 2.456692913385827,
632
- "grad_norm": 0.47851210832595825,
633
- "learning_rate": 3.9000000000000006e-05,
634
- "loss": 1.1434,
635
- "step": 78
636
- },
637
- {
638
- "epoch": 2.4881889763779528,
639
- "grad_norm": 0.48592087626457214,
640
- "learning_rate": 3.9500000000000005e-05,
641
- "loss": 1.1855,
642
- "step": 79
643
- },
644
- {
645
- "epoch": 2.5196850393700787,
646
- "grad_norm": 0.5699076652526855,
647
- "learning_rate": 4e-05,
648
- "loss": 1.2574,
649
- "step": 80
650
- },
651
- {
652
- "epoch": 2.5196850393700787,
653
- "eval_loss": 1.15181565284729,
654
- "eval_runtime": 2.2403,
655
- "eval_samples_per_second": 44.638,
656
- "eval_steps_per_second": 3.125,
657
- "step": 80
658
- },
659
- {
660
- "epoch": 2.5511811023622046,
661
- "grad_norm": 0.5299522876739502,
662
- "learning_rate": 4.05e-05,
663
- "loss": 1.0995,
664
- "step": 81
665
- },
666
- {
667
- "epoch": 2.5826771653543306,
668
- "grad_norm": 0.5705897808074951,
669
- "learning_rate": 4.1e-05,
670
- "loss": 1.2274,
671
- "step": 82
672
- },
673
- {
674
- "epoch": 2.6141732283464565,
675
- "grad_norm": 0.48036718368530273,
676
- "learning_rate": 4.15e-05,
677
- "loss": 1.0861,
678
- "step": 83
679
- },
680
- {
681
- "epoch": 2.6456692913385824,
682
- "grad_norm": 0.49471601843833923,
683
- "learning_rate": 4.2e-05,
684
- "loss": 1.1628,
685
- "step": 84
686
- },
687
- {
688
- "epoch": 2.677165354330709,
689
- "grad_norm": 0.49998700618743896,
690
- "learning_rate": 4.25e-05,
691
- "loss": 1.0709,
692
- "step": 85
693
- },
694
- {
695
- "epoch": 2.7086614173228347,
696
- "grad_norm": 0.5527383685112,
697
- "learning_rate": 4.3e-05,
698
- "loss": 1.201,
699
- "step": 86
700
- },
701
- {
702
- "epoch": 2.7401574803149606,
703
- "grad_norm": 0.6283419728279114,
704
- "learning_rate": 4.35e-05,
705
- "loss": 1.237,
706
- "step": 87
707
- },
708
- {
709
- "epoch": 2.7716535433070866,
710
- "grad_norm": 0.6050185561180115,
711
- "learning_rate": 4.4000000000000006e-05,
712
- "loss": 1.025,
713
- "step": 88
714
- },
715
- {
716
- "epoch": 2.7716535433070866,
717
- "eval_loss": 1.1485306024551392,
718
- "eval_runtime": 2.0753,
719
- "eval_samples_per_second": 48.185,
720
- "eval_steps_per_second": 3.373,
721
- "step": 88
722
- },
723
- {
724
- "epoch": 2.8031496062992125,
725
- "grad_norm": 0.5350843667984009,
726
- "learning_rate": 4.4500000000000004e-05,
727
- "loss": 1.1582,
728
- "step": 89
729
- },
730
- {
731
- "epoch": 2.8346456692913384,
732
- "grad_norm": 0.5085338950157166,
733
- "learning_rate": 4.5e-05,
734
- "loss": 1.254,
735
- "step": 90
736
- },
737
- {
738
- "epoch": 2.866141732283465,
739
- "grad_norm": 0.5343408584594727,
740
- "learning_rate": 4.55e-05,
741
- "loss": 1.0941,
742
- "step": 91
743
- },
744
- {
745
- "epoch": 2.8976377952755907,
746
- "grad_norm": 0.5207767486572266,
747
- "learning_rate": 4.600000000000001e-05,
748
- "loss": 1.176,
749
- "step": 92
750
- },
751
- {
752
- "epoch": 2.9291338582677167,
753
- "grad_norm": 0.6931973695755005,
754
- "learning_rate": 4.6500000000000005e-05,
755
- "loss": 1.2044,
756
- "step": 93
757
- },
758
- {
759
- "epoch": 2.9606299212598426,
760
- "grad_norm": 0.5525833964347839,
761
- "learning_rate": 4.7e-05,
762
- "loss": 1.1156,
763
- "step": 94
764
- },
765
- {
766
- "epoch": 2.9921259842519685,
767
- "grad_norm": 0.5206693410873413,
768
- "learning_rate": 4.75e-05,
769
- "loss": 1.1831,
770
- "step": 95
771
- },
772
- {
773
- "epoch": 3.0236220472440944,
774
- "grad_norm": 0.6194021105766296,
775
- "learning_rate": 4.8e-05,
776
- "loss": 1.2232,
777
- "step": 96
778
- },
779
- {
780
- "epoch": 3.0236220472440944,
781
- "eval_loss": 1.1487455368041992,
782
- "eval_runtime": 1.9591,
783
- "eval_samples_per_second": 51.044,
784
- "eval_steps_per_second": 3.573,
785
- "step": 96
786
- },
787
- {
788
- "epoch": 3.0551181102362204,
789
- "grad_norm": 0.47633522748947144,
790
- "learning_rate": 4.85e-05,
791
- "loss": 1.0667,
792
- "step": 97
793
- },
794
- {
795
- "epoch": 3.0866141732283463,
796
- "grad_norm": 0.49245792627334595,
797
- "learning_rate": 4.9e-05,
798
- "loss": 1.0812,
799
- "step": 98
800
- },
801
- {
802
- "epoch": 3.1181102362204722,
803
- "grad_norm": 0.49651336669921875,
804
- "learning_rate": 4.9500000000000004e-05,
805
- "loss": 1.1153,
806
- "step": 99
807
- },
808
- {
809
- "epoch": 3.1496062992125986,
810
- "grad_norm": 0.5725173354148865,
811
- "learning_rate": 5e-05,
812
- "loss": 1.1583,
813
- "step": 100
814
- },
815
- {
816
- "epoch": 3.1811023622047245,
817
- "grad_norm": 0.5892531871795654,
818
- "learning_rate": 4.9999567360675626e-05,
819
- "loss": 1.1323,
820
- "step": 101
821
- },
822
- {
823
- "epoch": 3.2125984251968505,
824
- "grad_norm": 0.5684159994125366,
825
- "learning_rate": 4.999826945767665e-05,
826
- "loss": 1.2006,
827
- "step": 102
828
- },
829
- {
830
- "epoch": 3.2440944881889764,
831
- "grad_norm": 0.5696210265159607,
832
- "learning_rate": 4.999610633592496e-05,
833
- "loss": 1.1272,
834
- "step": 103
835
- },
836
- {
837
- "epoch": 3.2755905511811023,
838
- "grad_norm": 0.6086538434028625,
839
- "learning_rate": 4.999307807028871e-05,
840
- "loss": 1.1621,
841
- "step": 104
842
- },
843
- {
844
- "epoch": 3.2755905511811023,
845
- "eval_loss": 1.1464533805847168,
846
- "eval_runtime": 1.9521,
847
- "eval_samples_per_second": 51.226,
848
- "eval_steps_per_second": 3.586,
849
- "step": 104
850
- },
851
- {
852
- "epoch": 3.3070866141732282,
853
- "grad_norm": 0.543544590473175,
854
- "learning_rate": 4.998918476557963e-05,
855
- "loss": 1.1049,
856
- "step": 105
857
- },
858
- {
859
- "epoch": 3.338582677165354,
860
- "grad_norm": 0.6197894811630249,
861
- "learning_rate": 4.9984426556549456e-05,
862
- "loss": 1.0589,
863
- "step": 106
864
- },
865
- {
866
- "epoch": 3.3700787401574805,
867
- "grad_norm": 0.553490400314331,
868
- "learning_rate": 4.997880360788526e-05,
869
- "loss": 1.0755,
870
- "step": 107
871
- },
872
- {
873
- "epoch": 3.4015748031496065,
874
- "grad_norm": 0.5474947094917297,
875
- "learning_rate": 4.997231611420373e-05,
876
- "loss": 0.9718,
877
- "step": 108
878
- },
879
- {
880
- "epoch": 3.4330708661417324,
881
- "grad_norm": 0.6409115791320801,
882
- "learning_rate": 4.996496430004446e-05,
883
- "loss": 1.0829,
884
- "step": 109
885
- },
886
- {
887
- "epoch": 3.4645669291338583,
888
- "grad_norm": 0.5846779346466064,
889
- "learning_rate": 4.995674841986217e-05,
890
- "loss": 1.0633,
891
- "step": 110
892
- },
893
- {
894
- "epoch": 3.4960629921259843,
895
- "grad_norm": 0.6325021982192993,
896
- "learning_rate": 4.9947668758017884e-05,
897
- "loss": 1.0721,
898
- "step": 111
899
- },
900
- {
901
- "epoch": 3.52755905511811,
902
- "grad_norm": 0.6479124426841736,
903
- "learning_rate": 4.9937725628769094e-05,
904
- "loss": 1.031,
905
- "step": 112
906
- },
907
- {
908
- "epoch": 3.52755905511811,
909
- "eval_loss": 1.1493302583694458,
910
- "eval_runtime": 2.0087,
911
- "eval_samples_per_second": 49.783,
912
- "eval_steps_per_second": 3.485,
913
- "step": 112
914
- },
915
- {
916
- "epoch": 3.559055118110236,
917
- "grad_norm": 0.663337767124176,
918
- "learning_rate": 4.9926919376258916e-05,
919
- "loss": 1.0875,
920
- "step": 113
921
- },
922
- {
923
- "epoch": 3.590551181102362,
924
- "grad_norm": 0.609626293182373,
925
- "learning_rate": 4.991525037450412e-05,
926
- "loss": 1.0576,
927
- "step": 114
928
- },
929
- {
930
- "epoch": 3.622047244094488,
931
- "grad_norm": 0.7602643966674805,
932
- "learning_rate": 4.990271902738223e-05,
933
- "loss": 1.1482,
934
- "step": 115
935
- },
936
- {
937
- "epoch": 3.653543307086614,
938
- "grad_norm": 0.7129984498023987,
939
- "learning_rate": 4.9889325768617536e-05,
940
- "loss": 1.0728,
941
- "step": 116
942
- },
943
- {
944
- "epoch": 3.6850393700787403,
945
- "grad_norm": 0.6811193823814392,
946
- "learning_rate": 4.987507106176606e-05,
947
- "loss": 1.126,
948
- "step": 117
949
- },
950
- {
951
- "epoch": 3.716535433070866,
952
- "grad_norm": 0.6760783195495605,
953
- "learning_rate": 4.985995540019955e-05,
954
- "loss": 1.3472,
955
- "step": 118
956
- },
957
- {
958
- "epoch": 3.748031496062992,
959
- "grad_norm": 0.7298296689987183,
960
- "learning_rate": 4.984397930708838e-05,
961
- "loss": 1.1568,
962
- "step": 119
963
- },
964
- {
965
- "epoch": 3.779527559055118,
966
- "grad_norm": 0.6873138546943665,
967
- "learning_rate": 4.982714333538343e-05,
968
- "loss": 1.0529,
969
- "step": 120
970
- },
971
- {
972
- "epoch": 3.779527559055118,
973
- "eval_loss": 1.1472485065460205,
974
- "eval_runtime": 2.0184,
975
- "eval_samples_per_second": 49.543,
976
- "eval_steps_per_second": 3.468,
977
- "step": 120
978
- },
979
- {
980
- "epoch": 3.811023622047244,
981
- "grad_norm": 0.5837106108665466,
982
- "learning_rate": 4.9809448067796974e-05,
983
- "loss": 1.1478,
984
- "step": 121
985
- },
986
- {
987
- "epoch": 3.84251968503937,
988
- "grad_norm": 0.721906304359436,
989
- "learning_rate": 4.9790894116782514e-05,
990
- "loss": 1.2416,
991
- "step": 122
992
- },
993
- {
994
- "epoch": 3.8740157480314963,
995
- "grad_norm": 0.5964561700820923,
996
- "learning_rate": 4.977148212451354e-05,
997
- "loss": 1.1178,
998
- "step": 123
999
- },
1000
- {
1001
- "epoch": 3.905511811023622,
1002
- "grad_norm": 0.6578372120857239,
1003
- "learning_rate": 4.975121276286136e-05,
1004
- "loss": 1.1086,
1005
- "step": 124
1006
- },
1007
- {
1008
- "epoch": 3.937007874015748,
1009
- "grad_norm": 0.7385239601135254,
1010
- "learning_rate": 4.973008673337181e-05,
1011
- "loss": 1.1771,
1012
- "step": 125
1013
- },
1014
- {
1015
- "epoch": 3.968503937007874,
1016
- "grad_norm": 0.648508608341217,
1017
- "learning_rate": 4.970810476724097e-05,
1018
- "loss": 1.1603,
1019
- "step": 126
1020
- },
1021
- {
1022
- "epoch": 4.0,
1023
- "grad_norm": 0.7525383830070496,
1024
- "learning_rate": 4.9685267625289886e-05,
1025
- "loss": 1.1405,
1026
- "step": 127
1027
- },
1028
- {
1029
- "epoch": 4.031496062992126,
1030
- "grad_norm": 0.6271669864654541,
1031
- "learning_rate": 4.96615760979382e-05,
1032
- "loss": 1.1122,
1033
- "step": 128
1034
- },
1035
- {
1036
- "epoch": 4.031496062992126,
1037
- "eval_loss": 1.1474381685256958,
1038
- "eval_runtime": 1.9734,
1039
- "eval_samples_per_second": 50.675,
1040
- "eval_steps_per_second": 3.547,
1041
- "step": 128
1042
- },
1043
- {
1044
- "epoch": 4.062992125984252,
1045
- "grad_norm": 0.6840550899505615,
1046
- "learning_rate": 4.963703100517684e-05,
1047
- "loss": 0.9849,
1048
- "step": 129
1049
- },
1050
- {
1051
- "epoch": 4.094488188976378,
1052
- "grad_norm": 0.6858397722244263,
1053
- "learning_rate": 4.9611633196539584e-05,
1054
- "loss": 1.095,
1055
- "step": 130
1056
- },
1057
- {
1058
- "epoch": 4.125984251968504,
1059
- "grad_norm": 0.8086863160133362,
1060
- "learning_rate": 4.9585383551073694e-05,
1061
- "loss": 1.0421,
1062
- "step": 131
1063
- },
1064
- {
1065
- "epoch": 4.15748031496063,
1066
- "grad_norm": 0.6306203007698059,
1067
- "learning_rate": 4.955828297730949e-05,
1068
- "loss": 1.0594,
1069
- "step": 132
1070
- },
1071
- {
1072
- "epoch": 4.188976377952756,
1073
- "grad_norm": 0.6755027174949646,
1074
- "learning_rate": 4.953033241322886e-05,
1075
- "loss": 1.1747,
1076
- "step": 133
1077
- },
1078
- {
1079
- "epoch": 4.2204724409448815,
1080
- "grad_norm": 0.7362006902694702,
1081
- "learning_rate": 4.950153282623289e-05,
1082
- "loss": 1.1125,
1083
- "step": 134
1084
- },
1085
- {
1086
- "epoch": 4.251968503937007,
1087
- "grad_norm": 0.6847573518753052,
1088
- "learning_rate": 4.9471885213108274e-05,
1089
- "loss": 1.0188,
1090
- "step": 135
1091
- },
1092
- {
1093
- "epoch": 4.283464566929134,
1094
- "grad_norm": 0.6339418292045593,
1095
- "learning_rate": 4.9441390599992864e-05,
1096
- "loss": 1.0234,
1097
- "step": 136
1098
- },
1099
- {
1100
- "epoch": 4.283464566929134,
1101
- "eval_loss": 1.1559439897537231,
1102
- "eval_runtime": 1.9613,
1103
- "eval_samples_per_second": 50.987,
1104
- "eval_steps_per_second": 3.569,
1105
- "step": 136
1106
- },
1107
- {
1108
- "epoch": 4.31496062992126,
1109
- "grad_norm": 0.7027537822723389,
1110
- "learning_rate": 4.941005004234018e-05,
1111
- "loss": 1.1565,
1112
- "step": 137
1113
- },
1114
- {
1115
- "epoch": 4.346456692913386,
1116
- "grad_norm": 0.7245253324508667,
1117
- "learning_rate": 4.937786462488284e-05,
1118
- "loss": 1.2024,
1119
- "step": 138
1120
- },
1121
- {
1122
- "epoch": 4.377952755905512,
1123
- "grad_norm": 0.7252236008644104,
1124
- "learning_rate": 4.9344835461595014e-05,
1125
- "loss": 1.0075,
1126
- "step": 139
1127
- },
1128
- {
1129
- "epoch": 4.409448818897638,
1130
- "grad_norm": 0.6999779939651489,
1131
- "learning_rate": 4.93109636956539e-05,
1132
- "loss": 0.9823,
1133
- "step": 140
1134
- },
1135
- {
1136
- "epoch": 4.440944881889764,
1137
- "grad_norm": 0.7879251837730408,
1138
- "learning_rate": 4.927625049940013e-05,
1139
- "loss": 1.0108,
1140
- "step": 141
1141
- },
1142
- {
1143
- "epoch": 4.47244094488189,
1144
- "grad_norm": 0.7063820958137512,
1145
- "learning_rate": 4.9240697074297206e-05,
1146
- "loss": 1.043,
1147
- "step": 142
1148
- },
1149
- {
1150
- "epoch": 4.503937007874016,
1151
- "grad_norm": 0.710659921169281,
1152
- "learning_rate": 4.9204304650889915e-05,
1153
- "loss": 1.0339,
1154
- "step": 143
1155
- },
1156
- {
1157
- "epoch": 4.535433070866142,
1158
- "grad_norm": 0.772346019744873,
1159
- "learning_rate": 4.9167074488761735e-05,
1160
- "loss": 0.9867,
1161
- "step": 144
1162
- },
1163
- {
1164
- "epoch": 4.535433070866142,
1165
- "eval_loss": 1.1680593490600586,
1166
- "eval_runtime": 1.9681,
1167
- "eval_samples_per_second": 50.81,
1168
- "eval_steps_per_second": 3.557,
1169
- "step": 144
1170
- },
1171
- {
1172
- "epoch": 4.566929133858268,
1173
- "grad_norm": 0.8781888484954834,
1174
- "learning_rate": 4.912900787649124e-05,
1175
- "loss": 1.0622,
1176
- "step": 145
1177
- },
1178
- {
1179
- "epoch": 4.5984251968503935,
1180
- "grad_norm": 0.9489847421646118,
1181
- "learning_rate": 4.90901061316075e-05,
1182
- "loss": 1.0378,
1183
- "step": 146
1184
- },
1185
- {
1186
- "epoch": 4.6299212598425195,
1187
- "grad_norm": 0.8057307600975037,
1188
- "learning_rate": 4.90503706005445e-05,
1189
- "loss": 1.0674,
1190
- "step": 147
1191
- },
1192
- {
1193
- "epoch": 4.661417322834645,
1194
- "grad_norm": 0.9859374761581421,
1195
- "learning_rate": 4.900980265859448e-05,
1196
- "loss": 1.0841,
1197
- "step": 148
1198
- },
1199
- {
1200
- "epoch": 4.692913385826771,
1201
- "grad_norm": 0.8140759468078613,
1202
- "learning_rate": 4.896840370986042e-05,
1203
- "loss": 1.0617,
1204
- "step": 149
1205
- },
1206
- {
1207
- "epoch": 4.724409448818898,
1208
- "grad_norm": 0.8298155665397644,
1209
- "learning_rate": 4.892617518720737e-05,
1210
- "loss": 1.0421,
1211
- "step": 150
1212
- },
1213
- {
1214
- "epoch": 4.755905511811024,
1215
- "grad_norm": 0.8481395840644836,
1216
- "learning_rate": 4.888311855221289e-05,
1217
- "loss": 1.0279,
1218
- "step": 151
1219
- },
1220
- {
1221
- "epoch": 4.78740157480315,
1222
- "grad_norm": 0.7747954726219177,
1223
- "learning_rate": 4.883923529511646e-05,
1224
- "loss": 0.9092,
1225
- "step": 152
1226
- },
1227
- {
1228
- "epoch": 4.78740157480315,
1229
- "eval_loss": 1.1637229919433594,
1230
- "eval_runtime": 1.9694,
1231
- "eval_samples_per_second": 50.777,
1232
- "eval_steps_per_second": 3.554,
1233
- "step": 152
1234
- },
1235
- {
1236
- "epoch": 4.818897637795276,
1237
- "grad_norm": 0.8691744208335876,
1238
- "learning_rate": 4.8794526934767894e-05,
1239
- "loss": 1.0147,
1240
- "step": 153
1241
- },
1242
- {
1243
- "epoch": 4.850393700787402,
1244
- "grad_norm": 0.8221555352210999,
1245
- "learning_rate": 4.874899501857477e-05,
1246
- "loss": 1.0088,
1247
- "step": 154
1248
- },
1249
- {
1250
- "epoch": 4.881889763779528,
1251
- "grad_norm": 0.8129372000694275,
1252
- "learning_rate": 4.87026411224489e-05,
1253
- "loss": 1.0464,
1254
- "step": 155
1255
- },
1256
- {
1257
- "epoch": 4.913385826771654,
1258
- "grad_norm": 0.8731322884559631,
1259
- "learning_rate": 4.865546685075174e-05,
1260
- "loss": 1.0963,
1261
- "step": 156
1262
- },
1263
- {
1264
- "epoch": 4.94488188976378,
1265
- "grad_norm": 0.9633178114891052,
1266
- "learning_rate": 4.860747383623889e-05,
1267
- "loss": 1.0273,
1268
- "step": 157
1269
- },
1270
- {
1271
- "epoch": 4.9763779527559056,
1272
- "grad_norm": 0.8986226320266724,
1273
- "learning_rate": 4.85586637400036e-05,
1274
- "loss": 1.0807,
1275
- "step": 158
1276
- },
1277
- {
1278
- "epoch": 5.0078740157480315,
1279
- "grad_norm": 0.9396881461143494,
1280
- "learning_rate": 4.8509038251419196e-05,
1281
- "loss": 0.9067,
1282
- "step": 159
1283
- },
1284
- {
1285
- "epoch": 5.039370078740157,
1286
- "grad_norm": 0.8420762419700623,
1287
- "learning_rate": 4.8458599088080735e-05,
1288
- "loss": 0.947,
1289
- "step": 160
1290
- },
1291
- {
1292
- "epoch": 5.039370078740157,
1293
- "eval_loss": 1.1647462844848633,
1294
- "eval_runtime": 1.983,
1295
- "eval_samples_per_second": 50.43,
1296
- "eval_steps_per_second": 3.53,
1297
- "step": 160
1298
- },
1299
- {
1300
- "epoch": 5.070866141732283,
1301
- "grad_norm": 0.8484461307525635,
1302
- "learning_rate": 4.840734799574546e-05,
1303
- "loss": 0.9632,
1304
- "step": 161
1305
- },
1306
- {
1307
- "epoch": 5.102362204724409,
1308
- "grad_norm": 0.8203635215759277,
1309
- "learning_rate": 4.83552867482724e-05,
1310
- "loss": 0.8487,
1311
- "step": 162
1312
- },
1313
- {
1314
- "epoch": 5.133858267716535,
1315
- "grad_norm": 0.9048157930374146,
1316
- "learning_rate": 4.830241714756099e-05,
1317
- "loss": 1.011,
1318
- "step": 163
1319
- },
1320
- {
1321
- "epoch": 5.165354330708661,
1322
- "grad_norm": 0.832145094871521,
1323
- "learning_rate": 4.82487410234887e-05,
1324
- "loss": 0.9208,
1325
- "step": 164
1326
- },
1327
- {
1328
- "epoch": 5.196850393700787,
1329
- "grad_norm": 0.8314303755760193,
1330
- "learning_rate": 4.8194260233847695e-05,
1331
- "loss": 1.048,
1332
- "step": 165
1333
- },
1334
- {
1335
- "epoch": 5.228346456692913,
1336
- "grad_norm": 0.8757374882698059,
1337
- "learning_rate": 4.8138976664280536e-05,
1338
- "loss": 0.9722,
1339
- "step": 166
1340
- },
1341
- {
1342
- "epoch": 5.259842519685039,
1343
- "grad_norm": 0.818915605545044,
1344
- "learning_rate": 4.8082892228214906e-05,
1345
- "loss": 0.9581,
1346
- "step": 167
1347
- },
1348
- {
1349
- "epoch": 5.291338582677166,
1350
- "grad_norm": 0.8921651244163513,
1351
- "learning_rate": 4.8026008866797423e-05,
1352
- "loss": 0.9651,
1353
- "step": 168
1354
- },
1355
- {
1356
- "epoch": 5.291338582677166,
1357
- "eval_loss": 1.1861763000488281,
1358
- "eval_runtime": 3.1874,
1359
- "eval_samples_per_second": 31.374,
1360
- "eval_steps_per_second": 2.196,
1361
- "step": 168
1362
- },
1363
- {
1364
- "epoch": 5.322834645669292,
1365
- "grad_norm": 0.8719169497489929,
1366
- "learning_rate": 4.79683285488264e-05,
1367
- "loss": 0.9496,
1368
- "step": 169
1369
- },
1370
- {
1371
- "epoch": 5.354330708661418,
1372
- "grad_norm": 0.8905276656150818,
1373
- "learning_rate": 4.7909853270683756e-05,
1374
- "loss": 1.0808,
1375
- "step": 170
1376
- },
1377
- {
1378
- "epoch": 5.3858267716535435,
1379
- "grad_norm": 0.9556466341018677,
1380
- "learning_rate": 4.785058505626587e-05,
1381
- "loss": 0.8845,
1382
- "step": 171
1383
- },
1384
- {
1385
- "epoch": 5.417322834645669,
1386
- "grad_norm": 0.9415910840034485,
1387
- "learning_rate": 4.779052595691355e-05,
1388
- "loss": 0.9678,
1389
- "step": 172
1390
- },
1391
- {
1392
- "epoch": 5.448818897637795,
1393
- "grad_norm": 0.8661073446273804,
1394
- "learning_rate": 4.772967805134106e-05,
1395
- "loss": 1.0017,
1396
- "step": 173
1397
- },
1398
- {
1399
- "epoch": 5.480314960629921,
1400
- "grad_norm": 0.9943151473999023,
1401
- "learning_rate": 4.7668043445564134e-05,
1402
- "loss": 0.946,
1403
- "step": 174
1404
- },
1405
- {
1406
- "epoch": 5.511811023622047,
1407
- "grad_norm": 0.9678016304969788,
1408
- "learning_rate": 4.7605624272827126e-05,
1409
- "loss": 0.9621,
1410
- "step": 175
1411
- },
1412
- {
1413
- "epoch": 5.543307086614173,
1414
- "grad_norm": 0.9622290134429932,
1415
- "learning_rate": 4.754242269352912e-05,
1416
- "loss": 0.8979,
1417
- "step": 176
1418
- },
1419
- {
1420
- "epoch": 5.543307086614173,
1421
- "eval_loss": 1.1995248794555664,
1422
- "eval_runtime": 3.0442,
1423
- "eval_samples_per_second": 32.849,
1424
- "eval_steps_per_second": 2.299,
1425
- "step": 176
1426
- },
1427
- {
1428
- "epoch": 5.574803149606299,
1429
- "grad_norm": 0.9539031386375427,
1430
- "learning_rate": 4.747844089514919e-05,
1431
- "loss": 1.0158,
1432
- "step": 177
1433
- },
1434
- {
1435
- "epoch": 5.606299212598425,
1436
- "grad_norm": 1.084650993347168,
1437
- "learning_rate": 4.7413681092170715e-05,
1438
- "loss": 0.9219,
1439
- "step": 178
1440
- },
1441
- {
1442
- "epoch": 5.637795275590551,
1443
- "grad_norm": 0.9811678528785706,
1444
- "learning_rate": 4.734814552600469e-05,
1445
- "loss": 1.0474,
1446
- "step": 179
1447
- },
1448
- {
1449
- "epoch": 5.669291338582677,
1450
- "grad_norm": 0.8820479512214661,
1451
- "learning_rate": 4.728183646491214e-05,
1452
- "loss": 0.9529,
1453
- "step": 180
1454
- },
1455
- {
1456
- "epoch": 5.700787401574803,
1457
- "grad_norm": 1.0495096445083618,
1458
- "learning_rate": 4.7214756203925676e-05,
1459
- "loss": 0.9337,
1460
- "step": 181
1461
- },
1462
- {
1463
- "epoch": 5.73228346456693,
1464
- "grad_norm": 1.0566222667694092,
1465
- "learning_rate": 4.7146907064769994e-05,
1466
- "loss": 0.9897,
1467
- "step": 182
1468
- },
1469
- {
1470
- "epoch": 5.7637795275590555,
1471
- "grad_norm": 1.2577701807022095,
1472
- "learning_rate": 4.7078291395781554e-05,
1473
- "loss": 1.1532,
1474
- "step": 183
1475
- },
1476
- {
1477
- "epoch": 5.7952755905511815,
1478
- "grad_norm": 0.9294533729553223,
1479
- "learning_rate": 4.700891157182729e-05,
1480
- "loss": 1.0119,
1481
- "step": 184
1482
- },
1483
- {
1484
- "epoch": 5.7952755905511815,
1485
- "eval_loss": 1.192854881286621,
1486
- "eval_runtime": 3.055,
1487
- "eval_samples_per_second": 32.733,
1488
- "eval_steps_per_second": 2.291,
1489
- "step": 184
1490
- },
1491
- {
1492
- "epoch": 5.826771653543307,
1493
- "grad_norm": 0.9633037447929382,
1494
- "learning_rate": 4.693876999422241e-05,
1495
- "loss": 0.9447,
1496
- "step": 185
1497
- },
1498
- {
1499
- "epoch": 5.858267716535433,
1500
- "grad_norm": 0.9411280751228333,
1501
- "learning_rate": 4.686786909064729e-05,
1502
- "loss": 0.9115,
1503
- "step": 186
1504
- },
1505
- {
1506
- "epoch": 5.889763779527559,
1507
- "grad_norm": 0.9875217080116272,
1508
- "learning_rate": 4.679621131506347e-05,
1509
- "loss": 1.0157,
1510
- "step": 187
1511
- },
1512
- {
1513
- "epoch": 5.921259842519685,
1514
- "grad_norm": 0.9757392406463623,
1515
- "learning_rate": 4.6723799147628666e-05,
1516
- "loss": 0.9835,
1517
- "step": 188
1518
- },
1519
- {
1520
- "epoch": 5.952755905511811,
1521
- "grad_norm": 0.987448513507843,
1522
- "learning_rate": 4.665063509461097e-05,
1523
- "loss": 1.0326,
1524
- "step": 189
1525
- },
1526
- {
1527
- "epoch": 5.984251968503937,
1528
- "grad_norm": 0.985375702381134,
1529
- "learning_rate": 4.6576721688302105e-05,
1530
- "loss": 0.8626,
1531
- "step": 190
1532
- },
1533
- {
1534
- "epoch": 6.015748031496063,
1535
- "grad_norm": 1.0300512313842773,
1536
- "learning_rate": 4.650206148692977e-05,
1537
- "loss": 0.9618,
1538
- "step": 191
1539
- },
1540
- {
1541
- "epoch": 6.047244094488189,
1542
- "grad_norm": 0.9991244077682495,
1543
- "learning_rate": 4.642665707456908e-05,
1544
- "loss": 0.9045,
1545
- "step": 192
1546
- },
1547
- {
1548
- "epoch": 6.047244094488189,
1549
- "eval_loss": 1.2067475318908691,
1550
- "eval_runtime": 3.6768,
1551
- "eval_samples_per_second": 27.197,
1552
- "eval_steps_per_second": 1.904,
1553
- "step": 192
1554
- },
1555
- {
1556
- "epoch": 6.078740157480315,
1557
- "grad_norm": 0.8374089598655701,
1558
- "learning_rate": 4.635051106105316e-05,
1559
- "loss": 0.867,
1560
- "step": 193
1561
- },
1562
- {
1563
- "epoch": 6.110236220472441,
1564
- "grad_norm": 1.0568664073944092,
1565
- "learning_rate": 4.6273626081882805e-05,
1566
- "loss": 0.8759,
1567
- "step": 194
1568
- },
1569
- {
1570
- "epoch": 6.141732283464567,
1571
- "grad_norm": 1.0698866844177246,
1572
- "learning_rate": 4.619600479813524e-05,
1573
- "loss": 0.8402,
1574
- "step": 195
1575
- },
1576
- {
1577
- "epoch": 6.173228346456693,
1578
- "grad_norm": 1.1773091554641724,
1579
- "learning_rate": 4.611764989637205e-05,
1580
- "loss": 0.9268,
1581
- "step": 196
1582
- },
1583
- {
1584
- "epoch": 6.2047244094488185,
1585
- "grad_norm": 1.2211183309555054,
1586
- "learning_rate": 4.603856408854618e-05,
1587
- "loss": 0.8635,
1588
- "step": 197
1589
- },
1590
- {
1591
- "epoch": 6.2362204724409445,
1592
- "grad_norm": 1.0713794231414795,
1593
- "learning_rate": 4.595875011190807e-05,
1594
- "loss": 0.8548,
1595
- "step": 198
1596
- },
1597
- {
1598
- "epoch": 6.267716535433071,
1599
- "grad_norm": 1.2533146142959595,
1600
- "learning_rate": 4.5878210728910894e-05,
1601
- "loss": 0.8742,
1602
- "step": 199
1603
- },
1604
- {
1605
- "epoch": 6.299212598425197,
1606
- "grad_norm": 1.1741169691085815,
1607
- "learning_rate": 4.579694872711501e-05,
1608
- "loss": 0.9117,
1609
- "step": 200
1610
- },
1611
- {
1612
- "epoch": 6.299212598425197,
1613
- "eval_loss": 1.2343026399612427,
1614
- "eval_runtime": 3.7714,
1615
- "eval_samples_per_second": 26.515,
1616
- "eval_steps_per_second": 1.856,
1617
- "step": 200
1618
- },
1619
- {
1620
- "epoch": 6.330708661417323,
1621
- "grad_norm": 1.1068271398544312,
1622
- "learning_rate": 4.5714966919091415e-05,
1623
- "loss": 0.8904,
1624
- "step": 201
1625
- },
1626
- {
1627
- "epoch": 6.362204724409449,
1628
- "grad_norm": 1.2471762895584106,
1629
- "learning_rate": 4.563226814232444e-05,
1630
- "loss": 0.7685,
1631
- "step": 202
1632
- },
1633
- {
1634
- "epoch": 6.393700787401575,
1635
- "grad_norm": 1.0919934511184692,
1636
- "learning_rate": 4.554885525911351e-05,
1637
- "loss": 0.7867,
1638
- "step": 203
1639
- },
1640
- {
1641
- "epoch": 6.425196850393701,
1642
- "grad_norm": 1.1424957513809204,
1643
- "learning_rate": 4.5464731156474094e-05,
1644
- "loss": 0.9329,
1645
- "step": 204
1646
- },
1647
- {
1648
- "epoch": 6.456692913385827,
1649
- "grad_norm": 1.190438985824585,
1650
- "learning_rate": 4.5379898746037804e-05,
1651
- "loss": 0.8624,
1652
- "step": 205
1653
- },
1654
- {
1655
- "epoch": 6.488188976377953,
1656
- "grad_norm": 1.210954189300537,
1657
- "learning_rate": 4.529436096395156e-05,
1658
- "loss": 0.9547,
1659
- "step": 206
1660
- },
1661
- {
1662
- "epoch": 6.519685039370079,
1663
- "grad_norm": 1.2978620529174805,
1664
- "learning_rate": 4.520812077077604e-05,
1665
- "loss": 0.8868,
1666
- "step": 207
1667
- },
1668
- {
1669
- "epoch": 6.551181102362205,
1670
- "grad_norm": 1.081527590751648,
1671
- "learning_rate": 4.5121181151383143e-05,
1672
- "loss": 0.7841,
1673
- "step": 208
1674
- },
1675
- {
1676
- "epoch": 6.551181102362205,
1677
- "eval_loss": 1.2477138042449951,
1678
- "eval_runtime": 3.8991,
1679
- "eval_samples_per_second": 25.647,
1680
- "eval_steps_per_second": 1.795,
1681
- "step": 208
1682
- },
1683
- {
1684
- "epoch": 6.582677165354331,
1685
- "grad_norm": 1.1496070623397827,
1686
- "learning_rate": 4.503354511485273e-05,
1687
- "loss": 0.9487,
1688
- "step": 209
1689
- },
1690
- {
1691
- "epoch": 6.6141732283464565,
1692
- "grad_norm": 1.188530683517456,
1693
- "learning_rate": 4.494521569436845e-05,
1694
- "loss": 0.9439,
1695
- "step": 210
1696
  }
1697
  ],
1698
  "logging_steps": 1,
1699
- "max_steps": 634,
1700
  "num_input_tokens_seen": 0,
1701
- "num_train_epochs": 21,
1702
  "save_steps": 5,
1703
  "stateful_callbacks": {
1704
  "TrainerControl": {
@@ -1712,8 +179,8 @@
1712
  "attributes": {}
1713
  }
1714
  },
1715
- "total_flos": 3.4155451121664e+16,
1716
- "train_batch_size": 15,
1717
  "trial_name": null,
1718
  "trial_params": null
1719
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.4047619047619047,
5
+ "eval_steps": 3,
6
+ "global_step": 15,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.09523809523809523,
13
+ "grad_norm": 0.5343478322029114,
14
+ "learning_rate": 2e-05,
15
+ "loss": 1.3356,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.09523809523809523,
20
+ "eval_loss": 1.2671657800674438,
21
+ "eval_runtime": 20.0994,
22
+ "eval_samples_per_second": 4.975,
23
+ "eval_steps_per_second": 2.488,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.19047619047619047,
28
+ "grad_norm": 0.5495722889900208,
29
+ "learning_rate": 4e-05,
30
+ "loss": 1.3576,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 0.2857142857142857,
35
+ "grad_norm": 0.48776495456695557,
36
+ "learning_rate": 6e-05,
37
+ "loss": 1.2159,
38
+ "step": 3
39
+ },
40
+ {
41
+ "epoch": 0.2857142857142857,
42
+ "eval_loss": 1.2533905506134033,
43
+ "eval_runtime": 24.7126,
44
+ "eval_samples_per_second": 4.047,
45
+ "eval_steps_per_second": 2.023,
46
  "step": 3
47
  },
48
  {
49
+ "epoch": 0.38095238095238093,
50
+ "grad_norm": 0.4593156576156616,
51
+ "learning_rate": 8e-05,
52
+ "loss": 1.2203,
53
  "step": 4
54
  },
55
  {
56
+ "epoch": 0.47619047619047616,
57
+ "grad_norm": 0.3861481249332428,
58
+ "learning_rate": 0.0001,
59
+ "loss": 1.278,
60
  "step": 5
61
  },
62
  {
63
+ "epoch": 0.5714285714285714,
64
+ "grad_norm": 0.26556482911109924,
65
+ "learning_rate": 0.00012,
66
+ "loss": 1.2716,
67
  "step": 6
68
  },
69
  {
70
+ "epoch": 0.5714285714285714,
71
+ "eval_loss": 1.209787130355835,
72
+ "eval_runtime": 20.5346,
73
+ "eval_samples_per_second": 4.87,
74
+ "eval_steps_per_second": 2.435,
75
+ "step": 6
76
+ },
77
+ {
78
+ "epoch": 0.6666666666666666,
79
+ "grad_norm": 0.3640616536140442,
80
+ "learning_rate": 0.00014,
81
+ "loss": 1.2188,
82
  "step": 7
83
  },
84
  {
85
+ "epoch": 0.7619047619047619,
86
+ "grad_norm": 0.49822962284088135,
87
+ "learning_rate": 0.00016,
88
+ "loss": 1.2624,
89
  "step": 8
90
  },
91
  {
92
+ "epoch": 0.8571428571428571,
93
+ "grad_norm": 0.42707493901252747,
94
+ "learning_rate": 0.00018,
95
+ "loss": 1.2697,
96
+ "step": 9
 
97
  },
98
  {
99
+ "epoch": 0.8571428571428571,
100
+ "eval_loss": 1.1909722089767456,
101
+ "eval_runtime": 20.1688,
102
+ "eval_samples_per_second": 4.958,
103
+ "eval_steps_per_second": 2.479,
104
  "step": 9
105
  },
106
  {
107
+ "epoch": 0.9523809523809523,
108
+ "grad_norm": 0.27150899171829224,
109
+ "learning_rate": 0.0002,
110
+ "loss": 1.215,
111
  "step": 10
112
  },
113
  {
114
+ "epoch": 1.0238095238095237,
115
+ "grad_norm": 0.23839783668518066,
116
+ "learning_rate": 0.00019876883405951377,
117
+ "loss": 1.2365,
118
  "step": 11
119
  },
120
  {
121
+ "epoch": 1.119047619047619,
122
+ "grad_norm": 0.20644636452198029,
123
+ "learning_rate": 0.00019510565162951537,
124
+ "loss": 1.2243,
125
+ "step": 12
126
+ },
127
+ {
128
+ "epoch": 1.119047619047619,
129
+ "eval_loss": 1.1754947900772095,
130
+ "eval_runtime": 21.1588,
131
+ "eval_samples_per_second": 4.726,
132
+ "eval_steps_per_second": 2.363,
133
  "step": 12
134
  },
135
  {
136
+ "epoch": 1.2142857142857142,
137
+ "grad_norm": 0.23169651627540588,
138
+ "learning_rate": 0.0001891006524188368,
139
+ "loss": 1.1474,
140
  "step": 13
141
  },
142
  {
143
+ "epoch": 1.3095238095238095,
144
+ "grad_norm": 0.21209821105003357,
145
+ "learning_rate": 0.00018090169943749476,
146
+ "loss": 1.1395,
147
  "step": 14
148
  },
149
  {
150
+ "epoch": 1.4047619047619047,
151
+ "grad_norm": 0.1954081654548645,
152
+ "learning_rate": 0.00017071067811865476,
153
+ "loss": 1.1981,
154
  "step": 15
155
  },
156
  {
157
+ "epoch": 1.4047619047619047,
158
+ "eval_loss": 1.1603792905807495,
159
+ "eval_runtime": 16.71,
160
+ "eval_samples_per_second": 5.984,
161
+ "eval_steps_per_second": 2.992,
162
+ "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  }
164
  ],
165
  "logging_steps": 1,
166
+ "max_steps": 30,
167
  "num_input_tokens_seen": 0,
168
+ "num_train_epochs": 3,
169
  "save_steps": 5,
170
  "stateful_callbacks": {
171
  "TrainerControl": {
 
179
  "attributes": {}
180
  }
181
  },
182
+ "total_flos": 2835124715520000.0,
183
+ "train_batch_size": 2,
184
  "trial_name": null,
185
  "trial_params": null
186
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:441f7463c0db2e1253eb6ddf59c934fab68dc0782a39afd6c59cfb134e8542f9
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c62cd4a5f5b5d98624b05b91ea66842c68cee52403c2495c0459cf13d17cfcc
3
  size 6648