alicegoesdown commited on
Commit
b724832
·
verified ·
1 Parent(s): 6c2c737

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -14,17 +14,15 @@
14
  "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
- "modules_to_save": [
18
- "lm_head"
19
- ],
20
  "peft_type": "LORA",
21
  "r": 8,
22
  "rank_pattern": {},
23
  "revision": null,
24
  "target_modules": [
25
- "q_proj",
26
- "o_proj",
27
  "k_proj",
 
 
28
  "v_proj"
29
  ],
30
  "task_type": "CAUSAL_LM",
 
14
  "lora_dropout": 0.1,
15
  "megatron_config": null,
16
  "megatron_core": "megatron.core",
17
+ "modules_to_save": [],
 
 
18
  "peft_type": "LORA",
19
  "r": 8,
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "k_proj",
24
+ "o_proj",
25
+ "q_proj",
26
  "v_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e70898adc0001cf8d3cbd88db122187cf34c72c60ab6b0d25f03fe79664bfbe6
3
- size 532169208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58a6dac7b8788cdf686e0493d47f426ab4e6370365936cf0127fc02865db7e27
3
+ size 6832520
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba56b0e54f1167dc39906ab61863f6f7efa026a9bdc2767ad4e86e543d464ac2
3
- size 1064413498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7192f33aec7dae1d9eb5f8b448f192cc1be6d1e1d1fce1464dfc671d4cd435f0
3
+ size 13739450
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0e8cd75e9fb00d7e1c3e742ef94f1b5f25795aee0fd9312b8f905404776c3bb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba79f79f4200644bcde298b8ba358af98910b10cc152e720addca023a2e47a37
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2608ab80398bcdaa9fbf626bf9fbe36aaf04727adaa3a55c222330d8a358f2f7
3
  size 1256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6186f1e9836e20fd270e1ab773c83f1027d92e426fd1d0a8c7816f8a9115c5fd
3
  size 1256
last-checkpoint/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
- size 17209920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52716f60c3ad328509fa37cdded9a2f1196ecae463f5480f5d38c66a25e7a7dc
3
+ size 17210019
last-checkpoint/trainer_state.json CHANGED
@@ -1,1933 +1,125 @@
1
  {
2
- "best_metric": 1.9544912576675415,
3
- "best_model_checkpoint": "./output/checkpoint-1500",
4
- "epoch": 3.256704980842912,
5
  "eval_steps": 150,
6
- "global_step": 2550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01277139208173691,
13
- "grad_norm": 3.200807571411133,
14
- "learning_rate": 1.4e-06,
15
- "loss": 1.9026,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.02554278416347382,
20
- "grad_norm": 3.294923782348633,
21
- "learning_rate": 2.8e-06,
22
- "loss": 1.9388,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.038314176245210725,
27
- "grad_norm": 3.206088066101074,
28
- "learning_rate": 4.2e-06,
29
- "loss": 1.9022,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.05108556832694764,
34
- "grad_norm": 3.0478761196136475,
35
- "learning_rate": 5.6e-06,
36
- "loss": 1.8711,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.06385696040868455,
41
- "grad_norm": 3.636721611022949,
42
- "learning_rate": 7e-06,
43
- "loss": 1.9732,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.07662835249042145,
48
- "grad_norm": 3.1977827548980713,
49
- "learning_rate": 8.4e-06,
50
- "loss": 1.9163,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.08939974457215837,
55
- "grad_norm": 2.9971323013305664,
56
- "learning_rate": 9.8e-06,
57
- "loss": 1.9435,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.10217113665389528,
62
- "grad_norm": 3.2469263076782227,
63
- "learning_rate": 1.12e-05,
64
- "loss": 1.9411,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.11494252873563218,
69
- "grad_norm": 3.1737616062164307,
70
- "learning_rate": 1.2599999999999998e-05,
71
- "loss": 1.9255,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.1277139208173691,
76
- "grad_norm": 3.41223406791687,
77
- "learning_rate": 1.4e-05,
78
- "loss": 1.9807,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.140485312899106,
83
- "grad_norm": 2.883166551589966,
84
- "learning_rate": 1.5399999999999998e-05,
85
- "loss": 1.9229,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.1532567049808429,
90
- "grad_norm": 2.956953287124634,
91
- "learning_rate": 1.68e-05,
92
- "loss": 1.9432,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.16602809706257982,
97
- "grad_norm": 3.3282694816589355,
98
- "learning_rate": 1.82e-05,
99
- "loss": 1.9258,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.17879948914431673,
104
- "grad_norm": 3.1125221252441406,
105
- "learning_rate": 1.96e-05,
106
- "loss": 1.9145,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.19157088122605365,
111
- "grad_norm": 3.033933639526367,
112
- "learning_rate": 2.1e-05,
113
- "loss": 1.9399,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.19157088122605365,
118
- "eval_loss": 1.9976754188537598,
119
- "eval_runtime": 24.9012,
120
- "eval_samples_per_second": 20.079,
121
- "eval_steps_per_second": 20.079,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.20434227330779056,
126
- "grad_norm": 3.3137006759643555,
127
- "learning_rate": 2.24e-05,
128
- "loss": 1.8722,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.21711366538952745,
133
- "grad_norm": 3.511589288711548,
134
- "learning_rate": 2.38e-05,
135
- "loss": 1.897,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.22988505747126436,
140
- "grad_norm": 3.4723284244537354,
141
- "learning_rate": 2.5199999999999996e-05,
142
- "loss": 1.8827,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.24265644955300128,
147
- "grad_norm": 3.296243667602539,
148
- "learning_rate": 2.66e-05,
149
- "loss": 1.9298,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.2554278416347382,
154
- "grad_norm": 3.188915491104126,
155
- "learning_rate": 2.8e-05,
156
- "loss": 1.9129,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.2681992337164751,
161
- "grad_norm": 3.2900846004486084,
162
- "learning_rate": 2.9399999999999996e-05,
163
- "loss": 1.8282,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.280970625798212,
168
- "grad_norm": 3.3161473274230957,
169
- "learning_rate": 3.0799999999999996e-05,
170
- "loss": 1.9102,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.2937420178799489,
175
- "grad_norm": 3.259005546569824,
176
- "learning_rate": 3.22e-05,
177
- "loss": 1.948,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.3065134099616858,
182
- "grad_norm": 3.051255941390991,
183
- "learning_rate": 3.36e-05,
184
- "loss": 1.8954,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.31928480204342274,
189
- "grad_norm": 3.3548150062561035,
190
- "learning_rate": 3.5e-05,
191
- "loss": 1.9701,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.33205619412515963,
196
- "grad_norm": 2.9780640602111816,
197
- "learning_rate": 3.64e-05,
198
- "loss": 1.908,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.3448275862068966,
203
- "grad_norm": 3.386885404586792,
204
- "learning_rate": 3.78e-05,
205
- "loss": 1.9511,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.35759897828863346,
210
- "grad_norm": 3.263582229614258,
211
- "learning_rate": 3.92e-05,
212
- "loss": 1.9289,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.37037037037037035,
217
- "grad_norm": 3.1607635021209717,
218
- "learning_rate": 4.059999999999999e-05,
219
- "loss": 1.9594,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.3831417624521073,
224
- "grad_norm": 3.176176071166992,
225
- "learning_rate": 4.2e-05,
226
- "loss": 1.9399,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.3831417624521073,
231
- "eval_loss": 1.9805161952972412,
232
- "eval_runtime": 28.0922,
233
- "eval_samples_per_second": 17.799,
234
- "eval_steps_per_second": 17.799,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.3959131545338442,
239
- "grad_norm": 3.4425015449523926,
240
- "learning_rate": 4.34e-05,
241
- "loss": 1.8902,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.4086845466155811,
246
- "grad_norm": 3.13578724861145,
247
- "learning_rate": 4.48e-05,
248
- "loss": 1.9275,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.421455938697318,
253
- "grad_norm": 3.6584582328796387,
254
- "learning_rate": 4.62e-05,
255
- "loss": 1.9625,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.4342273307790549,
260
- "grad_norm": 3.0934386253356934,
261
- "learning_rate": 4.76e-05,
262
- "loss": 1.882,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.44699872286079184,
267
- "grad_norm": 3.1788082122802734,
268
- "learning_rate": 4.899999999999999e-05,
269
- "loss": 1.8866,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.45977011494252873,
274
- "grad_norm": 3.1974220275878906,
275
- "learning_rate": 5.039999999999999e-05,
276
- "loss": 1.9472,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.4725415070242657,
281
- "grad_norm": 3.3801984786987305,
282
- "learning_rate": 5.179999999999999e-05,
283
- "loss": 1.9249,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.48531289910600256,
288
- "grad_norm": 3.156172037124634,
289
- "learning_rate": 5.32e-05,
290
- "loss": 1.9025,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.49808429118773945,
295
- "grad_norm": 3.3196117877960205,
296
- "learning_rate": 5.46e-05,
297
- "loss": 1.8319,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.5108556832694764,
302
- "grad_norm": 3.410414457321167,
303
- "learning_rate": 5.6e-05,
304
- "loss": 1.9329,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.5236270753512133,
309
- "grad_norm": 3.050872325897217,
310
- "learning_rate": 5.739999999999999e-05,
311
- "loss": 1.936,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 0.5363984674329502,
316
- "grad_norm": 3.5527184009552,
317
- "learning_rate": 5.879999999999999e-05,
318
- "loss": 1.9004,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 0.5491698595146871,
323
- "grad_norm": 3.099611282348633,
324
- "learning_rate": 6.019999999999999e-05,
325
- "loss": 1.9061,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 0.561941251596424,
330
- "grad_norm": 3.0915510654449463,
331
- "learning_rate": 6.159999999999999e-05,
332
- "loss": 1.9015,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 0.5747126436781609,
337
- "grad_norm": 3.2416725158691406,
338
- "learning_rate": 6.3e-05,
339
- "loss": 1.8938,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 0.5747126436781609,
344
- "eval_loss": 1.970125675201416,
345
- "eval_runtime": 24.7662,
346
- "eval_samples_per_second": 20.189,
347
- "eval_steps_per_second": 20.189,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.5874840357598978,
352
- "grad_norm": 2.9722657203674316,
353
- "learning_rate": 6.44e-05,
354
- "loss": 1.869,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.6002554278416348,
359
- "grad_norm": 3.058877468109131,
360
- "learning_rate": 6.579999999999999e-05,
361
- "loss": 1.856,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.6130268199233716,
366
- "grad_norm": 3.3449816703796387,
367
- "learning_rate": 6.72e-05,
368
- "loss": 1.8573,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.6257982120051085,
373
- "grad_norm": 3.0998528003692627,
374
- "learning_rate": 6.859999999999999e-05,
375
- "loss": 1.8721,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.6385696040868455,
380
- "grad_norm": 3.2949531078338623,
381
- "learning_rate": 7e-05,
382
- "loss": 1.9653,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.6513409961685823,
387
- "grad_norm": 2.965726137161255,
388
- "learning_rate": 6.99991470746888e-05,
389
- "loss": 1.8915,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.6641123882503193,
394
- "grad_norm": 3.32828950881958,
395
- "learning_rate": 6.999658834032565e-05,
396
- "loss": 1.8694,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.6768837803320562,
401
- "grad_norm": 3.0084595680236816,
402
- "learning_rate": 6.999232392161959e-05,
403
- "loss": 1.8969,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.6896551724137931,
408
- "grad_norm": 3.276498556137085,
409
- "learning_rate": 6.99863540264124e-05,
410
- "loss": 1.9219,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.70242656449553,
415
- "grad_norm": 3.205116033554077,
416
- "learning_rate": 6.997867894566835e-05,
417
- "loss": 1.9566,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.7151979565772669,
422
- "grad_norm": 3.1132071018218994,
423
- "learning_rate": 6.996929905346004e-05,
424
- "loss": 1.9083,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.7279693486590039,
429
- "grad_norm": 2.9555888175964355,
430
- "learning_rate": 6.995821480695019e-05,
431
- "loss": 1.8563,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.7407407407407407,
436
- "grad_norm": 3.2956736087799072,
437
- "learning_rate": 6.994542674636933e-05,
438
- "loss": 1.8805,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.7535121328224776,
443
- "grad_norm": 3.101565361022949,
444
- "learning_rate": 6.99309354949895e-05,
445
- "loss": 1.8884,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.7662835249042146,
450
- "grad_norm": 3.5506794452667236,
451
- "learning_rate": 6.991474175909384e-05,
452
- "loss": 1.9324,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.7662835249042146,
457
- "eval_loss": 1.9721698760986328,
458
- "eval_runtime": 27.9596,
459
- "eval_samples_per_second": 17.883,
460
- "eval_steps_per_second": 17.883,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.7790549169859514,
465
- "grad_norm": 3.1406140327453613,
466
- "learning_rate": 6.989684632794221e-05,
467
- "loss": 1.8976,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.7918263090676884,
472
- "grad_norm": 3.364961624145508,
473
- "learning_rate": 6.987725007373265e-05,
474
- "loss": 1.8477,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.8045977011494253,
479
- "grad_norm": 2.9893288612365723,
480
- "learning_rate": 6.985595395155896e-05,
481
- "loss": 1.9148,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.8173690932311622,
486
- "grad_norm": 3.107212781906128,
487
- "learning_rate": 6.983295899936408e-05,
488
- "loss": 1.8768,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.8301404853128991,
493
- "grad_norm": 3.053358554840088,
494
- "learning_rate": 6.980826633788956e-05,
495
- "loss": 1.9073,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.842911877394636,
500
- "grad_norm": 2.9061360359191895,
501
- "learning_rate": 6.978187717062086e-05,
502
- "loss": 1.8477,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.855683269476373,
507
- "grad_norm": 2.9826648235321045,
508
- "learning_rate": 6.975379278372878e-05,
509
- "loss": 1.8508,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.8684546615581098,
514
- "grad_norm": 3.2406656742095947,
515
- "learning_rate": 6.972401454600672e-05,
516
- "loss": 1.9066,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.8812260536398467,
521
- "grad_norm": 3.206275463104248,
522
- "learning_rate": 6.969254390880395e-05,
523
- "loss": 1.9108,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.8939974457215837,
528
- "grad_norm": 3.2281596660614014,
529
- "learning_rate": 6.965938240595496e-05,
530
- "loss": 1.9246,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.9067688378033205,
535
- "grad_norm": 3.3281443119049072,
536
- "learning_rate": 6.962453165370459e-05,
537
- "loss": 1.9756,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.9195402298850575,
542
- "grad_norm": 3.3063316345214844,
543
- "learning_rate": 6.958799335062934e-05,
544
- "loss": 1.8813,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.9323116219667944,
549
- "grad_norm": 2.862231969833374,
550
- "learning_rate": 6.954976927755458e-05,
551
- "loss": 1.8586,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.9450830140485313,
556
- "grad_norm": 3.3289144039154053,
557
- "learning_rate": 6.950986129746767e-05,
558
- "loss": 1.8932,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.9578544061302682,
563
- "grad_norm": 2.8503053188323975,
564
- "learning_rate": 6.946827135542728e-05,
565
- "loss": 1.8778,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.9578544061302682,
570
- "eval_loss": 1.9627635478973389,
571
- "eval_runtime": 28.7652,
572
- "eval_samples_per_second": 17.382,
573
- "eval_steps_per_second": 17.382,
574
- "step": 750
575
- },
576
- {
577
- "epoch": 0.9706257982120051,
578
- "grad_norm": 3.0852253437042236,
579
- "learning_rate": 6.94250014784685e-05,
580
- "loss": 1.9039,
581
- "step": 760
582
- },
583
- {
584
- "epoch": 0.9833971902937421,
585
- "grad_norm": 3.4453940391540527,
586
- "learning_rate": 6.93800537755041e-05,
587
- "loss": 1.8881,
588
- "step": 770
589
- },
590
- {
591
- "epoch": 0.9961685823754789,
592
- "grad_norm": 2.987304925918579,
593
- "learning_rate": 6.93334304372217e-05,
594
- "loss": 1.8828,
595
- "step": 780
596
- },
597
- {
598
- "epoch": 1.0089399744572158,
599
- "grad_norm": 2.9873013496398926,
600
- "learning_rate": 6.928513373597703e-05,
601
- "loss": 1.8152,
602
- "step": 790
603
- },
604
- {
605
- "epoch": 1.0217113665389528,
606
- "grad_norm": 3.0838310718536377,
607
- "learning_rate": 6.923516602568319e-05,
608
- "loss": 1.7415,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 1.0344827586206897,
613
- "grad_norm": 3.1033427715301514,
614
- "learning_rate": 6.918352974169587e-05,
615
- "loss": 1.7952,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 1.0472541507024267,
620
- "grad_norm": 3.1028897762298584,
621
- "learning_rate": 6.913022740069471e-05,
622
- "loss": 1.7518,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 1.0600255427841634,
627
- "grad_norm": 3.008892297744751,
628
- "learning_rate": 6.90752616005606e-05,
629
- "loss": 1.7723,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 1.0727969348659003,
634
- "grad_norm": 3.101642370223999,
635
- "learning_rate": 6.901863502024912e-05,
636
- "loss": 1.7136,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 1.0855683269476373,
641
- "grad_norm": 3.0864851474761963,
642
- "learning_rate": 6.896035041965987e-05,
643
- "loss": 1.7585,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 1.0983397190293742,
648
- "grad_norm": 3.0635933876037598,
649
- "learning_rate": 6.890041063950208e-05,
650
- "loss": 1.7557,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 1.1111111111111112,
655
- "grad_norm": 3.0961105823516846,
656
- "learning_rate": 6.883881860115608e-05,
657
- "loss": 1.7491,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 1.123882503192848,
662
- "grad_norm": 3.00886607170105,
663
- "learning_rate": 6.87755773065309e-05,
664
- "loss": 1.7299,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 1.136653895274585,
669
- "grad_norm": 2.9253010749816895,
670
- "learning_rate": 6.871068983791803e-05,
671
- "loss": 1.7236,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 1.1494252873563218,
676
- "grad_norm": 3.2435286045074463,
677
- "learning_rate": 6.864415935784116e-05,
678
- "loss": 1.7262,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 1.1494252873563218,
683
- "eval_loss": 1.9637728929519653,
684
- "eval_runtime": 26.2765,
685
- "eval_samples_per_second": 19.028,
686
- "eval_steps_per_second": 19.028,
687
- "step": 900
688
- },
689
- {
690
- "epoch": 1.1621966794380587,
691
- "grad_norm": 2.878432512283325,
692
- "learning_rate": 6.8575989108902e-05,
693
- "loss": 1.7455,
694
- "step": 910
695
- },
696
- {
697
- "epoch": 1.1749680715197957,
698
- "grad_norm": 2.9958291053771973,
699
- "learning_rate": 6.850618241362235e-05,
700
- "loss": 1.7218,
701
- "step": 920
702
- },
703
- {
704
- "epoch": 1.1877394636015326,
705
- "grad_norm": 3.107431173324585,
706
- "learning_rate": 6.843474267428202e-05,
707
- "loss": 1.7237,
708
- "step": 930
709
- },
710
- {
711
- "epoch": 1.2005108556832695,
712
- "grad_norm": 3.1416854858398438,
713
- "learning_rate": 6.836167337275314e-05,
714
- "loss": 1.7539,
715
- "step": 940
716
- },
717
- {
718
- "epoch": 1.2132822477650063,
719
- "grad_norm": 3.1158761978149414,
720
- "learning_rate": 6.828697807033038e-05,
721
- "loss": 1.7637,
722
- "step": 950
723
- },
724
- {
725
- "epoch": 1.2260536398467432,
726
- "grad_norm": 2.7915549278259277,
727
- "learning_rate": 6.821066040755737e-05,
728
- "loss": 1.688,
729
- "step": 960
730
- },
731
- {
732
- "epoch": 1.2388250319284801,
733
- "grad_norm": 3.083923816680908,
734
- "learning_rate": 6.813272410404936e-05,
735
- "loss": 1.7765,
736
- "step": 970
737
- },
738
- {
739
- "epoch": 1.251596424010217,
740
- "grad_norm": 3.111654281616211,
741
- "learning_rate": 6.805317295831182e-05,
742
- "loss": 1.7157,
743
- "step": 980
744
- },
745
- {
746
- "epoch": 1.264367816091954,
747
- "grad_norm": 2.9396989345550537,
748
- "learning_rate": 6.797201084755538e-05,
749
- "loss": 1.7147,
750
- "step": 990
751
- },
752
- {
753
- "epoch": 1.277139208173691,
754
- "grad_norm": 3.1540298461914062,
755
- "learning_rate": 6.788924172750679e-05,
756
- "loss": 1.755,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 1.289910600255428,
761
- "grad_norm": 3.1720669269561768,
762
- "learning_rate": 6.78048696322162e-05,
763
- "loss": 1.8097,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 1.3026819923371646,
768
- "grad_norm": 3.1514675617218018,
769
- "learning_rate": 6.77188986738605e-05,
770
- "loss": 1.7045,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 1.3154533844189016,
775
- "grad_norm": 2.926434278488159,
776
- "learning_rate": 6.763133304254292e-05,
777
- "loss": 1.7047,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 1.3282247765006385,
782
- "grad_norm": 3.011573314666748,
783
- "learning_rate": 6.75421770060888e-05,
784
- "loss": 1.7246,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 1.3409961685823755,
789
- "grad_norm": 3.360208749771118,
790
- "learning_rate": 6.745143490983756e-05,
791
- "loss": 1.823,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 1.3409961685823755,
796
- "eval_loss": 1.9604660272598267,
797
- "eval_runtime": 28.9496,
798
- "eval_samples_per_second": 17.271,
799
- "eval_steps_per_second": 17.271,
800
- "step": 1050
801
- },
802
- {
803
- "epoch": 1.3537675606641124,
804
- "grad_norm": 3.0648999214172363,
805
- "learning_rate": 6.735911117643095e-05,
806
- "loss": 1.7675,
807
- "step": 1060
808
- },
809
- {
810
- "epoch": 1.3665389527458494,
811
- "grad_norm": 2.986990451812744,
812
- "learning_rate": 6.726521030559751e-05,
813
- "loss": 1.7806,
814
- "step": 1070
815
- },
816
- {
817
- "epoch": 1.3793103448275863,
818
- "grad_norm": 2.9186227321624756,
819
- "learning_rate": 6.71697368739332e-05,
820
- "loss": 1.7838,
821
- "step": 1080
822
- },
823
- {
824
- "epoch": 1.392081736909323,
825
- "grad_norm": 2.8783702850341797,
826
- "learning_rate": 6.707269553467838e-05,
827
- "loss": 1.7621,
828
- "step": 1090
829
- },
830
- {
831
- "epoch": 1.40485312899106,
832
- "grad_norm": 3.190941333770752,
833
- "learning_rate": 6.697409101749102e-05,
834
- "loss": 1.7883,
835
- "step": 1100
836
- },
837
- {
838
- "epoch": 1.417624521072797,
839
- "grad_norm": 3.060194969177246,
840
- "learning_rate": 6.687392812821619e-05,
841
- "loss": 1.7676,
842
- "step": 1110
843
- },
844
- {
845
- "epoch": 1.4303959131545338,
846
- "grad_norm": 3.0321028232574463,
847
- "learning_rate": 6.677221174865179e-05,
848
- "loss": 1.7914,
849
- "step": 1120
850
- },
851
- {
852
- "epoch": 1.4431673052362708,
853
- "grad_norm": 2.939692497253418,
854
- "learning_rate": 6.666894683631068e-05,
855
- "loss": 1.7739,
856
- "step": 1130
857
- },
858
- {
859
- "epoch": 1.4559386973180077,
860
- "grad_norm": 2.909308433532715,
861
- "learning_rate": 6.656413842417897e-05,
862
- "loss": 1.7782,
863
- "step": 1140
864
- },
865
- {
866
- "epoch": 1.4687100893997447,
867
- "grad_norm": 3.142610788345337,
868
- "learning_rate": 6.645779162047084e-05,
869
- "loss": 1.7468,
870
- "step": 1150
871
- },
872
- {
873
- "epoch": 1.4814814814814814,
874
- "grad_norm": 3.127857208251953,
875
- "learning_rate": 6.634991160837945e-05,
876
- "loss": 1.8058,
877
- "step": 1160
878
- },
879
- {
880
- "epoch": 1.4942528735632183,
881
- "grad_norm": 2.9502968788146973,
882
- "learning_rate": 6.624050364582439e-05,
883
- "loss": 1.7747,
884
- "step": 1170
885
- },
886
- {
887
- "epoch": 1.5070242656449553,
888
- "grad_norm": 3.0171525478363037,
889
- "learning_rate": 6.612957306519541e-05,
890
- "loss": 1.8112,
891
- "step": 1180
892
- },
893
- {
894
- "epoch": 1.5197956577266922,
895
- "grad_norm": 3.0701234340667725,
896
- "learning_rate": 6.60171252730925e-05,
897
- "loss": 1.7723,
898
- "step": 1190
899
- },
900
- {
901
- "epoch": 1.5325670498084292,
902
- "grad_norm": 3.009382486343384,
903
- "learning_rate": 6.590316575006243e-05,
904
- "loss": 1.815,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 1.5325670498084292,
909
- "eval_loss": 1.9615885019302368,
910
- "eval_runtime": 27.2206,
911
- "eval_samples_per_second": 18.368,
912
- "eval_steps_per_second": 18.368,
913
- "step": 1200
914
- },
915
- {
916
- "epoch": 1.545338441890166,
917
- "grad_norm": 2.813608407974243,
918
- "learning_rate": 6.578770005033157e-05,
919
- "loss": 1.7336,
920
- "step": 1210
921
- },
922
- {
923
- "epoch": 1.558109833971903,
924
- "grad_norm": 3.030839443206787,
925
- "learning_rate": 6.567073380153521e-05,
926
- "loss": 1.7409,
927
- "step": 1220
928
- },
929
- {
930
- "epoch": 1.5708812260536398,
931
- "grad_norm": 2.8138840198516846,
932
- "learning_rate": 6.555227270444334e-05,
933
- "loss": 1.7933,
934
- "step": 1230
935
- },
936
- {
937
- "epoch": 1.5836526181353767,
938
- "grad_norm": 2.8430979251861572,
939
- "learning_rate": 6.543232253268266e-05,
940
- "loss": 1.7454,
941
- "step": 1240
942
- },
943
- {
944
- "epoch": 1.5964240102171137,
945
- "grad_norm": 3.018444538116455,
946
- "learning_rate": 6.531088913245535e-05,
947
- "loss": 1.7844,
948
- "step": 1250
949
- },
950
- {
951
- "epoch": 1.6091954022988506,
952
- "grad_norm": 2.859196662902832,
953
- "learning_rate": 6.518797842225401e-05,
954
- "loss": 1.7814,
955
- "step": 1260
956
- },
957
- {
958
- "epoch": 1.6219667943805876,
959
- "grad_norm": 2.9246723651885986,
960
- "learning_rate": 6.506359639257325e-05,
961
- "loss": 1.7289,
962
- "step": 1270
963
- },
964
- {
965
- "epoch": 1.6347381864623243,
966
- "grad_norm": 3.0037896633148193,
967
- "learning_rate": 6.493774910561772e-05,
968
- "loss": 1.7259,
969
- "step": 1280
970
- },
971
- {
972
- "epoch": 1.6475095785440614,
973
- "grad_norm": 3.0603103637695312,
974
- "learning_rate": 6.481044269500665e-05,
975
- "loss": 1.6821,
976
- "step": 1290
977
- },
978
- {
979
- "epoch": 1.6602809706257982,
980
- "grad_norm": 3.065397262573242,
981
- "learning_rate": 6.46816833654749e-05,
982
- "loss": 1.8109,
983
- "step": 1300
984
- },
985
- {
986
- "epoch": 1.673052362707535,
987
- "grad_norm": 3.0101401805877686,
988
- "learning_rate": 6.455147739257053e-05,
989
- "loss": 1.7583,
990
- "step": 1310
991
- },
992
- {
993
- "epoch": 1.685823754789272,
994
- "grad_norm": 3.0239782333374023,
995
- "learning_rate": 6.441983112234894e-05,
996
- "loss": 1.759,
997
- "step": 1320
998
- },
999
- {
1000
- "epoch": 1.698595146871009,
1001
- "grad_norm": 3.068173408508301,
1002
- "learning_rate": 6.428675097106366e-05,
1003
- "loss": 1.7957,
1004
- "step": 1330
1005
- },
1006
- {
1007
- "epoch": 1.711366538952746,
1008
- "grad_norm": 3.2432665824890137,
1009
- "learning_rate": 6.415224342485348e-05,
1010
- "loss": 1.7558,
1011
- "step": 1340
1012
- },
1013
- {
1014
- "epoch": 1.7241379310344827,
1015
- "grad_norm": 3.144869327545166,
1016
- "learning_rate": 6.401631503942645e-05,
1017
- "loss": 1.7613,
1018
- "step": 1350
1019
- },
1020
- {
1021
- "epoch": 1.7241379310344827,
1022
- "eval_loss": 1.9588252305984497,
1023
- "eval_runtime": 27.7722,
1024
- "eval_samples_per_second": 18.004,
1025
- "eval_steps_per_second": 18.004,
1026
- "step": 1350
1027
- },
1028
- {
1029
- "epoch": 1.7369093231162198,
1030
- "grad_norm": 2.8430140018463135,
1031
- "learning_rate": 6.387897243974032e-05,
1032
- "loss": 1.767,
1033
- "step": 1360
1034
- },
1035
- {
1036
- "epoch": 1.7496807151979565,
1037
- "grad_norm": 3.0153555870056152,
1038
- "learning_rate": 6.374022231967963e-05,
1039
- "loss": 1.7671,
1040
- "step": 1370
1041
- },
1042
- {
1043
- "epoch": 1.7624521072796935,
1044
- "grad_norm": 2.876494884490967,
1045
- "learning_rate": 6.360007144172949e-05,
1046
- "loss": 1.7545,
1047
- "step": 1380
1048
- },
1049
- {
1050
- "epoch": 1.7752234993614304,
1051
- "grad_norm": 3.244321823120117,
1052
- "learning_rate": 6.345852663664596e-05,
1053
- "loss": 1.8216,
1054
- "step": 1390
1055
- },
1056
- {
1057
- "epoch": 1.7879948914431671,
1058
- "grad_norm": 2.9006967544555664,
1059
- "learning_rate": 6.331559480312315e-05,
1060
- "loss": 1.7792,
1061
- "step": 1400
1062
- },
1063
- {
1064
- "epoch": 1.8007662835249043,
1065
- "grad_norm": 2.7696609497070312,
1066
- "learning_rate": 6.317128290745699e-05,
1067
- "loss": 1.7941,
1068
- "step": 1410
1069
- },
1070
- {
1071
- "epoch": 1.813537675606641,
1072
- "grad_norm": 2.7425568103790283,
1073
- "learning_rate": 6.302559798320566e-05,
1074
- "loss": 1.7169,
1075
- "step": 1420
1076
- },
1077
- {
1078
- "epoch": 1.8263090676883782,
1079
- "grad_norm": 3.130302906036377,
1080
- "learning_rate": 6.287854713084686e-05,
1081
- "loss": 1.7979,
1082
- "step": 1430
1083
- },
1084
- {
1085
- "epoch": 1.839080459770115,
1086
- "grad_norm": 2.8706188201904297,
1087
- "learning_rate": 6.273013751743166e-05,
1088
- "loss": 1.7407,
1089
- "step": 1440
1090
- },
1091
- {
1092
- "epoch": 1.8518518518518519,
1093
- "grad_norm": 3.3041722774505615,
1094
- "learning_rate": 6.258037637623526e-05,
1095
- "loss": 1.7932,
1096
- "step": 1450
1097
- },
1098
- {
1099
- "epoch": 1.8646232439335888,
1100
- "grad_norm": 3.403945207595825,
1101
- "learning_rate": 6.242927100640439e-05,
1102
- "loss": 1.7524,
1103
- "step": 1460
1104
- },
1105
- {
1106
- "epoch": 1.8773946360153255,
1107
- "grad_norm": 3.074903964996338,
1108
- "learning_rate": 6.22768287726016e-05,
1109
- "loss": 1.8342,
1110
- "step": 1470
1111
- },
1112
- {
1113
- "epoch": 1.8901660280970627,
1114
- "grad_norm": 2.789820671081543,
1115
- "learning_rate": 6.212305710464628e-05,
1116
- "loss": 1.7446,
1117
- "step": 1480
1118
- },
1119
- {
1120
- "epoch": 1.9029374201787994,
1121
- "grad_norm": 3.0101871490478516,
1122
- "learning_rate": 6.196796349715262e-05,
1123
- "loss": 1.7563,
1124
- "step": 1490
1125
- },
1126
- {
1127
- "epoch": 1.9157088122605364,
1128
- "grad_norm": 2.9494054317474365,
1129
- "learning_rate": 6.181155550916422e-05,
1130
- "loss": 1.7892,
1131
- "step": 1500
1132
- },
1133
- {
1134
- "epoch": 1.9157088122605364,
1135
- "eval_loss": 1.9544912576675415,
1136
- "eval_runtime": 26.8049,
1137
- "eval_samples_per_second": 18.653,
1138
- "eval_steps_per_second": 18.653,
1139
- "step": 1500
1140
- },
1141
- {
1142
- "epoch": 1.9284802043422733,
1143
- "grad_norm": 2.912105083465576,
1144
- "learning_rate": 6.165384076378578e-05,
1145
- "loss": 1.7858,
1146
- "step": 1510
1147
- },
1148
- {
1149
- "epoch": 1.9412515964240102,
1150
- "grad_norm": 3.1038968563079834,
1151
- "learning_rate": 6.149482694781147e-05,
1152
- "loss": 1.8045,
1153
- "step": 1520
1154
- },
1155
- {
1156
- "epoch": 1.9540229885057472,
1157
- "grad_norm": 3.214099407196045,
1158
- "learning_rate": 6.133452181135035e-05,
1159
- "loss": 1.771,
1160
- "step": 1530
1161
- },
1162
- {
1163
- "epoch": 1.966794380587484,
1164
- "grad_norm": 2.9663312435150146,
1165
- "learning_rate": 6.117293316744862e-05,
1166
- "loss": 1.7892,
1167
- "step": 1540
1168
- },
1169
- {
1170
- "epoch": 1.979565772669221,
1171
- "grad_norm": 2.9719676971435547,
1172
- "learning_rate": 6.101006889170879e-05,
1173
- "loss": 1.773,
1174
- "step": 1550
1175
- },
1176
- {
1177
- "epoch": 1.9923371647509578,
1178
- "grad_norm": 3.2106828689575195,
1179
- "learning_rate": 6.0845936921905935e-05,
1180
- "loss": 1.7839,
1181
- "step": 1560
1182
- },
1183
- {
1184
- "epoch": 2.005108556832695,
1185
- "grad_norm": 2.85646390914917,
1186
- "learning_rate": 6.068054525760066e-05,
1187
- "loss": 1.6438,
1188
- "step": 1570
1189
- },
1190
- {
1191
- "epoch": 2.0178799489144317,
1192
- "grad_norm": 2.613004684448242,
1193
- "learning_rate": 6.0513901959749396e-05,
1194
- "loss": 1.594,
1195
- "step": 1580
1196
- },
1197
- {
1198
- "epoch": 2.0306513409961684,
1199
- "grad_norm": 2.919440984725952,
1200
- "learning_rate": 6.0346015150311366e-05,
1201
- "loss": 1.6577,
1202
- "step": 1590
1203
- },
1204
- {
1205
- "epoch": 2.0434227330779056,
1206
- "grad_norm": 2.744115114212036,
1207
- "learning_rate": 6.017689301185278e-05,
1208
- "loss": 1.6052,
1209
- "step": 1600
1210
- },
1211
- {
1212
- "epoch": 2.0561941251596423,
1213
- "grad_norm": 2.954751491546631,
1214
- "learning_rate": 6.000654378714811e-05,
1215
- "loss": 1.6132,
1216
- "step": 1610
1217
- },
1218
- {
1219
- "epoch": 2.0689655172413794,
1220
- "grad_norm": 2.783085823059082,
1221
- "learning_rate": 5.983497577877823e-05,
1222
- "loss": 1.6281,
1223
- "step": 1620
1224
- },
1225
- {
1226
- "epoch": 2.081736909323116,
1227
- "grad_norm": 2.958265781402588,
1228
- "learning_rate": 5.966219734872581e-05,
1229
- "loss": 1.5537,
1230
- "step": 1630
1231
- },
1232
- {
1233
- "epoch": 2.0945083014048533,
1234
- "grad_norm": 2.761991262435913,
1235
- "learning_rate": 5.9488216917967784e-05,
1236
- "loss": 1.6174,
1237
- "step": 1640
1238
- },
1239
- {
1240
- "epoch": 2.10727969348659,
1241
- "grad_norm": 2.8865253925323486,
1242
- "learning_rate": 5.9313042966064896e-05,
1243
- "loss": 1.6407,
1244
- "step": 1650
1245
- },
1246
- {
1247
- "epoch": 2.10727969348659,
1248
- "eval_loss": 1.9608972072601318,
1249
- "eval_runtime": 26.9926,
1250
- "eval_samples_per_second": 18.524,
1251
- "eval_steps_per_second": 18.524,
1252
- "step": 1650
1253
- },
1254
- {
1255
- "epoch": 2.1200510855683268,
1256
- "grad_norm": 2.831836700439453,
1257
- "learning_rate": 5.9136684030748436e-05,
1258
- "loss": 1.6225,
1259
- "step": 1660
1260
- },
1261
- {
1262
- "epoch": 2.132822477650064,
1263
- "grad_norm": 2.761368751525879,
1264
- "learning_rate": 5.89591487075041e-05,
1265
- "loss": 1.6014,
1266
- "step": 1670
1267
- },
1268
- {
1269
- "epoch": 2.1455938697318007,
1270
- "grad_norm": 2.7214226722717285,
1271
- "learning_rate": 5.8780445649153075e-05,
1272
- "loss": 1.6009,
1273
- "step": 1680
1274
- },
1275
- {
1276
- "epoch": 2.158365261813538,
1277
- "grad_norm": 3.010241746902466,
1278
- "learning_rate": 5.860058356543031e-05,
1279
- "loss": 1.5901,
1280
- "step": 1690
1281
- },
1282
- {
1283
- "epoch": 2.1711366538952745,
1284
- "grad_norm": 3.019549608230591,
1285
- "learning_rate": 5.8419571222560034e-05,
1286
- "loss": 1.5947,
1287
- "step": 1700
1288
- },
1289
- {
1290
- "epoch": 2.1839080459770113,
1291
- "grad_norm": 2.638648509979248,
1292
- "learning_rate": 5.823741744282845e-05,
1293
- "loss": 1.566,
1294
- "step": 1710
1295
- },
1296
- {
1297
- "epoch": 2.1966794380587484,
1298
- "grad_norm": 2.9191718101501465,
1299
- "learning_rate": 5.805413110415381e-05,
1300
- "loss": 1.5797,
1301
- "step": 1720
1302
- },
1303
- {
1304
- "epoch": 2.209450830140485,
1305
- "grad_norm": 3.0825226306915283,
1306
- "learning_rate": 5.786972113965369e-05,
1307
- "loss": 1.6247,
1308
- "step": 1730
1309
- },
1310
- {
1311
- "epoch": 2.2222222222222223,
1312
- "grad_norm": 3.186307191848755,
1313
- "learning_rate": 5.7684196537209574e-05,
1314
- "loss": 1.6044,
1315
- "step": 1740
1316
- },
1317
- {
1318
- "epoch": 2.234993614303959,
1319
- "grad_norm": 2.94689679145813,
1320
- "learning_rate": 5.749756633902887e-05,
1321
- "loss": 1.5743,
1322
- "step": 1750
1323
- },
1324
- {
1325
- "epoch": 2.247765006385696,
1326
- "grad_norm": 3.0586421489715576,
1327
- "learning_rate": 5.7309839641204136e-05,
1328
- "loss": 1.6237,
1329
- "step": 1760
1330
- },
1331
- {
1332
- "epoch": 2.260536398467433,
1333
- "grad_norm": 3.0845377445220947,
1334
- "learning_rate": 5.7121025593269777e-05,
1335
- "loss": 1.6194,
1336
- "step": 1770
1337
- },
1338
- {
1339
- "epoch": 2.27330779054917,
1340
- "grad_norm": 3.0147876739501953,
1341
- "learning_rate": 5.693113339775611e-05,
1342
- "loss": 1.5865,
1343
- "step": 1780
1344
- },
1345
- {
1346
- "epoch": 2.286079182630907,
1347
- "grad_norm": 3.1859536170959473,
1348
- "learning_rate": 5.674017230974085e-05,
1349
- "loss": 1.6086,
1350
- "step": 1790
1351
- },
1352
- {
1353
- "epoch": 2.2988505747126435,
1354
- "grad_norm": 3.106766939163208,
1355
- "learning_rate": 5.654815163639803e-05,
1356
- "loss": 1.5977,
1357
- "step": 1800
1358
- },
1359
- {
1360
- "epoch": 2.2988505747126435,
1361
- "eval_loss": 1.9625232219696045,
1362
- "eval_runtime": 28.5711,
1363
- "eval_samples_per_second": 17.5,
1364
- "eval_steps_per_second": 17.5,
1365
- "step": 1800
1366
- },
1367
- {
1368
- "epoch": 2.3116219667943807,
1369
- "grad_norm": 2.9333529472351074,
1370
- "learning_rate": 5.654815163639803e-07,
1371
- "loss": 1.5856,
1372
- "step": 1810
1373
- },
1374
- {
1375
- "epoch": 2.3243933588761174,
1376
- "grad_norm": 2.9890449047088623,
1377
- "learning_rate": 1.1309630327279607e-06,
1378
- "loss": 1.584,
1379
- "step": 1820
1380
- },
1381
- {
1382
- "epoch": 2.3371647509578546,
1383
- "grad_norm": 3.300159215927124,
1384
- "learning_rate": 1.6964445490919409e-06,
1385
- "loss": 1.5921,
1386
- "step": 1830
1387
- },
1388
- {
1389
- "epoch": 2.3499361430395913,
1390
- "grad_norm": 3.0725886821746826,
1391
- "learning_rate": 2.2619260654559213e-06,
1392
- "loss": 1.5305,
1393
- "step": 1840
1394
- },
1395
- {
1396
- "epoch": 2.362707535121328,
1397
- "grad_norm": 2.9211957454681396,
1398
- "learning_rate": 2.8274075818199017e-06,
1399
- "loss": 1.6247,
1400
- "step": 1850
1401
- },
1402
- {
1403
- "epoch": 2.375478927203065,
1404
- "grad_norm": 2.942033052444458,
1405
- "learning_rate": 3.3928890981838817e-06,
1406
- "loss": 1.5717,
1407
- "step": 1860
1408
- },
1409
- {
1410
- "epoch": 2.388250319284802,
1411
- "grad_norm": 2.8808839321136475,
1412
- "learning_rate": 3.958370614547863e-06,
1413
- "loss": 1.606,
1414
- "step": 1870
1415
- },
1416
- {
1417
- "epoch": 2.401021711366539,
1418
- "grad_norm": 2.9812419414520264,
1419
- "learning_rate": 4.523852130911843e-06,
1420
- "loss": 1.5825,
1421
- "step": 1880
1422
- },
1423
- {
1424
- "epoch": 2.413793103448276,
1425
- "grad_norm": 3.1213929653167725,
1426
- "learning_rate": 5.089333647275823e-06,
1427
- "loss": 1.6172,
1428
- "step": 1890
1429
- },
1430
- {
1431
- "epoch": 2.4265644955300125,
1432
- "grad_norm": 2.865854501724243,
1433
- "learning_rate": 5.6548151636398035e-06,
1434
- "loss": 1.6028,
1435
- "step": 1900
1436
- },
1437
- {
1438
- "epoch": 2.4393358876117497,
1439
- "grad_norm": 3.07641863822937,
1440
- "learning_rate": 6.220296680003784e-06,
1441
- "loss": 1.6674,
1442
- "step": 1910
1443
- },
1444
- {
1445
- "epoch": 2.4521072796934864,
1446
- "grad_norm": 2.7488229274749756,
1447
- "learning_rate": 6.7857781963677635e-06,
1448
- "loss": 1.5994,
1449
- "step": 1920
1450
- },
1451
- {
1452
- "epoch": 2.4648786717752236,
1453
- "grad_norm": 2.993471384048462,
1454
- "learning_rate": 7.351259712731745e-06,
1455
- "loss": 1.6039,
1456
- "step": 1930
1457
- },
1458
- {
1459
- "epoch": 2.4776500638569603,
1460
- "grad_norm": 2.7299869060516357,
1461
- "learning_rate": 7.916741229095726e-06,
1462
- "loss": 1.5046,
1463
- "step": 1940
1464
- },
1465
- {
1466
- "epoch": 2.4904214559386975,
1467
- "grad_norm": 3.036618947982788,
1468
- "learning_rate": 8.482222745459704e-06,
1469
- "loss": 1.6052,
1470
- "step": 1950
1471
- },
1472
- {
1473
- "epoch": 2.4904214559386975,
1474
- "eval_loss": 1.9600526094436646,
1475
- "eval_runtime": 26.6264,
1476
- "eval_samples_per_second": 18.778,
1477
- "eval_steps_per_second": 18.778,
1478
- "step": 1950
1479
- },
1480
- {
1481
- "epoch": 2.503192848020434,
1482
- "grad_norm": 3.1929354667663574,
1483
- "learning_rate": 9.047704261823685e-06,
1484
- "loss": 1.632,
1485
- "step": 1960
1486
- },
1487
- {
1488
- "epoch": 2.5159642401021713,
1489
- "grad_norm": 3.241421699523926,
1490
- "learning_rate": 9.613185778187667e-06,
1491
- "loss": 1.5757,
1492
- "step": 1970
1493
- },
1494
- {
1495
- "epoch": 2.528735632183908,
1496
- "grad_norm": 2.8949759006500244,
1497
- "learning_rate": 1.0178667294551646e-05,
1498
- "loss": 1.6781,
1499
- "step": 1980
1500
- },
1501
- {
1502
- "epoch": 2.541507024265645,
1503
- "grad_norm": 2.983260154724121,
1504
- "learning_rate": 1.0744148810915626e-05,
1505
- "loss": 1.6403,
1506
- "step": 1990
1507
- },
1508
- {
1509
- "epoch": 2.554278416347382,
1510
- "grad_norm": 3.2770559787750244,
1511
- "learning_rate": 1.1309630327279607e-05,
1512
- "loss": 1.6642,
1513
- "step": 2000
1514
- },
1515
- {
1516
- "epoch": 2.5670498084291187,
1517
- "grad_norm": 2.995584487915039,
1518
- "learning_rate": 1.1875111843643587e-05,
1519
- "loss": 1.6282,
1520
- "step": 2010
1521
- },
1522
- {
1523
- "epoch": 2.579821200510856,
1524
- "grad_norm": 3.113140106201172,
1525
- "learning_rate": 1.2440593360007568e-05,
1526
- "loss": 1.5997,
1527
- "step": 2020
1528
- },
1529
- {
1530
- "epoch": 2.5925925925925926,
1531
- "grad_norm": 2.9088339805603027,
1532
- "learning_rate": 1.3006074876371547e-05,
1533
- "loss": 1.6134,
1534
- "step": 2030
1535
- },
1536
- {
1537
- "epoch": 2.6053639846743293,
1538
- "grad_norm": 2.9703259468078613,
1539
- "learning_rate": 1.3571556392735527e-05,
1540
- "loss": 1.5381,
1541
- "step": 2040
1542
- },
1543
- {
1544
- "epoch": 2.6181353767560664,
1545
- "grad_norm": 3.0287814140319824,
1546
- "learning_rate": 1.4137037909099508e-05,
1547
- "loss": 1.6708,
1548
- "step": 2050
1549
- },
1550
- {
1551
- "epoch": 2.630906768837803,
1552
- "grad_norm": 3.0678138732910156,
1553
- "learning_rate": 1.470251942546349e-05,
1554
- "loss": 1.6232,
1555
- "step": 2060
1556
- },
1557
- {
1558
- "epoch": 2.6436781609195403,
1559
- "grad_norm": 2.8624658584594727,
1560
- "learning_rate": 1.526800094182747e-05,
1561
- "loss": 1.5495,
1562
- "step": 2070
1563
- },
1564
- {
1565
- "epoch": 2.656449553001277,
1566
- "grad_norm": 3.0500433444976807,
1567
- "learning_rate": 1.5833482458191452e-05,
1568
- "loss": 1.5992,
1569
- "step": 2080
1570
- },
1571
- {
1572
- "epoch": 2.6692209450830138,
1573
- "grad_norm": 2.9770045280456543,
1574
- "learning_rate": 1.639896397455543e-05,
1575
- "loss": 1.6213,
1576
- "step": 2090
1577
- },
1578
- {
1579
- "epoch": 2.681992337164751,
1580
- "grad_norm": 3.110860586166382,
1581
- "learning_rate": 1.6964445490919408e-05,
1582
- "loss": 1.5852,
1583
- "step": 2100
1584
- },
1585
- {
1586
- "epoch": 2.681992337164751,
1587
- "eval_loss": 1.957663893699646,
1588
- "eval_runtime": 26.8577,
1589
- "eval_samples_per_second": 18.617,
1590
- "eval_steps_per_second": 18.617,
1591
- "step": 2100
1592
- },
1593
- {
1594
- "epoch": 2.694763729246488,
1595
- "grad_norm": 2.987915515899658,
1596
- "learning_rate": 1.752992700728339e-05,
1597
- "loss": 1.613,
1598
- "step": 2110
1599
- },
1600
- {
1601
- "epoch": 2.707535121328225,
1602
- "grad_norm": 3.1154093742370605,
1603
- "learning_rate": 1.809540852364737e-05,
1604
- "loss": 1.6002,
1605
- "step": 2120
1606
- },
1607
- {
1608
- "epoch": 2.7203065134099615,
1609
- "grad_norm": 3.175532817840576,
1610
- "learning_rate": 1.8660890040011352e-05,
1611
- "loss": 1.5809,
1612
- "step": 2130
1613
- },
1614
- {
1615
- "epoch": 2.7330779054916987,
1616
- "grad_norm": 2.9489593505859375,
1617
- "learning_rate": 1.9226371556375333e-05,
1618
- "loss": 1.5821,
1619
- "step": 2140
1620
- },
1621
- {
1622
- "epoch": 2.7458492975734354,
1623
- "grad_norm": 2.855994939804077,
1624
- "learning_rate": 1.979185307273931e-05,
1625
- "loss": 1.5858,
1626
- "step": 2150
1627
- },
1628
- {
1629
- "epoch": 2.7586206896551726,
1630
- "grad_norm": 2.9239962100982666,
1631
- "learning_rate": 2.0357334589103292e-05,
1632
- "loss": 1.5816,
1633
- "step": 2160
1634
- },
1635
- {
1636
- "epoch": 2.7713920817369093,
1637
- "grad_norm": 2.9092483520507812,
1638
- "learning_rate": 2.0922816105467273e-05,
1639
- "loss": 1.6278,
1640
- "step": 2170
1641
- },
1642
- {
1643
- "epoch": 2.784163473818646,
1644
- "grad_norm": 3.0730059146881104,
1645
- "learning_rate": 2.148829762183125e-05,
1646
- "loss": 1.587,
1647
- "step": 2180
1648
- },
1649
- {
1650
- "epoch": 2.796934865900383,
1651
- "grad_norm": 2.9733469486236572,
1652
- "learning_rate": 2.2053779138195233e-05,
1653
- "loss": 1.5931,
1654
- "step": 2190
1655
- },
1656
- {
1657
- "epoch": 2.80970625798212,
1658
- "grad_norm": 2.882889986038208,
1659
- "learning_rate": 2.2619260654559214e-05,
1660
- "loss": 1.6206,
1661
- "step": 2200
1662
- },
1663
- {
1664
- "epoch": 2.822477650063857,
1665
- "grad_norm": 3.2219135761260986,
1666
- "learning_rate": 2.3184742170923192e-05,
1667
- "loss": 1.5782,
1668
- "step": 2210
1669
- },
1670
- {
1671
- "epoch": 2.835249042145594,
1672
- "grad_norm": 3.1879076957702637,
1673
- "learning_rate": 2.3750223687287173e-05,
1674
- "loss": 1.6398,
1675
- "step": 2220
1676
- },
1677
- {
1678
- "epoch": 2.8480204342273305,
1679
- "grad_norm": 3.0046067237854004,
1680
- "learning_rate": 2.4315705203651154e-05,
1681
- "loss": 1.6471,
1682
- "step": 2230
1683
- },
1684
- {
1685
- "epoch": 2.8607918263090677,
1686
- "grad_norm": 2.837573766708374,
1687
- "learning_rate": 2.4881186720015136e-05,
1688
- "loss": 1.5785,
1689
- "step": 2240
1690
- },
1691
- {
1692
- "epoch": 2.873563218390805,
1693
- "grad_norm": 2.998067855834961,
1694
- "learning_rate": 2.5446668236379117e-05,
1695
- "loss": 1.6123,
1696
- "step": 2250
1697
- },
1698
- {
1699
- "epoch": 2.873563218390805,
1700
- "eval_loss": 1.9578022956848145,
1701
- "eval_runtime": 27.614,
1702
- "eval_samples_per_second": 18.107,
1703
- "eval_steps_per_second": 18.107,
1704
- "step": 2250
1705
- },
1706
- {
1707
- "epoch": 2.8863346104725416,
1708
- "grad_norm": 3.0444719791412354,
1709
- "learning_rate": 2.6012149752743095e-05,
1710
- "loss": 1.5877,
1711
- "step": 2260
1712
- },
1713
- {
1714
- "epoch": 2.8991060025542783,
1715
- "grad_norm": 3.109495162963867,
1716
- "learning_rate": 2.6577631269107073e-05,
1717
- "loss": 1.6581,
1718
- "step": 2270
1719
- },
1720
- {
1721
- "epoch": 2.9118773946360155,
1722
- "grad_norm": 2.8879830837249756,
1723
- "learning_rate": 2.7143112785471054e-05,
1724
- "loss": 1.6034,
1725
- "step": 2280
1726
- },
1727
- {
1728
- "epoch": 2.924648786717752,
1729
- "grad_norm": 3.3881685733795166,
1730
- "learning_rate": 2.7708594301835035e-05,
1731
- "loss": 1.6445,
1732
- "step": 2290
1733
- },
1734
- {
1735
- "epoch": 2.9374201787994894,
1736
- "grad_norm": 2.869450330734253,
1737
- "learning_rate": 2.8274075818199017e-05,
1738
- "loss": 1.5976,
1739
- "step": 2300
1740
- },
1741
- {
1742
- "epoch": 2.950191570881226,
1743
- "grad_norm": 2.9405651092529297,
1744
- "learning_rate": 2.8273731308557363e-05,
1745
- "loss": 1.5806,
1746
- "step": 2310
1747
- },
1748
- {
1749
- "epoch": 2.962962962962963,
1750
- "grad_norm": 3.227057695388794,
1751
- "learning_rate": 2.8272697796423325e-05,
1752
- "loss": 1.6023,
1753
- "step": 2320
1754
- },
1755
- {
1756
- "epoch": 2.9757343550447,
1757
- "grad_norm": 3.0668394565582275,
1758
- "learning_rate": 2.8270975332168814e-05,
1759
- "loss": 1.5949,
1760
- "step": 2330
1761
- },
1762
- {
1763
- "epoch": 2.9885057471264367,
1764
- "grad_norm": 3.4752612113952637,
1765
- "learning_rate": 2.8268563999744315e-05,
1766
- "loss": 1.6142,
1767
- "step": 2340
1768
- },
1769
- {
1770
- "epoch": 3.001277139208174,
1771
- "grad_norm": 2.851846694946289,
1772
- "learning_rate": 2.8265463916674774e-05,
1773
- "loss": 1.6006,
1774
- "step": 2350
1775
- },
1776
- {
1777
- "epoch": 3.0140485312899106,
1778
- "grad_norm": 2.914916753768921,
1779
- "learning_rate": 2.8261675234053857e-05,
1780
- "loss": 1.5599,
1781
- "step": 2360
1782
- },
1783
- {
1784
- "epoch": 3.0268199233716473,
1785
- "grad_norm": 3.022381544113159,
1786
- "learning_rate": 2.825719813653661e-05,
1787
- "loss": 1.561,
1788
- "step": 2370
1789
- },
1790
- {
1791
- "epoch": 3.0395913154533845,
1792
- "grad_norm": 2.812488079071045,
1793
- "learning_rate": 2.8252032842330455e-05,
1794
- "loss": 1.5051,
1795
- "step": 2380
1796
- },
1797
- {
1798
- "epoch": 3.052362707535121,
1799
- "grad_norm": 2.879337787628174,
1800
- "learning_rate": 2.8246179603184542e-05,
1801
- "loss": 1.5499,
1802
- "step": 2390
1803
- },
1804
- {
1805
- "epoch": 3.0651340996168583,
1806
- "grad_norm": 3.1652944087982178,
1807
- "learning_rate": 2.823963870437749e-05,
1808
- "loss": 1.5814,
1809
- "step": 2400
1810
- },
1811
- {
1812
- "epoch": 3.0651340996168583,
1813
- "eval_loss": 1.9594498872756958,
1814
- "eval_runtime": 27.4191,
1815
- "eval_samples_per_second": 18.235,
1816
- "eval_steps_per_second": 18.235,
1817
- "step": 2400
1818
- },
1819
- {
1820
- "epoch": 3.077905491698595,
1821
- "grad_norm": 3.0777502059936523,
1822
- "learning_rate": 2.8239638704377494e-07,
1823
- "loss": 1.5266,
1824
- "step": 2410
1825
- },
1826
- {
1827
- "epoch": 3.0906768837803322,
1828
- "grad_norm": 2.999093770980835,
1829
- "learning_rate": 5.647927740875499e-07,
1830
- "loss": 1.5265,
1831
- "step": 2420
1832
- },
1833
- {
1834
- "epoch": 3.103448275862069,
1835
- "grad_norm": 3.03286075592041,
1836
- "learning_rate": 8.471891611313247e-07,
1837
- "loss": 1.5492,
1838
- "step": 2430
1839
- },
1840
- {
1841
- "epoch": 3.1162196679438057,
1842
- "grad_norm": 3.1120502948760986,
1843
- "learning_rate": 1.1295855481750998e-06,
1844
- "loss": 1.5494,
1845
- "step": 2440
1846
- },
1847
- {
1848
- "epoch": 3.128991060025543,
1849
- "grad_norm": 3.289019823074341,
1850
- "learning_rate": 1.4119819352188746e-06,
1851
- "loss": 1.5846,
1852
- "step": 2450
1853
- },
1854
- {
1855
- "epoch": 3.1417624521072796,
1856
- "grad_norm": 2.9034361839294434,
1857
- "learning_rate": 1.6943783222626494e-06,
1858
- "loss": 1.553,
1859
- "step": 2460
1860
- },
1861
- {
1862
- "epoch": 3.1545338441890167,
1863
- "grad_norm": 3.0615646839141846,
1864
- "learning_rate": 1.9767747093064247e-06,
1865
- "loss": 1.5219,
1866
- "step": 2470
1867
- },
1868
- {
1869
- "epoch": 3.1673052362707534,
1870
- "grad_norm": 2.944612741470337,
1871
- "learning_rate": 2.2591710963501995e-06,
1872
- "loss": 1.5853,
1873
- "step": 2480
1874
- },
1875
- {
1876
- "epoch": 3.1800766283524906,
1877
- "grad_norm": 2.9914486408233643,
1878
- "learning_rate": 2.5415674833939743e-06,
1879
- "loss": 1.5752,
1880
- "step": 2490
1881
- },
1882
- {
1883
- "epoch": 3.1928480204342273,
1884
- "grad_norm": 2.6826584339141846,
1885
- "learning_rate": 2.823963870437749e-06,
1886
- "loss": 1.5066,
1887
- "step": 2500
1888
- },
1889
- {
1890
- "epoch": 3.205619412515964,
1891
- "grad_norm": 3.1901533603668213,
1892
- "learning_rate": 3.106360257481524e-06,
1893
- "loss": 1.5461,
1894
- "step": 2510
1895
- },
1896
- {
1897
- "epoch": 3.218390804597701,
1898
- "grad_norm": 3.0013110637664795,
1899
- "learning_rate": 3.388756644525299e-06,
1900
- "loss": 1.5591,
1901
- "step": 2520
1902
- },
1903
- {
1904
- "epoch": 3.231162196679438,
1905
- "grad_norm": 3.266486406326294,
1906
- "learning_rate": 3.671153031569074e-06,
1907
- "loss": 1.5211,
1908
- "step": 2530
1909
- },
1910
- {
1911
- "epoch": 3.243933588761175,
1912
- "grad_norm": 2.939565896987915,
1913
- "learning_rate": 3.953549418612849e-06,
1914
- "loss": 1.5392,
1915
- "step": 2540
1916
- },
1917
- {
1918
- "epoch": 3.256704980842912,
1919
- "grad_norm": 2.8810532093048096,
1920
- "learning_rate": 4.235945805656624e-06,
1921
- "loss": 1.5439,
1922
- "step": 2550
1923
- },
1924
- {
1925
- "epoch": 3.256704980842912,
1926
- "eval_loss": 1.9594852924346924,
1927
- "eval_runtime": 27.7206,
1928
- "eval_samples_per_second": 18.037,
1929
- "eval_steps_per_second": 18.037,
1930
- "step": 2550
1931
  }
1932
  ],
1933
  "logging_steps": 10,
@@ -1947,7 +139,7 @@
1947
  "attributes": {}
1948
  }
1949
  },
1950
- "total_flos": 4.48659005668393e+16,
1951
  "train_batch_size": 16,
1952
  "trial_name": null,
1953
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.9780572652816772,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.19157088122605365,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.01277139208173691,
13
+ "grad_norm": 2.1011273860931396,
14
+ "learning_rate": 4.666666666666666e-06,
15
+ "loss": 1.9177,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.02554278416347382,
20
+ "grad_norm": 2.0189812183380127,
21
+ "learning_rate": 9.333333333333333e-06,
22
+ "loss": 1.9419,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.038314176245210725,
27
+ "grad_norm": 2.073760509490967,
28
+ "learning_rate": 1.4e-05,
29
+ "loss": 1.9122,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.05108556832694764,
34
+ "grad_norm": 1.955664873123169,
35
+ "learning_rate": 1.8666666666666665e-05,
36
+ "loss": 1.8872,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.06385696040868455,
41
+ "grad_norm": 2.6010475158691406,
42
+ "learning_rate": 2.333333333333333e-05,
43
+ "loss": 1.9779,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.07662835249042145,
48
+ "grad_norm": 2.0808260440826416,
49
+ "learning_rate": 2.8e-05,
50
+ "loss": 1.933,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.08939974457215837,
55
+ "grad_norm": 1.969761848449707,
56
+ "learning_rate": 3.266666666666666e-05,
57
+ "loss": 1.946,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.10217113665389528,
62
+ "grad_norm": 2.136836290359497,
63
+ "learning_rate": 3.733333333333333e-05,
64
+ "loss": 1.9441,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.11494252873563218,
69
+ "grad_norm": 2.056912899017334,
70
+ "learning_rate": 4.2e-05,
71
+ "loss": 1.9262,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.1277139208173691,
76
+ "grad_norm": 2.1491384506225586,
77
+ "learning_rate": 4.666666666666666e-05,
78
+ "loss": 1.9775,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.140485312899106,
83
+ "grad_norm": 1.8882553577423096,
84
+ "learning_rate": 5.1333333333333325e-05,
85
+ "loss": 1.9233,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.1532567049808429,
90
+ "grad_norm": 2.0507898330688477,
91
+ "learning_rate": 5.6e-05,
92
+ "loss": 1.9408,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.16602809706257982,
97
+ "grad_norm": 2.2763912677764893,
98
+ "learning_rate": 6.0666666666666666e-05,
99
+ "loss": 1.9429,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.17879948914431673,
104
+ "grad_norm": 2.1040444374084473,
105
+ "learning_rate": 6.533333333333333e-05,
106
+ "loss": 1.9193,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.19157088122605365,
111
+ "grad_norm": 2.0764999389648438,
112
+ "learning_rate": 7e-05,
113
+ "loss": 1.9405,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.19157088122605365,
118
+ "eval_loss": 1.9780572652816772,
119
+ "eval_runtime": 24.3012,
120
+ "eval_samples_per_second": 20.575,
121
+ "eval_steps_per_second": 20.575,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 2084245000224768.0,
143
  "train_batch_size": 16,
144
  "trial_name": null,
145
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d858d0509d8d3f066b40d4d22294b620c54719ac51743859316dd522404d30f7
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73d63faa96a8013f73d4d225b5f62be5f6f1a8819c12a7d65e93c26570162b6b
3
  size 5496