alicegoesdown commited on
Commit
62b38fe
·
verified ·
1 Parent(s): 8e3a05f

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,12 +20,12 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "lm_head",
24
  "dense",
25
- "query_key_value",
26
- "dense_4h_to_h",
27
  "word_embeddings",
28
- "dense_h_to_4h"
 
 
 
29
  ],
30
  "task_type": "CAUSAL_LM",
31
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "dense",
 
 
24
  "word_embeddings",
25
+ "lm_head",
26
+ "dense_h_to_4h",
27
+ "dense_4h_to_h",
28
+ "query_key_value"
29
  ],
30
  "task_type": "CAUSAL_LM",
31
  "use_dora": false,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55251d3351329c6bec7b5c825a125b886772390b08b82afdd2dccf743de8143a
3
  size 2083942096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16fc430ea02afa629e866a5346641b6836e8f0a1231a21e35108a75b62918c10
3
  size 2083942096
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e97f359060ec8922585016a5f533ed80c846824ad6e77834e6f73725ec3cf73
3
  size 57574138
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df6dda378d442e7a76885cd08dbfbaa9ba73630ffbd0a668c07fc0c4eb07ccf6
3
  size 57574138
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d77b24e48a4ba95487ecec041dfb397894ad303cfaa54e6b76fd5a4fd650f14c
3
- size 14308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04d21b9a22376a76ea15a7649e8a85c3cec7dfde7d72a28d681a78aaa424be2e
3
+ size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbe22fabc76df2ede4be853b3225d4fc11a7550974320613fba1758dc4eb38bc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acbf40b939fcabf512d1392be3cf484e890b0399ab711748f7ead69b1e6e3930
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,2159 +1,125 @@
1
  {
2
- "best_metric": 3.244508981704712,
3
- "best_model_checkpoint": "./output/checkpoint-2850",
4
- "epoch": 2.199074074074074,
5
  "eval_steps": 150,
6
- "global_step": 2850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.007716049382716049,
13
- "grad_norm": 22.088781356811523,
14
  "learning_rate": 8.000000000000001e-06,
15
- "loss": 4.7663,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.015432098765432098,
20
- "grad_norm": 26.410573959350586,
21
  "learning_rate": 1.6000000000000003e-05,
22
- "loss": 4.8517,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.023148148148148147,
27
- "grad_norm": 18.140840530395508,
28
  "learning_rate": 2.4e-05,
29
- "loss": 4.776,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.030864197530864196,
34
- "grad_norm": 16.24273681640625,
35
  "learning_rate": 3.2000000000000005e-05,
36
- "loss": 4.5876,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.038580246913580245,
41
- "grad_norm": 24.71746063232422,
42
  "learning_rate": 4e-05,
43
- "loss": 4.5541,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.046296296296296294,
48
- "grad_norm": 28.99083137512207,
49
  "learning_rate": 4.8e-05,
50
- "loss": 4.6019,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.05401234567901234,
55
- "grad_norm": 18.820459365844727,
56
  "learning_rate": 5.6e-05,
57
- "loss": 4.5561,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.06172839506172839,
62
- "grad_norm": 20.313241958618164,
63
  "learning_rate": 6.400000000000001e-05,
64
- "loss": 4.3774,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.06944444444444445,
69
- "grad_norm": 14.407193183898926,
70
  "learning_rate": 7.2e-05,
71
- "loss": 4.3441,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.07716049382716049,
76
- "grad_norm": 18.037578582763672,
77
  "learning_rate": 8e-05,
78
- "loss": 4.4628,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.08487654320987655,
83
- "grad_norm": 18.227323532104492,
84
  "learning_rate": 7.999917787833465e-05,
85
- "loss": 4.2779,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.09259259259259259,
90
- "grad_norm": 16.788177490234375,
91
  "learning_rate": 7.999671154713278e-05,
92
- "loss": 4.31,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.10030864197530864,
97
- "grad_norm": 20.197952270507812,
98
  "learning_rate": 7.99926011077756e-05,
99
- "loss": 4.3506,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.10802469135802469,
104
- "grad_norm": 18.79179573059082,
105
  "learning_rate": 7.99868467292272e-05,
106
- "loss": 4.2644,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.11574074074074074,
111
- "grad_norm": 17.230504989624023,
112
  "learning_rate": 7.997944864802752e-05,
113
- "loss": 4.2638,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.11574074074074074,
118
- "eval_loss": 4.43131160736084,
119
- "eval_runtime": 17.6713,
120
- "eval_samples_per_second": 28.295,
121
- "eval_steps_per_second": 28.295,
122
  "step": 150
123
- },
124
- {
125
- "epoch": 0.12345679012345678,
126
- "grad_norm": 16.790706634521484,
127
- "learning_rate": 7.997040716828271e-05,
128
- "loss": 4.2613,
129
- "step": 160
130
- },
131
- {
132
- "epoch": 0.13117283950617284,
133
- "grad_norm": 18.06231117248535,
134
- "learning_rate": 7.995972266165259e-05,
135
- "loss": 4.2768,
136
- "step": 170
137
- },
138
- {
139
- "epoch": 0.1388888888888889,
140
- "grad_norm": 22.725183486938477,
141
- "learning_rate": 7.994739556733538e-05,
142
- "loss": 4.1748,
143
- "step": 180
144
- },
145
- {
146
- "epoch": 0.14660493827160495,
147
- "grad_norm": 25.361064910888672,
148
- "learning_rate": 7.993342639204965e-05,
149
- "loss": 4.2136,
150
- "step": 190
151
- },
152
- {
153
- "epoch": 0.15432098765432098,
154
- "grad_norm": 15.649086952209473,
155
- "learning_rate": 7.991781571001347e-05,
156
- "loss": 4.2655,
157
- "step": 200
158
- },
159
- {
160
- "epoch": 0.16203703703703703,
161
- "grad_norm": 13.647220611572266,
162
- "learning_rate": 7.990056416292084e-05,
163
- "loss": 4.1017,
164
- "step": 210
165
- },
166
- {
167
- "epoch": 0.1697530864197531,
168
- "grad_norm": 45.323265075683594,
169
- "learning_rate": 7.988167245991528e-05,
170
- "loss": 4.1013,
171
- "step": 220
172
- },
173
- {
174
- "epoch": 0.17746913580246915,
175
- "grad_norm": 15.971923828125,
176
- "learning_rate": 7.986114137756074e-05,
177
- "loss": 4.2096,
178
- "step": 230
179
- },
180
- {
181
- "epoch": 0.18518518518518517,
182
- "grad_norm": 16.353431701660156,
183
- "learning_rate": 7.983897175980957e-05,
184
- "loss": 4.1852,
185
- "step": 240
186
- },
187
- {
188
- "epoch": 0.19290123456790123,
189
- "grad_norm": 16.62486457824707,
190
- "learning_rate": 7.981516451796794e-05,
191
- "loss": 4.2644,
192
- "step": 250
193
- },
194
- {
195
- "epoch": 0.2006172839506173,
196
- "grad_norm": 11.310361862182617,
197
- "learning_rate": 7.97897206306583e-05,
198
- "loss": 4.1296,
199
- "step": 260
200
- },
201
- {
202
- "epoch": 0.20833333333333334,
203
- "grad_norm": 17.337621688842773,
204
- "learning_rate": 7.976264114377922e-05,
205
- "loss": 4.1704,
206
- "step": 270
207
- },
208
- {
209
- "epoch": 0.21604938271604937,
210
- "grad_norm": 15.199752807617188,
211
- "learning_rate": 7.973392717046233e-05,
212
- "loss": 4.1354,
213
- "step": 280
214
- },
215
- {
216
- "epoch": 0.22376543209876543,
217
- "grad_norm": 18.04800796508789,
218
- "learning_rate": 7.97035798910266e-05,
219
- "loss": 4.0614,
220
- "step": 290
221
- },
222
- {
223
- "epoch": 0.23148148148148148,
224
- "grad_norm": 14.9419527053833,
225
- "learning_rate": 7.967160055292984e-05,
226
- "loss": 4.3118,
227
- "step": 300
228
- },
229
- {
230
- "epoch": 0.23148148148148148,
231
- "eval_loss": 4.240542888641357,
232
- "eval_runtime": 20.5897,
233
- "eval_samples_per_second": 24.284,
234
- "eval_steps_per_second": 24.284,
235
- "step": 300
236
- },
237
- {
238
- "epoch": 0.23919753086419754,
239
- "grad_norm": 19.11252212524414,
240
- "learning_rate": 7.96379904707174e-05,
241
- "loss": 4.1267,
242
- "step": 310
243
- },
244
- {
245
- "epoch": 0.24691358024691357,
246
- "grad_norm": 12.28123950958252,
247
- "learning_rate": 7.960275102596809e-05,
248
- "loss": 4.2135,
249
- "step": 320
250
- },
251
- {
252
- "epoch": 0.25462962962962965,
253
- "grad_norm": 30.027751922607422,
254
- "learning_rate": 7.956588366723745e-05,
255
- "loss": 4.2535,
256
- "step": 330
257
- },
258
- {
259
- "epoch": 0.2623456790123457,
260
- "grad_norm": 11.198431015014648,
261
- "learning_rate": 7.952738990999824e-05,
262
- "loss": 4.2223,
263
- "step": 340
264
- },
265
- {
266
- "epoch": 0.2700617283950617,
267
- "grad_norm": 24.937889099121094,
268
- "learning_rate": 7.948727133657802e-05,
269
- "loss": 4.0935,
270
- "step": 350
271
- },
272
- {
273
- "epoch": 0.2777777777777778,
274
- "grad_norm": 14.272343635559082,
275
- "learning_rate": 7.94455295960942e-05,
276
- "loss": 4.2069,
277
- "step": 360
278
- },
279
- {
280
- "epoch": 0.2854938271604938,
281
- "grad_norm": 17.262426376342773,
282
- "learning_rate": 7.940216640438628e-05,
283
- "loss": 4.2168,
284
- "step": 370
285
- },
286
- {
287
- "epoch": 0.2932098765432099,
288
- "grad_norm": 27.980453491210938,
289
- "learning_rate": 7.93571835439452e-05,
290
- "loss": 4.1711,
291
- "step": 380
292
- },
293
- {
294
- "epoch": 0.30092592592592593,
295
- "grad_norm": 17.98458480834961,
296
- "learning_rate": 7.931058286384016e-05,
297
- "loss": 4.2067,
298
- "step": 390
299
- },
300
- {
301
- "epoch": 0.30864197530864196,
302
- "grad_norm": 21.41556167602539,
303
- "learning_rate": 7.926236627964262e-05,
304
- "loss": 4.0364,
305
- "step": 400
306
- },
307
- {
308
- "epoch": 0.31635802469135804,
309
- "grad_norm": 16.788623809814453,
310
- "learning_rate": 7.92125357733475e-05,
311
- "loss": 4.1445,
312
- "step": 410
313
- },
314
- {
315
- "epoch": 0.32407407407407407,
316
- "grad_norm": 29.33296775817871,
317
- "learning_rate": 7.916109339329173e-05,
318
- "loss": 4.0973,
319
- "step": 420
320
- },
321
- {
322
- "epoch": 0.3317901234567901,
323
- "grad_norm": 15.776802062988281,
324
- "learning_rate": 7.910804125407007e-05,
325
- "loss": 3.9347,
326
- "step": 430
327
- },
328
- {
329
- "epoch": 0.3395061728395062,
330
- "grad_norm": 18.816022872924805,
331
- "learning_rate": 7.905338153644818e-05,
332
- "loss": 4.1418,
333
- "step": 440
334
- },
335
- {
336
- "epoch": 0.3472222222222222,
337
- "grad_norm": 19.884754180908203,
338
- "learning_rate": 7.899711648727294e-05,
339
- "loss": 4.0071,
340
- "step": 450
341
- },
342
- {
343
- "epoch": 0.3472222222222222,
344
- "eval_loss": 4.1484150886535645,
345
- "eval_runtime": 18.0768,
346
- "eval_samples_per_second": 27.66,
347
- "eval_steps_per_second": 27.66,
348
- "step": 450
349
- },
350
- {
351
- "epoch": 0.3549382716049383,
352
- "grad_norm": 23.56029510498047,
353
- "learning_rate": 7.89392484193802e-05,
354
- "loss": 4.1422,
355
- "step": 460
356
- },
357
- {
358
- "epoch": 0.3626543209876543,
359
- "grad_norm": 18.750913619995117,
360
- "learning_rate": 7.887977971149952e-05,
361
- "loss": 4.114,
362
- "step": 470
363
- },
364
- {
365
- "epoch": 0.37037037037037035,
366
- "grad_norm": 17.39931297302246,
367
- "learning_rate": 7.881871280815659e-05,
368
- "loss": 4.1022,
369
- "step": 480
370
- },
371
- {
372
- "epoch": 0.37808641975308643,
373
- "grad_norm": 15.805721282958984,
374
- "learning_rate": 7.875605021957262e-05,
375
- "loss": 4.1326,
376
- "step": 490
377
- },
378
- {
379
- "epoch": 0.38580246913580246,
380
- "grad_norm": 15.227653503417969,
381
- "learning_rate": 7.869179452156118e-05,
382
- "loss": 3.9761,
383
- "step": 500
384
- },
385
- {
386
- "epoch": 0.39351851851851855,
387
- "grad_norm": 24.661602020263672,
388
- "learning_rate": 7.862594835542236e-05,
389
- "loss": 4.1084,
390
- "step": 510
391
- },
392
- {
393
- "epoch": 0.4012345679012346,
394
- "grad_norm": 18.671737670898438,
395
- "learning_rate": 7.855851442783414e-05,
396
- "loss": 3.9883,
397
- "step": 520
398
- },
399
- {
400
- "epoch": 0.4089506172839506,
401
- "grad_norm": 16.544588088989258,
402
- "learning_rate": 7.848949551074116e-05,
403
- "loss": 4.106,
404
- "step": 530
405
- },
406
- {
407
- "epoch": 0.4166666666666667,
408
- "grad_norm": 27.335189819335938,
409
- "learning_rate": 7.841889444124078e-05,
410
- "loss": 4.0624,
411
- "step": 540
412
- },
413
- {
414
- "epoch": 0.4243827160493827,
415
- "grad_norm": 21.533105850219727,
416
- "learning_rate": 7.834671412146643e-05,
417
- "loss": 4.1846,
418
- "step": 550
419
- },
420
- {
421
- "epoch": 0.43209876543209874,
422
- "grad_norm": 13.441323280334473,
423
- "learning_rate": 7.827295751846836e-05,
424
- "loss": 4.1276,
425
- "step": 560
426
- },
427
- {
428
- "epoch": 0.4398148148148148,
429
- "grad_norm": 23.776737213134766,
430
- "learning_rate": 7.819762766409162e-05,
431
- "loss": 4.0718,
432
- "step": 570
433
- },
434
- {
435
- "epoch": 0.44753086419753085,
436
- "grad_norm": 14.89003849029541,
437
- "learning_rate": 7.81207276548515e-05,
438
- "loss": 4.0092,
439
- "step": 580
440
- },
441
- {
442
- "epoch": 0.45524691358024694,
443
- "grad_norm": 18.61737632751465,
444
- "learning_rate": 7.804226065180615e-05,
445
- "loss": 3.9711,
446
- "step": 590
447
- },
448
- {
449
- "epoch": 0.46296296296296297,
450
- "grad_norm": 22.550548553466797,
451
- "learning_rate": 7.796222988042676e-05,
452
- "loss": 3.8619,
453
- "step": 600
454
- },
455
- {
456
- "epoch": 0.46296296296296297,
457
- "eval_loss": 4.064351558685303,
458
- "eval_runtime": 20.247,
459
- "eval_samples_per_second": 24.695,
460
- "eval_steps_per_second": 24.695,
461
- "step": 600
462
- },
463
- {
464
- "epoch": 0.470679012345679,
465
- "grad_norm": 24.837818145751953,
466
- "learning_rate": 7.788063863046486e-05,
467
- "loss": 4.0238,
468
- "step": 610
469
- },
470
- {
471
- "epoch": 0.4783950617283951,
472
- "grad_norm": 17.935625076293945,
473
- "learning_rate": 7.779749025581717e-05,
474
- "loss": 4.035,
475
- "step": 620
476
- },
477
- {
478
- "epoch": 0.4861111111111111,
479
- "grad_norm": 18.532503128051758,
480
- "learning_rate": 7.771278817438773e-05,
481
- "loss": 4.0214,
482
- "step": 630
483
- },
484
- {
485
- "epoch": 0.49382716049382713,
486
- "grad_norm": 18.070255279541016,
487
- "learning_rate": 7.762653586794731e-05,
488
- "loss": 4.0554,
489
- "step": 640
490
- },
491
- {
492
- "epoch": 0.5015432098765432,
493
- "grad_norm": 19.590288162231445,
494
- "learning_rate": 7.753873688199042e-05,
495
- "loss": 3.9853,
496
- "step": 650
497
- },
498
- {
499
- "epoch": 0.5092592592592593,
500
- "grad_norm": 19.89542007446289,
501
- "learning_rate": 7.74493948255895e-05,
502
- "loss": 4.0259,
503
- "step": 660
504
- },
505
- {
506
- "epoch": 0.5169753086419753,
507
- "grad_norm": 14.403157234191895,
508
- "learning_rate": 7.735851337124654e-05,
509
- "loss": 3.9273,
510
- "step": 670
511
- },
512
- {
513
- "epoch": 0.5246913580246914,
514
- "grad_norm": 22.67559051513672,
515
- "learning_rate": 7.726609625474218e-05,
516
- "loss": 4.0133,
517
- "step": 680
518
- },
519
- {
520
- "epoch": 0.5324074074074074,
521
- "grad_norm": 11.595963478088379,
522
- "learning_rate": 7.717214727498209e-05,
523
- "loss": 3.9475,
524
- "step": 690
525
- },
526
- {
527
- "epoch": 0.5401234567901234,
528
- "grad_norm": 17.296403884887695,
529
- "learning_rate": 7.707667029384088e-05,
530
- "loss": 3.8421,
531
- "step": 700
532
- },
533
- {
534
- "epoch": 0.5478395061728395,
535
- "grad_norm": 21.876863479614258,
536
- "learning_rate": 7.697966923600327e-05,
537
- "loss": 3.9917,
538
- "step": 710
539
- },
540
- {
541
- "epoch": 0.5555555555555556,
542
- "grad_norm": 17.589155197143555,
543
- "learning_rate": 7.688114808880283e-05,
544
- "loss": 3.9218,
545
- "step": 720
546
- },
547
- {
548
- "epoch": 0.5632716049382716,
549
- "grad_norm": 17.723289489746094,
550
- "learning_rate": 7.678111090205804e-05,
551
- "loss": 3.9183,
552
- "step": 730
553
- },
554
- {
555
- "epoch": 0.5709876543209876,
556
- "grad_norm": 16.053651809692383,
557
- "learning_rate": 7.667956178790582e-05,
558
- "loss": 4.0021,
559
- "step": 740
560
- },
561
- {
562
- "epoch": 0.5787037037037037,
563
- "grad_norm": 17.192163467407227,
564
- "learning_rate": 7.65765049206325e-05,
565
- "loss": 3.9775,
566
- "step": 750
567
- },
568
- {
569
- "epoch": 0.5787037037037037,
570
- "eval_loss": 3.9868149757385254,
571
- "eval_runtime": 17.6035,
572
- "eval_samples_per_second": 28.403,
573
- "eval_steps_per_second": 28.403,
574
- "step": 750
575
- },
576
- {
577
- "epoch": 0.5864197530864198,
578
- "grad_norm": 29.302610397338867,
579
- "learning_rate": 7.647194453650228e-05,
580
- "loss": 4.0921,
581
- "step": 760
582
- },
583
- {
584
- "epoch": 0.5941358024691358,
585
- "grad_norm": 19.50672721862793,
586
- "learning_rate": 7.6365884933583e-05,
587
- "loss": 3.7631,
588
- "step": 770
589
- },
590
- {
591
- "epoch": 0.6018518518518519,
592
- "grad_norm": 25.308528900146484,
593
- "learning_rate": 7.625833047156953e-05,
594
- "loss": 3.9659,
595
- "step": 780
596
- },
597
- {
598
- "epoch": 0.6095679012345679,
599
- "grad_norm": 20.864965438842773,
600
- "learning_rate": 7.614928557160454e-05,
601
- "loss": 3.902,
602
- "step": 790
603
- },
604
- {
605
- "epoch": 0.6172839506172839,
606
- "grad_norm": 26.428804397583008,
607
- "learning_rate": 7.603875471609677e-05,
608
- "loss": 3.9137,
609
- "step": 800
610
- },
611
- {
612
- "epoch": 0.625,
613
- "grad_norm": 18.294261932373047,
614
- "learning_rate": 7.592674244853676e-05,
615
- "loss": 3.9768,
616
- "step": 810
617
- },
618
- {
619
- "epoch": 0.6327160493827161,
620
- "grad_norm": 13.396001815795898,
621
- "learning_rate": 7.581325337331013e-05,
622
- "loss": 3.979,
623
- "step": 820
624
- },
625
- {
626
- "epoch": 0.6404320987654321,
627
- "grad_norm": 17.813318252563477,
628
- "learning_rate": 7.569829215550825e-05,
629
- "loss": 3.9316,
630
- "step": 830
631
- },
632
- {
633
- "epoch": 0.6481481481481481,
634
- "grad_norm": 27.784278869628906,
635
- "learning_rate": 7.558186352073648e-05,
636
- "loss": 3.9523,
637
- "step": 840
638
- },
639
- {
640
- "epoch": 0.6558641975308642,
641
- "grad_norm": 24.91390609741211,
642
- "learning_rate": 7.546397225492001e-05,
643
- "loss": 3.9183,
644
- "step": 850
645
- },
646
- {
647
- "epoch": 0.6635802469135802,
648
- "grad_norm": 18.925460815429688,
649
- "learning_rate": 7.534462320410702e-05,
650
- "loss": 4.0088,
651
- "step": 860
652
- },
653
- {
654
- "epoch": 0.6712962962962963,
655
- "grad_norm": 23.20028305053711,
656
- "learning_rate": 7.522382127426952e-05,
657
- "loss": 3.8521,
658
- "step": 870
659
- },
660
- {
661
- "epoch": 0.6790123456790124,
662
- "grad_norm": 25.306800842285156,
663
- "learning_rate": 7.510157143110172e-05,
664
- "loss": 3.931,
665
- "step": 880
666
- },
667
- {
668
- "epoch": 0.6867283950617284,
669
- "grad_norm": 14.286781311035156,
670
- "learning_rate": 7.497787869981583e-05,
671
- "loss": 3.7833,
672
- "step": 890
673
- },
674
- {
675
- "epoch": 0.6944444444444444,
676
- "grad_norm": 27.823122024536133,
677
- "learning_rate": 7.485274816493558e-05,
678
- "loss": 3.929,
679
- "step": 900
680
- },
681
- {
682
- "epoch": 0.6944444444444444,
683
- "eval_loss": 3.928668737411499,
684
- "eval_runtime": 18.7881,
685
- "eval_samples_per_second": 26.613,
686
- "eval_steps_per_second": 26.613,
687
- "step": 900
688
- },
689
- {
690
- "epoch": 0.7021604938271605,
691
- "grad_norm": 19.728225708007812,
692
- "learning_rate": 7.472618497008714e-05,
693
- "loss": 3.8653,
694
- "step": 910
695
- },
696
- {
697
- "epoch": 0.7098765432098766,
698
- "grad_norm": 18.80716896057129,
699
- "learning_rate": 7.459819431778775e-05,
700
- "loss": 3.8751,
701
- "step": 920
702
- },
703
- {
704
- "epoch": 0.7175925925925926,
705
- "grad_norm": 22.028133392333984,
706
- "learning_rate": 7.44687814692318e-05,
707
- "loss": 3.9937,
708
- "step": 930
709
- },
710
- {
711
- "epoch": 0.7253086419753086,
712
- "grad_norm": 16.667814254760742,
713
- "learning_rate": 7.433795174407465e-05,
714
- "loss": 3.8102,
715
- "step": 940
716
- },
717
- {
718
- "epoch": 0.7330246913580247,
719
- "grad_norm": 20.56685447692871,
720
- "learning_rate": 7.420571052021386e-05,
721
- "loss": 3.8229,
722
- "step": 950
723
- },
724
- {
725
- "epoch": 0.7407407407407407,
726
- "grad_norm": 19.397369384765625,
727
- "learning_rate": 7.407206323356818e-05,
728
- "loss": 3.807,
729
- "step": 960
730
- },
731
- {
732
- "epoch": 0.7484567901234568,
733
- "grad_norm": 19.5135498046875,
734
- "learning_rate": 7.393701537785411e-05,
735
- "loss": 4.049,
736
- "step": 970
737
- },
738
- {
739
- "epoch": 0.7561728395061729,
740
- "grad_norm": 15.916143417358398,
741
- "learning_rate": 7.380057250436006e-05,
742
- "loss": 3.7301,
743
- "step": 980
744
- },
745
- {
746
- "epoch": 0.7638888888888888,
747
- "grad_norm": 19.677715301513672,
748
- "learning_rate": 7.366274022171814e-05,
749
- "loss": 3.8505,
750
- "step": 990
751
- },
752
- {
753
- "epoch": 0.7716049382716049,
754
- "grad_norm": 15.923828125,
755
- "learning_rate": 7.352352419567362e-05,
756
- "loss": 3.8464,
757
- "step": 1000
758
- },
759
- {
760
- "epoch": 0.779320987654321,
761
- "grad_norm": 15.452752113342285,
762
- "learning_rate": 7.338293014885212e-05,
763
- "loss": 3.6698,
764
- "step": 1010
765
- },
766
- {
767
- "epoch": 0.7870370370370371,
768
- "grad_norm": 34.56717300415039,
769
- "learning_rate": 7.324096386052416e-05,
770
- "loss": 3.8096,
771
- "step": 1020
772
- },
773
- {
774
- "epoch": 0.7947530864197531,
775
- "grad_norm": 21.44382095336914,
776
- "learning_rate": 7.309763116636786e-05,
777
- "loss": 3.9437,
778
- "step": 1030
779
- },
780
- {
781
- "epoch": 0.8024691358024691,
782
- "grad_norm": 19.292604446411133,
783
- "learning_rate": 7.295293795822887e-05,
784
- "loss": 3.9738,
785
- "step": 1040
786
- },
787
- {
788
- "epoch": 0.8101851851851852,
789
- "grad_norm": 15.463872909545898,
790
- "learning_rate": 7.280689018387824e-05,
791
- "loss": 3.861,
792
- "step": 1050
793
- },
794
- {
795
- "epoch": 0.8101851851851852,
796
- "eval_loss": 3.859124183654785,
797
- "eval_runtime": 17.5741,
798
- "eval_samples_per_second": 28.451,
799
- "eval_steps_per_second": 28.451,
800
- "step": 1050
801
- },
802
- {
803
- "epoch": 0.8179012345679012,
804
- "grad_norm": 20.17243003845215,
805
- "learning_rate": 7.265949384676795e-05,
806
- "loss": 3.7003,
807
- "step": 1060
808
- },
809
- {
810
- "epoch": 0.8256172839506173,
811
- "grad_norm": 17.895933151245117,
812
- "learning_rate": 7.251075500578411e-05,
813
- "loss": 3.8089,
814
- "step": 1070
815
- },
816
- {
817
- "epoch": 0.8333333333333334,
818
- "grad_norm": 16.239850997924805,
819
- "learning_rate": 7.236067977499791e-05,
820
- "loss": 3.8402,
821
- "step": 1080
822
- },
823
- {
824
- "epoch": 0.8410493827160493,
825
- "grad_norm": 23.776884078979492,
826
- "learning_rate": 7.220927432341426e-05,
827
- "loss": 3.7538,
828
- "step": 1090
829
- },
830
- {
831
- "epoch": 0.8487654320987654,
832
- "grad_norm": 15.736024856567383,
833
- "learning_rate": 7.205654487471826e-05,
834
- "loss": 3.8258,
835
- "step": 1100
836
- },
837
- {
838
- "epoch": 0.8564814814814815,
839
- "grad_norm": 26.04443359375,
840
- "learning_rate": 7.190249770701939e-05,
841
- "loss": 4.0184,
842
- "step": 1110
843
- },
844
- {
845
- "epoch": 0.8641975308641975,
846
- "grad_norm": 21.772159576416016,
847
- "learning_rate": 7.174713915259331e-05,
848
- "loss": 3.7938,
849
- "step": 1120
850
- },
851
- {
852
- "epoch": 0.8719135802469136,
853
- "grad_norm": 19.530372619628906,
854
- "learning_rate": 7.15904755976217e-05,
855
- "loss": 3.6955,
856
- "step": 1130
857
- },
858
- {
859
- "epoch": 0.8796296296296297,
860
- "grad_norm": 19.027843475341797,
861
- "learning_rate": 7.143251348192971e-05,
862
- "loss": 3.8163,
863
- "step": 1140
864
- },
865
- {
866
- "epoch": 0.8873456790123457,
867
- "grad_norm": 24.277997970581055,
868
- "learning_rate": 7.12732592987212e-05,
869
- "loss": 3.7664,
870
- "step": 1150
871
- },
872
- {
873
- "epoch": 0.8950617283950617,
874
- "grad_norm": 24.98284149169922,
875
- "learning_rate": 7.111271959431189e-05,
876
- "loss": 3.5373,
877
- "step": 1160
878
- },
879
- {
880
- "epoch": 0.9027777777777778,
881
- "grad_norm": 27.04857063293457,
882
- "learning_rate": 7.095090096786027e-05,
883
- "loss": 3.6208,
884
- "step": 1170
885
- },
886
- {
887
- "epoch": 0.9104938271604939,
888
- "grad_norm": 19.242406845092773,
889
- "learning_rate": 7.078781007109625e-05,
890
- "loss": 3.8045,
891
- "step": 1180
892
- },
893
- {
894
- "epoch": 0.9182098765432098,
895
- "grad_norm": 52.25917053222656,
896
- "learning_rate": 7.062345360804779e-05,
897
- "loss": 3.8328,
898
- "step": 1190
899
- },
900
- {
901
- "epoch": 0.9259259259259259,
902
- "grad_norm": 19.197126388549805,
903
- "learning_rate": 7.045783833476538e-05,
904
- "loss": 3.6963,
905
- "step": 1200
906
- },
907
- {
908
- "epoch": 0.9259259259259259,
909
- "eval_loss": 3.787529945373535,
910
- "eval_runtime": 18.9623,
911
- "eval_samples_per_second": 26.368,
912
- "eval_steps_per_second": 26.368,
913
- "step": 1200
914
- },
915
- {
916
- "epoch": 0.933641975308642,
917
- "grad_norm": 25.777538299560547,
918
- "learning_rate": 7.029097105904422e-05,
919
- "loss": 3.8159,
920
- "step": 1210
921
- },
922
- {
923
- "epoch": 0.941358024691358,
924
- "grad_norm": 21.648229598999023,
925
- "learning_rate": 7.012285864014445e-05,
926
- "loss": 3.6991,
927
- "step": 1220
928
- },
929
- {
930
- "epoch": 0.9490740740740741,
931
- "grad_norm": 18.975547790527344,
932
- "learning_rate": 6.995350798850913e-05,
933
- "loss": 3.787,
934
- "step": 1230
935
- },
936
- {
937
- "epoch": 0.9567901234567902,
938
- "grad_norm": 17.65053939819336,
939
- "learning_rate": 6.978292606548029e-05,
940
- "loss": 3.7765,
941
- "step": 1240
942
- },
943
- {
944
- "epoch": 0.9645061728395061,
945
- "grad_norm": 17.642724990844727,
946
- "learning_rate": 6.961111988301262e-05,
947
- "loss": 3.6433,
948
- "step": 1250
949
- },
950
- {
951
- "epoch": 0.9722222222222222,
952
- "grad_norm": 31.705036163330078,
953
- "learning_rate": 6.943809650338541e-05,
954
- "loss": 3.7943,
955
- "step": 1260
956
- },
957
- {
958
- "epoch": 0.9799382716049383,
959
- "grad_norm": 15.860248565673828,
960
- "learning_rate": 6.926386303891205e-05,
961
- "loss": 3.7749,
962
- "step": 1270
963
- },
964
- {
965
- "epoch": 0.9876543209876543,
966
- "grad_norm": 20.174976348876953,
967
- "learning_rate": 6.908842665164789e-05,
968
- "loss": 3.6612,
969
- "step": 1280
970
- },
971
- {
972
- "epoch": 0.9953703703703703,
973
- "grad_norm": 24.97418785095215,
974
- "learning_rate": 6.891179455309567e-05,
975
- "loss": 3.7852,
976
- "step": 1290
977
- },
978
- {
979
- "epoch": 1.0030864197530864,
980
- "grad_norm": 26.189205169677734,
981
- "learning_rate": 6.873397400390911e-05,
982
- "loss": 3.6042,
983
- "step": 1300
984
- },
985
- {
986
- "epoch": 1.0108024691358024,
987
- "grad_norm": 15.729840278625488,
988
- "learning_rate": 6.855497231359453e-05,
989
- "loss": 3.5714,
990
- "step": 1310
991
- },
992
- {
993
- "epoch": 1.0185185185185186,
994
- "grad_norm": 16.016050338745117,
995
- "learning_rate": 6.837479684021032e-05,
996
- "loss": 3.6184,
997
- "step": 1320
998
- },
999
- {
1000
- "epoch": 1.0262345679012346,
1001
- "grad_norm": 21.100793838500977,
1002
- "learning_rate": 6.819345499006448e-05,
1003
- "loss": 3.4837,
1004
- "step": 1330
1005
- },
1006
- {
1007
- "epoch": 1.0339506172839505,
1008
- "grad_norm": 34.890785217285156,
1009
- "learning_rate": 6.80109542174102e-05,
1010
- "loss": 3.5829,
1011
- "step": 1340
1012
- },
1013
- {
1014
- "epoch": 1.0416666666666667,
1015
- "grad_norm": 22.613712310791016,
1016
- "learning_rate": 6.782730202413946e-05,
1017
- "loss": 3.4799,
1018
- "step": 1350
1019
- },
1020
- {
1021
- "epoch": 1.0416666666666667,
1022
- "eval_loss": 3.742891550064087,
1023
- "eval_runtime": 17.5683,
1024
- "eval_samples_per_second": 28.46,
1025
- "eval_steps_per_second": 28.46,
1026
- "step": 1350
1027
- },
1028
- {
1029
- "epoch": 1.0493827160493827,
1030
- "grad_norm": 20.22081184387207,
1031
- "learning_rate": 6.76425059594746e-05,
1032
- "loss": 3.6744,
1033
- "step": 1360
1034
- },
1035
- {
1036
- "epoch": 1.0570987654320987,
1037
- "grad_norm": 16.30259895324707,
1038
- "learning_rate": 6.745657361965803e-05,
1039
- "loss": 3.3617,
1040
- "step": 1370
1041
- },
1042
- {
1043
- "epoch": 1.0648148148148149,
1044
- "grad_norm": 32.47397994995117,
1045
- "learning_rate": 6.726951264763998e-05,
1046
- "loss": 3.3692,
1047
- "step": 1380
1048
- },
1049
- {
1050
- "epoch": 1.0725308641975309,
1051
- "grad_norm": 16.126924514770508,
1052
- "learning_rate": 6.70813307327644e-05,
1053
- "loss": 3.4547,
1054
- "step": 1390
1055
- },
1056
- {
1057
- "epoch": 1.0802469135802468,
1058
- "grad_norm": 35.09951400756836,
1059
- "learning_rate": 6.689203561045268e-05,
1060
- "loss": 3.4555,
1061
- "step": 1400
1062
- },
1063
- {
1064
- "epoch": 1.087962962962963,
1065
- "grad_norm": 27.35624122619629,
1066
- "learning_rate": 6.670163506188593e-05,
1067
- "loss": 3.639,
1068
- "step": 1410
1069
- },
1070
- {
1071
- "epoch": 1.095679012345679,
1072
- "grad_norm": 20.523944854736328,
1073
- "learning_rate": 6.651013691368492e-05,
1074
- "loss": 3.6066,
1075
- "step": 1420
1076
- },
1077
- {
1078
- "epoch": 1.103395061728395,
1079
- "grad_norm": 29.101484298706055,
1080
- "learning_rate": 6.631754903758851e-05,
1081
- "loss": 3.5652,
1082
- "step": 1430
1083
- },
1084
- {
1085
- "epoch": 1.1111111111111112,
1086
- "grad_norm": 24.4724063873291,
1087
- "learning_rate": 6.612387935012995e-05,
1088
- "loss": 3.4425,
1089
- "step": 1440
1090
- },
1091
- {
1092
- "epoch": 1.1188271604938271,
1093
- "grad_norm": 24.631128311157227,
1094
- "learning_rate": 6.592913581231155e-05,
1095
- "loss": 3.5492,
1096
- "step": 1450
1097
- },
1098
- {
1099
- "epoch": 1.126543209876543,
1100
- "grad_norm": 39.123775482177734,
1101
- "learning_rate": 6.573332642927737e-05,
1102
- "loss": 3.3953,
1103
- "step": 1460
1104
- },
1105
- {
1106
- "epoch": 1.1342592592592593,
1107
- "grad_norm": 30.524311065673828,
1108
- "learning_rate": 6.553645924998422e-05,
1109
- "loss": 3.3948,
1110
- "step": 1470
1111
- },
1112
- {
1113
- "epoch": 1.1419753086419753,
1114
- "grad_norm": 23.212848663330078,
1115
- "learning_rate": 6.53385423668708e-05,
1116
- "loss": 3.3809,
1117
- "step": 1480
1118
- },
1119
- {
1120
- "epoch": 1.1496913580246915,
1121
- "grad_norm": 32.31661605834961,
1122
- "learning_rate": 6.513958391552494e-05,
1123
- "loss": 3.4081,
1124
- "step": 1490
1125
- },
1126
- {
1127
- "epoch": 1.1574074074074074,
1128
- "grad_norm": 27.419097900390625,
1129
- "learning_rate": 6.493959207434934e-05,
1130
- "loss": 3.3998,
1131
- "step": 1500
1132
- },
1133
- {
1134
- "epoch": 1.1574074074074074,
1135
- "eval_loss": 3.70896053314209,
1136
- "eval_runtime": 19.7466,
1137
- "eval_samples_per_second": 25.321,
1138
- "eval_steps_per_second": 25.321,
1139
- "step": 1500
1140
- },
1141
- {
1142
- "epoch": 1.1651234567901234,
1143
- "grad_norm": 24.480224609375,
1144
- "learning_rate": 6.473857506422526e-05,
1145
- "loss": 3.3748,
1146
- "step": 1510
1147
- },
1148
- {
1149
- "epoch": 1.1728395061728394,
1150
- "grad_norm": 22.06273078918457,
1151
- "learning_rate": 6.453654114817467e-05,
1152
- "loss": 3.7085,
1153
- "step": 1520
1154
- },
1155
- {
1156
- "epoch": 1.1805555555555556,
1157
- "grad_norm": 21.491657257080078,
1158
- "learning_rate": 6.433349863102053e-05,
1159
- "loss": 3.398,
1160
- "step": 1530
1161
- },
1162
- {
1163
- "epoch": 1.1882716049382716,
1164
- "grad_norm": 23.725849151611328,
1165
- "learning_rate": 6.412945585904545e-05,
1166
- "loss": 3.4217,
1167
- "step": 1540
1168
- },
1169
- {
1170
- "epoch": 1.1959876543209877,
1171
- "grad_norm": 22.040245056152344,
1172
- "learning_rate": 6.392442121964865e-05,
1173
- "loss": 3.3477,
1174
- "step": 1550
1175
- },
1176
- {
1177
- "epoch": 1.2037037037037037,
1178
- "grad_norm": 20.66607093811035,
1179
- "learning_rate": 6.371840314100104e-05,
1180
- "loss": 3.4077,
1181
- "step": 1560
1182
- },
1183
- {
1184
- "epoch": 1.2114197530864197,
1185
- "grad_norm": 23.58853530883789,
1186
- "learning_rate": 6.351141009169893e-05,
1187
- "loss": 3.3661,
1188
- "step": 1570
1189
- },
1190
- {
1191
- "epoch": 1.2191358024691359,
1192
- "grad_norm": 24.576080322265625,
1193
- "learning_rate": 6.330345058041585e-05,
1194
- "loss": 3.4321,
1195
- "step": 1580
1196
- },
1197
- {
1198
- "epoch": 1.2268518518518519,
1199
- "grad_norm": 24.945005416870117,
1200
- "learning_rate": 6.309453315555279e-05,
1201
- "loss": 3.5472,
1202
- "step": 1590
1203
- },
1204
- {
1205
- "epoch": 1.2345679012345678,
1206
- "grad_norm": 20.998676300048828,
1207
- "learning_rate": 6.288466640488679e-05,
1208
- "loss": 3.4229,
1209
- "step": 1600
1210
- },
1211
- {
1212
- "epoch": 1.242283950617284,
1213
- "grad_norm": 25.277576446533203,
1214
- "learning_rate": 6.2673858955218e-05,
1215
- "loss": 3.5484,
1216
- "step": 1610
1217
- },
1218
- {
1219
- "epoch": 1.25,
1220
- "grad_norm": 35.780555725097656,
1221
- "learning_rate": 6.2462119472015e-05,
1222
- "loss": 3.572,
1223
- "step": 1620
1224
- },
1225
- {
1226
- "epoch": 1.257716049382716,
1227
- "grad_norm": 24.384044647216797,
1228
- "learning_rate": 6.22494566590586e-05,
1229
- "loss": 3.445,
1230
- "step": 1630
1231
- },
1232
- {
1233
- "epoch": 1.2654320987654322,
1234
- "grad_norm": 16.81711196899414,
1235
- "learning_rate": 6.20358792580841e-05,
1236
- "loss": 3.3449,
1237
- "step": 1640
1238
- },
1239
- {
1240
- "epoch": 1.2731481481481481,
1241
- "grad_norm": 19.94586181640625,
1242
- "learning_rate": 6.182139604842195e-05,
1243
- "loss": 3.4073,
1244
- "step": 1650
1245
- },
1246
- {
1247
- "epoch": 1.2731481481481481,
1248
- "eval_loss": 3.637455463409424,
1249
- "eval_runtime": 17.8365,
1250
- "eval_samples_per_second": 28.032,
1251
- "eval_steps_per_second": 28.032,
1252
- "step": 1650
1253
- },
1254
- {
1255
- "epoch": 1.2808641975308643,
1256
- "grad_norm": 28.49296760559082,
1257
- "learning_rate": 6.160601584663681e-05,
1258
- "loss": 3.4991,
1259
- "step": 1660
1260
- },
1261
- {
1262
- "epoch": 1.2885802469135803,
1263
- "grad_norm": 30.67734146118164,
1264
- "learning_rate": 6.138974750616519e-05,
1265
- "loss": 3.3618,
1266
- "step": 1670
1267
- },
1268
- {
1269
- "epoch": 1.2962962962962963,
1270
- "grad_norm": 19.116107940673828,
1271
- "learning_rate": 6.11725999169515e-05,
1272
- "loss": 3.4404,
1273
- "step": 1680
1274
- },
1275
- {
1276
- "epoch": 1.3040123456790123,
1277
- "grad_norm": 28.58710289001465,
1278
- "learning_rate": 6.0954582005082616e-05,
1279
- "loss": 3.3612,
1280
- "step": 1690
1281
- },
1282
- {
1283
- "epoch": 1.3117283950617284,
1284
- "grad_norm": 37.30510711669922,
1285
- "learning_rate": 6.0735702732421015e-05,
1286
- "loss": 3.3507,
1287
- "step": 1700
1288
- },
1289
- {
1290
- "epoch": 1.3194444444444444,
1291
- "grad_norm": 25.61612319946289,
1292
- "learning_rate": 6.0515971096236253e-05,
1293
- "loss": 3.2735,
1294
- "step": 1710
1295
- },
1296
- {
1297
- "epoch": 1.3271604938271606,
1298
- "grad_norm": 33.34403991699219,
1299
- "learning_rate": 6.029539612883529e-05,
1300
- "loss": 3.2973,
1301
- "step": 1720
1302
- },
1303
- {
1304
- "epoch": 1.3348765432098766,
1305
- "grad_norm": 31.370502471923828,
1306
- "learning_rate": 6.007398689719111e-05,
1307
- "loss": 3.327,
1308
- "step": 1730
1309
- },
1310
- {
1311
- "epoch": 1.3425925925925926,
1312
- "grad_norm": 31.45387840270996,
1313
- "learning_rate": 5.9851752502570015e-05,
1314
- "loss": 3.2595,
1315
- "step": 1740
1316
- },
1317
- {
1318
- "epoch": 1.3503086419753085,
1319
- "grad_norm": 28.973865509033203,
1320
- "learning_rate": 5.9628702080157526e-05,
1321
- "loss": 3.3104,
1322
- "step": 1750
1323
- },
1324
- {
1325
- "epoch": 1.3580246913580247,
1326
- "grad_norm": 21.37240219116211,
1327
- "learning_rate": 5.940484479868288e-05,
1328
- "loss": 3.4102,
1329
- "step": 1760
1330
- },
1331
- {
1332
- "epoch": 1.3657407407407407,
1333
- "grad_norm": 33.90608215332031,
1334
- "learning_rate": 5.918018986004216e-05,
1335
- "loss": 3.5516,
1336
- "step": 1770
1337
- },
1338
- {
1339
- "epoch": 1.373456790123457,
1340
- "grad_norm": 24.596567153930664,
1341
- "learning_rate": 5.895474649891995e-05,
1342
- "loss": 3.378,
1343
- "step": 1780
1344
- },
1345
- {
1346
- "epoch": 1.3811728395061729,
1347
- "grad_norm": 25.496883392333984,
1348
- "learning_rate": 5.872852398240984e-05,
1349
- "loss": 3.3586,
1350
- "step": 1790
1351
- },
1352
- {
1353
- "epoch": 1.3888888888888888,
1354
- "grad_norm": 27.56167221069336,
1355
- "learning_rate": 5.8501531609633424e-05,
1356
- "loss": 3.3986,
1357
- "step": 1800
1358
- },
1359
- {
1360
- "epoch": 1.3888888888888888,
1361
- "eval_loss": 3.574371576309204,
1362
- "eval_runtime": 19.0779,
1363
- "eval_samples_per_second": 26.208,
1364
- "eval_steps_per_second": 26.208,
1365
- "step": 1800
1366
- },
1367
- {
1368
- "epoch": 1.3966049382716048,
1369
- "grad_norm": 26.979446411132812,
1370
- "learning_rate": 5.827377871135807e-05,
1371
- "loss": 3.223,
1372
- "step": 1810
1373
- },
1374
- {
1375
- "epoch": 1.404320987654321,
1376
- "grad_norm": 33.906124114990234,
1377
- "learning_rate": 5.8045274649613386e-05,
1378
- "loss": 3.2157,
1379
- "step": 1820
1380
- },
1381
- {
1382
- "epoch": 1.412037037037037,
1383
- "grad_norm": 30.635848999023438,
1384
- "learning_rate": 5.781602881730637e-05,
1385
- "loss": 3.3547,
1386
- "step": 1830
1387
- },
1388
- {
1389
- "epoch": 1.4197530864197532,
1390
- "grad_norm": 18.647005081176758,
1391
- "learning_rate": 5.7586050637835295e-05,
1392
- "loss": 3.3164,
1393
- "step": 1840
1394
- },
1395
- {
1396
- "epoch": 1.4274691358024691,
1397
- "grad_norm": 28.55082130432129,
1398
- "learning_rate": 5.735534956470233e-05,
1399
- "loss": 3.392,
1400
- "step": 1850
1401
- },
1402
- {
1403
- "epoch": 1.4351851851851851,
1404
- "grad_norm": 24.287002563476562,
1405
- "learning_rate": 5.7123935081125034e-05,
1406
- "loss": 3.4614,
1407
- "step": 1860
1408
- },
1409
- {
1410
- "epoch": 1.4429012345679013,
1411
- "grad_norm": 23.30496597290039,
1412
- "learning_rate": 5.68918166996464e-05,
1413
- "loss": 3.363,
1414
- "step": 1870
1415
- },
1416
- {
1417
- "epoch": 1.4506172839506173,
1418
- "grad_norm": 25.783321380615234,
1419
- "learning_rate": 5.6659003961743965e-05,
1420
- "loss": 3.4322,
1421
- "step": 1880
1422
- },
1423
- {
1424
- "epoch": 1.4583333333333333,
1425
- "grad_norm": 22.144737243652344,
1426
- "learning_rate": 5.642550643743753e-05,
1427
- "loss": 3.3036,
1428
- "step": 1890
1429
- },
1430
- {
1431
- "epoch": 1.4660493827160495,
1432
- "grad_norm": 29.154598236083984,
1433
- "learning_rate": 5.619133372489575e-05,
1434
- "loss": 3.3028,
1435
- "step": 1900
1436
- },
1437
- {
1438
- "epoch": 1.4737654320987654,
1439
- "grad_norm": 26.780481338500977,
1440
- "learning_rate": 5.5956495450041675e-05,
1441
- "loss": 3.4062,
1442
- "step": 1910
1443
- },
1444
- {
1445
- "epoch": 1.4814814814814814,
1446
- "grad_norm": 25.359783172607422,
1447
- "learning_rate": 5.572100126615695e-05,
1448
- "loss": 3.3956,
1449
- "step": 1920
1450
- },
1451
- {
1452
- "epoch": 1.4891975308641976,
1453
- "grad_norm": 36.21287536621094,
1454
- "learning_rate": 5.5484860853485135e-05,
1455
- "loss": 3.4237,
1456
- "step": 1930
1457
- },
1458
- {
1459
- "epoch": 1.4969135802469136,
1460
- "grad_norm": 30.381282806396484,
1461
- "learning_rate": 5.524808391883367e-05,
1462
- "loss": 3.316,
1463
- "step": 1940
1464
- },
1465
- {
1466
- "epoch": 1.5046296296296298,
1467
- "grad_norm": 25.306821823120117,
1468
- "learning_rate": 5.5010680195174975e-05,
1469
- "loss": 3.3108,
1470
- "step": 1950
1471
- },
1472
- {
1473
- "epoch": 1.5046296296296298,
1474
- "eval_loss": 3.5245025157928467,
1475
- "eval_runtime": 17.6142,
1476
- "eval_samples_per_second": 28.386,
1477
- "eval_steps_per_second": 28.386,
1478
- "step": 1950
1479
- },
1480
- {
1481
- "epoch": 1.5123456790123457,
1482
- "grad_norm": 19.196184158325195,
1483
- "learning_rate": 5.477265944124626e-05,
1484
- "loss": 3.3328,
1485
- "step": 1960
1486
- },
1487
- {
1488
- "epoch": 1.5200617283950617,
1489
- "grad_norm": 23.37359619140625,
1490
- "learning_rate": 5.453403144114846e-05,
1491
- "loss": 3.2503,
1492
- "step": 1970
1493
- },
1494
- {
1495
- "epoch": 1.5277777777777777,
1496
- "grad_norm": 22.228778839111328,
1497
- "learning_rate": 5.429480600394405e-05,
1498
- "loss": 3.3659,
1499
- "step": 1980
1500
- },
1501
- {
1502
- "epoch": 1.5354938271604939,
1503
- "grad_norm": 33.1472282409668,
1504
- "learning_rate": 5.4054992963253716e-05,
1505
- "loss": 3.206,
1506
- "step": 1990
1507
- },
1508
- {
1509
- "epoch": 1.5432098765432098,
1510
- "grad_norm": 20.814178466796875,
1511
- "learning_rate": 5.381460217685231e-05,
1512
- "loss": 3.3325,
1513
- "step": 2000
1514
- },
1515
- {
1516
- "epoch": 1.550925925925926,
1517
- "grad_norm": 25.20581817626953,
1518
- "learning_rate": 5.357364352626351e-05,
1519
- "loss": 3.4038,
1520
- "step": 2010
1521
- },
1522
- {
1523
- "epoch": 1.558641975308642,
1524
- "grad_norm": 23.560279846191406,
1525
- "learning_rate": 5.333212691635368e-05,
1526
- "loss": 3.2845,
1527
- "step": 2020
1528
- },
1529
- {
1530
- "epoch": 1.566358024691358,
1531
- "grad_norm": 34.251522064208984,
1532
- "learning_rate": 5.309006227492468e-05,
1533
- "loss": 3.2339,
1534
- "step": 2030
1535
- },
1536
- {
1537
- "epoch": 1.574074074074074,
1538
- "grad_norm": 28.38330078125,
1539
- "learning_rate": 5.2847459552305834e-05,
1540
- "loss": 3.2448,
1541
- "step": 2040
1542
- },
1543
- {
1544
- "epoch": 1.5817901234567902,
1545
- "grad_norm": 26.189992904663086,
1546
- "learning_rate": 5.260432872094484e-05,
1547
- "loss": 3.3235,
1548
- "step": 2050
1549
- },
1550
- {
1551
- "epoch": 1.5895061728395061,
1552
- "grad_norm": 18.80211639404297,
1553
- "learning_rate": 5.23606797749979e-05,
1554
- "loss": 3.2259,
1555
- "step": 2060
1556
- },
1557
- {
1558
- "epoch": 1.5972222222222223,
1559
- "grad_norm": 30.651729583740234,
1560
- "learning_rate": 5.211652272991889e-05,
1561
- "loss": 3.1168,
1562
- "step": 2070
1563
- },
1564
- {
1565
- "epoch": 1.6049382716049383,
1566
- "grad_norm": 34.49848175048828,
1567
- "learning_rate": 5.1871867622047624e-05,
1568
- "loss": 3.1488,
1569
- "step": 2080
1570
- },
1571
- {
1572
- "epoch": 1.6126543209876543,
1573
- "grad_norm": 26.484848022460938,
1574
- "learning_rate": 5.1626724508197356e-05,
1575
- "loss": 3.1819,
1576
- "step": 2090
1577
- },
1578
- {
1579
- "epoch": 1.6203703703703702,
1580
- "grad_norm": 30.39811897277832,
1581
- "learning_rate": 5.13811034652413e-05,
1582
- "loss": 3.4571,
1583
- "step": 2100
1584
- },
1585
- {
1586
- "epoch": 1.6203703703703702,
1587
- "eval_loss": 3.4539551734924316,
1588
- "eval_runtime": 18.5177,
1589
- "eval_samples_per_second": 27.001,
1590
- "eval_steps_per_second": 27.001,
1591
- "step": 2100
1592
- },
1593
- {
1594
- "epoch": 1.6280864197530864,
1595
- "grad_norm": 27.10554313659668,
1596
- "learning_rate": 5.113501458969854e-05,
1597
- "loss": 3.3854,
1598
- "step": 2110
1599
- },
1600
- {
1601
- "epoch": 1.6358024691358026,
1602
- "grad_norm": 20.923534393310547,
1603
- "learning_rate": 5.088846799731885e-05,
1604
- "loss": 3.3414,
1605
- "step": 2120
1606
- },
1607
- {
1608
- "epoch": 1.6435185185185186,
1609
- "grad_norm": 29.248905181884766,
1610
- "learning_rate": 5.064147382266701e-05,
1611
- "loss": 3.2729,
1612
- "step": 2130
1613
- },
1614
- {
1615
- "epoch": 1.6512345679012346,
1616
- "grad_norm": 25.068252563476562,
1617
- "learning_rate": 5.039404221870612e-05,
1618
- "loss": 3.4848,
1619
- "step": 2140
1620
- },
1621
- {
1622
- "epoch": 1.6589506172839505,
1623
- "grad_norm": 33.346012115478516,
1624
- "learning_rate": 5.0146183356380295e-05,
1625
- "loss": 3.1731,
1626
- "step": 2150
1627
- },
1628
- {
1629
- "epoch": 1.6666666666666665,
1630
- "grad_norm": 27.35399055480957,
1631
- "learning_rate": 4.989790742419658e-05,
1632
- "loss": 3.2532,
1633
- "step": 2160
1634
- },
1635
- {
1636
- "epoch": 1.6743827160493827,
1637
- "grad_norm": 19.2067928314209,
1638
- "learning_rate": 4.96492246278061e-05,
1639
- "loss": 3.3465,
1640
- "step": 2170
1641
- },
1642
- {
1643
- "epoch": 1.682098765432099,
1644
- "grad_norm": 31.099220275878906,
1645
- "learning_rate": 4.940014518958461e-05,
1646
- "loss": 3.311,
1647
- "step": 2180
1648
- },
1649
- {
1650
- "epoch": 1.6898148148148149,
1651
- "grad_norm": 20.159053802490234,
1652
- "learning_rate": 4.915067934821222e-05,
1653
- "loss": 3.2779,
1654
- "step": 2190
1655
- },
1656
- {
1657
- "epoch": 1.6975308641975309,
1658
- "grad_norm": 27.030029296875,
1659
- "learning_rate": 4.890083735825258e-05,
1660
- "loss": 2.936,
1661
- "step": 2200
1662
- },
1663
- {
1664
- "epoch": 1.7052469135802468,
1665
- "grad_norm": 32.07747268676758,
1666
- "learning_rate": 4.865062948973134e-05,
1667
- "loss": 3.2353,
1668
- "step": 2210
1669
- },
1670
- {
1671
- "epoch": 1.7129629629629628,
1672
- "grad_norm": 23.422794342041016,
1673
- "learning_rate": 4.8400066027713974e-05,
1674
- "loss": 3.1703,
1675
- "step": 2220
1676
- },
1677
- {
1678
- "epoch": 1.720679012345679,
1679
- "grad_norm": 26.94754981994629,
1680
- "learning_rate": 4.8149157271883026e-05,
1681
- "loss": 3.2279,
1682
- "step": 2230
1683
- },
1684
- {
1685
- "epoch": 1.7283950617283952,
1686
- "grad_norm": 26.43851089477539,
1687
- "learning_rate": 4.789791353611469e-05,
1688
- "loss": 3.1315,
1689
- "step": 2240
1690
- },
1691
- {
1692
- "epoch": 1.7361111111111112,
1693
- "grad_norm": 34.339378356933594,
1694
- "learning_rate": 4.76463451480549e-05,
1695
- "loss": 3.2606,
1696
- "step": 2250
1697
- },
1698
- {
1699
- "epoch": 1.7361111111111112,
1700
- "eval_loss": 3.3989486694335938,
1701
- "eval_runtime": 17.7398,
1702
- "eval_samples_per_second": 28.185,
1703
- "eval_steps_per_second": 28.185,
1704
- "step": 2250
1705
- },
1706
- {
1707
- "epoch": 1.7438271604938271,
1708
- "grad_norm": 28.98934555053711,
1709
- "learning_rate": 4.7394462448694756e-05,
1710
- "loss": 3.33,
1711
- "step": 2260
1712
- },
1713
- {
1714
- "epoch": 1.751543209876543,
1715
- "grad_norm": 29.47772216796875,
1716
- "learning_rate": 4.714227579194547e-05,
1717
- "loss": 3.3261,
1718
- "step": 2270
1719
- },
1720
- {
1721
- "epoch": 1.7592592592592593,
1722
- "grad_norm": 30.95444107055664,
1723
- "learning_rate": 4.688979554421276e-05,
1724
- "loss": 3.3706,
1725
- "step": 2280
1726
- },
1727
- {
1728
- "epoch": 1.7669753086419753,
1729
- "grad_norm": 30.03018569946289,
1730
- "learning_rate": 4.663703208397072e-05,
1731
- "loss": 3.0371,
1732
- "step": 2290
1733
- },
1734
- {
1735
- "epoch": 1.7746913580246915,
1736
- "grad_norm": 26.282629013061523,
1737
- "learning_rate": 4.6383995801335176e-05,
1738
- "loss": 3.3524,
1739
- "step": 2300
1740
- },
1741
- {
1742
- "epoch": 1.7824074074074074,
1743
- "grad_norm": 55.37343978881836,
1744
- "learning_rate": 4.6130697097636634e-05,
1745
- "loss": 3.4364,
1746
- "step": 2310
1747
- },
1748
- {
1749
- "epoch": 1.7901234567901234,
1750
- "grad_norm": 32.71061325073242,
1751
- "learning_rate": 4.5877146384992725e-05,
1752
- "loss": 3.3357,
1753
- "step": 2320
1754
- },
1755
- {
1756
- "epoch": 1.7978395061728394,
1757
- "grad_norm": 27.23622703552246,
1758
- "learning_rate": 4.562335408588012e-05,
1759
- "loss": 3.1002,
1760
- "step": 2330
1761
- },
1762
- {
1763
- "epoch": 1.8055555555555556,
1764
- "grad_norm": 38.7041015625,
1765
- "learning_rate": 4.5369330632706223e-05,
1766
- "loss": 3.1582,
1767
- "step": 2340
1768
- },
1769
- {
1770
- "epoch": 1.8132716049382716,
1771
- "grad_norm": 35.14750671386719,
1772
- "learning_rate": 4.5115086467380244e-05,
1773
- "loss": 3.1838,
1774
- "step": 2350
1775
- },
1776
- {
1777
- "epoch": 1.8209876543209877,
1778
- "grad_norm": 22.170042037963867,
1779
- "learning_rate": 4.486063204088402e-05,
1780
- "loss": 3.2643,
1781
- "step": 2360
1782
- },
1783
- {
1784
- "epoch": 1.8287037037037037,
1785
- "grad_norm": 26.56646156311035,
1786
- "learning_rate": 4.4605977812842384e-05,
1787
- "loss": 3.105,
1788
- "step": 2370
1789
- },
1790
- {
1791
- "epoch": 1.8364197530864197,
1792
- "grad_norm": 24.756046295166016,
1793
- "learning_rate": 4.435113425109324e-05,
1794
- "loss": 3.1878,
1795
- "step": 2380
1796
- },
1797
- {
1798
- "epoch": 1.8441358024691357,
1799
- "grad_norm": 34.98149108886719,
1800
- "learning_rate": 4.409611183125725e-05,
1801
- "loss": 3.2356,
1802
- "step": 2390
1803
- },
1804
- {
1805
- "epoch": 1.8518518518518519,
1806
- "grad_norm": 32.97663497924805,
1807
- "learning_rate": 4.3840921036307274e-05,
1808
- "loss": 3.1842,
1809
- "step": 2400
1810
- },
1811
- {
1812
- "epoch": 1.8518518518518519,
1813
- "eval_loss": 3.3580269813537598,
1814
- "eval_runtime": 19.1402,
1815
- "eval_samples_per_second": 26.123,
1816
- "eval_steps_per_second": 26.123,
1817
- "step": 2400
1818
- },
1819
- {
1820
- "epoch": 1.859567901234568,
1821
- "grad_norm": 32.32783889770508,
1822
- "learning_rate": 4.358557235613734e-05,
1823
- "loss": 3.0993,
1824
- "step": 2410
1825
- },
1826
- {
1827
- "epoch": 1.867283950617284,
1828
- "grad_norm": 36.21072006225586,
1829
- "learning_rate": 4.333007628713158e-05,
1830
- "loss": 3.2311,
1831
- "step": 2420
1832
- },
1833
- {
1834
- "epoch": 1.875,
1835
- "grad_norm": 20.204273223876953,
1836
- "learning_rate": 4.3074443331732674e-05,
1837
- "loss": 3.0291,
1838
- "step": 2430
1839
- },
1840
- {
1841
- "epoch": 1.882716049382716,
1842
- "grad_norm": 30.630725860595703,
1843
- "learning_rate": 4.281868399801016e-05,
1844
- "loss": 3.3228,
1845
- "step": 2440
1846
- },
1847
- {
1848
- "epoch": 1.890432098765432,
1849
- "grad_norm": 23.674013137817383,
1850
- "learning_rate": 4.256280879922852e-05,
1851
- "loss": 3.1248,
1852
- "step": 2450
1853
- },
1854
- {
1855
- "epoch": 1.8981481481481481,
1856
- "grad_norm": 30.178142547607422,
1857
- "learning_rate": 4.230682825341498e-05,
1858
- "loss": 3.132,
1859
- "step": 2460
1860
- },
1861
- {
1862
- "epoch": 1.9058641975308643,
1863
- "grad_norm": 49.78887939453125,
1864
- "learning_rate": 4.205075288292717e-05,
1865
- "loss": 3.2772,
1866
- "step": 2470
1867
- },
1868
- {
1869
- "epoch": 1.9135802469135803,
1870
- "grad_norm": 22.746013641357422,
1871
- "learning_rate": 4.17945932140206e-05,
1872
- "loss": 3.257,
1873
- "step": 2480
1874
- },
1875
- {
1876
- "epoch": 1.9212962962962963,
1877
- "grad_norm": 29.011247634887695,
1878
- "learning_rate": 4.1538359776415936e-05,
1879
- "loss": 2.9449,
1880
- "step": 2490
1881
- },
1882
- {
1883
- "epoch": 1.9290123456790123,
1884
- "grad_norm": 29.394569396972656,
1885
- "learning_rate": 4.128206310286622e-05,
1886
- "loss": 3.1816,
1887
- "step": 2500
1888
- },
1889
- {
1890
- "epoch": 1.9367283950617284,
1891
- "grad_norm": 41.661983489990234,
1892
- "learning_rate": 4.102571372872382e-05,
1893
- "loss": 3.1338,
1894
- "step": 2510
1895
- },
1896
- {
1897
- "epoch": 1.9444444444444444,
1898
- "grad_norm": 24.220867156982422,
1899
- "learning_rate": 4.0769322191507485e-05,
1900
- "loss": 3.0679,
1901
- "step": 2520
1902
- },
1903
- {
1904
- "epoch": 1.9521604938271606,
1905
- "grad_norm": 30.152782440185547,
1906
- "learning_rate": 4.051289903046909e-05,
1907
- "loss": 3.0104,
1908
- "step": 2530
1909
- },
1910
- {
1911
- "epoch": 1.9598765432098766,
1912
- "grad_norm": 50.978118896484375,
1913
- "learning_rate": 4.025645478616045e-05,
1914
- "loss": 3.1816,
1915
- "step": 2540
1916
- },
1917
- {
1918
- "epoch": 1.9675925925925926,
1919
- "grad_norm": 30.627073287963867,
1920
- "learning_rate": 4e-05,
1921
- "loss": 3.2438,
1922
- "step": 2550
1923
- },
1924
- {
1925
- "epoch": 1.9675925925925926,
1926
- "eval_loss": 3.288115978240967,
1927
- "eval_runtime": 17.6092,
1928
- "eval_samples_per_second": 28.394,
1929
- "eval_steps_per_second": 28.394,
1930
- "step": 2550
1931
- },
1932
- {
1933
- "epoch": 1.9753086419753085,
1934
- "grad_norm": 43.21097183227539,
1935
- "learning_rate": 3.974354521383956e-05,
1936
- "loss": 2.9912,
1937
- "step": 2560
1938
- },
1939
- {
1940
- "epoch": 1.9830246913580247,
1941
- "grad_norm": 29.048534393310547,
1942
- "learning_rate": 3.948710096953091e-05,
1943
- "loss": 3.2144,
1944
- "step": 2570
1945
- },
1946
- {
1947
- "epoch": 1.9907407407407407,
1948
- "grad_norm": 30.724382400512695,
1949
- "learning_rate": 3.923067780849252e-05,
1950
- "loss": 3.1849,
1951
- "step": 2580
1952
- },
1953
- {
1954
- "epoch": 1.998456790123457,
1955
- "grad_norm": 32.7701530456543,
1956
- "learning_rate": 3.8974286271276185e-05,
1957
- "loss": 3.2466,
1958
- "step": 2590
1959
- },
1960
- {
1961
- "epoch": 2.006172839506173,
1962
- "grad_norm": 25.29627799987793,
1963
- "learning_rate": 3.87179368971338e-05,
1964
- "loss": 3.0169,
1965
- "step": 2600
1966
- },
1967
- {
1968
- "epoch": 2.013888888888889,
1969
- "grad_norm": 33.91135787963867,
1970
- "learning_rate": 3.8461640223584064e-05,
1971
- "loss": 2.8224,
1972
- "step": 2610
1973
- },
1974
- {
1975
- "epoch": 2.021604938271605,
1976
- "grad_norm": 26.389068603515625,
1977
- "learning_rate": 3.820540678597942e-05,
1978
- "loss": 2.6429,
1979
- "step": 2620
1980
- },
1981
- {
1982
- "epoch": 2.029320987654321,
1983
- "grad_norm": 23.78996467590332,
1984
- "learning_rate": 3.794924711707284e-05,
1985
- "loss": 2.7442,
1986
- "step": 2630
1987
- },
1988
- {
1989
- "epoch": 2.037037037037037,
1990
- "grad_norm": 30.783973693847656,
1991
- "learning_rate": 3.769317174658503e-05,
1992
- "loss": 2.9359,
1993
- "step": 2640
1994
- },
1995
- {
1996
- "epoch": 2.044753086419753,
1997
- "grad_norm": 23.419214248657227,
1998
- "learning_rate": 3.743719120077149e-05,
1999
- "loss": 2.9642,
2000
- "step": 2650
2001
- },
2002
- {
2003
- "epoch": 2.052469135802469,
2004
- "grad_norm": 43.545475006103516,
2005
- "learning_rate": 3.718131600198984e-05,
2006
- "loss": 2.7393,
2007
- "step": 2660
2008
- },
2009
- {
2010
- "epoch": 2.060185185185185,
2011
- "grad_norm": 27.782304763793945,
2012
- "learning_rate": 3.6925556668267346e-05,
2013
- "loss": 3.1423,
2014
- "step": 2670
2015
- },
2016
- {
2017
- "epoch": 2.067901234567901,
2018
- "grad_norm": 31.327932357788086,
2019
- "learning_rate": 3.666992371286843e-05,
2020
- "loss": 2.8856,
2021
- "step": 2680
2022
- },
2023
- {
2024
- "epoch": 2.075617283950617,
2025
- "grad_norm": 27.207284927368164,
2026
- "learning_rate": 3.6414427643862664e-05,
2027
- "loss": 3.0525,
2028
- "step": 2690
2029
- },
2030
- {
2031
- "epoch": 2.0833333333333335,
2032
- "grad_norm": 27.2406005859375,
2033
- "learning_rate": 3.615907896369273e-05,
2034
- "loss": 2.7817,
2035
- "step": 2700
2036
- },
2037
- {
2038
- "epoch": 2.0833333333333335,
2039
- "eval_loss": 3.2747247219085693,
2040
- "eval_runtime": 20.4314,
2041
- "eval_samples_per_second": 24.472,
2042
- "eval_steps_per_second": 24.472,
2043
- "step": 2700
2044
- },
2045
- {
2046
- "epoch": 2.0910493827160495,
2047
- "grad_norm": 33.61459732055664,
2048
- "learning_rate": 3.5903888168742754e-05,
2049
- "loss": 2.8164,
2050
- "step": 2710
2051
- },
2052
- {
2053
- "epoch": 2.0987654320987654,
2054
- "grad_norm": 34.80902099609375,
2055
- "learning_rate": 3.564886574890677e-05,
2056
- "loss": 2.6692,
2057
- "step": 2720
2058
- },
2059
- {
2060
- "epoch": 2.1064814814814814,
2061
- "grad_norm": 40.73919677734375,
2062
- "learning_rate": 3.539402218715763e-05,
2063
- "loss": 2.723,
2064
- "step": 2730
2065
- },
2066
- {
2067
- "epoch": 2.1141975308641974,
2068
- "grad_norm": 31.271671295166016,
2069
- "learning_rate": 3.5139367959115986e-05,
2070
- "loss": 2.7958,
2071
- "step": 2740
2072
- },
2073
- {
2074
- "epoch": 2.121913580246914,
2075
- "grad_norm": 29.13555908203125,
2076
- "learning_rate": 3.4884913532619756e-05,
2077
- "loss": 2.8345,
2078
- "step": 2750
2079
- },
2080
- {
2081
- "epoch": 2.1296296296296298,
2082
- "grad_norm": 24.78423309326172,
2083
- "learning_rate": 3.4630669367293797e-05,
2084
- "loss": 2.6955,
2085
- "step": 2760
2086
- },
2087
- {
2088
- "epoch": 2.1373456790123457,
2089
- "grad_norm": 33.64024353027344,
2090
- "learning_rate": 3.43766459141199e-05,
2091
- "loss": 2.801,
2092
- "step": 2770
2093
- },
2094
- {
2095
- "epoch": 2.1450617283950617,
2096
- "grad_norm": 34.713993072509766,
2097
- "learning_rate": 3.412285361500729e-05,
2098
- "loss": 2.7948,
2099
- "step": 2780
2100
- },
2101
- {
2102
- "epoch": 2.1527777777777777,
2103
- "grad_norm": 158.35733032226562,
2104
- "learning_rate": 3.386930290236336e-05,
2105
- "loss": 2.8969,
2106
- "step": 2790
2107
- },
2108
- {
2109
- "epoch": 2.1604938271604937,
2110
- "grad_norm": 23.90937614440918,
2111
- "learning_rate": 3.3616004198664845e-05,
2112
- "loss": 2.7715,
2113
- "step": 2800
2114
- },
2115
- {
2116
- "epoch": 2.16820987654321,
2117
- "grad_norm": 36.69210433959961,
2118
- "learning_rate": 3.33629679160293e-05,
2119
- "loss": 3.0046,
2120
- "step": 2810
2121
- },
2122
- {
2123
- "epoch": 2.175925925925926,
2124
- "grad_norm": 32.34378433227539,
2125
- "learning_rate": 3.311020445578725e-05,
2126
- "loss": 2.7916,
2127
- "step": 2820
2128
- },
2129
- {
2130
- "epoch": 2.183641975308642,
2131
- "grad_norm": 27.218441009521484,
2132
- "learning_rate": 3.285772420805454e-05,
2133
- "loss": 2.8711,
2134
- "step": 2830
2135
- },
2136
- {
2137
- "epoch": 2.191358024691358,
2138
- "grad_norm": 20.864200592041016,
2139
- "learning_rate": 3.260553755130525e-05,
2140
- "loss": 2.8068,
2141
- "step": 2840
2142
- },
2143
- {
2144
- "epoch": 2.199074074074074,
2145
- "grad_norm": 33.19184494018555,
2146
- "learning_rate": 3.235365485194512e-05,
2147
- "loss": 2.6901,
2148
- "step": 2850
2149
- },
2150
- {
2151
- "epoch": 2.199074074074074,
2152
- "eval_loss": 3.244508981704712,
2153
- "eval_runtime": 18.2444,
2154
- "eval_samples_per_second": 27.406,
2155
- "eval_steps_per_second": 27.406,
2156
- "step": 2850
2157
  }
2158
  ],
2159
  "logging_steps": 10,
@@ -2173,7 +139,7 @@
2173
  "attributes": {}
2174
  }
2175
  },
2176
- "total_flos": 4341743839150080.0,
2177
  "train_batch_size": 8,
2178
  "trial_name": null,
2179
  "trial_params": null
 
1
  {
2
+ "best_metric": 4.406219959259033,
3
+ "best_model_checkpoint": "./output/checkpoint-150",
4
+ "epoch": 0.11574074074074074,
5
  "eval_steps": 150,
6
+ "global_step": 150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.007716049382716049,
13
+ "grad_norm": 25.134618759155273,
14
  "learning_rate": 8.000000000000001e-06,
15
+ "loss": 5.0051,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.015432098765432098,
20
+ "grad_norm": 21.143909454345703,
21
  "learning_rate": 1.6000000000000003e-05,
22
+ "loss": 4.7303,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.023148148148148147,
27
+ "grad_norm": 21.639801025390625,
28
  "learning_rate": 2.4e-05,
29
+ "loss": 4.6905,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.030864197530864196,
34
+ "grad_norm": 21.367475509643555,
35
  "learning_rate": 3.2000000000000005e-05,
36
+ "loss": 4.6481,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.038580246913580245,
41
+ "grad_norm": 22.380338668823242,
42
  "learning_rate": 4e-05,
43
+ "loss": 4.6698,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.046296296296296294,
48
+ "grad_norm": 30.744077682495117,
49
  "learning_rate": 4.8e-05,
50
+ "loss": 4.5466,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.05401234567901234,
55
+ "grad_norm": 21.133874893188477,
56
  "learning_rate": 5.6e-05,
57
+ "loss": 4.5706,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.06172839506172839,
62
+ "grad_norm": 19.068326950073242,
63
  "learning_rate": 6.400000000000001e-05,
64
+ "loss": 4.3699,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.06944444444444445,
69
+ "grad_norm": 20.487281799316406,
70
  "learning_rate": 7.2e-05,
71
+ "loss": 4.4229,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.07716049382716049,
76
+ "grad_norm": 24.00373649597168,
77
  "learning_rate": 8e-05,
78
+ "loss": 4.4247,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.08487654320987655,
83
+ "grad_norm": 20.140514373779297,
84
  "learning_rate": 7.999917787833465e-05,
85
+ "loss": 4.2552,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.09259259259259259,
90
+ "grad_norm": 22.779335021972656,
91
  "learning_rate": 7.999671154713278e-05,
92
+ "loss": 4.2996,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.10030864197530864,
97
+ "grad_norm": 15.848005294799805,
98
  "learning_rate": 7.99926011077756e-05,
99
+ "loss": 4.359,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.10802469135802469,
104
+ "grad_norm": 19.472209930419922,
105
  "learning_rate": 7.99868467292272e-05,
106
+ "loss": 4.3415,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.11574074074074074,
111
+ "grad_norm": 13.642756462097168,
112
  "learning_rate": 7.997944864802752e-05,
113
+ "loss": 4.4707,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.11574074074074074,
118
+ "eval_loss": 4.406219959259033,
119
+ "eval_runtime": 17.8179,
120
+ "eval_samples_per_second": 28.062,
121
+ "eval_steps_per_second": 28.062,
122
  "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  }
124
  ],
125
  "logging_steps": 10,
 
139
  "attributes": {}
140
  }
141
  },
142
+ "total_flos": 232102392496128.0,
143
  "train_batch_size": 8,
144
  "trial_name": null,
145
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0c7b06e2298807b4ae6a80ea5b1558c4b50bb1932aa8c42633bcd3de63d6458
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6605c2de7a844ad29d351b6c36722a29860e2329761ad63ddc14664ce30af171
3
  size 5368