error577 commited on
Commit
adba30e
·
verified ·
1 Parent(s): c9738c0

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -20,12 +20,12 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "up_proj",
24
  "gate_proj",
25
- "v_proj",
26
- "k_proj",
27
- "q_proj",
28
  "o_proj",
 
 
 
 
29
  "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
23
  "gate_proj",
 
 
 
24
  "o_proj",
25
+ "q_proj",
26
+ "k_proj",
27
+ "up_proj",
28
+ "v_proj",
29
  "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27fd9d96fe970ba05e3d0a1187867e9aa0f91a35a9b7e5e06afe41cda5ffe858
3
  size 180385008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5cce6156c4621517be68ed6604412d1e180059ddcba2665cbdb58955f9bb05
3
  size 180385008
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19fb996b6d924aaf741f7960cca2cac6d95b543b45fd8cbb1841f51822eae0d3
3
  size 137651322
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfcc971a71e688b4db954d5f9e261787333eb8279ea692ae9db960cb16db16c5
3
  size 137651322
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2bc53698ebe0612a88894cc5efe6ed9ebe6574153de26ad3dbff99f5c01de187
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11309a88af1da04c34187de7c9fa4eeb4751eebe97a4effc8b29c06633b89aa3
3
  size 14244
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 1.4612168073654175,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
- "epoch": 0.0008042690601713897,
5
  "eval_steps": 200,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
@@ -9,1424 +9,1424 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 4.021345300856949e-06,
13
- "grad_norm": 2.1250829696655273,
14
  "learning_rate": 2e-05,
15
- "loss": 0.5581,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 4.021345300856949e-06,
20
- "eval_loss": 0.4672188460826874,
21
- "eval_runtime": 32.0568,
22
- "eval_samples_per_second": 7.767,
23
- "eval_steps_per_second": 7.767,
24
  "step": 1
25
  },
26
  {
27
- "epoch": 8.042690601713897e-06,
28
- "grad_norm": 1.9934287071228027,
29
  "learning_rate": 4e-05,
30
- "loss": 0.5831,
31
  "step": 2
32
  },
33
  {
34
- "epoch": 1.2064035902570847e-05,
35
- "grad_norm": 1.3228237628936768,
36
  "learning_rate": 6e-05,
37
- "loss": 0.4479,
38
  "step": 3
39
  },
40
  {
41
- "epoch": 1.6085381203427795e-05,
42
- "grad_norm": 1.273181438446045,
43
  "learning_rate": 8e-05,
44
- "loss": 0.4714,
45
  "step": 4
46
  },
47
  {
48
- "epoch": 2.0106726504284744e-05,
49
- "grad_norm": 1.606476902961731,
50
  "learning_rate": 0.0001,
51
- "loss": 0.8307,
52
  "step": 5
53
  },
54
  {
55
- "epoch": 2.4128071805141694e-05,
56
- "grad_norm": 1.6689647436141968,
57
  "learning_rate": 0.00012,
58
- "loss": 0.374,
59
  "step": 6
60
  },
61
  {
62
- "epoch": 2.814941710599864e-05,
63
- "grad_norm": 4.524862289428711,
64
  "learning_rate": 0.00014,
65
- "loss": 0.9419,
66
  "step": 7
67
  },
68
  {
69
- "epoch": 3.217076240685559e-05,
70
- "grad_norm": 4.146886348724365,
71
  "learning_rate": 0.00016,
72
- "loss": 0.6234,
73
  "step": 8
74
  },
75
  {
76
- "epoch": 3.6192107707712536e-05,
77
- "grad_norm": 5.241947650909424,
78
  "learning_rate": 0.00018,
79
- "loss": 0.8098,
80
  "step": 9
81
  },
82
  {
83
- "epoch": 4.021345300856949e-05,
84
- "grad_norm": 7.22183084487915,
85
  "learning_rate": 0.0002,
86
- "loss": 0.6312,
87
  "step": 10
88
  },
89
  {
90
- "epoch": 4.4234798309426435e-05,
91
- "grad_norm": 6.737246990203857,
92
  "learning_rate": 0.0002,
93
- "loss": 0.6996,
94
  "step": 11
95
  },
96
  {
97
- "epoch": 4.825614361028339e-05,
98
- "grad_norm": 6.468095779418945,
99
  "learning_rate": 0.0002,
100
- "loss": 0.8823,
101
  "step": 12
102
  },
103
  {
104
- "epoch": 5.2277488911140334e-05,
105
- "grad_norm": 16.900236129760742,
106
  "learning_rate": 0.0002,
107
- "loss": 0.6476,
108
  "step": 13
109
  },
110
  {
111
- "epoch": 5.629883421199728e-05,
112
- "grad_norm": 8.813634872436523,
113
  "learning_rate": 0.0002,
114
- "loss": 0.7912,
115
  "step": 14
116
  },
117
  {
118
- "epoch": 6.032017951285423e-05,
119
- "grad_norm": 18.69288444519043,
120
  "learning_rate": 0.0002,
121
- "loss": 0.8103,
122
  "step": 15
123
  },
124
  {
125
- "epoch": 6.434152481371118e-05,
126
- "grad_norm": 15.614603996276855,
127
  "learning_rate": 0.0002,
128
- "loss": 0.9055,
129
  "step": 16
130
  },
131
  {
132
- "epoch": 6.836287011456813e-05,
133
- "grad_norm": 12.850824356079102,
134
  "learning_rate": 0.0002,
135
- "loss": 1.3363,
136
  "step": 17
137
  },
138
  {
139
- "epoch": 7.238421541542507e-05,
140
- "grad_norm": 8.7499418258667,
141
  "learning_rate": 0.0002,
142
- "loss": 0.9585,
143
  "step": 18
144
  },
145
  {
146
- "epoch": 7.640556071628202e-05,
147
- "grad_norm": 6.852315425872803,
148
  "learning_rate": 0.0002,
149
- "loss": 1.0607,
150
  "step": 19
151
  },
152
  {
153
- "epoch": 8.042690601713898e-05,
154
- "grad_norm": 11.972504615783691,
155
  "learning_rate": 0.0002,
156
- "loss": 1.0609,
157
  "step": 20
158
  },
159
  {
160
- "epoch": 8.444825131799592e-05,
161
- "grad_norm": 5.352021217346191,
162
  "learning_rate": 0.0002,
163
- "loss": 0.7703,
164
  "step": 21
165
  },
166
  {
167
- "epoch": 8.846959661885287e-05,
168
- "grad_norm": 4.236023426055908,
169
  "learning_rate": 0.0002,
170
- "loss": 0.8208,
171
  "step": 22
172
  },
173
  {
174
- "epoch": 9.249094191970982e-05,
175
- "grad_norm": 6.339336395263672,
176
  "learning_rate": 0.0002,
177
- "loss": 0.9896,
178
  "step": 23
179
  },
180
  {
181
- "epoch": 9.651228722056678e-05,
182
- "grad_norm": 6.273009777069092,
183
  "learning_rate": 0.0002,
184
- "loss": 0.9005,
185
  "step": 24
186
  },
187
  {
188
- "epoch": 0.00010053363252142371,
189
- "grad_norm": 10.484920501708984,
190
  "learning_rate": 0.0002,
191
- "loss": 0.7364,
192
  "step": 25
193
  },
194
  {
195
- "epoch": 0.00010455497782228067,
196
- "grad_norm": 17.437223434448242,
197
  "learning_rate": 0.0002,
198
- "loss": 0.9995,
199
  "step": 26
200
  },
201
  {
202
- "epoch": 0.00010857632312313762,
203
- "grad_norm": 10.151628494262695,
204
  "learning_rate": 0.0002,
205
- "loss": 0.8712,
206
  "step": 27
207
  },
208
  {
209
- "epoch": 0.00011259766842399456,
210
- "grad_norm": 9.369444847106934,
211
  "learning_rate": 0.0002,
212
- "loss": 1.8323,
213
  "step": 28
214
  },
215
  {
216
- "epoch": 0.00011661901372485151,
217
- "grad_norm": 8.75271987915039,
218
  "learning_rate": 0.0002,
219
- "loss": 0.9926,
220
  "step": 29
221
  },
222
  {
223
- "epoch": 0.00012064035902570847,
224
- "grad_norm": 9.381953239440918,
225
  "learning_rate": 0.0002,
226
- "loss": 1.2458,
227
  "step": 30
228
  },
229
  {
230
- "epoch": 0.00012466170432656542,
231
- "grad_norm": 6.763002395629883,
232
  "learning_rate": 0.0002,
233
- "loss": 1.1212,
234
  "step": 31
235
  },
236
  {
237
- "epoch": 0.00012868304962742236,
238
- "grad_norm": 6.601625919342041,
239
  "learning_rate": 0.0002,
240
- "loss": 1.3255,
241
  "step": 32
242
  },
243
  {
244
- "epoch": 0.0001327043949282793,
245
- "grad_norm": 7.443782806396484,
246
  "learning_rate": 0.0002,
247
- "loss": 0.9829,
248
  "step": 33
249
  },
250
  {
251
- "epoch": 0.00013672574022913626,
252
- "grad_norm": 12.476454734802246,
253
  "learning_rate": 0.0002,
254
- "loss": 1.3845,
255
  "step": 34
256
  },
257
  {
258
- "epoch": 0.0001407470855299932,
259
- "grad_norm": 17.0606689453125,
260
  "learning_rate": 0.0002,
261
- "loss": 1.99,
262
  "step": 35
263
  },
264
  {
265
- "epoch": 0.00014476843083085014,
266
- "grad_norm": 6.719250679016113,
267
  "learning_rate": 0.0002,
268
- "loss": 1.135,
269
  "step": 36
270
  },
271
  {
272
- "epoch": 0.0001487897761317071,
273
- "grad_norm": 5.461545944213867,
274
  "learning_rate": 0.0002,
275
- "loss": 1.4928,
276
  "step": 37
277
  },
278
  {
279
- "epoch": 0.00015281112143256405,
280
- "grad_norm": 5.649755001068115,
281
  "learning_rate": 0.0002,
282
- "loss": 0.839,
283
  "step": 38
284
  },
285
  {
286
- "epoch": 0.000156832466733421,
287
- "grad_norm": 12.541187286376953,
288
  "learning_rate": 0.0002,
289
- "loss": 1.2483,
290
  "step": 39
291
  },
292
  {
293
- "epoch": 0.00016085381203427795,
294
- "grad_norm": 15.279743194580078,
295
  "learning_rate": 0.0002,
296
- "loss": 0.84,
297
  "step": 40
298
  },
299
  {
300
- "epoch": 0.0001648751573351349,
301
- "grad_norm": 14.897642135620117,
302
  "learning_rate": 0.0002,
303
- "loss": 1.4763,
304
  "step": 41
305
  },
306
  {
307
- "epoch": 0.00016889650263599183,
308
- "grad_norm": 23.705480575561523,
309
  "learning_rate": 0.0002,
310
- "loss": 1.7018,
311
  "step": 42
312
  },
313
  {
314
- "epoch": 0.0001729178479368488,
315
- "grad_norm": 23.132944107055664,
316
  "learning_rate": 0.0002,
317
- "loss": 1.579,
318
  "step": 43
319
  },
320
  {
321
- "epoch": 0.00017693919323770574,
322
- "grad_norm": 13.163228034973145,
323
  "learning_rate": 0.0002,
324
- "loss": 1.1921,
325
  "step": 44
326
  },
327
  {
328
- "epoch": 0.00018096053853856268,
329
- "grad_norm": 11.848477363586426,
330
  "learning_rate": 0.0002,
331
- "loss": 1.3981,
332
  "step": 45
333
  },
334
  {
335
- "epoch": 0.00018498188383941964,
336
- "grad_norm": 11.794342041015625,
337
  "learning_rate": 0.0002,
338
- "loss": 1.2867,
339
  "step": 46
340
  },
341
  {
342
- "epoch": 0.00018900322914027658,
343
- "grad_norm": 13.43369197845459,
344
  "learning_rate": 0.0002,
345
- "loss": 1.7953,
346
  "step": 47
347
  },
348
  {
349
- "epoch": 0.00019302457444113355,
350
- "grad_norm": 10.306236267089844,
351
  "learning_rate": 0.0002,
352
- "loss": 1.2805,
353
  "step": 48
354
  },
355
  {
356
- "epoch": 0.0001970459197419905,
357
- "grad_norm": 7.8810715675354,
358
  "learning_rate": 0.0002,
359
- "loss": 1.4666,
360
  "step": 49
361
  },
362
  {
363
- "epoch": 0.00020106726504284743,
364
- "grad_norm": 10.786847114562988,
365
  "learning_rate": 0.0002,
366
- "loss": 0.9801,
367
  "step": 50
368
  },
369
  {
370
- "epoch": 0.0002050886103437044,
371
- "grad_norm": 18.616641998291016,
372
  "learning_rate": 0.0002,
373
- "loss": 1.5865,
374
  "step": 51
375
  },
376
  {
377
- "epoch": 0.00020910995564456133,
378
- "grad_norm": 5.640084743499756,
379
  "learning_rate": 0.0002,
380
- "loss": 1.2003,
381
  "step": 52
382
  },
383
  {
384
- "epoch": 0.00021313130094541827,
385
- "grad_norm": 7.515094757080078,
386
  "learning_rate": 0.0002,
387
- "loss": 1.1607,
388
  "step": 53
389
  },
390
  {
391
- "epoch": 0.00021715264624627524,
392
- "grad_norm": 8.680420875549316,
393
  "learning_rate": 0.0002,
394
- "loss": 1.0826,
395
  "step": 54
396
  },
397
  {
398
- "epoch": 0.00022117399154713218,
399
- "grad_norm": 10.082337379455566,
400
  "learning_rate": 0.0002,
401
- "loss": 1.102,
402
  "step": 55
403
  },
404
  {
405
- "epoch": 0.00022519533684798912,
406
- "grad_norm": 7.704357147216797,
407
  "learning_rate": 0.0002,
408
- "loss": 1.7538,
409
  "step": 56
410
  },
411
  {
412
- "epoch": 0.00022921668214884609,
413
- "grad_norm": 13.682107925415039,
414
  "learning_rate": 0.0002,
415
- "loss": 1.566,
416
  "step": 57
417
  },
418
  {
419
- "epoch": 0.00023323802744970302,
420
- "grad_norm": 8.392353057861328,
421
  "learning_rate": 0.0002,
422
- "loss": 1.0485,
423
  "step": 58
424
  },
425
  {
426
- "epoch": 0.00023725937275055996,
427
- "grad_norm": 10.200504302978516,
428
  "learning_rate": 0.0002,
429
- "loss": 1.8267,
430
  "step": 59
431
  },
432
  {
433
- "epoch": 0.00024128071805141693,
434
- "grad_norm": 10.44006633758545,
435
  "learning_rate": 0.0002,
436
- "loss": 1.6873,
437
  "step": 60
438
  },
439
  {
440
- "epoch": 0.00024530206335227387,
441
- "grad_norm": 11.430456161499023,
442
  "learning_rate": 0.0002,
443
- "loss": 1.1932,
444
  "step": 61
445
  },
446
  {
447
- "epoch": 0.00024932340865313084,
448
- "grad_norm": 9.302727699279785,
449
  "learning_rate": 0.0002,
450
- "loss": 1.2453,
451
  "step": 62
452
  },
453
  {
454
- "epoch": 0.00025334475395398775,
455
- "grad_norm": 10.85357666015625,
456
  "learning_rate": 0.0002,
457
- "loss": 1.0684,
458
  "step": 63
459
  },
460
  {
461
- "epoch": 0.0002573660992548447,
462
- "grad_norm": 12.542272567749023,
463
  "learning_rate": 0.0002,
464
- "loss": 1.5268,
465
  "step": 64
466
  },
467
  {
468
- "epoch": 0.0002613874445557017,
469
- "grad_norm": 8.385772705078125,
470
  "learning_rate": 0.0002,
471
- "loss": 1.6718,
472
  "step": 65
473
  },
474
  {
475
- "epoch": 0.0002654087898565586,
476
- "grad_norm": 17.25252342224121,
477
  "learning_rate": 0.0002,
478
- "loss": 1.1325,
479
  "step": 66
480
  },
481
  {
482
- "epoch": 0.00026943013515741556,
483
- "grad_norm": 10.651689529418945,
484
  "learning_rate": 0.0002,
485
- "loss": 1.1901,
486
  "step": 67
487
  },
488
  {
489
- "epoch": 0.0002734514804582725,
490
- "grad_norm": 6.3180084228515625,
491
  "learning_rate": 0.0002,
492
- "loss": 1.1154,
493
  "step": 68
494
  },
495
  {
496
- "epoch": 0.00027747282575912944,
497
- "grad_norm": 7.142451286315918,
498
  "learning_rate": 0.0002,
499
- "loss": 1.293,
500
  "step": 69
501
  },
502
  {
503
- "epoch": 0.0002814941710599864,
504
- "grad_norm": 8.585003852844238,
505
  "learning_rate": 0.0002,
506
- "loss": 1.4001,
507
  "step": 70
508
  },
509
  {
510
- "epoch": 0.00028551551636084337,
511
- "grad_norm": 12.69324016571045,
512
  "learning_rate": 0.0002,
513
- "loss": 1.0116,
514
  "step": 71
515
  },
516
  {
517
- "epoch": 0.0002895368616617003,
518
- "grad_norm": 13.226308822631836,
519
  "learning_rate": 0.0002,
520
- "loss": 1.2387,
521
  "step": 72
522
  },
523
  {
524
- "epoch": 0.00029355820696255725,
525
- "grad_norm": 15.932046890258789,
526
  "learning_rate": 0.0002,
527
- "loss": 1.4289,
528
  "step": 73
529
  },
530
  {
531
- "epoch": 0.0002975795522634142,
532
- "grad_norm": 14.791518211364746,
533
  "learning_rate": 0.0002,
534
- "loss": 1.4636,
535
  "step": 74
536
  },
537
  {
538
- "epoch": 0.00030160089756427113,
539
- "grad_norm": 9.525426864624023,
540
  "learning_rate": 0.0002,
541
- "loss": 1.2034,
542
  "step": 75
543
  },
544
  {
545
- "epoch": 0.0003056222428651281,
546
- "grad_norm": 9.078874588012695,
547
  "learning_rate": 0.0002,
548
- "loss": 1.087,
549
  "step": 76
550
  },
551
  {
552
- "epoch": 0.00030964358816598506,
553
- "grad_norm": 11.860196113586426,
554
  "learning_rate": 0.0002,
555
- "loss": 1.5046,
556
  "step": 77
557
  },
558
  {
559
- "epoch": 0.000313664933466842,
560
- "grad_norm": 15.652862548828125,
561
  "learning_rate": 0.0002,
562
- "loss": 1.3219,
563
  "step": 78
564
  },
565
  {
566
- "epoch": 0.00031768627876769894,
567
- "grad_norm": 9.072121620178223,
568
  "learning_rate": 0.0002,
569
- "loss": 0.9932,
570
  "step": 79
571
  },
572
  {
573
- "epoch": 0.0003217076240685559,
574
- "grad_norm": 11.151254653930664,
575
  "learning_rate": 0.0002,
576
- "loss": 1.0352,
577
  "step": 80
578
  },
579
  {
580
- "epoch": 0.0003257289693694128,
581
- "grad_norm": 8.298866271972656,
582
  "learning_rate": 0.0002,
583
- "loss": 1.3864,
584
  "step": 81
585
  },
586
  {
587
- "epoch": 0.0003297503146702698,
588
- "grad_norm": 11.593015670776367,
589
  "learning_rate": 0.0002,
590
- "loss": 1.1893,
591
  "step": 82
592
  },
593
  {
594
- "epoch": 0.00033377165997112675,
595
- "grad_norm": 7.286454677581787,
596
  "learning_rate": 0.0002,
597
- "loss": 1.161,
598
  "step": 83
599
  },
600
  {
601
- "epoch": 0.00033779300527198367,
602
- "grad_norm": 8.644365310668945,
603
  "learning_rate": 0.0002,
604
- "loss": 1.0684,
605
  "step": 84
606
  },
607
  {
608
- "epoch": 0.00034181435057284063,
609
- "grad_norm": 6.340452194213867,
610
  "learning_rate": 0.0002,
611
- "loss": 1.026,
612
  "step": 85
613
  },
614
  {
615
- "epoch": 0.0003458356958736976,
616
- "grad_norm": 14.825886726379395,
617
  "learning_rate": 0.0002,
618
- "loss": 1.428,
619
  "step": 86
620
  },
621
  {
622
- "epoch": 0.0003498570411745545,
623
- "grad_norm": 10.448280334472656,
624
  "learning_rate": 0.0002,
625
- "loss": 1.4677,
626
  "step": 87
627
  },
628
  {
629
- "epoch": 0.0003538783864754115,
630
- "grad_norm": 11.790903091430664,
631
  "learning_rate": 0.0002,
632
- "loss": 1.6509,
633
  "step": 88
634
  },
635
  {
636
- "epoch": 0.00035789973177626844,
637
- "grad_norm": 15.971644401550293,
638
  "learning_rate": 0.0002,
639
- "loss": 0.9585,
640
  "step": 89
641
  },
642
  {
643
- "epoch": 0.00036192107707712536,
644
- "grad_norm": 8.853199005126953,
645
  "learning_rate": 0.0002,
646
- "loss": 0.9822,
647
  "step": 90
648
  },
649
  {
650
- "epoch": 0.0003659424223779823,
651
- "grad_norm": 16.233217239379883,
652
  "learning_rate": 0.0002,
653
- "loss": 1.8514,
654
  "step": 91
655
  },
656
  {
657
- "epoch": 0.0003699637676788393,
658
- "grad_norm": 10.900402069091797,
659
  "learning_rate": 0.0002,
660
- "loss": 1.4084,
661
  "step": 92
662
  },
663
  {
664
- "epoch": 0.0003739851129796962,
665
- "grad_norm": 14.19662857055664,
666
  "learning_rate": 0.0002,
667
- "loss": 1.4282,
668
  "step": 93
669
  },
670
  {
671
- "epoch": 0.00037800645828055317,
672
- "grad_norm": 9.281522750854492,
673
  "learning_rate": 0.0002,
674
- "loss": 1.0121,
675
  "step": 94
676
  },
677
  {
678
- "epoch": 0.00038202780358141013,
679
- "grad_norm": 9.312750816345215,
680
  "learning_rate": 0.0002,
681
- "loss": 0.8598,
682
  "step": 95
683
  },
684
  {
685
- "epoch": 0.0003860491488822671,
686
- "grad_norm": 33.047245025634766,
687
  "learning_rate": 0.0002,
688
- "loss": 1.1154,
689
  "step": 96
690
  },
691
  {
692
- "epoch": 0.000390070494183124,
693
- "grad_norm": 17.734811782836914,
694
  "learning_rate": 0.0002,
695
- "loss": 1.2669,
696
  "step": 97
697
  },
698
  {
699
- "epoch": 0.000394091839483981,
700
- "grad_norm": 9.97078800201416,
701
  "learning_rate": 0.0002,
702
- "loss": 0.9161,
703
  "step": 98
704
  },
705
  {
706
- "epoch": 0.00039811318478483795,
707
- "grad_norm": 12.870509147644043,
708
  "learning_rate": 0.0002,
709
- "loss": 0.9081,
710
  "step": 99
711
  },
712
  {
713
- "epoch": 0.00040213453008569486,
714
- "grad_norm": 12.00430965423584,
715
  "learning_rate": 0.0002,
716
- "loss": 1.3502,
717
  "step": 100
718
  },
719
  {
720
- "epoch": 0.0004061558753865518,
721
- "grad_norm": 29.870113372802734,
722
  "learning_rate": 0.0002,
723
- "loss": 1.3471,
724
  "step": 101
725
  },
726
  {
727
- "epoch": 0.0004101772206874088,
728
- "grad_norm": 12.41720962524414,
729
  "learning_rate": 0.0002,
730
- "loss": 1.6199,
731
  "step": 102
732
  },
733
  {
734
- "epoch": 0.0004141985659882657,
735
- "grad_norm": 17.7427978515625,
736
  "learning_rate": 0.0002,
737
- "loss": 1.2317,
738
  "step": 103
739
  },
740
  {
741
- "epoch": 0.00041821991128912267,
742
- "grad_norm": 10.443873405456543,
743
  "learning_rate": 0.0002,
744
- "loss": 1.352,
745
  "step": 104
746
  },
747
  {
748
- "epoch": 0.00042224125658997964,
749
- "grad_norm": 10.916038513183594,
750
  "learning_rate": 0.0002,
751
- "loss": 1.2256,
752
  "step": 105
753
  },
754
  {
755
- "epoch": 0.00042626260189083655,
756
- "grad_norm": 22.543190002441406,
757
  "learning_rate": 0.0002,
758
- "loss": 1.9063,
759
  "step": 106
760
  },
761
  {
762
- "epoch": 0.0004302839471916935,
763
- "grad_norm": 13.181563377380371,
764
  "learning_rate": 0.0002,
765
- "loss": 1.3225,
766
  "step": 107
767
  },
768
  {
769
- "epoch": 0.0004343052924925505,
770
- "grad_norm": 7.7589898109436035,
771
  "learning_rate": 0.0002,
772
- "loss": 1.1763,
773
  "step": 108
774
  },
775
  {
776
- "epoch": 0.0004383266377934074,
777
- "grad_norm": 42.25303268432617,
778
  "learning_rate": 0.0002,
779
- "loss": 1.6523,
780
  "step": 109
781
  },
782
  {
783
- "epoch": 0.00044234798309426436,
784
- "grad_norm": 11.621317863464355,
785
  "learning_rate": 0.0002,
786
- "loss": 1.06,
787
  "step": 110
788
  },
789
  {
790
- "epoch": 0.0004463693283951213,
791
- "grad_norm": 19.160158157348633,
792
  "learning_rate": 0.0002,
793
- "loss": 1.9047,
794
  "step": 111
795
  },
796
  {
797
- "epoch": 0.00045039067369597824,
798
- "grad_norm": 8.714892387390137,
799
  "learning_rate": 0.0002,
800
- "loss": 1.1184,
801
  "step": 112
802
  },
803
  {
804
- "epoch": 0.0004544120189968352,
805
- "grad_norm": 24.73910140991211,
806
  "learning_rate": 0.0002,
807
- "loss": 2.4336,
808
  "step": 113
809
  },
810
  {
811
- "epoch": 0.00045843336429769217,
812
- "grad_norm": 11.352472305297852,
813
  "learning_rate": 0.0002,
814
- "loss": 1.5641,
815
  "step": 114
816
  },
817
  {
818
- "epoch": 0.0004624547095985491,
819
- "grad_norm": 9.388741493225098,
820
  "learning_rate": 0.0002,
821
- "loss": 1.0202,
822
  "step": 115
823
  },
824
  {
825
- "epoch": 0.00046647605489940605,
826
- "grad_norm": 10.607665061950684,
827
  "learning_rate": 0.0002,
828
- "loss": 1.1882,
829
  "step": 116
830
  },
831
  {
832
- "epoch": 0.000470497400200263,
833
- "grad_norm": 11.667326927185059,
834
  "learning_rate": 0.0002,
835
- "loss": 0.9898,
836
  "step": 117
837
  },
838
  {
839
- "epoch": 0.00047451874550111993,
840
- "grad_norm": 13.278824806213379,
841
  "learning_rate": 0.0002,
842
- "loss": 1.6049,
843
  "step": 118
844
  },
845
  {
846
- "epoch": 0.0004785400908019769,
847
- "grad_norm": 19.007776260375977,
848
  "learning_rate": 0.0002,
849
- "loss": 1.3523,
850
  "step": 119
851
  },
852
  {
853
- "epoch": 0.00048256143610283386,
854
- "grad_norm": 25.230680465698242,
855
  "learning_rate": 0.0002,
856
- "loss": 1.3257,
857
  "step": 120
858
  },
859
  {
860
- "epoch": 0.0004865827814036908,
861
- "grad_norm": 7.6431450843811035,
862
  "learning_rate": 0.0002,
863
- "loss": 0.9766,
864
  "step": 121
865
  },
866
  {
867
- "epoch": 0.0004906041267045477,
868
- "grad_norm": 7.645576477050781,
869
  "learning_rate": 0.0002,
870
- "loss": 0.9987,
871
  "step": 122
872
  },
873
  {
874
- "epoch": 0.0004946254720054047,
875
- "grad_norm": 13.02873420715332,
876
  "learning_rate": 0.0002,
877
- "loss": 1.0794,
878
  "step": 123
879
  },
880
  {
881
- "epoch": 0.0004986468173062617,
882
- "grad_norm": 57.77375411987305,
883
  "learning_rate": 0.0002,
884
- "loss": 1.8805,
885
  "step": 124
886
  },
887
  {
888
- "epoch": 0.0005026681626071186,
889
- "grad_norm": 18.01174545288086,
890
  "learning_rate": 0.0002,
891
- "loss": 1.351,
892
  "step": 125
893
  },
894
  {
895
- "epoch": 0.0005066895079079755,
896
- "grad_norm": 16.48109245300293,
897
  "learning_rate": 0.0002,
898
- "loss": 1.5716,
899
  "step": 126
900
  },
901
  {
902
- "epoch": 0.0005107108532088325,
903
- "grad_norm": 15.327945709228516,
904
  "learning_rate": 0.0002,
905
- "loss": 1.1796,
906
  "step": 127
907
  },
908
  {
909
- "epoch": 0.0005147321985096894,
910
- "grad_norm": 56.11573028564453,
911
  "learning_rate": 0.0002,
912
- "loss": 2.877,
913
  "step": 128
914
  },
915
  {
916
- "epoch": 0.0005187535438105463,
917
- "grad_norm": 23.577686309814453,
918
  "learning_rate": 0.0002,
919
- "loss": 1.2067,
920
  "step": 129
921
  },
922
  {
923
- "epoch": 0.0005227748891114034,
924
- "grad_norm": 38.37621307373047,
925
  "learning_rate": 0.0002,
926
- "loss": 1.402,
927
  "step": 130
928
  },
929
  {
930
- "epoch": 0.0005267962344122603,
931
- "grad_norm": 9.98384952545166,
932
  "learning_rate": 0.0002,
933
- "loss": 1.3932,
934
  "step": 131
935
  },
936
  {
937
- "epoch": 0.0005308175797131172,
938
- "grad_norm": 9.067256927490234,
939
  "learning_rate": 0.0002,
940
- "loss": 0.9654,
941
  "step": 132
942
  },
943
  {
944
- "epoch": 0.0005348389250139742,
945
- "grad_norm": 9.063508033752441,
946
  "learning_rate": 0.0002,
947
- "loss": 1.413,
948
  "step": 133
949
  },
950
  {
951
- "epoch": 0.0005388602703148311,
952
- "grad_norm": 13.763749122619629,
953
  "learning_rate": 0.0002,
954
- "loss": 1.5151,
955
  "step": 134
956
  },
957
  {
958
- "epoch": 0.000542881615615688,
959
- "grad_norm": 35.761844635009766,
960
  "learning_rate": 0.0002,
961
- "loss": 2.3071,
962
  "step": 135
963
  },
964
  {
965
- "epoch": 0.000546902960916545,
966
- "grad_norm": 10.740913391113281,
967
  "learning_rate": 0.0002,
968
- "loss": 1.3204,
969
  "step": 136
970
  },
971
  {
972
- "epoch": 0.000550924306217402,
973
- "grad_norm": 29.596393585205078,
974
  "learning_rate": 0.0002,
975
- "loss": 1.6114,
976
  "step": 137
977
  },
978
  {
979
- "epoch": 0.0005549456515182589,
980
- "grad_norm": 11.534493446350098,
981
  "learning_rate": 0.0002,
982
- "loss": 1.584,
983
  "step": 138
984
  },
985
  {
986
- "epoch": 0.0005589669968191159,
987
- "grad_norm": 14.048515319824219,
988
  "learning_rate": 0.0002,
989
- "loss": 1.8239,
990
  "step": 139
991
  },
992
  {
993
- "epoch": 0.0005629883421199728,
994
- "grad_norm": 15.496759414672852,
995
  "learning_rate": 0.0002,
996
- "loss": 1.4984,
997
  "step": 140
998
  },
999
  {
1000
- "epoch": 0.0005670096874208297,
1001
- "grad_norm": 20.077861785888672,
1002
  "learning_rate": 0.0002,
1003
- "loss": 1.3176,
1004
  "step": 141
1005
  },
1006
  {
1007
- "epoch": 0.0005710310327216867,
1008
- "grad_norm": 13.013651847839355,
1009
  "learning_rate": 0.0002,
1010
- "loss": 1.5781,
1011
  "step": 142
1012
  },
1013
  {
1014
- "epoch": 0.0005750523780225437,
1015
- "grad_norm": 8.42491340637207,
1016
  "learning_rate": 0.0002,
1017
- "loss": 1.175,
1018
  "step": 143
1019
  },
1020
  {
1021
- "epoch": 0.0005790737233234006,
1022
- "grad_norm": 13.043536186218262,
1023
  "learning_rate": 0.0002,
1024
- "loss": 1.6594,
1025
  "step": 144
1026
  },
1027
  {
1028
- "epoch": 0.0005830950686242576,
1029
- "grad_norm": 8.459278106689453,
1030
  "learning_rate": 0.0002,
1031
- "loss": 1.2524,
1032
  "step": 145
1033
  },
1034
  {
1035
- "epoch": 0.0005871164139251145,
1036
- "grad_norm": 16.37969207763672,
1037
  "learning_rate": 0.0002,
1038
- "loss": 1.0042,
1039
  "step": 146
1040
  },
1041
  {
1042
- "epoch": 0.0005911377592259714,
1043
- "grad_norm": 11.152143478393555,
1044
  "learning_rate": 0.0002,
1045
- "loss": 1.5675,
1046
  "step": 147
1047
  },
1048
  {
1049
- "epoch": 0.0005951591045268284,
1050
- "grad_norm": 24.192337036132812,
1051
  "learning_rate": 0.0002,
1052
- "loss": 1.4515,
1053
  "step": 148
1054
  },
1055
  {
1056
- "epoch": 0.0005991804498276853,
1057
- "grad_norm": 14.054618835449219,
1058
  "learning_rate": 0.0002,
1059
- "loss": 1.4857,
1060
  "step": 149
1061
  },
1062
  {
1063
- "epoch": 0.0006032017951285423,
1064
- "grad_norm": 11.309020042419434,
1065
  "learning_rate": 0.0002,
1066
- "loss": 1.2878,
1067
  "step": 150
1068
  },
1069
  {
1070
- "epoch": 0.0006072231404293993,
1071
- "grad_norm": 16.008554458618164,
1072
  "learning_rate": 0.0002,
1073
- "loss": 1.2043,
1074
  "step": 151
1075
  },
1076
  {
1077
- "epoch": 0.0006112444857302562,
1078
- "grad_norm": 9.693023681640625,
1079
  "learning_rate": 0.0002,
1080
- "loss": 1.1373,
1081
  "step": 152
1082
  },
1083
  {
1084
- "epoch": 0.0006152658310311131,
1085
- "grad_norm": 18.5133056640625,
1086
  "learning_rate": 0.0002,
1087
- "loss": 2.0182,
1088
  "step": 153
1089
  },
1090
  {
1091
- "epoch": 0.0006192871763319701,
1092
- "grad_norm": 13.03020191192627,
1093
  "learning_rate": 0.0002,
1094
- "loss": 1.1401,
1095
  "step": 154
1096
  },
1097
  {
1098
- "epoch": 0.000623308521632827,
1099
- "grad_norm": 18.04163932800293,
1100
  "learning_rate": 0.0002,
1101
- "loss": 1.2094,
1102
  "step": 155
1103
  },
1104
  {
1105
- "epoch": 0.000627329866933684,
1106
- "grad_norm": 27.854990005493164,
1107
  "learning_rate": 0.0002,
1108
- "loss": 1.6173,
1109
  "step": 156
1110
  },
1111
  {
1112
- "epoch": 0.000631351212234541,
1113
- "grad_norm": 10.695880889892578,
1114
  "learning_rate": 0.0002,
1115
- "loss": 1.5821,
1116
  "step": 157
1117
  },
1118
  {
1119
- "epoch": 0.0006353725575353979,
1120
- "grad_norm": 65.75477600097656,
1121
  "learning_rate": 0.0002,
1122
- "loss": 4.1151,
1123
  "step": 158
1124
  },
1125
  {
1126
- "epoch": 0.0006393939028362548,
1127
- "grad_norm": 51.57217025756836,
1128
  "learning_rate": 0.0002,
1129
- "loss": 2.5532,
1130
  "step": 159
1131
  },
1132
  {
1133
- "epoch": 0.0006434152481371118,
1134
- "grad_norm": 12.791463851928711,
1135
  "learning_rate": 0.0002,
1136
- "loss": 1.438,
1137
  "step": 160
1138
  },
1139
  {
1140
- "epoch": 0.0006474365934379687,
1141
- "grad_norm": 12.390244483947754,
1142
  "learning_rate": 0.0002,
1143
- "loss": 1.7736,
1144
  "step": 161
1145
  },
1146
  {
1147
- "epoch": 0.0006514579387388256,
1148
- "grad_norm": 32.154598236083984,
1149
  "learning_rate": 0.0002,
1150
- "loss": 1.7178,
1151
  "step": 162
1152
  },
1153
  {
1154
- "epoch": 0.0006554792840396827,
1155
- "grad_norm": 16.198659896850586,
1156
  "learning_rate": 0.0002,
1157
- "loss": 1.8435,
1158
  "step": 163
1159
  },
1160
  {
1161
- "epoch": 0.0006595006293405396,
1162
- "grad_norm": 21.361989974975586,
1163
  "learning_rate": 0.0002,
1164
- "loss": 1.7035,
1165
  "step": 164
1166
  },
1167
  {
1168
- "epoch": 0.0006635219746413965,
1169
- "grad_norm": 9.898756980895996,
1170
  "learning_rate": 0.0002,
1171
- "loss": 1.3707,
1172
  "step": 165
1173
  },
1174
  {
1175
- "epoch": 0.0006675433199422535,
1176
- "grad_norm": 16.237110137939453,
1177
  "learning_rate": 0.0002,
1178
- "loss": 1.5044,
1179
  "step": 166
1180
  },
1181
  {
1182
- "epoch": 0.0006715646652431104,
1183
- "grad_norm": 22.132568359375,
1184
  "learning_rate": 0.0002,
1185
- "loss": 1.491,
1186
  "step": 167
1187
  },
1188
  {
1189
- "epoch": 0.0006755860105439673,
1190
- "grad_norm": 13.93227481842041,
1191
  "learning_rate": 0.0002,
1192
- "loss": 1.0854,
1193
  "step": 168
1194
  },
1195
  {
1196
- "epoch": 0.0006796073558448244,
1197
- "grad_norm": 15.528178215026855,
1198
  "learning_rate": 0.0002,
1199
- "loss": 1.4709,
1200
  "step": 169
1201
  },
1202
  {
1203
- "epoch": 0.0006836287011456813,
1204
- "grad_norm": 16.82071876525879,
1205
  "learning_rate": 0.0002,
1206
- "loss": 1.8743,
1207
  "step": 170
1208
  },
1209
  {
1210
- "epoch": 0.0006876500464465382,
1211
- "grad_norm": 28.489633560180664,
1212
  "learning_rate": 0.0002,
1213
- "loss": 1.432,
1214
  "step": 171
1215
  },
1216
  {
1217
- "epoch": 0.0006916713917473952,
1218
- "grad_norm": 13.395151138305664,
1219
  "learning_rate": 0.0002,
1220
- "loss": 1.3452,
1221
  "step": 172
1222
  },
1223
  {
1224
- "epoch": 0.0006956927370482521,
1225
- "grad_norm": 12.414864540100098,
1226
  "learning_rate": 0.0002,
1227
- "loss": 1.5916,
1228
  "step": 173
1229
  },
1230
  {
1231
- "epoch": 0.000699714082349109,
1232
- "grad_norm": 12.605962753295898,
1233
  "learning_rate": 0.0002,
1234
- "loss": 1.5301,
1235
  "step": 174
1236
  },
1237
  {
1238
- "epoch": 0.000703735427649966,
1239
- "grad_norm": 8.837152481079102,
1240
  "learning_rate": 0.0002,
1241
- "loss": 1.4052,
1242
  "step": 175
1243
  },
1244
  {
1245
- "epoch": 0.000707756772950823,
1246
- "grad_norm": 15.721978187561035,
1247
  "learning_rate": 0.0002,
1248
- "loss": 2.087,
1249
  "step": 176
1250
  },
1251
  {
1252
- "epoch": 0.0007117781182516799,
1253
- "grad_norm": 14.101908683776855,
1254
  "learning_rate": 0.0002,
1255
- "loss": 0.9505,
1256
  "step": 177
1257
  },
1258
  {
1259
- "epoch": 0.0007157994635525369,
1260
- "grad_norm": 11.634686470031738,
1261
  "learning_rate": 0.0002,
1262
- "loss": 1.6044,
1263
  "step": 178
1264
  },
1265
  {
1266
- "epoch": 0.0007198208088533938,
1267
- "grad_norm": 20.826696395874023,
1268
  "learning_rate": 0.0002,
1269
- "loss": 2.5433,
1270
  "step": 179
1271
  },
1272
  {
1273
- "epoch": 0.0007238421541542507,
1274
- "grad_norm": 17.852861404418945,
1275
  "learning_rate": 0.0002,
1276
- "loss": 1.6406,
1277
  "step": 180
1278
  },
1279
  {
1280
- "epoch": 0.0007278634994551077,
1281
- "grad_norm": 11.486310958862305,
1282
  "learning_rate": 0.0002,
1283
- "loss": 1.1016,
1284
  "step": 181
1285
  },
1286
  {
1287
- "epoch": 0.0007318848447559646,
1288
- "grad_norm": 13.983698844909668,
1289
  "learning_rate": 0.0002,
1290
- "loss": 1.5827,
1291
  "step": 182
1292
  },
1293
  {
1294
- "epoch": 0.0007359061900568216,
1295
- "grad_norm": 16.355886459350586,
1296
  "learning_rate": 0.0002,
1297
- "loss": 1.9982,
1298
  "step": 183
1299
  },
1300
  {
1301
- "epoch": 0.0007399275353576786,
1302
- "grad_norm": 10.875386238098145,
1303
  "learning_rate": 0.0002,
1304
- "loss": 0.8674,
1305
  "step": 184
1306
  },
1307
  {
1308
- "epoch": 0.0007439488806585355,
1309
- "grad_norm": 14.478775024414062,
1310
  "learning_rate": 0.0002,
1311
- "loss": 1.8666,
1312
  "step": 185
1313
  },
1314
  {
1315
- "epoch": 0.0007479702259593924,
1316
- "grad_norm": 20.979000091552734,
1317
  "learning_rate": 0.0002,
1318
- "loss": 1.8665,
1319
  "step": 186
1320
  },
1321
  {
1322
- "epoch": 0.0007519915712602494,
1323
- "grad_norm": 11.725519180297852,
1324
  "learning_rate": 0.0002,
1325
- "loss": 1.291,
1326
  "step": 187
1327
  },
1328
  {
1329
- "epoch": 0.0007560129165611063,
1330
- "grad_norm": 12.288352012634277,
1331
  "learning_rate": 0.0002,
1332
- "loss": 1.1208,
1333
  "step": 188
1334
  },
1335
  {
1336
- "epoch": 0.0007600342618619634,
1337
- "grad_norm": 23.79787826538086,
1338
  "learning_rate": 0.0002,
1339
- "loss": 1.9853,
1340
  "step": 189
1341
  },
1342
  {
1343
- "epoch": 0.0007640556071628203,
1344
- "grad_norm": 28.1628475189209,
1345
  "learning_rate": 0.0002,
1346
- "loss": 1.5199,
1347
  "step": 190
1348
  },
1349
  {
1350
- "epoch": 0.0007680769524636772,
1351
- "grad_norm": 21.436002731323242,
1352
  "learning_rate": 0.0002,
1353
- "loss": 1.9569,
1354
  "step": 191
1355
  },
1356
  {
1357
- "epoch": 0.0007720982977645342,
1358
- "grad_norm": 12.12016773223877,
1359
  "learning_rate": 0.0002,
1360
- "loss": 1.3846,
1361
  "step": 192
1362
  },
1363
  {
1364
- "epoch": 0.0007761196430653911,
1365
- "grad_norm": 24.61306381225586,
1366
  "learning_rate": 0.0002,
1367
- "loss": 2.4186,
1368
  "step": 193
1369
  },
1370
  {
1371
- "epoch": 0.000780140988366248,
1372
- "grad_norm": 11.681290626525879,
1373
  "learning_rate": 0.0002,
1374
- "loss": 1.7408,
1375
  "step": 194
1376
  },
1377
  {
1378
- "epoch": 0.000784162333667105,
1379
- "grad_norm": 21.627490997314453,
1380
  "learning_rate": 0.0002,
1381
- "loss": 1.3919,
1382
  "step": 195
1383
  },
1384
  {
1385
- "epoch": 0.000788183678967962,
1386
- "grad_norm": 11.827309608459473,
1387
  "learning_rate": 0.0002,
1388
- "loss": 1.8474,
1389
  "step": 196
1390
  },
1391
  {
1392
- "epoch": 0.0007922050242688189,
1393
- "grad_norm": 13.740830421447754,
1394
  "learning_rate": 0.0002,
1395
- "loss": 1.2792,
1396
  "step": 197
1397
  },
1398
  {
1399
- "epoch": 0.0007962263695696759,
1400
- "grad_norm": 10.612765312194824,
1401
  "learning_rate": 0.0002,
1402
- "loss": 1.3391,
1403
  "step": 198
1404
  },
1405
  {
1406
- "epoch": 0.0008002477148705328,
1407
- "grad_norm": 23.91811180114746,
1408
  "learning_rate": 0.0002,
1409
- "loss": 1.5853,
1410
  "step": 199
1411
  },
1412
  {
1413
- "epoch": 0.0008042690601713897,
1414
- "grad_norm": 13.433571815490723,
1415
  "learning_rate": 0.0002,
1416
- "loss": 1.0399,
1417
  "step": 200
1418
  },
1419
  {
1420
- "epoch": 0.0008042690601713897,
1421
- "eval_loss": 1.4612168073654175,
1422
- "eval_runtime": 32.2018,
1423
- "eval_samples_per_second": 7.732,
1424
- "eval_steps_per_second": 7.732,
1425
  "step": 200
1426
  }
1427
  ],
1428
  "logging_steps": 1,
1429
- "max_steps": 746019,
1430
  "num_input_tokens_seen": 0,
1431
  "num_train_epochs": 3,
1432
  "save_steps": 200,
@@ -1451,7 +1451,7 @@
1451
  "attributes": {}
1452
  }
1453
  },
1454
- "total_flos": 1476424284241920.0,
1455
  "train_batch_size": 1,
1456
  "trial_name": null,
1457
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7864285707473755,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 0.003217076240685559,
5
  "eval_steps": 200,
6
  "global_step": 200,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.6085381203427795e-05,
13
+ "grad_norm": 0.9673656821250916,
14
  "learning_rate": 2e-05,
15
+ "loss": 0.5297,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 1.6085381203427795e-05,
20
+ "eval_loss": 0.46725764870643616,
21
+ "eval_runtime": 25.4799,
22
+ "eval_samples_per_second": 9.772,
23
+ "eval_steps_per_second": 9.772,
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 3.217076240685559e-05,
28
+ "grad_norm": 0.9954096078872681,
29
  "learning_rate": 4e-05,
30
+ "loss": 0.6221,
31
  "step": 2
32
  },
33
  {
34
+ "epoch": 4.825614361028339e-05,
35
+ "grad_norm": 0.8733547329902649,
36
  "learning_rate": 6e-05,
37
+ "loss": 0.4143,
38
  "step": 3
39
  },
40
  {
41
+ "epoch": 6.434152481371118e-05,
42
+ "grad_norm": 0.8365621566772461,
43
  "learning_rate": 8e-05,
44
+ "loss": 0.2579,
45
  "step": 4
46
  },
47
  {
48
+ "epoch": 8.042690601713898e-05,
49
+ "grad_norm": 1.2265547513961792,
50
  "learning_rate": 0.0001,
51
+ "loss": 0.4713,
52
  "step": 5
53
  },
54
  {
55
+ "epoch": 9.651228722056678e-05,
56
+ "grad_norm": 1.1219959259033203,
57
  "learning_rate": 0.00012,
58
+ "loss": 0.4131,
59
  "step": 6
60
  },
61
  {
62
+ "epoch": 0.00011259766842399456,
63
+ "grad_norm": 1.630370855331421,
64
  "learning_rate": 0.00014,
65
+ "loss": 0.5203,
66
  "step": 7
67
  },
68
  {
69
+ "epoch": 0.00012868304962742236,
70
+ "grad_norm": 1.959912896156311,
71
  "learning_rate": 0.00016,
72
+ "loss": 0.6619,
73
  "step": 8
74
  },
75
  {
76
+ "epoch": 0.00014476843083085014,
77
+ "grad_norm": 2.2232961654663086,
78
  "learning_rate": 0.00018,
79
+ "loss": 0.5758,
80
  "step": 9
81
  },
82
  {
83
+ "epoch": 0.00016085381203427795,
84
+ "grad_norm": 2.4021875858306885,
85
  "learning_rate": 0.0002,
86
+ "loss": 0.5578,
87
  "step": 10
88
  },
89
  {
90
+ "epoch": 0.00017693919323770574,
91
+ "grad_norm": 2.4358997344970703,
92
  "learning_rate": 0.0002,
93
+ "loss": 0.5025,
94
  "step": 11
95
  },
96
  {
97
+ "epoch": 0.00019302457444113355,
98
+ "grad_norm": 2.9442031383514404,
99
  "learning_rate": 0.0002,
100
+ "loss": 0.6399,
101
  "step": 12
102
  },
103
  {
104
+ "epoch": 0.00020910995564456133,
105
+ "grad_norm": 3.2934744358062744,
106
  "learning_rate": 0.0002,
107
+ "loss": 0.7238,
108
  "step": 13
109
  },
110
  {
111
+ "epoch": 0.00022519533684798912,
112
+ "grad_norm": 2.135126829147339,
113
  "learning_rate": 0.0002,
114
+ "loss": 0.6701,
115
  "step": 14
116
  },
117
  {
118
+ "epoch": 0.00024128071805141693,
119
+ "grad_norm": 3.4425387382507324,
120
  "learning_rate": 0.0002,
121
+ "loss": 0.7285,
122
  "step": 15
123
  },
124
  {
125
+ "epoch": 0.0002573660992548447,
126
+ "grad_norm": 4.053037166595459,
127
  "learning_rate": 0.0002,
128
+ "loss": 0.6067,
129
  "step": 16
130
  },
131
  {
132
+ "epoch": 0.0002734514804582725,
133
+ "grad_norm": 1.9749451875686646,
134
  "learning_rate": 0.0002,
135
+ "loss": 0.6545,
136
  "step": 17
137
  },
138
  {
139
+ "epoch": 0.0002895368616617003,
140
+ "grad_norm": 2.6539998054504395,
141
  "learning_rate": 0.0002,
142
+ "loss": 0.5582,
143
  "step": 18
144
  },
145
  {
146
+ "epoch": 0.0003056222428651281,
147
+ "grad_norm": 4.1893205642700195,
148
  "learning_rate": 0.0002,
149
+ "loss": 0.6518,
150
  "step": 19
151
  },
152
  {
153
+ "epoch": 0.0003217076240685559,
154
+ "grad_norm": 2.7660045623779297,
155
  "learning_rate": 0.0002,
156
+ "loss": 0.619,
157
  "step": 20
158
  },
159
  {
160
+ "epoch": 0.00033779300527198367,
161
+ "grad_norm": 3.1297731399536133,
162
  "learning_rate": 0.0002,
163
+ "loss": 0.6169,
164
  "step": 21
165
  },
166
  {
167
+ "epoch": 0.0003538783864754115,
168
+ "grad_norm": 2.4766297340393066,
169
  "learning_rate": 0.0002,
170
+ "loss": 0.671,
171
  "step": 22
172
  },
173
  {
174
+ "epoch": 0.0003699637676788393,
175
+ "grad_norm": 1.840955376625061,
176
  "learning_rate": 0.0002,
177
+ "loss": 0.5704,
178
  "step": 23
179
  },
180
  {
181
+ "epoch": 0.0003860491488822671,
182
+ "grad_norm": 2.017615556716919,
183
  "learning_rate": 0.0002,
184
+ "loss": 0.4936,
185
  "step": 24
186
  },
187
  {
188
+ "epoch": 0.00040213453008569486,
189
+ "grad_norm": 2.527812957763672,
190
  "learning_rate": 0.0002,
191
+ "loss": 0.436,
192
  "step": 25
193
  },
194
  {
195
+ "epoch": 0.00041821991128912267,
196
+ "grad_norm": 2.738335132598877,
197
  "learning_rate": 0.0002,
198
+ "loss": 0.6511,
199
  "step": 26
200
  },
201
  {
202
+ "epoch": 0.0004343052924925505,
203
+ "grad_norm": 2.6857173442840576,
204
  "learning_rate": 0.0002,
205
+ "loss": 0.8459,
206
  "step": 27
207
  },
208
  {
209
+ "epoch": 0.00045039067369597824,
210
+ "grad_norm": 3.223954200744629,
211
  "learning_rate": 0.0002,
212
+ "loss": 0.5558,
213
  "step": 28
214
  },
215
  {
216
+ "epoch": 0.00046647605489940605,
217
+ "grad_norm": 2.828322649002075,
218
  "learning_rate": 0.0002,
219
+ "loss": 0.7201,
220
  "step": 29
221
  },
222
  {
223
+ "epoch": 0.00048256143610283386,
224
+ "grad_norm": 3.2195804119110107,
225
  "learning_rate": 0.0002,
226
+ "loss": 0.5933,
227
  "step": 30
228
  },
229
  {
230
+ "epoch": 0.0004986468173062617,
231
+ "grad_norm": 2.4919071197509766,
232
  "learning_rate": 0.0002,
233
+ "loss": 0.5764,
234
  "step": 31
235
  },
236
  {
237
+ "epoch": 0.0005147321985096894,
238
+ "grad_norm": 4.92438268661499,
239
  "learning_rate": 0.0002,
240
+ "loss": 0.9201,
241
  "step": 32
242
  },
243
  {
244
+ "epoch": 0.0005308175797131172,
245
+ "grad_norm": 2.232290267944336,
246
  "learning_rate": 0.0002,
247
+ "loss": 0.5863,
248
  "step": 33
249
  },
250
  {
251
+ "epoch": 0.000546902960916545,
252
+ "grad_norm": 3.7385706901550293,
253
  "learning_rate": 0.0002,
254
+ "loss": 0.9086,
255
  "step": 34
256
  },
257
  {
258
+ "epoch": 0.0005629883421199728,
259
+ "grad_norm": 3.262006998062134,
260
  "learning_rate": 0.0002,
261
+ "loss": 0.6261,
262
  "step": 35
263
  },
264
  {
265
+ "epoch": 0.0005790737233234006,
266
+ "grad_norm": 2.7973763942718506,
267
  "learning_rate": 0.0002,
268
+ "loss": 0.6955,
269
  "step": 36
270
  },
271
  {
272
+ "epoch": 0.0005951591045268284,
273
+ "grad_norm": 3.127302885055542,
274
  "learning_rate": 0.0002,
275
+ "loss": 0.7446,
276
  "step": 37
277
  },
278
  {
279
+ "epoch": 0.0006112444857302562,
280
+ "grad_norm": 2.1533172130584717,
281
  "learning_rate": 0.0002,
282
+ "loss": 0.5484,
283
  "step": 38
284
  },
285
  {
286
+ "epoch": 0.000627329866933684,
287
+ "grad_norm": 4.116796016693115,
288
  "learning_rate": 0.0002,
289
+ "loss": 0.7521,
290
  "step": 39
291
  },
292
  {
293
+ "epoch": 0.0006434152481371118,
294
+ "grad_norm": 4.400921821594238,
295
  "learning_rate": 0.0002,
296
+ "loss": 0.9317,
297
  "step": 40
298
  },
299
  {
300
+ "epoch": 0.0006595006293405396,
301
+ "grad_norm": 2.6137619018554688,
302
  "learning_rate": 0.0002,
303
+ "loss": 0.7086,
304
  "step": 41
305
  },
306
  {
307
+ "epoch": 0.0006755860105439673,
308
+ "grad_norm": 2.341974973678589,
309
  "learning_rate": 0.0002,
310
+ "loss": 0.5551,
311
  "step": 42
312
  },
313
  {
314
+ "epoch": 0.0006916713917473952,
315
+ "grad_norm": 2.7685954570770264,
316
  "learning_rate": 0.0002,
317
+ "loss": 0.7665,
318
  "step": 43
319
  },
320
  {
321
+ "epoch": 0.000707756772950823,
322
+ "grad_norm": 3.1898794174194336,
323
  "learning_rate": 0.0002,
324
+ "loss": 0.8037,
325
  "step": 44
326
  },
327
  {
328
+ "epoch": 0.0007238421541542507,
329
+ "grad_norm": 3.215623617172241,
330
  "learning_rate": 0.0002,
331
+ "loss": 0.9811,
332
  "step": 45
333
  },
334
  {
335
+ "epoch": 0.0007399275353576786,
336
+ "grad_norm": 3.3365135192871094,
337
  "learning_rate": 0.0002,
338
+ "loss": 0.7127,
339
  "step": 46
340
  },
341
  {
342
+ "epoch": 0.0007560129165611063,
343
+ "grad_norm": 4.518591403961182,
344
  "learning_rate": 0.0002,
345
+ "loss": 0.797,
346
  "step": 47
347
  },
348
  {
349
+ "epoch": 0.0007720982977645342,
350
+ "grad_norm": 2.179842948913574,
351
  "learning_rate": 0.0002,
352
+ "loss": 0.7091,
353
  "step": 48
354
  },
355
  {
356
+ "epoch": 0.000788183678967962,
357
+ "grad_norm": 2.5702974796295166,
358
  "learning_rate": 0.0002,
359
+ "loss": 0.8829,
360
  "step": 49
361
  },
362
  {
363
+ "epoch": 0.0008042690601713897,
364
+ "grad_norm": 2.2742362022399902,
365
  "learning_rate": 0.0002,
366
+ "loss": 0.5818,
367
  "step": 50
368
  },
369
  {
370
+ "epoch": 0.0008203544413748176,
371
+ "grad_norm": 3.2687766551971436,
372
  "learning_rate": 0.0002,
373
+ "loss": 0.7228,
374
  "step": 51
375
  },
376
  {
377
+ "epoch": 0.0008364398225782453,
378
+ "grad_norm": 3.5674126148223877,
379
  "learning_rate": 0.0002,
380
+ "loss": 0.8874,
381
  "step": 52
382
  },
383
  {
384
+ "epoch": 0.0008525252037816731,
385
+ "grad_norm": 2.703923225402832,
386
  "learning_rate": 0.0002,
387
+ "loss": 0.6596,
388
  "step": 53
389
  },
390
  {
391
+ "epoch": 0.000868610584985101,
392
+ "grad_norm": 2.3442795276641846,
393
  "learning_rate": 0.0002,
394
+ "loss": 0.8213,
395
  "step": 54
396
  },
397
  {
398
+ "epoch": 0.0008846959661885287,
399
+ "grad_norm": 3.142275094985962,
400
  "learning_rate": 0.0002,
401
+ "loss": 0.8181,
402
  "step": 55
403
  },
404
  {
405
+ "epoch": 0.0009007813473919565,
406
+ "grad_norm": 4.0531487464904785,
407
  "learning_rate": 0.0002,
408
+ "loss": 0.5939,
409
  "step": 56
410
  },
411
  {
412
+ "epoch": 0.0009168667285953843,
413
+ "grad_norm": 4.309750556945801,
414
  "learning_rate": 0.0002,
415
+ "loss": 0.867,
416
  "step": 57
417
  },
418
  {
419
+ "epoch": 0.0009329521097988121,
420
+ "grad_norm": 3.4528746604919434,
421
  "learning_rate": 0.0002,
422
+ "loss": 0.5944,
423
  "step": 58
424
  },
425
  {
426
+ "epoch": 0.0009490374910022399,
427
+ "grad_norm": 3.531193494796753,
428
  "learning_rate": 0.0002,
429
+ "loss": 0.7985,
430
  "step": 59
431
  },
432
  {
433
+ "epoch": 0.0009651228722056677,
434
+ "grad_norm": 3.000215768814087,
435
  "learning_rate": 0.0002,
436
+ "loss": 0.7939,
437
  "step": 60
438
  },
439
  {
440
+ "epoch": 0.0009812082534090955,
441
+ "grad_norm": 4.317079067230225,
442
  "learning_rate": 0.0002,
443
+ "loss": 0.6823,
444
  "step": 61
445
  },
446
  {
447
+ "epoch": 0.0009972936346125233,
448
+ "grad_norm": 3.4617133140563965,
449
  "learning_rate": 0.0002,
450
+ "loss": 0.7672,
451
  "step": 62
452
  },
453
  {
454
+ "epoch": 0.001013379015815951,
455
+ "grad_norm": 3.625797986984253,
456
  "learning_rate": 0.0002,
457
+ "loss": 0.7985,
458
  "step": 63
459
  },
460
  {
461
+ "epoch": 0.0010294643970193789,
462
+ "grad_norm": 4.261772632598877,
463
  "learning_rate": 0.0002,
464
+ "loss": 0.8154,
465
  "step": 64
466
  },
467
  {
468
+ "epoch": 0.0010455497782228067,
469
+ "grad_norm": 3.3078057765960693,
470
  "learning_rate": 0.0002,
471
+ "loss": 0.7663,
472
  "step": 65
473
  },
474
  {
475
+ "epoch": 0.0010616351594262344,
476
+ "grad_norm": 2.1908516883850098,
477
  "learning_rate": 0.0002,
478
+ "loss": 0.6996,
479
  "step": 66
480
  },
481
  {
482
+ "epoch": 0.0010777205406296622,
483
+ "grad_norm": 2.491776943206787,
484
  "learning_rate": 0.0002,
485
+ "loss": 0.659,
486
  "step": 67
487
  },
488
  {
489
+ "epoch": 0.00109380592183309,
490
+ "grad_norm": 2.7965214252471924,
491
  "learning_rate": 0.0002,
492
+ "loss": 0.6798,
493
  "step": 68
494
  },
495
  {
496
+ "epoch": 0.0011098913030365178,
497
+ "grad_norm": 3.3033552169799805,
498
  "learning_rate": 0.0002,
499
+ "loss": 0.9425,
500
  "step": 69
501
  },
502
  {
503
+ "epoch": 0.0011259766842399456,
504
+ "grad_norm": 2.6152732372283936,
505
  "learning_rate": 0.0002,
506
+ "loss": 0.9675,
507
  "step": 70
508
  },
509
  {
510
+ "epoch": 0.0011420620654433735,
511
+ "grad_norm": 2.942465305328369,
512
  "learning_rate": 0.0002,
513
+ "loss": 0.8886,
514
  "step": 71
515
  },
516
  {
517
+ "epoch": 0.0011581474466468011,
518
+ "grad_norm": 3.2040352821350098,
519
  "learning_rate": 0.0002,
520
+ "loss": 0.7208,
521
  "step": 72
522
  },
523
  {
524
+ "epoch": 0.001174232827850229,
525
+ "grad_norm": 5.6633501052856445,
526
  "learning_rate": 0.0002,
527
+ "loss": 0.9701,
528
  "step": 73
529
  },
530
  {
531
+ "epoch": 0.0011903182090536569,
532
+ "grad_norm": 2.924656867980957,
533
  "learning_rate": 0.0002,
534
+ "loss": 0.6366,
535
  "step": 74
536
  },
537
  {
538
+ "epoch": 0.0012064035902570845,
539
+ "grad_norm": 3.251835584640503,
540
  "learning_rate": 0.0002,
541
+ "loss": 0.8638,
542
  "step": 75
543
  },
544
  {
545
+ "epoch": 0.0012224889714605124,
546
+ "grad_norm": 3.145000696182251,
547
  "learning_rate": 0.0002,
548
+ "loss": 0.6692,
549
  "step": 76
550
  },
551
  {
552
+ "epoch": 0.0012385743526639402,
553
+ "grad_norm": 2.7392325401306152,
554
  "learning_rate": 0.0002,
555
+ "loss": 0.7459,
556
  "step": 77
557
  },
558
  {
559
+ "epoch": 0.001254659733867368,
560
+ "grad_norm": 2.8011040687561035,
561
  "learning_rate": 0.0002,
562
+ "loss": 0.7722,
563
  "step": 78
564
  },
565
  {
566
+ "epoch": 0.0012707451150707958,
567
+ "grad_norm": 3.5295469760894775,
568
  "learning_rate": 0.0002,
569
+ "loss": 0.7733,
570
  "step": 79
571
  },
572
  {
573
+ "epoch": 0.0012868304962742236,
574
+ "grad_norm": 2.9453213214874268,
575
  "learning_rate": 0.0002,
576
+ "loss": 0.6945,
577
  "step": 80
578
  },
579
  {
580
+ "epoch": 0.0013029158774776513,
581
+ "grad_norm": 3.2154369354248047,
582
  "learning_rate": 0.0002,
583
+ "loss": 0.8776,
584
  "step": 81
585
  },
586
  {
587
+ "epoch": 0.0013190012586810791,
588
+ "grad_norm": 3.536776065826416,
589
  "learning_rate": 0.0002,
590
+ "loss": 0.8774,
591
  "step": 82
592
  },
593
  {
594
+ "epoch": 0.001335086639884507,
595
+ "grad_norm": 2.8547418117523193,
596
  "learning_rate": 0.0002,
597
+ "loss": 0.7109,
598
  "step": 83
599
  },
600
  {
601
+ "epoch": 0.0013511720210879347,
602
+ "grad_norm": 3.4063565731048584,
603
  "learning_rate": 0.0002,
604
+ "loss": 0.8466,
605
  "step": 84
606
  },
607
  {
608
+ "epoch": 0.0013672574022913625,
609
+ "grad_norm": 5.920643329620361,
610
  "learning_rate": 0.0002,
611
+ "loss": 0.8423,
612
  "step": 85
613
  },
614
  {
615
+ "epoch": 0.0013833427834947904,
616
+ "grad_norm": 4.299768924713135,
617
  "learning_rate": 0.0002,
618
+ "loss": 1.0802,
619
  "step": 86
620
  },
621
  {
622
+ "epoch": 0.001399428164698218,
623
+ "grad_norm": 3.5304558277130127,
624
  "learning_rate": 0.0002,
625
+ "loss": 0.8542,
626
  "step": 87
627
  },
628
  {
629
+ "epoch": 0.001415513545901646,
630
+ "grad_norm": 3.0248117446899414,
631
  "learning_rate": 0.0002,
632
+ "loss": 0.6346,
633
  "step": 88
634
  },
635
  {
636
+ "epoch": 0.0014315989271050738,
637
+ "grad_norm": 3.5863444805145264,
638
  "learning_rate": 0.0002,
639
+ "loss": 0.9679,
640
  "step": 89
641
  },
642
  {
643
+ "epoch": 0.0014476843083085014,
644
+ "grad_norm": 3.6556644439697266,
645
  "learning_rate": 0.0002,
646
+ "loss": 0.7355,
647
  "step": 90
648
  },
649
  {
650
+ "epoch": 0.0014637696895119293,
651
+ "grad_norm": 3.691444158554077,
652
  "learning_rate": 0.0002,
653
+ "loss": 0.8556,
654
  "step": 91
655
  },
656
  {
657
+ "epoch": 0.0014798550707153572,
658
+ "grad_norm": 3.8535704612731934,
659
  "learning_rate": 0.0002,
660
+ "loss": 1.0531,
661
  "step": 92
662
  },
663
  {
664
+ "epoch": 0.0014959404519187848,
665
+ "grad_norm": 3.402984619140625,
666
  "learning_rate": 0.0002,
667
+ "loss": 0.7127,
668
  "step": 93
669
  },
670
  {
671
+ "epoch": 0.0015120258331222127,
672
+ "grad_norm": 2.967519760131836,
673
  "learning_rate": 0.0002,
674
+ "loss": 0.7416,
675
  "step": 94
676
  },
677
  {
678
+ "epoch": 0.0015281112143256405,
679
+ "grad_norm": 4.5817718505859375,
680
  "learning_rate": 0.0002,
681
+ "loss": 0.6667,
682
  "step": 95
683
  },
684
  {
685
+ "epoch": 0.0015441965955290684,
686
+ "grad_norm": 4.2193379402160645,
687
  "learning_rate": 0.0002,
688
+ "loss": 0.6914,
689
  "step": 96
690
  },
691
  {
692
+ "epoch": 0.001560281976732496,
693
+ "grad_norm": 4.412436485290527,
694
  "learning_rate": 0.0002,
695
+ "loss": 0.6476,
696
  "step": 97
697
  },
698
  {
699
+ "epoch": 0.001576367357935924,
700
+ "grad_norm": 3.960810661315918,
701
  "learning_rate": 0.0002,
702
+ "loss": 0.6829,
703
  "step": 98
704
  },
705
  {
706
+ "epoch": 0.0015924527391393518,
707
+ "grad_norm": 4.494846343994141,
708
  "learning_rate": 0.0002,
709
+ "loss": 0.899,
710
  "step": 99
711
  },
712
  {
713
+ "epoch": 0.0016085381203427794,
714
+ "grad_norm": 5.150880813598633,
715
  "learning_rate": 0.0002,
716
+ "loss": 0.8743,
717
  "step": 100
718
  },
719
  {
720
+ "epoch": 0.0016246235015462073,
721
+ "grad_norm": 3.156965970993042,
722
  "learning_rate": 0.0002,
723
+ "loss": 0.754,
724
  "step": 101
725
  },
726
  {
727
+ "epoch": 0.0016407088827496352,
728
+ "grad_norm": 3.00789213180542,
729
  "learning_rate": 0.0002,
730
+ "loss": 0.8606,
731
  "step": 102
732
  },
733
  {
734
+ "epoch": 0.0016567942639530628,
735
+ "grad_norm": 3.9045052528381348,
736
  "learning_rate": 0.0002,
737
+ "loss": 0.833,
738
  "step": 103
739
  },
740
  {
741
+ "epoch": 0.0016728796451564907,
742
+ "grad_norm": 3.0179498195648193,
743
  "learning_rate": 0.0002,
744
+ "loss": 0.6971,
745
  "step": 104
746
  },
747
  {
748
+ "epoch": 0.0016889650263599185,
749
+ "grad_norm": 3.441555976867676,
750
  "learning_rate": 0.0002,
751
+ "loss": 0.9697,
752
  "step": 105
753
  },
754
  {
755
+ "epoch": 0.0017050504075633462,
756
+ "grad_norm": 3.4271888732910156,
757
  "learning_rate": 0.0002,
758
+ "loss": 0.8264,
759
  "step": 106
760
  },
761
  {
762
+ "epoch": 0.001721135788766774,
763
+ "grad_norm": 3.3394598960876465,
764
  "learning_rate": 0.0002,
765
+ "loss": 0.7529,
766
  "step": 107
767
  },
768
  {
769
+ "epoch": 0.001737221169970202,
770
+ "grad_norm": 4.098421573638916,
771
  "learning_rate": 0.0002,
772
+ "loss": 0.7967,
773
  "step": 108
774
  },
775
  {
776
+ "epoch": 0.0017533065511736296,
777
+ "grad_norm": 5.323544979095459,
778
  "learning_rate": 0.0002,
779
+ "loss": 0.9429,
780
  "step": 109
781
  },
782
  {
783
+ "epoch": 0.0017693919323770574,
784
+ "grad_norm": 3.8546035289764404,
785
  "learning_rate": 0.0002,
786
+ "loss": 0.8392,
787
  "step": 110
788
  },
789
  {
790
+ "epoch": 0.0017854773135804853,
791
+ "grad_norm": 3.514596939086914,
792
  "learning_rate": 0.0002,
793
+ "loss": 0.904,
794
  "step": 111
795
  },
796
  {
797
+ "epoch": 0.001801562694783913,
798
+ "grad_norm": 4.436436653137207,
799
  "learning_rate": 0.0002,
800
+ "loss": 0.8841,
801
  "step": 112
802
  },
803
  {
804
+ "epoch": 0.0018176480759873408,
805
+ "grad_norm": 3.042628049850464,
806
  "learning_rate": 0.0002,
807
+ "loss": 0.6856,
808
  "step": 113
809
  },
810
  {
811
+ "epoch": 0.0018337334571907687,
812
+ "grad_norm": 3.558793306350708,
813
  "learning_rate": 0.0002,
814
+ "loss": 0.9463,
815
  "step": 114
816
  },
817
  {
818
+ "epoch": 0.0018498188383941963,
819
+ "grad_norm": 3.0797207355499268,
820
  "learning_rate": 0.0002,
821
+ "loss": 0.7813,
822
  "step": 115
823
  },
824
  {
825
+ "epoch": 0.0018659042195976242,
826
+ "grad_norm": 3.2403101921081543,
827
  "learning_rate": 0.0002,
828
+ "loss": 0.9499,
829
  "step": 116
830
  },
831
  {
832
+ "epoch": 0.001881989600801052,
833
+ "grad_norm": 3.385939121246338,
834
  "learning_rate": 0.0002,
835
+ "loss": 0.6545,
836
  "step": 117
837
  },
838
  {
839
+ "epoch": 0.0018980749820044797,
840
+ "grad_norm": 3.525153636932373,
841
  "learning_rate": 0.0002,
842
+ "loss": 0.9449,
843
  "step": 118
844
  },
845
  {
846
+ "epoch": 0.0019141603632079076,
847
+ "grad_norm": 2.670220375061035,
848
  "learning_rate": 0.0002,
849
+ "loss": 0.6208,
850
  "step": 119
851
  },
852
  {
853
+ "epoch": 0.0019302457444113354,
854
+ "grad_norm": 3.3499555587768555,
855
  "learning_rate": 0.0002,
856
+ "loss": 0.833,
857
  "step": 120
858
  },
859
  {
860
+ "epoch": 0.001946331125614763,
861
+ "grad_norm": 5.413862705230713,
862
  "learning_rate": 0.0002,
863
+ "loss": 1.2186,
864
  "step": 121
865
  },
866
  {
867
+ "epoch": 0.001962416506818191,
868
+ "grad_norm": 3.637068271636963,
869
  "learning_rate": 0.0002,
870
+ "loss": 0.8746,
871
  "step": 122
872
  },
873
  {
874
+ "epoch": 0.0019785018880216186,
875
+ "grad_norm": 6.209028244018555,
876
  "learning_rate": 0.0002,
877
+ "loss": 1.1379,
878
  "step": 123
879
  },
880
  {
881
+ "epoch": 0.0019945872692250467,
882
+ "grad_norm": 4.2924418449401855,
883
  "learning_rate": 0.0002,
884
+ "loss": 1.0075,
885
  "step": 124
886
  },
887
  {
888
+ "epoch": 0.0020106726504284743,
889
+ "grad_norm": 2.749718427658081,
890
  "learning_rate": 0.0002,
891
+ "loss": 0.694,
892
  "step": 125
893
  },
894
  {
895
+ "epoch": 0.002026758031631902,
896
+ "grad_norm": 4.217276573181152,
897
  "learning_rate": 0.0002,
898
+ "loss": 0.778,
899
  "step": 126
900
  },
901
  {
902
+ "epoch": 0.00204284341283533,
903
+ "grad_norm": 3.031771421432495,
904
  "learning_rate": 0.0002,
905
+ "loss": 0.9696,
906
  "step": 127
907
  },
908
  {
909
+ "epoch": 0.0020589287940387577,
910
+ "grad_norm": 3.4838218688964844,
911
  "learning_rate": 0.0002,
912
+ "loss": 0.6629,
913
  "step": 128
914
  },
915
  {
916
+ "epoch": 0.0020750141752421854,
917
+ "grad_norm": 3.218451738357544,
918
  "learning_rate": 0.0002,
919
+ "loss": 0.6899,
920
  "step": 129
921
  },
922
  {
923
+ "epoch": 0.0020910995564456135,
924
+ "grad_norm": 3.4607691764831543,
925
  "learning_rate": 0.0002,
926
+ "loss": 0.6832,
927
  "step": 130
928
  },
929
  {
930
+ "epoch": 0.002107184937649041,
931
+ "grad_norm": 3.70224666595459,
932
  "learning_rate": 0.0002,
933
+ "loss": 0.7241,
934
  "step": 131
935
  },
936
  {
937
+ "epoch": 0.0021232703188524688,
938
+ "grad_norm": 4.122409820556641,
939
  "learning_rate": 0.0002,
940
+ "loss": 0.8109,
941
  "step": 132
942
  },
943
  {
944
+ "epoch": 0.002139355700055897,
945
+ "grad_norm": 3.3417394161224365,
946
  "learning_rate": 0.0002,
947
+ "loss": 0.6684,
948
  "step": 133
949
  },
950
  {
951
+ "epoch": 0.0021554410812593245,
952
+ "grad_norm": 3.019958972930908,
953
  "learning_rate": 0.0002,
954
+ "loss": 0.7826,
955
  "step": 134
956
  },
957
  {
958
+ "epoch": 0.002171526462462752,
959
+ "grad_norm": 3.201491117477417,
960
  "learning_rate": 0.0002,
961
+ "loss": 0.7875,
962
  "step": 135
963
  },
964
  {
965
+ "epoch": 0.00218761184366618,
966
+ "grad_norm": 5.85605525970459,
967
  "learning_rate": 0.0002,
968
+ "loss": 1.1128,
969
  "step": 136
970
  },
971
  {
972
+ "epoch": 0.002203697224869608,
973
+ "grad_norm": 3.976530075073242,
974
  "learning_rate": 0.0002,
975
+ "loss": 0.8679,
976
  "step": 137
977
  },
978
  {
979
+ "epoch": 0.0022197826060730355,
980
+ "grad_norm": 3.621382713317871,
981
  "learning_rate": 0.0002,
982
+ "loss": 0.7601,
983
  "step": 138
984
  },
985
  {
986
+ "epoch": 0.0022358679872764636,
987
+ "grad_norm": 18.2700252532959,
988
  "learning_rate": 0.0002,
989
+ "loss": 0.9312,
990
  "step": 139
991
  },
992
  {
993
+ "epoch": 0.0022519533684798912,
994
+ "grad_norm": 3.050555467605591,
995
  "learning_rate": 0.0002,
996
+ "loss": 0.9431,
997
  "step": 140
998
  },
999
  {
1000
+ "epoch": 0.002268038749683319,
1001
+ "grad_norm": 4.187278747558594,
1002
  "learning_rate": 0.0002,
1003
+ "loss": 1.16,
1004
  "step": 141
1005
  },
1006
  {
1007
+ "epoch": 0.002284124130886747,
1008
+ "grad_norm": 2.9168365001678467,
1009
  "learning_rate": 0.0002,
1010
+ "loss": 0.7853,
1011
  "step": 142
1012
  },
1013
  {
1014
+ "epoch": 0.0023002095120901746,
1015
+ "grad_norm": 118.312744140625,
1016
  "learning_rate": 0.0002,
1017
+ "loss": 1.1003,
1018
  "step": 143
1019
  },
1020
  {
1021
+ "epoch": 0.0023162948932936023,
1022
+ "grad_norm": 4.7243971824646,
1023
  "learning_rate": 0.0002,
1024
+ "loss": 0.694,
1025
  "step": 144
1026
  },
1027
  {
1028
+ "epoch": 0.0023323802744970304,
1029
+ "grad_norm": 4.773429870605469,
1030
  "learning_rate": 0.0002,
1031
+ "loss": 0.7167,
1032
  "step": 145
1033
  },
1034
  {
1035
+ "epoch": 0.002348465655700458,
1036
+ "grad_norm": 6.2195868492126465,
1037
  "learning_rate": 0.0002,
1038
+ "loss": 0.7979,
1039
  "step": 146
1040
  },
1041
  {
1042
+ "epoch": 0.0023645510369038857,
1043
+ "grad_norm": 12.494455337524414,
1044
  "learning_rate": 0.0002,
1045
+ "loss": 1.2257,
1046
  "step": 147
1047
  },
1048
  {
1049
+ "epoch": 0.0023806364181073137,
1050
+ "grad_norm": 6.841114521026611,
1051
  "learning_rate": 0.0002,
1052
+ "loss": 1.28,
1053
  "step": 148
1054
  },
1055
  {
1056
+ "epoch": 0.0023967217993107414,
1057
+ "grad_norm": 5.901433944702148,
1058
  "learning_rate": 0.0002,
1059
+ "loss": 0.826,
1060
  "step": 149
1061
  },
1062
  {
1063
+ "epoch": 0.002412807180514169,
1064
+ "grad_norm": 7.198768615722656,
1065
  "learning_rate": 0.0002,
1066
+ "loss": 0.7969,
1067
  "step": 150
1068
  },
1069
  {
1070
+ "epoch": 0.002428892561717597,
1071
+ "grad_norm": 9.673176765441895,
1072
  "learning_rate": 0.0002,
1073
+ "loss": 0.8828,
1074
  "step": 151
1075
  },
1076
  {
1077
+ "epoch": 0.0024449779429210248,
1078
+ "grad_norm": 10.305676460266113,
1079
  "learning_rate": 0.0002,
1080
+ "loss": 0.8668,
1081
  "step": 152
1082
  },
1083
  {
1084
+ "epoch": 0.0024610633241244524,
1085
+ "grad_norm": 14.00606632232666,
1086
  "learning_rate": 0.0002,
1087
+ "loss": 0.9462,
1088
  "step": 153
1089
  },
1090
  {
1091
+ "epoch": 0.0024771487053278805,
1092
+ "grad_norm": 6.559825897216797,
1093
  "learning_rate": 0.0002,
1094
+ "loss": 0.7042,
1095
  "step": 154
1096
  },
1097
  {
1098
+ "epoch": 0.002493234086531308,
1099
+ "grad_norm": 3.9966037273406982,
1100
  "learning_rate": 0.0002,
1101
+ "loss": 0.8798,
1102
  "step": 155
1103
  },
1104
  {
1105
+ "epoch": 0.002509319467734736,
1106
+ "grad_norm": 5.800797462463379,
1107
  "learning_rate": 0.0002,
1108
+ "loss": 0.7377,
1109
  "step": 156
1110
  },
1111
  {
1112
+ "epoch": 0.002525404848938164,
1113
+ "grad_norm": 7.694753646850586,
1114
  "learning_rate": 0.0002,
1115
+ "loss": 0.9589,
1116
  "step": 157
1117
  },
1118
  {
1119
+ "epoch": 0.0025414902301415915,
1120
+ "grad_norm": 4.698418617248535,
1121
  "learning_rate": 0.0002,
1122
+ "loss": 0.826,
1123
  "step": 158
1124
  },
1125
  {
1126
+ "epoch": 0.002557575611345019,
1127
+ "grad_norm": 3.7439236640930176,
1128
  "learning_rate": 0.0002,
1129
+ "loss": 0.874,
1130
  "step": 159
1131
  },
1132
  {
1133
+ "epoch": 0.0025736609925484473,
1134
+ "grad_norm": 4.441625118255615,
1135
  "learning_rate": 0.0002,
1136
+ "loss": 0.8844,
1137
  "step": 160
1138
  },
1139
  {
1140
+ "epoch": 0.002589746373751875,
1141
+ "grad_norm": 4.822892665863037,
1142
  "learning_rate": 0.0002,
1143
+ "loss": 0.9741,
1144
  "step": 161
1145
  },
1146
  {
1147
+ "epoch": 0.0026058317549553026,
1148
+ "grad_norm": 5.727447986602783,
1149
  "learning_rate": 0.0002,
1150
+ "loss": 1.228,
1151
  "step": 162
1152
  },
1153
  {
1154
+ "epoch": 0.0026219171361587306,
1155
+ "grad_norm": 4.084842681884766,
1156
  "learning_rate": 0.0002,
1157
+ "loss": 0.8113,
1158
  "step": 163
1159
  },
1160
  {
1161
+ "epoch": 0.0026380025173621583,
1162
+ "grad_norm": 4.884864330291748,
1163
  "learning_rate": 0.0002,
1164
+ "loss": 0.9853,
1165
  "step": 164
1166
  },
1167
  {
1168
+ "epoch": 0.002654087898565586,
1169
+ "grad_norm": 4.315978527069092,
1170
  "learning_rate": 0.0002,
1171
+ "loss": 0.7985,
1172
  "step": 165
1173
  },
1174
  {
1175
+ "epoch": 0.002670173279769014,
1176
+ "grad_norm": 3.958301544189453,
1177
  "learning_rate": 0.0002,
1178
+ "loss": 0.8639,
1179
  "step": 166
1180
  },
1181
  {
1182
+ "epoch": 0.0026862586609724417,
1183
+ "grad_norm": 5.930337905883789,
1184
  "learning_rate": 0.0002,
1185
+ "loss": 0.9575,
1186
  "step": 167
1187
  },
1188
  {
1189
+ "epoch": 0.0027023440421758693,
1190
+ "grad_norm": 3.374218702316284,
1191
  "learning_rate": 0.0002,
1192
+ "loss": 0.5752,
1193
  "step": 168
1194
  },
1195
  {
1196
+ "epoch": 0.0027184294233792974,
1197
+ "grad_norm": 7.738460063934326,
1198
  "learning_rate": 0.0002,
1199
+ "loss": 1.1104,
1200
  "step": 169
1201
  },
1202
  {
1203
+ "epoch": 0.002734514804582725,
1204
+ "grad_norm": 6.493184566497803,
1205
  "learning_rate": 0.0002,
1206
+ "loss": 0.9614,
1207
  "step": 170
1208
  },
1209
  {
1210
+ "epoch": 0.0027506001857861527,
1211
+ "grad_norm": 7.904129981994629,
1212
  "learning_rate": 0.0002,
1213
+ "loss": 1.1735,
1214
  "step": 171
1215
  },
1216
  {
1217
+ "epoch": 0.002766685566989581,
1218
+ "grad_norm": 6.135262489318848,
1219
  "learning_rate": 0.0002,
1220
+ "loss": 1.1976,
1221
  "step": 172
1222
  },
1223
  {
1224
+ "epoch": 0.0027827709481930084,
1225
+ "grad_norm": 6.674580097198486,
1226
  "learning_rate": 0.0002,
1227
+ "loss": 0.7546,
1228
  "step": 173
1229
  },
1230
  {
1231
+ "epoch": 0.002798856329396436,
1232
+ "grad_norm": 3.6253364086151123,
1233
  "learning_rate": 0.0002,
1234
+ "loss": 0.8027,
1235
  "step": 174
1236
  },
1237
  {
1238
+ "epoch": 0.002814941710599864,
1239
+ "grad_norm": 3.2293593883514404,
1240
  "learning_rate": 0.0002,
1241
+ "loss": 0.8404,
1242
  "step": 175
1243
  },
1244
  {
1245
+ "epoch": 0.002831027091803292,
1246
+ "grad_norm": 4.404852867126465,
1247
  "learning_rate": 0.0002,
1248
+ "loss": 0.8233,
1249
  "step": 176
1250
  },
1251
  {
1252
+ "epoch": 0.0028471124730067195,
1253
+ "grad_norm": 9.036417007446289,
1254
  "learning_rate": 0.0002,
1255
+ "loss": 1.2197,
1256
  "step": 177
1257
  },
1258
  {
1259
+ "epoch": 0.0028631978542101475,
1260
+ "grad_norm": 3.6753194332122803,
1261
  "learning_rate": 0.0002,
1262
+ "loss": 0.8155,
1263
  "step": 178
1264
  },
1265
  {
1266
+ "epoch": 0.002879283235413575,
1267
+ "grad_norm": 4.148676872253418,
1268
  "learning_rate": 0.0002,
1269
+ "loss": 1.0028,
1270
  "step": 179
1271
  },
1272
  {
1273
+ "epoch": 0.002895368616617003,
1274
+ "grad_norm": 10.267266273498535,
1275
  "learning_rate": 0.0002,
1276
+ "loss": 0.8078,
1277
  "step": 180
1278
  },
1279
  {
1280
+ "epoch": 0.002911453997820431,
1281
+ "grad_norm": 5.570545673370361,
1282
  "learning_rate": 0.0002,
1283
+ "loss": 0.9974,
1284
  "step": 181
1285
  },
1286
  {
1287
+ "epoch": 0.0029275393790238586,
1288
+ "grad_norm": 6.258678436279297,
1289
  "learning_rate": 0.0002,
1290
+ "loss": 1.1986,
1291
  "step": 182
1292
  },
1293
  {
1294
+ "epoch": 0.0029436247602272862,
1295
+ "grad_norm": 11.766939163208008,
1296
  "learning_rate": 0.0002,
1297
+ "loss": 0.8153,
1298
  "step": 183
1299
  },
1300
  {
1301
+ "epoch": 0.0029597101414307143,
1302
+ "grad_norm": 4.668914318084717,
1303
  "learning_rate": 0.0002,
1304
+ "loss": 0.7482,
1305
  "step": 184
1306
  },
1307
  {
1308
+ "epoch": 0.002975795522634142,
1309
+ "grad_norm": 3.728922128677368,
1310
  "learning_rate": 0.0002,
1311
+ "loss": 0.7389,
1312
  "step": 185
1313
  },
1314
  {
1315
+ "epoch": 0.0029918809038375696,
1316
+ "grad_norm": 3.9253530502319336,
1317
  "learning_rate": 0.0002,
1318
+ "loss": 0.8526,
1319
  "step": 186
1320
  },
1321
  {
1322
+ "epoch": 0.0030079662850409977,
1323
+ "grad_norm": 4.449740409851074,
1324
  "learning_rate": 0.0002,
1325
+ "loss": 0.8117,
1326
  "step": 187
1327
  },
1328
  {
1329
+ "epoch": 0.0030240516662444253,
1330
+ "grad_norm": 3.856152296066284,
1331
  "learning_rate": 0.0002,
1332
+ "loss": 0.6481,
1333
  "step": 188
1334
  },
1335
  {
1336
+ "epoch": 0.0030401370474478534,
1337
+ "grad_norm": 140.99961853027344,
1338
  "learning_rate": 0.0002,
1339
+ "loss": 2.8234,
1340
  "step": 189
1341
  },
1342
  {
1343
+ "epoch": 0.003056222428651281,
1344
+ "grad_norm": 4.190764904022217,
1345
  "learning_rate": 0.0002,
1346
+ "loss": 0.7266,
1347
  "step": 190
1348
  },
1349
  {
1350
+ "epoch": 0.0030723078098547087,
1351
+ "grad_norm": 3.9606616497039795,
1352
  "learning_rate": 0.0002,
1353
+ "loss": 0.8465,
1354
  "step": 191
1355
  },
1356
  {
1357
+ "epoch": 0.003088393191058137,
1358
+ "grad_norm": 4.197356700897217,
1359
  "learning_rate": 0.0002,
1360
+ "loss": 0.7764,
1361
  "step": 192
1362
  },
1363
  {
1364
+ "epoch": 0.0031044785722615644,
1365
+ "grad_norm": 4.308269023895264,
1366
  "learning_rate": 0.0002,
1367
+ "loss": 0.6308,
1368
  "step": 193
1369
  },
1370
  {
1371
+ "epoch": 0.003120563953464992,
1372
+ "grad_norm": 7.85593843460083,
1373
  "learning_rate": 0.0002,
1374
+ "loss": 1.2231,
1375
  "step": 194
1376
  },
1377
  {
1378
+ "epoch": 0.00313664933466842,
1379
+ "grad_norm": 5.271966934204102,
1380
  "learning_rate": 0.0002,
1381
+ "loss": 0.6263,
1382
  "step": 195
1383
  },
1384
  {
1385
+ "epoch": 0.003152734715871848,
1386
+ "grad_norm": 4.99168062210083,
1387
  "learning_rate": 0.0002,
1388
+ "loss": 0.8379,
1389
  "step": 196
1390
  },
1391
  {
1392
+ "epoch": 0.0031688200970752755,
1393
+ "grad_norm": 4.923642635345459,
1394
  "learning_rate": 0.0002,
1395
+ "loss": 0.7982,
1396
  "step": 197
1397
  },
1398
  {
1399
+ "epoch": 0.0031849054782787036,
1400
+ "grad_norm": 8.511445999145508,
1401
  "learning_rate": 0.0002,
1402
+ "loss": 0.8379,
1403
  "step": 198
1404
  },
1405
  {
1406
+ "epoch": 0.003200990859482131,
1407
+ "grad_norm": 6.066445350646973,
1408
  "learning_rate": 0.0002,
1409
+ "loss": 0.7347,
1410
  "step": 199
1411
  },
1412
  {
1413
+ "epoch": 0.003217076240685559,
1414
+ "grad_norm": 6.310784339904785,
1415
  "learning_rate": 0.0002,
1416
+ "loss": 0.9526,
1417
  "step": 200
1418
  },
1419
  {
1420
+ "epoch": 0.003217076240685559,
1421
+ "eval_loss": 0.7864285707473755,
1422
+ "eval_runtime": 25.6512,
1423
+ "eval_samples_per_second": 9.707,
1424
+ "eval_steps_per_second": 9.707,
1425
  "step": 200
1426
  }
1427
  ],
1428
  "logging_steps": 1,
1429
+ "max_steps": 186504,
1430
  "num_input_tokens_seen": 0,
1431
  "num_train_epochs": 3,
1432
  "save_steps": 200,
 
1451
  "attributes": {}
1452
  }
1453
  },
1454
+ "total_flos": 5911953172070400.0,
1455
  "train_batch_size": 1,
1456
  "trial_name": null,
1457
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0a0728d73b11fa5133595eca73629f5bdb909de86993e399061d0cc95c785cc
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5bfb8f9ee0d17252ff4577fc9c15127560771b7e188338420238d872618fd3b
3
  size 6776