Kudod commited on
Commit
a373d4a
·
verified ·
1 Parent(s): 0cfad86

Training in progress, step 40000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5615f5c89d013fb49a779606b4469fbfcb42b508a247fb0de81f625f228ffd29
3
  size 641630264
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:195846c2c0d0878f568436856fcb31115980b919fbde547b24fd8f1b6904de93
3
  size 641630264
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58d8d2bf92817390f11f67b44f65ad4a6149f1dc2868ba468dfdda581838aee0
3
  size 1283324282
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:569405fc74cee783b05c60cf3d0725086154f7c45fc14f63f78350b6f91fba50
3
  size 1283324282
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:496c96b90a901bbe63b7d97d586df49c4bac3dd99421ba33e49cfafd2c9c454f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbe7fbcb6a233a005419711cf60f8760e1079db7e2f205f22b817a7876d6841c
3
  size 14244
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be749ce6c2e646403dd9eb54cb2041d931d8fa4ed3faa66f55822cd781662848
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:996e482d7113d1552c854585970963491dff89aace5a33b5bc82a19f32014cec
3
  size 988
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f09268d48053182ee7910b62b4a5ba89f63f97f1266f9d30b5fbad29e0d8898a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:433bb109a71cc77a49d19d0f78be4460568488238c75b6770695e9c96428597d
3
  size 1064
last-checkpoint/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3326df9ec64ce5af70e1eff5ba7070351a3c08ddec98cac0c7b843f58fad66ec
3
  size 10959617
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d95d07520d0d46f70d42b3d0908ce844501ca2d13d2b479131e76473cc6b3bf
3
  size 10959617
last-checkpoint/trainer_state.json CHANGED
@@ -2,954 +2,642 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4898688915375446,
6
  "eval_steps": 5000,
7
- "global_step": 60000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.012415574096146206,
14
- "grad_norm": 2.5686280727386475,
15
- "learning_rate": 0.00025,
16
- "loss": 7.7629,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.024831148192292412,
21
- "grad_norm": 2.2654430866241455,
22
- "learning_rate": 0.0005,
23
- "loss": 6.0856,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 0.037246722288438616,
28
- "grad_norm": 2.7833986282348633,
29
- "learning_rate": 0.0004987522459572769,
30
- "loss": 5.6864,
31
  "step": 1500
32
  },
33
  {
34
  "epoch": 0.049662296384584824,
35
- "grad_norm": 2.7468013763427734,
36
- "learning_rate": 0.0004975044919145539,
37
- "loss": 5.3895,
38
  "step": 2000
39
  },
40
  {
41
  "epoch": 0.06207787048073103,
42
- "grad_norm": 2.8749406337738037,
43
- "learning_rate": 0.0004962567378718307,
44
- "loss": 5.186,
45
  "step": 2500
46
  },
47
  {
48
  "epoch": 0.07449344457687723,
49
- "grad_norm": 2.867882490158081,
50
- "learning_rate": 0.0004950089838291076,
51
- "loss": 4.977,
52
  "step": 3000
53
  },
54
  {
55
  "epoch": 0.08690901867302345,
56
- "grad_norm": 3.050034523010254,
57
- "learning_rate": 0.0004937612297863845,
58
- "loss": 4.7687,
59
  "step": 3500
60
  },
61
  {
62
  "epoch": 0.09932459276916965,
63
- "grad_norm": 4.819639682769775,
64
- "learning_rate": 0.0004925134757436613,
65
- "loss": 4.4337,
66
  "step": 4000
67
  },
68
  {
69
  "epoch": 0.11174016686531585,
70
- "grad_norm": 2.481872081756592,
71
- "learning_rate": 0.0004912657217009383,
72
- "loss": 4.0712,
73
  "step": 4500
74
  },
75
  {
76
  "epoch": 0.12415574096146206,
77
- "grad_norm": 2.837972402572632,
78
- "learning_rate": 0.0004900179676582152,
79
- "loss": 3.915,
80
  "step": 5000
81
  },
82
  {
83
  "epoch": 0.12415574096146206,
84
- "eval_loss": 2.701279878616333,
85
- "eval_runtime": 3666.7471,
86
- "eval_samples_per_second": 351.453,
87
- "eval_steps_per_second": 10.983,
88
  "step": 5000
89
  },
90
  {
91
  "epoch": 0.13657131505760825,
92
- "grad_norm": 3.349369525909424,
93
- "learning_rate": 0.0004887702136154921,
94
- "loss": 3.7757,
95
  "step": 5500
96
  },
97
  {
98
  "epoch": 0.14898688915375446,
99
- "grad_norm": 3.129056930541992,
100
- "learning_rate": 0.00048752245957276905,
101
- "loss": 3.7123,
102
  "step": 6000
103
  },
104
  {
105
  "epoch": 0.16140246324990068,
106
- "grad_norm": 3.715869426727295,
107
- "learning_rate": 0.00048627969654621686,
108
- "loss": 3.6079,
109
  "step": 6500
110
  },
111
  {
112
  "epoch": 0.1738180373460469,
113
- "grad_norm": 2.6967413425445557,
114
- "learning_rate": 0.0004850319425034937,
115
- "loss": 3.5496,
116
  "step": 7000
117
  },
118
  {
119
  "epoch": 0.18623361144219308,
120
- "grad_norm": 2.7264204025268555,
121
- "learning_rate": 0.0004837841884607706,
122
- "loss": 3.431,
123
  "step": 7500
124
  },
125
  {
126
  "epoch": 0.1986491855383393,
127
- "grad_norm": 2.5806965827941895,
128
- "learning_rate": 0.00048253643441804753,
129
- "loss": 3.384,
130
  "step": 8000
131
  },
132
  {
133
  "epoch": 0.2110647596344855,
134
- "grad_norm": 3.236356019973755,
135
- "learning_rate": 0.0004812886803753244,
136
- "loss": 3.3063,
137
  "step": 8500
138
  },
139
  {
140
  "epoch": 0.2234803337306317,
141
- "grad_norm": 3.126569986343384,
142
- "learning_rate": 0.00048004342184068677,
143
- "loss": 3.2365,
144
  "step": 9000
145
  },
146
  {
147
  "epoch": 0.2358959078267779,
148
- "grad_norm": 2.903918981552124,
149
- "learning_rate": 0.0004787956677979637,
150
- "loss": 3.1995,
151
  "step": 9500
152
  },
153
  {
154
  "epoch": 0.24831148192292413,
155
- "grad_norm": 2.595036745071411,
156
- "learning_rate": 0.00047754791375524057,
157
- "loss": 3.1525,
158
  "step": 10000
159
  },
160
  {
161
  "epoch": 0.24831148192292413,
162
- "eval_loss": 2.1151881217956543,
163
- "eval_runtime": 3661.5842,
164
- "eval_samples_per_second": 351.949,
165
- "eval_steps_per_second": 10.999,
166
  "step": 10000
167
  },
168
  {
169
  "epoch": 0.2607270560190703,
170
- "grad_norm": 2.7406485080718994,
171
- "learning_rate": 0.0004763001597125175,
172
- "loss": 3.1375,
173
  "step": 10500
174
  },
175
  {
176
  "epoch": 0.2731426301152165,
177
- "grad_norm": 3.198718309402466,
178
- "learning_rate": 0.0004750549011778798,
179
- "loss": 3.0598,
180
  "step": 11000
181
  },
182
  {
183
  "epoch": 0.28555820421136274,
184
- "grad_norm": 2.6438426971435547,
185
- "learning_rate": 0.00047380714713515673,
186
- "loss": 3.0657,
187
  "step": 11500
188
  },
189
  {
190
  "epoch": 0.29797377830750893,
191
- "grad_norm": 3.145714282989502,
192
- "learning_rate": 0.0004725593930924336,
193
- "loss": 3.003,
194
  "step": 12000
195
  },
196
  {
197
  "epoch": 0.31038935240365517,
198
- "grad_norm": 3.4619131088256836,
199
- "learning_rate": 0.00047131413455779597,
200
- "loss": 2.9664,
201
  "step": 12500
202
  },
203
  {
204
  "epoch": 0.32280492649980136,
205
- "grad_norm": 2.8198635578155518,
206
- "learning_rate": 0.0004700663805150729,
207
- "loss": 2.9595,
208
  "step": 13000
209
  },
210
  {
211
  "epoch": 0.33522050059594755,
212
- "grad_norm": 3.9205424785614014,
213
- "learning_rate": 0.0004688186264723498,
214
- "loss": 2.9262,
215
  "step": 13500
216
  },
217
  {
218
  "epoch": 0.3476360746920938,
219
- "grad_norm": 3.130042791366577,
220
- "learning_rate": 0.0004675708724296267,
221
- "loss": 2.8829,
222
  "step": 14000
223
  },
224
  {
225
  "epoch": 0.36005164878824,
226
- "grad_norm": 3.2414395809173584,
227
- "learning_rate": 0.0004663231183869036,
228
- "loss": 2.8598,
229
  "step": 14500
230
  },
231
  {
232
  "epoch": 0.37246722288438616,
233
- "grad_norm": 3.659555196762085,
234
- "learning_rate": 0.00046507536434418044,
235
- "loss": 2.8592,
236
  "step": 15000
237
  },
238
  {
239
  "epoch": 0.37246722288438616,
240
- "eval_loss": 1.8211588859558105,
241
- "eval_runtime": 3647.4211,
242
- "eval_samples_per_second": 353.316,
243
- "eval_steps_per_second": 11.041,
244
  "step": 15000
245
  },
246
  {
247
  "epoch": 0.3848827969805324,
248
- "grad_norm": 3.733859062194824,
249
- "learning_rate": 0.0004638301058095428,
250
- "loss": 2.8209,
251
  "step": 15500
252
  },
253
  {
254
  "epoch": 0.3972983710766786,
255
- "grad_norm": 2.9978246688842773,
256
- "learning_rate": 0.00046258235176681973,
257
- "loss": 2.7854,
258
  "step": 16000
259
  },
260
  {
261
  "epoch": 0.4097139451728248,
262
- "grad_norm": 2.694765567779541,
263
- "learning_rate": 0.00046133459772409666,
264
- "loss": 2.789,
265
  "step": 16500
266
  },
267
  {
268
  "epoch": 0.422129519268971,
269
- "grad_norm": 3.022148370742798,
270
- "learning_rate": 0.00046008684368137353,
271
- "loss": 2.7819,
272
  "step": 17000
273
  },
274
  {
275
  "epoch": 0.4345450933651172,
276
- "grad_norm": 2.756038188934326,
277
- "learning_rate": 0.00045883908963865045,
278
- "loss": 2.7496,
279
  "step": 17500
280
  },
281
  {
282
  "epoch": 0.4469606674612634,
283
- "grad_norm": 3.0430989265441895,
284
- "learning_rate": 0.0004575913355959274,
285
- "loss": 2.7558,
286
  "step": 18000
287
  },
288
  {
289
  "epoch": 0.45937624155740964,
290
- "grad_norm": 2.710583209991455,
291
- "learning_rate": 0.0004563460770612897,
292
- "loss": 2.7214,
293
  "step": 18500
294
  },
295
  {
296
  "epoch": 0.4717918156535558,
297
- "grad_norm": 4.814529895782471,
298
- "learning_rate": 0.00045509832301856656,
299
- "loss": 2.6977,
300
  "step": 19000
301
  },
302
  {
303
  "epoch": 0.484207389749702,
304
- "grad_norm": 2.782024621963501,
305
- "learning_rate": 0.0004538505689758435,
306
- "loss": 2.7066,
307
  "step": 19500
308
  },
309
  {
310
  "epoch": 0.49662296384584825,
311
- "grad_norm": 2.9942479133605957,
312
- "learning_rate": 0.00045260281493312036,
313
- "loss": 2.6744,
314
  "step": 20000
315
  },
316
  {
317
  "epoch": 0.49662296384584825,
318
- "eval_loss": 1.681386947631836,
319
- "eval_runtime": 3650.0754,
320
- "eval_samples_per_second": 353.059,
321
- "eval_steps_per_second": 11.033,
322
  "step": 20000
323
  },
324
  {
325
  "epoch": 0.5090385379419944,
326
- "grad_norm": 2.6107559204101562,
327
- "learning_rate": 0.00045135755639848273,
328
- "loss": 2.6627,
329
  "step": 20500
330
  },
331
  {
332
  "epoch": 0.5214541120381406,
333
- "grad_norm": 3.603623390197754,
334
- "learning_rate": 0.00045010980235575966,
335
- "loss": 2.6374,
336
  "step": 21000
337
  },
338
  {
339
  "epoch": 0.5338696861342869,
340
- "grad_norm": 2.804776668548584,
341
- "learning_rate": 0.0004488620483130366,
342
- "loss": 2.6477,
343
  "step": 21500
344
  },
345
  {
346
  "epoch": 0.546285260230433,
347
- "grad_norm": 3.2368860244750977,
348
- "learning_rate": 0.00044761429427031345,
349
- "loss": 2.6416,
350
  "step": 22000
351
  },
352
  {
353
  "epoch": 0.5587008343265792,
354
- "grad_norm": 2.6095378398895264,
355
- "learning_rate": 0.0004463665402275903,
356
- "loss": 2.6248,
357
  "step": 22500
358
  },
359
  {
360
  "epoch": 0.5711164084227255,
361
- "grad_norm": 2.9860754013061523,
362
- "learning_rate": 0.0004451212816929527,
363
- "loss": 2.6254,
364
  "step": 23000
365
  },
366
  {
367
  "epoch": 0.5835319825188717,
368
- "grad_norm": 3.114459276199341,
369
- "learning_rate": 0.0004438735276502296,
370
- "loss": 2.6257,
371
  "step": 23500
372
  },
373
  {
374
  "epoch": 0.5959475566150179,
375
- "grad_norm": 2.812556028366089,
376
- "learning_rate": 0.0004426257736075065,
377
- "loss": 2.5746,
378
  "step": 24000
379
  },
380
  {
381
  "epoch": 0.6083631307111641,
382
- "grad_norm": 3.2355823516845703,
383
- "learning_rate": 0.0004413780195647834,
384
- "loss": 2.5919,
385
  "step": 24500
386
  },
387
  {
388
  "epoch": 0.6207787048073103,
389
- "grad_norm": 2.5201354026794434,
390
- "learning_rate": 0.0004401302655220603,
391
- "loss": 2.5754,
392
  "step": 25000
393
  },
394
  {
395
  "epoch": 0.6207787048073103,
396
- "eval_loss": 1.5900601148605347,
397
- "eval_runtime": 4174.4112,
398
- "eval_samples_per_second": 308.712,
399
- "eval_steps_per_second": 9.647,
400
  "step": 25000
401
  },
402
  {
403
  "epoch": 0.6331942789034565,
404
- "grad_norm": 2.8540596961975098,
405
- "learning_rate": 0.0004388825114793372,
406
- "loss": 2.5705,
407
  "step": 25500
408
  },
409
  {
410
  "epoch": 0.6456098529996027,
411
- "grad_norm": 2.603358507156372,
412
- "learning_rate": 0.00043763475743661414,
413
- "loss": 2.5342,
414
  "step": 26000
415
  },
416
  {
417
  "epoch": 0.658025427095749,
418
- "grad_norm": 2.7852208614349365,
419
- "learning_rate": 0.00043638700339389096,
420
- "loss": 2.5463,
421
  "step": 26500
422
  },
423
  {
424
  "epoch": 0.6704410011918951,
425
- "grad_norm": 2.7578940391540527,
426
- "learning_rate": 0.0004351392493511679,
427
- "loss": 2.5372,
428
  "step": 27000
429
  },
430
  {
431
  "epoch": 0.6828565752880413,
432
- "grad_norm": 2.941049337387085,
433
- "learning_rate": 0.00043389399081653025,
434
- "loss": 2.5207,
435
  "step": 27500
436
  },
437
  {
438
  "epoch": 0.6952721493841876,
439
- "grad_norm": 2.7455787658691406,
440
- "learning_rate": 0.0004326462367738072,
441
- "loss": 2.5233,
442
  "step": 28000
443
  },
444
  {
445
  "epoch": 0.7076877234803337,
446
- "grad_norm": 2.4482600688934326,
447
- "learning_rate": 0.00043139848273108405,
448
- "loss": 2.5105,
449
  "step": 28500
450
  },
451
  {
452
  "epoch": 0.72010329757648,
453
- "grad_norm": 2.8398752212524414,
454
- "learning_rate": 0.000430150728688361,
455
- "loss": 2.531,
456
  "step": 29000
457
  },
458
  {
459
  "epoch": 0.7325188716726262,
460
- "grad_norm": 2.608999013900757,
461
- "learning_rate": 0.00042890547015372334,
462
- "loss": 2.4864,
463
  "step": 29500
464
  },
465
  {
466
  "epoch": 0.7449344457687723,
467
- "grad_norm": 2.071620225906372,
468
- "learning_rate": 0.00042765771611100016,
469
- "loss": 2.4574,
470
  "step": 30000
471
  },
472
  {
473
  "epoch": 0.7449344457687723,
474
- "eval_loss": 1.5331941843032837,
475
- "eval_runtime": 4179.1286,
476
- "eval_samples_per_second": 308.364,
477
- "eval_steps_per_second": 9.636,
478
  "step": 30000
479
  },
480
  {
481
  "epoch": 0.7573500198649186,
482
- "grad_norm": 3.0172479152679443,
483
- "learning_rate": 0.0004264099620682771,
484
- "loss": 2.4733,
485
  "step": 30500
486
  },
487
  {
488
  "epoch": 0.7697655939610648,
489
- "grad_norm": 2.6325442790985107,
490
- "learning_rate": 0.000425162208025554,
491
- "loss": 2.4721,
492
  "step": 31000
493
  },
494
  {
495
  "epoch": 0.7821811680572109,
496
- "grad_norm": 2.826345682144165,
497
- "learning_rate": 0.0004239144539828309,
498
- "loss": 2.4692,
499
  "step": 31500
500
  },
501
  {
502
  "epoch": 0.7945967421533572,
503
- "grad_norm": 2.456289291381836,
504
- "learning_rate": 0.00042266919544819325,
505
- "loss": 2.4385,
506
  "step": 32000
507
  },
508
  {
509
  "epoch": 0.8070123162495034,
510
- "grad_norm": 2.4803292751312256,
511
- "learning_rate": 0.0004214214414054702,
512
- "loss": 2.439,
513
  "step": 32500
514
  },
515
  {
516
  "epoch": 0.8194278903456496,
517
- "grad_norm": 2.6469247341156006,
518
- "learning_rate": 0.00042017618287083254,
519
- "loss": 2.4729,
520
  "step": 33000
521
  },
522
  {
523
  "epoch": 0.8318434644417958,
524
- "grad_norm": 2.7024786472320557,
525
- "learning_rate": 0.0004189284288281094,
526
- "loss": 2.4244,
527
  "step": 33500
528
  },
529
  {
530
  "epoch": 0.844259038537942,
531
- "grad_norm": 2.847285270690918,
532
- "learning_rate": 0.0004176806747853863,
533
- "loss": 2.4636,
534
  "step": 34000
535
  },
536
  {
537
  "epoch": 0.8566746126340882,
538
- "grad_norm": 2.453200340270996,
539
- "learning_rate": 0.0004164329207426632,
540
- "loss": 2.4524,
541
  "step": 34500
542
  },
543
  {
544
  "epoch": 0.8690901867302344,
545
- "grad_norm": 2.49642276763916,
546
- "learning_rate": 0.0004151876622080256,
547
- "loss": 2.4457,
548
  "step": 35000
549
  },
550
  {
551
  "epoch": 0.8690901867302344,
552
- "eval_loss": 1.4776599407196045,
553
- "eval_runtime": 4154.52,
554
- "eval_samples_per_second": 310.19,
555
- "eval_steps_per_second": 9.694,
556
  "step": 35000
557
  },
558
  {
559
  "epoch": 0.8815057608263807,
560
- "grad_norm": 2.576984405517578,
561
- "learning_rate": 0.00041393990816530245,
562
- "loss": 2.4149,
563
  "step": 35500
564
  },
565
  {
566
  "epoch": 0.8939213349225268,
567
- "grad_norm": 3.0729165077209473,
568
- "learning_rate": 0.0004126921541225794,
569
- "loss": 2.4067,
570
  "step": 36000
571
  },
572
  {
573
  "epoch": 0.906336909018673,
574
- "grad_norm": 2.7619829177856445,
575
- "learning_rate": 0.0004114444000798563,
576
- "loss": 2.4121,
577
  "step": 36500
578
  },
579
  {
580
  "epoch": 0.9187524831148193,
581
- "grad_norm": 3.5316452980041504,
582
- "learning_rate": 0.0004101991415452186,
583
- "loss": 2.3781,
584
  "step": 37000
585
  },
586
  {
587
  "epoch": 0.9311680572109654,
588
- "grad_norm": 2.7174599170684814,
589
- "learning_rate": 0.0004089538830105809,
590
- "loss": 2.4013,
591
  "step": 37500
592
  },
593
  {
594
  "epoch": 0.9435836313071116,
595
- "grad_norm": 13.372625350952148,
596
- "learning_rate": 0.00040771611100019967,
597
- "loss": 2.5449,
598
  "step": 38000
599
  },
600
  {
601
  "epoch": 0.9559992054032579,
602
- "grad_norm": 11.173745155334473,
603
- "learning_rate": 0.0004064883210221601,
604
- "loss": 6.7867,
605
  "step": 38500
606
  },
607
  {
608
  "epoch": 0.968414779499404,
609
- "grad_norm": 0.9608703255653381,
610
- "learning_rate": 0.000405240566979437,
611
- "loss": 7.7097,
612
  "step": 39000
613
  },
614
  {
615
  "epoch": 0.9808303535955503,
616
- "grad_norm": 8286.201171875,
617
- "learning_rate": 0.0004039928129367139,
618
- "loss": 7.7291,
619
  "step": 39500
620
  },
621
  {
622
  "epoch": 0.9932459276916965,
623
- "grad_norm": 2.1965973377227783,
624
- "learning_rate": 0.00040274505889399085,
625
- "loss": 7.7205,
626
  "step": 40000
627
  },
628
  {
629
  "epoch": 0.9932459276916965,
630
- "eval_loss": 7.322892665863037,
631
- "eval_runtime": 4170.5156,
632
- "eval_samples_per_second": 309.0,
633
- "eval_steps_per_second": 9.656,
634
  "step": 40000
635
- },
636
- {
637
- "epoch": 1.0056615017878427,
638
- "grad_norm": 4.976968288421631,
639
- "learning_rate": 0.0004014973048512677,
640
- "loss": 7.671,
641
- "step": 40500
642
- },
643
- {
644
- "epoch": 1.0180770758839888,
645
- "grad_norm": 430.2960510253906,
646
- "learning_rate": 0.00040024955080854464,
647
- "loss": 7.6052,
648
- "step": 41000
649
- },
650
- {
651
- "epoch": 1.030492649980135,
652
- "grad_norm": 76.06390380859375,
653
- "learning_rate": 0.00039900179676582157,
654
- "loss": 7.6507,
655
- "step": 41500
656
- },
657
- {
658
- "epoch": 1.0429082240762813,
659
- "grad_norm": 75.03479766845703,
660
- "learning_rate": 0.00039776152924735477,
661
- "loss": 7.6393,
662
- "step": 42000
663
- },
664
- {
665
- "epoch": 1.0553237981724275,
666
- "grad_norm": 99.86361694335938,
667
- "learning_rate": 0.00039651876622080257,
668
- "loss": 7.6949,
669
- "step": 42500
670
- },
671
- {
672
- "epoch": 1.0677393722685737,
673
- "grad_norm": 1295.1463623046875,
674
- "learning_rate": 0.0003952710121780795,
675
- "loss": 8.0558,
676
- "step": 43000
677
- },
678
- {
679
- "epoch": 1.08015494636472,
680
- "grad_norm": 3921.91357421875,
681
- "learning_rate": 0.00039402325813535637,
682
- "loss": 8.1452,
683
- "step": 43500
684
- },
685
- {
686
- "epoch": 1.0925705204608662,
687
- "grad_norm": 2463.47265625,
688
- "learning_rate": 0.0003927755040926333,
689
- "loss": 8.6431,
690
- "step": 44000
691
- },
692
- {
693
- "epoch": 1.1049860945570122,
694
- "grad_norm": 259.3988952636719,
695
- "learning_rate": 0.0003915302455579956,
696
- "loss": 8.3183,
697
- "step": 44500
698
- },
699
- {
700
- "epoch": 1.1174016686531585,
701
- "grad_norm": 1.8363823890686035,
702
- "learning_rate": 0.00039028249151527254,
703
- "loss": 8.5829,
704
- "step": 45000
705
- },
706
- {
707
- "epoch": 1.1174016686531585,
708
- "eval_loss": 8.133321762084961,
709
- "eval_runtime": 4160.7924,
710
- "eval_samples_per_second": 309.722,
711
- "eval_steps_per_second": 9.679,
712
- "step": 45000
713
- },
714
- {
715
- "epoch": 1.1298172427493047,
716
- "grad_norm": 18.962547302246094,
717
- "learning_rate": 0.0003890347374725494,
718
- "loss": 8.762,
719
- "step": 45500
720
- },
721
- {
722
- "epoch": 1.142232816845451,
723
- "grad_norm": 41.705020904541016,
724
- "learning_rate": 0.00038778698342982633,
725
- "loss": 8.8213,
726
- "step": 46000
727
- },
728
- {
729
- "epoch": 1.1546483909415972,
730
- "grad_norm": 17.964086532592773,
731
- "learning_rate": 0.0003865392293871032,
732
- "loss": 8.9416,
733
- "step": 46500
734
- },
735
- {
736
- "epoch": 1.1670639650377432,
737
- "grad_norm": 363.1439208984375,
738
- "learning_rate": 0.00038529147534438013,
739
- "loss": 8.9453,
740
- "step": 47000
741
- },
742
- {
743
- "epoch": 1.1794795391338895,
744
- "grad_norm": 5445.716796875,
745
- "learning_rate": 0.0003840462168097425,
746
- "loss": 8.9454,
747
- "step": 47500
748
- },
749
- {
750
- "epoch": 1.1918951132300357,
751
- "grad_norm": 6571.228515625,
752
- "learning_rate": 0.00038279846276701937,
753
- "loss": 8.9851,
754
- "step": 48000
755
- },
756
- {
757
- "epoch": 1.204310687326182,
758
- "grad_norm": 4.845749378204346,
759
- "learning_rate": 0.00038155070872429624,
760
- "loss": 9.1135,
761
- "step": 48500
762
- },
763
- {
764
- "epoch": 1.2167262614223282,
765
- "grad_norm": 5.5228142738342285,
766
- "learning_rate": 0.00038030295468157317,
767
- "loss": 9.5186,
768
- "step": 49000
769
- },
770
- {
771
- "epoch": 1.2291418355184744,
772
- "grad_norm": 3.5042662620544434,
773
- "learning_rate": 0.0003790552006388501,
774
- "loss": 9.8645,
775
- "step": 49500
776
- },
777
- {
778
- "epoch": 1.2415574096146207,
779
- "grad_norm": 15.4348726272583,
780
- "learning_rate": 0.00037780744659612696,
781
- "loss": 9.7882,
782
- "step": 50000
783
- },
784
- {
785
- "epoch": 1.2415574096146207,
786
- "eval_loss": 9.363153457641602,
787
- "eval_runtime": 4159.7498,
788
- "eval_samples_per_second": 309.8,
789
- "eval_steps_per_second": 9.681,
790
- "step": 50000
791
- },
792
- {
793
- "epoch": 1.2539729837107667,
794
- "grad_norm": 4.296143054962158,
795
- "learning_rate": 0.0003765596925534039,
796
- "loss": 9.7354,
797
- "step": 50500
798
- },
799
- {
800
- "epoch": 1.266388557806913,
801
- "grad_norm": 4.641263484954834,
802
- "learning_rate": 0.0003753119385106808,
803
- "loss": 9.7796,
804
- "step": 51000
805
- },
806
- {
807
- "epoch": 1.2788041319030592,
808
- "grad_norm": 4.560667514801025,
809
- "learning_rate": 0.0003740641844679577,
810
- "loss": 9.8683,
811
- "step": 51500
812
- },
813
- {
814
- "epoch": 1.2912197059992054,
815
- "grad_norm": 4.787716388702393,
816
- "learning_rate": 0.0003728164304252346,
817
- "loss": 9.7906,
818
- "step": 52000
819
- },
820
- {
821
- "epoch": 1.3036352800953517,
822
- "grad_norm": 4.268510818481445,
823
- "learning_rate": 0.0003715686763825115,
824
- "loss": 9.8016,
825
- "step": 52500
826
- },
827
- {
828
- "epoch": 1.3160508541914977,
829
- "grad_norm": 4.434477806091309,
830
- "learning_rate": 0.00037032092233978836,
831
- "loss": 9.7858,
832
- "step": 53000
833
- },
834
- {
835
- "epoch": 1.328466428287644,
836
- "grad_norm": 25.949443817138672,
837
- "learning_rate": 0.0003690731682970653,
838
- "loss": 9.7222,
839
- "step": 53500
840
- },
841
- {
842
- "epoch": 1.3408820023837902,
843
- "grad_norm": 61.10898208618164,
844
- "learning_rate": 0.0003678254142543422,
845
- "loss": 9.8307,
846
- "step": 54000
847
- },
848
- {
849
- "epoch": 1.3532975764799364,
850
- "grad_norm": 7.8839430809021,
851
- "learning_rate": 0.0003665776602116191,
852
- "loss": 9.6664,
853
- "step": 54500
854
- },
855
- {
856
- "epoch": 1.3657131505760827,
857
- "grad_norm": 27.914098739624023,
858
- "learning_rate": 0.000365329906168896,
859
- "loss": 9.5602,
860
- "step": 55000
861
- },
862
- {
863
- "epoch": 1.3657131505760827,
864
- "eval_loss": 9.656608581542969,
865
- "eval_runtime": 4188.0736,
866
- "eval_samples_per_second": 307.705,
867
- "eval_steps_per_second": 9.616,
868
- "step": 55000
869
- },
870
- {
871
- "epoch": 1.378128724672229,
872
- "grad_norm": 259.0064392089844,
873
- "learning_rate": 0.0003640821521261729,
874
- "loss": 9.6525,
875
- "step": 55500
876
- },
877
- {
878
- "epoch": 1.3905442987683752,
879
- "grad_norm": 7.888017654418945,
880
- "learning_rate": 0.0003628343980834498,
881
- "loss": 9.5104,
882
- "step": 56000
883
- },
884
- {
885
- "epoch": 1.4029598728645212,
886
- "grad_norm": 7.025816440582275,
887
- "learning_rate": 0.00036158664404072673,
888
- "loss": 9.6077,
889
- "step": 56500
890
- },
891
- {
892
- "epoch": 1.4153754469606674,
893
- "grad_norm": 41.72800827026367,
894
- "learning_rate": 0.0003603388899980036,
895
- "loss": 9.6558,
896
- "step": 57000
897
- },
898
- {
899
- "epoch": 1.4277910210568137,
900
- "grad_norm": 289.7976989746094,
901
- "learning_rate": 0.0003590911359552805,
902
- "loss": 9.6123,
903
- "step": 57500
904
- },
905
- {
906
- "epoch": 1.44020659515296,
907
- "grad_norm": 9.890583038330078,
908
- "learning_rate": 0.0003578433819125574,
909
- "loss": 9.4366,
910
- "step": 58000
911
- },
912
- {
913
- "epoch": 1.4526221692491061,
914
- "grad_norm": 405.3219299316406,
915
- "learning_rate": 0.00035659812337791977,
916
- "loss": 9.3668,
917
- "step": 58500
918
- },
919
- {
920
- "epoch": 1.4650377433452522,
921
- "grad_norm": 6.443118572235107,
922
- "learning_rate": 0.00035535036933519664,
923
- "loss": 9.4349,
924
- "step": 59000
925
- },
926
- {
927
- "epoch": 1.4774533174413986,
928
- "grad_norm": 8.514484405517578,
929
- "learning_rate": 0.00035410261529247357,
930
- "loss": 9.3594,
931
- "step": 59500
932
- },
933
- {
934
- "epoch": 1.4898688915375446,
935
- "grad_norm": 11.674294471740723,
936
- "learning_rate": 0.0003528548612497505,
937
- "loss": 9.3252,
938
- "step": 60000
939
- },
940
- {
941
- "epoch": 1.4898688915375446,
942
- "eval_loss": 9.275012969970703,
943
- "eval_runtime": 4173.4545,
944
- "eval_samples_per_second": 308.783,
945
- "eval_steps_per_second": 9.65,
946
- "step": 60000
947
  }
948
  ],
949
  "logging_steps": 500,
950
- "max_steps": 201360,
951
  "num_input_tokens_seen": 0,
952
- "num_train_epochs": 5,
953
  "save_steps": 10000,
954
  "stateful_callbacks": {
955
  "TrainerControl": {
@@ -963,7 +651,7 @@
963
  "attributes": {}
964
  }
965
  },
966
- "total_flos": 2.5209002073613517e+17,
967
  "train_batch_size": 32,
968
  "trial_name": null,
969
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9932459276916965,
6
  "eval_steps": 5000,
7
+ "global_step": 40000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.012415574096146206,
14
+ "grad_norm": 34.79914474487305,
15
+ "learning_rate": 0.000246,
16
+ "loss": 8.4873,
17
  "step": 500
18
  },
19
  {
20
  "epoch": 0.024831148192292412,
21
+ "grad_norm": 6.984533786773682,
22
+ "learning_rate": 0.0004955,
23
+ "loss": 8.152,
24
  "step": 1000
25
  },
26
  {
27
  "epoch": 0.037246722288438616,
28
+ "grad_norm": 18.275636672973633,
29
+ "learning_rate": 0.0004984664684423567,
30
+ "loss": 8.3503,
31
  "step": 1500
32
  },
33
  {
34
  "epoch": 0.049662296384584824,
35
+ "grad_norm": 8.112704277038574,
36
+ "learning_rate": 0.0004969141971915447,
37
+ "loss": 8.1588,
38
  "step": 2000
39
  },
40
  {
41
  "epoch": 0.06207787048073103,
42
+ "grad_norm": 371.90625,
43
+ "learning_rate": 0.0004953525560941483,
44
+ "loss": 7.9764,
45
  "step": 2500
46
  },
47
  {
48
  "epoch": 0.07449344457687723,
49
+ "grad_norm": 164.8943634033203,
50
+ "learning_rate": 0.0004937940382789466,
51
+ "loss": 7.8648,
52
  "step": 3000
53
  },
54
  {
55
  "epoch": 0.08690901867302345,
56
+ "grad_norm": 119.9703140258789,
57
+ "learning_rate": 0.0004922323971815501,
58
+ "loss": 8.0131,
59
  "step": 3500
60
  },
61
  {
62
  "epoch": 0.09932459276916965,
63
+ "grad_norm": 1504.8216552734375,
64
+ "learning_rate": 0.0004906738793663485,
65
+ "loss": 8.0822,
66
  "step": 4000
67
  },
68
  {
69
  "epoch": 0.11174016686531585,
70
+ "grad_norm": NaN,
71
+ "learning_rate": 0.0004900742091849483,
72
+ "loss": 10.2917,
73
  "step": 4500
74
  },
75
  {
76
  "epoch": 0.12415574096146206,
77
+ "grad_norm": NaN,
78
+ "learning_rate": 0.0004900742091849483,
79
+ "loss": 0.0,
80
  "step": 5000
81
  },
82
  {
83
  "epoch": 0.12415574096146206,
84
+ "eval_loss": NaN,
85
+ "eval_runtime": 3598.9752,
86
+ "eval_samples_per_second": 358.072,
87
+ "eval_steps_per_second": 11.19,
88
  "step": 5000
89
  },
90
  {
91
  "epoch": 0.13657131505760825,
92
+ "grad_norm": NaN,
93
+ "learning_rate": 0.0004900742091849483,
94
+ "loss": 0.0,
95
  "step": 5500
96
  },
97
  {
98
  "epoch": 0.14898688915375446,
99
+ "grad_norm": NaN,
100
+ "learning_rate": 0.0004900742091849483,
101
+ "loss": 0.0,
102
  "step": 6000
103
  },
104
  {
105
  "epoch": 0.16140246324990068,
106
+ "grad_norm": NaN,
107
+ "learning_rate": 0.0004900742091849483,
108
+ "loss": 0.0,
109
  "step": 6500
110
  },
111
  {
112
  "epoch": 0.1738180373460469,
113
+ "grad_norm": NaN,
114
+ "learning_rate": 0.0004900742091849483,
115
+ "loss": 0.0,
116
  "step": 7000
117
  },
118
  {
119
  "epoch": 0.18623361144219308,
120
+ "grad_norm": NaN,
121
+ "learning_rate": 0.0004900742091849483,
122
+ "loss": 0.0,
123
  "step": 7500
124
  },
125
  {
126
  "epoch": 0.1986491855383393,
127
+ "grad_norm": NaN,
128
+ "learning_rate": 0.0004900742091849483,
129
+ "loss": 0.0,
130
  "step": 8000
131
  },
132
  {
133
  "epoch": 0.2110647596344855,
134
+ "grad_norm": NaN,
135
+ "learning_rate": 0.0004900742091849483,
136
+ "loss": 0.0,
137
  "step": 8500
138
  },
139
  {
140
  "epoch": 0.2234803337306317,
141
+ "grad_norm": NaN,
142
+ "learning_rate": 0.0004900742091849483,
143
+ "loss": 0.0,
144
  "step": 9000
145
  },
146
  {
147
  "epoch": 0.2358959078267779,
148
+ "grad_norm": NaN,
149
+ "learning_rate": 0.0004900742091849483,
150
+ "loss": 0.0,
151
  "step": 9500
152
  },
153
  {
154
  "epoch": 0.24831148192292413,
155
+ "grad_norm": NaN,
156
+ "learning_rate": 0.0004900742091849483,
157
+ "loss": 0.0,
158
  "step": 10000
159
  },
160
  {
161
  "epoch": 0.24831148192292413,
162
+ "eval_loss": NaN,
163
+ "eval_runtime": 3702.896,
164
+ "eval_samples_per_second": 348.022,
165
+ "eval_steps_per_second": 10.876,
166
  "step": 10000
167
  },
168
  {
169
  "epoch": 0.2607270560190703,
170
+ "grad_norm": NaN,
171
+ "learning_rate": 0.0004900742091849483,
172
+ "loss": 0.0,
173
  "step": 10500
174
  },
175
  {
176
  "epoch": 0.2731426301152165,
177
+ "grad_norm": NaN,
178
+ "learning_rate": 0.0004900742091849483,
179
+ "loss": 0.0,
180
  "step": 11000
181
  },
182
  {
183
  "epoch": 0.28555820421136274,
184
+ "grad_norm": NaN,
185
+ "learning_rate": 0.0004900742091849483,
186
+ "loss": 0.0,
187
  "step": 11500
188
  },
189
  {
190
  "epoch": 0.29797377830750893,
191
+ "grad_norm": NaN,
192
+ "learning_rate": 0.0004900742091849483,
193
+ "loss": 0.0,
194
  "step": 12000
195
  },
196
  {
197
  "epoch": 0.31038935240365517,
198
+ "grad_norm": NaN,
199
+ "learning_rate": 0.0004900742091849483,
200
+ "loss": 0.0,
201
  "step": 12500
202
  },
203
  {
204
  "epoch": 0.32280492649980136,
205
+ "grad_norm": NaN,
206
+ "learning_rate": 0.0004900742091849483,
207
+ "loss": 0.0,
208
  "step": 13000
209
  },
210
  {
211
  "epoch": 0.33522050059594755,
212
+ "grad_norm": NaN,
213
+ "learning_rate": 0.0004900742091849483,
214
+ "loss": 0.0,
215
  "step": 13500
216
  },
217
  {
218
  "epoch": 0.3476360746920938,
219
+ "grad_norm": NaN,
220
+ "learning_rate": 0.0004900742091849483,
221
+ "loss": 0.0,
222
  "step": 14000
223
  },
224
  {
225
  "epoch": 0.36005164878824,
226
+ "grad_norm": NaN,
227
+ "learning_rate": 0.0004900742091849483,
228
+ "loss": 0.0,
229
  "step": 14500
230
  },
231
  {
232
  "epoch": 0.37246722288438616,
233
+ "grad_norm": NaN,
234
+ "learning_rate": 0.0004900742091849483,
235
+ "loss": 0.0,
236
  "step": 15000
237
  },
238
  {
239
  "epoch": 0.37246722288438616,
240
+ "eval_loss": NaN,
241
+ "eval_runtime": 3721.6303,
242
+ "eval_samples_per_second": 346.271,
243
+ "eval_steps_per_second": 10.821,
244
  "step": 15000
245
  },
246
  {
247
  "epoch": 0.3848827969805324,
248
+ "grad_norm": NaN,
249
+ "learning_rate": 0.0004900742091849483,
250
+ "loss": 0.0,
251
  "step": 15500
252
  },
253
  {
254
  "epoch": 0.3972983710766786,
255
+ "grad_norm": NaN,
256
+ "learning_rate": 0.0004900742091849483,
257
+ "loss": 0.0,
258
  "step": 16000
259
  },
260
  {
261
  "epoch": 0.4097139451728248,
262
+ "grad_norm": NaN,
263
+ "learning_rate": 0.0004900742091849483,
264
+ "loss": 0.0,
265
  "step": 16500
266
  },
267
  {
268
  "epoch": 0.422129519268971,
269
+ "grad_norm": NaN,
270
+ "learning_rate": 0.0004900742091849483,
271
+ "loss": 0.0,
272
  "step": 17000
273
  },
274
  {
275
  "epoch": 0.4345450933651172,
276
+ "grad_norm": NaN,
277
+ "learning_rate": 0.0004900742091849483,
278
+ "loss": 0.0,
279
  "step": 17500
280
  },
281
  {
282
  "epoch": 0.4469606674612634,
283
+ "grad_norm": NaN,
284
+ "learning_rate": 0.0004900742091849483,
285
+ "loss": 0.0,
286
  "step": 18000
287
  },
288
  {
289
  "epoch": 0.45937624155740964,
290
+ "grad_norm": NaN,
291
+ "learning_rate": 0.0004900742091849483,
292
+ "loss": 0.0,
293
  "step": 18500
294
  },
295
  {
296
  "epoch": 0.4717918156535558,
297
+ "grad_norm": NaN,
298
+ "learning_rate": 0.0004900742091849483,
299
+ "loss": 0.0,
300
  "step": 19000
301
  },
302
  {
303
  "epoch": 0.484207389749702,
304
+ "grad_norm": NaN,
305
+ "learning_rate": 0.0004900742091849483,
306
+ "loss": 0.0,
307
  "step": 19500
308
  },
309
  {
310
  "epoch": 0.49662296384584825,
311
+ "grad_norm": NaN,
312
+ "learning_rate": 0.0004900742091849483,
313
+ "loss": 0.0,
314
  "step": 20000
315
  },
316
  {
317
  "epoch": 0.49662296384584825,
318
+ "eval_loss": NaN,
319
+ "eval_runtime": 3334.043,
320
+ "eval_samples_per_second": 386.525,
321
+ "eval_steps_per_second": 12.079,
322
  "step": 20000
323
  },
324
  {
325
  "epoch": 0.5090385379419944,
326
+ "grad_norm": NaN,
327
+ "learning_rate": 0.0004900742091849483,
328
+ "loss": 0.0,
329
  "step": 20500
330
  },
331
  {
332
  "epoch": 0.5214541120381406,
333
+ "grad_norm": NaN,
334
+ "learning_rate": 0.0004900742091849483,
335
+ "loss": 0.0,
336
  "step": 21000
337
  },
338
  {
339
  "epoch": 0.5338696861342869,
340
+ "grad_norm": NaN,
341
+ "learning_rate": 0.0004900742091849483,
342
+ "loss": 0.0,
343
  "step": 21500
344
  },
345
  {
346
  "epoch": 0.546285260230433,
347
+ "grad_norm": NaN,
348
+ "learning_rate": 0.0004900742091849483,
349
+ "loss": 0.0,
350
  "step": 22000
351
  },
352
  {
353
  "epoch": 0.5587008343265792,
354
+ "grad_norm": NaN,
355
+ "learning_rate": 0.0004900742091849483,
356
+ "loss": 0.0,
357
  "step": 22500
358
  },
359
  {
360
  "epoch": 0.5711164084227255,
361
+ "grad_norm": NaN,
362
+ "learning_rate": 0.0004900742091849483,
363
+ "loss": 0.0,
364
  "step": 23000
365
  },
366
  {
367
  "epoch": 0.5835319825188717,
368
+ "grad_norm": NaN,
369
+ "learning_rate": 0.0004900742091849483,
370
+ "loss": 0.0,
371
  "step": 23500
372
  },
373
  {
374
  "epoch": 0.5959475566150179,
375
+ "grad_norm": NaN,
376
+ "learning_rate": 0.0004900742091849483,
377
+ "loss": 0.0,
378
  "step": 24000
379
  },
380
  {
381
  "epoch": 0.6083631307111641,
382
+ "grad_norm": NaN,
383
+ "learning_rate": 0.0004900742091849483,
384
+ "loss": 0.0,
385
  "step": 24500
386
  },
387
  {
388
  "epoch": 0.6207787048073103,
389
+ "grad_norm": NaN,
390
+ "learning_rate": 0.0004900742091849483,
391
+ "loss": 0.0,
392
  "step": 25000
393
  },
394
  {
395
  "epoch": 0.6207787048073103,
396
+ "eval_loss": NaN,
397
+ "eval_runtime": 3318.6898,
398
+ "eval_samples_per_second": 388.313,
399
+ "eval_steps_per_second": 12.135,
400
  "step": 25000
401
  },
402
  {
403
  "epoch": 0.6331942789034565,
404
+ "grad_norm": NaN,
405
+ "learning_rate": 0.0004900742091849483,
406
+ "loss": 0.0,
407
  "step": 25500
408
  },
409
  {
410
  "epoch": 0.6456098529996027,
411
+ "grad_norm": NaN,
412
+ "learning_rate": 0.0004900742091849483,
413
+ "loss": 0.0,
414
  "step": 26000
415
  },
416
  {
417
  "epoch": 0.658025427095749,
418
+ "grad_norm": NaN,
419
+ "learning_rate": 0.0004900742091849483,
420
+ "loss": 0.0,
421
  "step": 26500
422
  },
423
  {
424
  "epoch": 0.6704410011918951,
425
+ "grad_norm": NaN,
426
+ "learning_rate": 0.0004900742091849483,
427
+ "loss": 0.0,
428
  "step": 27000
429
  },
430
  {
431
  "epoch": 0.6828565752880413,
432
+ "grad_norm": NaN,
433
+ "learning_rate": 0.0004900742091849483,
434
+ "loss": 0.0,
435
  "step": 27500
436
  },
437
  {
438
  "epoch": 0.6952721493841876,
439
+ "grad_norm": NaN,
440
+ "learning_rate": 0.0004900742091849483,
441
+ "loss": 0.0,
442
  "step": 28000
443
  },
444
  {
445
  "epoch": 0.7076877234803337,
446
+ "grad_norm": NaN,
447
+ "learning_rate": 0.0004900742091849483,
448
+ "loss": 0.0,
449
  "step": 28500
450
  },
451
  {
452
  "epoch": 0.72010329757648,
453
+ "grad_norm": NaN,
454
+ "learning_rate": 0.0004900742091849483,
455
+ "loss": 0.0,
456
  "step": 29000
457
  },
458
  {
459
  "epoch": 0.7325188716726262,
460
+ "grad_norm": NaN,
461
+ "learning_rate": 0.0004900742091849483,
462
+ "loss": 0.0,
463
  "step": 29500
464
  },
465
  {
466
  "epoch": 0.7449344457687723,
467
+ "grad_norm": NaN,
468
+ "learning_rate": 0.0004900742091849483,
469
+ "loss": 0.0,
470
  "step": 30000
471
  },
472
  {
473
  "epoch": 0.7449344457687723,
474
+ "eval_loss": NaN,
475
+ "eval_runtime": 3329.5874,
476
+ "eval_samples_per_second": 387.042,
477
+ "eval_steps_per_second": 12.095,
478
  "step": 30000
479
  },
480
  {
481
  "epoch": 0.7573500198649186,
482
+ "grad_norm": NaN,
483
+ "learning_rate": 0.0004900742091849483,
484
+ "loss": 0.0,
485
  "step": 30500
486
  },
487
  {
488
  "epoch": 0.7697655939610648,
489
+ "grad_norm": NaN,
490
+ "learning_rate": 0.0004900742091849483,
491
+ "loss": 0.0,
492
  "step": 31000
493
  },
494
  {
495
  "epoch": 0.7821811680572109,
496
+ "grad_norm": NaN,
497
+ "learning_rate": 0.0004900742091849483,
498
+ "loss": 0.0,
499
  "step": 31500
500
  },
501
  {
502
  "epoch": 0.7945967421533572,
503
+ "grad_norm": NaN,
504
+ "learning_rate": 0.0004900742091849483,
505
+ "loss": 0.0,
506
  "step": 32000
507
  },
508
  {
509
  "epoch": 0.8070123162495034,
510
+ "grad_norm": NaN,
511
+ "learning_rate": 0.0004900742091849483,
512
+ "loss": 0.0,
513
  "step": 32500
514
  },
515
  {
516
  "epoch": 0.8194278903456496,
517
+ "grad_norm": NaN,
518
+ "learning_rate": 0.0004900742091849483,
519
+ "loss": 0.0,
520
  "step": 33000
521
  },
522
  {
523
  "epoch": 0.8318434644417958,
524
+ "grad_norm": NaN,
525
+ "learning_rate": 0.0004900742091849483,
526
+ "loss": 0.0,
527
  "step": 33500
528
  },
529
  {
530
  "epoch": 0.844259038537942,
531
+ "grad_norm": NaN,
532
+ "learning_rate": 0.0004900742091849483,
533
+ "loss": 0.0,
534
  "step": 34000
535
  },
536
  {
537
  "epoch": 0.8566746126340882,
538
+ "grad_norm": NaN,
539
+ "learning_rate": 0.0004900742091849483,
540
+ "loss": 0.0,
541
  "step": 34500
542
  },
543
  {
544
  "epoch": 0.8690901867302344,
545
+ "grad_norm": NaN,
546
+ "learning_rate": 0.0004900742091849483,
547
+ "loss": 0.0,
548
  "step": 35000
549
  },
550
  {
551
  "epoch": 0.8690901867302344,
552
+ "eval_loss": NaN,
553
+ "eval_runtime": 3340.4008,
554
+ "eval_samples_per_second": 385.789,
555
+ "eval_steps_per_second": 12.056,
556
  "step": 35000
557
  },
558
  {
559
  "epoch": 0.8815057608263807,
560
+ "grad_norm": NaN,
561
+ "learning_rate": 0.0004900742091849483,
562
+ "loss": 0.0,
563
  "step": 35500
564
  },
565
  {
566
  "epoch": 0.8939213349225268,
567
+ "grad_norm": NaN,
568
+ "learning_rate": 0.0004900742091849483,
569
+ "loss": 0.0,
570
  "step": 36000
571
  },
572
  {
573
  "epoch": 0.906336909018673,
574
+ "grad_norm": NaN,
575
+ "learning_rate": 0.0004900742091849483,
576
+ "loss": 0.0,
577
  "step": 36500
578
  },
579
  {
580
  "epoch": 0.9187524831148193,
581
+ "grad_norm": NaN,
582
+ "learning_rate": 0.0004900742091849483,
583
+ "loss": 0.0,
584
  "step": 37000
585
  },
586
  {
587
  "epoch": 0.9311680572109654,
588
+ "grad_norm": NaN,
589
+ "learning_rate": 0.0004900742091849483,
590
+ "loss": 0.0,
591
  "step": 37500
592
  },
593
  {
594
  "epoch": 0.9435836313071116,
595
+ "grad_norm": NaN,
596
+ "learning_rate": 0.0004900742091849483,
597
+ "loss": 0.0,
598
  "step": 38000
599
  },
600
  {
601
  "epoch": 0.9559992054032579,
602
+ "grad_norm": NaN,
603
+ "learning_rate": 0.0004900742091849483,
604
+ "loss": 0.0,
605
  "step": 38500
606
  },
607
  {
608
  "epoch": 0.968414779499404,
609
+ "grad_norm": NaN,
610
+ "learning_rate": 0.0004900742091849483,
611
+ "loss": 0.0,
612
  "step": 39000
613
  },
614
  {
615
  "epoch": 0.9808303535955503,
616
+ "grad_norm": NaN,
617
+ "learning_rate": 0.0004900742091849483,
618
+ "loss": 0.0,
619
  "step": 39500
620
  },
621
  {
622
  "epoch": 0.9932459276916965,
623
+ "grad_norm": NaN,
624
+ "learning_rate": 0.0004900742091849483,
625
+ "loss": 0.0,
626
  "step": 40000
627
  },
628
  {
629
  "epoch": 0.9932459276916965,
630
+ "eval_loss": NaN,
631
+ "eval_runtime": 3403.8564,
632
+ "eval_samples_per_second": 378.597,
633
+ "eval_steps_per_second": 11.831,
634
  "step": 40000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  }
636
  ],
637
  "logging_steps": 500,
638
+ "max_steps": 161088,
639
  "num_input_tokens_seen": 0,
640
+ "num_train_epochs": 4,
641
  "save_steps": 10000,
642
  "stateful_callbacks": {
643
  "TrainerControl": {
 
651
  "attributes": {}
652
  }
653
  },
654
+ "total_flos": 8.494745877441331e+16,
655
  "train_batch_size": 32,
656
  "trial_name": null,
657
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3113451d9893929f8ff3855bcb2647209eee528a14890b35d215742603e4dc5a
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4801bd5b148520d0075cf5afe1e0c45a70e3939c841013d91a0d072e265ffbd
3
  size 5368