Wilsonwin commited on
Commit
28988ca
·
verified ·
1 Parent(s): e0e8b6e

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6b52d2c4e6f1dc1fc53e1df4ec08ffe7a50c1b6037cc45122a1b5264d5c4b91
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29ab3bcbd54c5e63c4e604ac4ad2f368ae42aa766977dc0340b7b8e0814fb858
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13a58a7f728d5913709f013bfd6cbcb991064242e3075f2b5e93d9b5b184b9f7
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5104f05c76008a8cc4ebab2ab5f343ccdca71dafda81e126d612fe143dbfa54
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf45e6f5a33d99139eae20e5be76bd3bf9589da43c06744e1ac55dde6dda87db
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a04575953c998a8fd3197b1b8249c8e72c33f4bb7c27b036788a4d9e537cf3cd
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:839b4043be0c777e952526844484b5d7c9eb08d95c6a855198a76f2eb1f08d84
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a29280eedf28bde93a8485de1b90963ca69c84125cea86695b5935449e18f453
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7602635580334516,
6
  "eval_steps": 500,
7
- "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3237,6 +3237,364 @@
3237
  "eval_samples_per_second": 280.631,
3238
  "eval_steps_per_second": 5.893,
3239
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3240
  }
3241
  ],
3242
  "logging_steps": 10,
@@ -3256,7 +3614,7 @@
3256
  "attributes": {}
3257
  }
3258
  },
3259
- "total_flos": 1.50505569386496e+17,
3260
  "train_batch_size": 48,
3261
  "trial_name": null,
3262
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8447372867038351,
6
  "eval_steps": 500,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3237
  "eval_samples_per_second": 280.631,
3238
  "eval_steps_per_second": 5.893,
3239
  "step": 4500
3240
+ },
3241
+ {
3242
+ "epoch": 0.7619530326068593,
3243
+ "grad_norm": 0.5667482614517212,
3244
+ "learning_rate": 0.0002543762555494541,
3245
+ "loss": 4.658942794799804,
3246
+ "step": 4510
3247
+ },
3248
+ {
3249
+ "epoch": 0.7636425071802669,
3250
+ "grad_norm": 0.5182068347930908,
3251
+ "learning_rate": 0.0002540317094124131,
3252
+ "loss": 4.675426483154297,
3253
+ "step": 4520
3254
+ },
3255
+ {
3256
+ "epoch": 0.7653319817536746,
3257
+ "grad_norm": 0.5856271982192993,
3258
+ "learning_rate": 0.0002536861024314936,
3259
+ "loss": 4.647851181030274,
3260
+ "step": 4530
3261
+ },
3262
+ {
3263
+ "epoch": 0.7670214563270823,
3264
+ "grad_norm": 0.5433441400527954,
3265
+ "learning_rate": 0.0002533394381309583,
3266
+ "loss": 4.628954315185547,
3267
+ "step": 4540
3268
+ },
3269
+ {
3270
+ "epoch": 0.7687109309004899,
3271
+ "grad_norm": 0.5408143401145935,
3272
+ "learning_rate": 0.00025299172004585144,
3273
+ "loss": 4.679843139648438,
3274
+ "step": 4550
3275
+ },
3276
+ {
3277
+ "epoch": 0.7704004054738977,
3278
+ "grad_norm": 0.5055237412452698,
3279
+ "learning_rate": 0.00025264295172196304,
3280
+ "loss": 4.667029190063476,
3281
+ "step": 4560
3282
+ },
3283
+ {
3284
+ "epoch": 0.7720898800473053,
3285
+ "grad_norm": 0.5375385284423828,
3286
+ "learning_rate": 0.0002522931367157928,
3287
+ "loss": 4.6552692413330075,
3288
+ "step": 4570
3289
+ },
3290
+ {
3291
+ "epoch": 0.773779354620713,
3292
+ "grad_norm": 0.5234900712966919,
3293
+ "learning_rate": 0.00025194227859451384,
3294
+ "loss": 4.664133071899414,
3295
+ "step": 4580
3296
+ },
3297
+ {
3298
+ "epoch": 0.7754688291941206,
3299
+ "grad_norm": 0.5033290386199951,
3300
+ "learning_rate": 0.00025159038093593606,
3301
+ "loss": 4.677631759643555,
3302
+ "step": 4590
3303
+ },
3304
+ {
3305
+ "epoch": 0.7771583037675283,
3306
+ "grad_norm": 0.5198631286621094,
3307
+ "learning_rate": 0.0002512374473284699,
3308
+ "loss": 4.641722106933594,
3309
+ "step": 4600
3310
+ },
3311
+ {
3312
+ "epoch": 0.778847778340936,
3313
+ "grad_norm": 0.5315260291099548,
3314
+ "learning_rate": 0.00025088348137108983,
3315
+ "loss": 4.641604614257813,
3316
+ "step": 4610
3317
+ },
3318
+ {
3319
+ "epoch": 0.7805372529143436,
3320
+ "grad_norm": 0.5272190570831299,
3321
+ "learning_rate": 0.0002505284866732974,
3322
+ "loss": 4.667778778076172,
3323
+ "step": 4620
3324
+ },
3325
+ {
3326
+ "epoch": 0.7822267274877513,
3327
+ "grad_norm": 0.5185366868972778,
3328
+ "learning_rate": 0.0002501724668550846,
3329
+ "loss": 4.631634902954102,
3330
+ "step": 4630
3331
+ },
3332
+ {
3333
+ "epoch": 0.783916202061159,
3334
+ "grad_norm": 0.5354645252227783,
3335
+ "learning_rate": 0.00024981542554689684,
3336
+ "loss": 4.678403091430664,
3337
+ "step": 4640
3338
+ },
3339
+ {
3340
+ "epoch": 0.7856056766345666,
3341
+ "grad_norm": 0.5226261019706726,
3342
+ "learning_rate": 0.000249457366389596,
3343
+ "loss": 4.658837890625,
3344
+ "step": 4650
3345
+ },
3346
+ {
3347
+ "epoch": 0.7872951512079743,
3348
+ "grad_norm": 0.558031439781189,
3349
+ "learning_rate": 0.0002490982930344233,
3350
+ "loss": 4.646864318847657,
3351
+ "step": 4660
3352
+ },
3353
+ {
3354
+ "epoch": 0.788984625781382,
3355
+ "grad_norm": 0.519002377986908,
3356
+ "learning_rate": 0.0002487382091429621,
3357
+ "loss": 4.6644752502441404,
3358
+ "step": 4670
3359
+ },
3360
+ {
3361
+ "epoch": 0.7906741003547897,
3362
+ "grad_norm": 0.5250281095504761,
3363
+ "learning_rate": 0.00024837711838710035,
3364
+ "loss": 4.6212821960449215,
3365
+ "step": 4680
3366
+ },
3367
+ {
3368
+ "epoch": 0.7923635749281973,
3369
+ "grad_norm": 0.5656465291976929,
3370
+ "learning_rate": 0.00024801502444899353,
3371
+ "loss": 4.661688995361328,
3372
+ "step": 4690
3373
+ },
3374
+ {
3375
+ "epoch": 0.794053049501605,
3376
+ "grad_norm": 0.5460257530212402,
3377
+ "learning_rate": 0.00024765193102102676,
3378
+ "loss": 4.65002555847168,
3379
+ "step": 4700
3380
+ },
3381
+ {
3382
+ "epoch": 0.7957425240750127,
3383
+ "grad_norm": 0.5244697332382202,
3384
+ "learning_rate": 0.0002472878418057772,
3385
+ "loss": 4.6698455810546875,
3386
+ "step": 4710
3387
+ },
3388
+ {
3389
+ "epoch": 0.7974319986484203,
3390
+ "grad_norm": 0.5033484697341919,
3391
+ "learning_rate": 0.0002469227605159766,
3392
+ "loss": 4.634893798828125,
3393
+ "step": 4720
3394
+ },
3395
+ {
3396
+ "epoch": 0.799121473221828,
3397
+ "grad_norm": 0.5970498323440552,
3398
+ "learning_rate": 0.0002465566908744729,
3399
+ "loss": 4.61572494506836,
3400
+ "step": 4730
3401
+ },
3402
+ {
3403
+ "epoch": 0.8008109477952357,
3404
+ "grad_norm": 0.5307066440582275,
3405
+ "learning_rate": 0.00024618963661419285,
3406
+ "loss": 4.648424530029297,
3407
+ "step": 4740
3408
+ },
3409
+ {
3410
+ "epoch": 0.8025004223686434,
3411
+ "grad_norm": 0.48402100801467896,
3412
+ "learning_rate": 0.0002458216014781035,
3413
+ "loss": 4.620497131347657,
3414
+ "step": 4750
3415
+ },
3416
+ {
3417
+ "epoch": 0.804189896942051,
3418
+ "grad_norm": 0.5049648880958557,
3419
+ "learning_rate": 0.00024545258921917416,
3420
+ "loss": 4.6289928436279295,
3421
+ "step": 4760
3422
+ },
3423
+ {
3424
+ "epoch": 0.8058793715154587,
3425
+ "grad_norm": 0.4983990788459778,
3426
+ "learning_rate": 0.0002450826036003384,
3427
+ "loss": 4.63318977355957,
3428
+ "step": 4770
3429
+ },
3430
+ {
3431
+ "epoch": 0.8075688460888664,
3432
+ "grad_norm": 0.5099707245826721,
3433
+ "learning_rate": 0.00024471164839445526,
3434
+ "loss": 4.635572814941407,
3435
+ "step": 4780
3436
+ },
3437
+ {
3438
+ "epoch": 0.809258320662274,
3439
+ "grad_norm": 0.5057718753814697,
3440
+ "learning_rate": 0.0002443397273842709,
3441
+ "loss": 4.644168090820313,
3442
+ "step": 4790
3443
+ },
3444
+ {
3445
+ "epoch": 0.8109477952356817,
3446
+ "grad_norm": 0.5193650126457214,
3447
+ "learning_rate": 0.00024396684436238025,
3448
+ "loss": 4.605130386352539,
3449
+ "step": 4800
3450
+ },
3451
+ {
3452
+ "epoch": 0.8126372698090893,
3453
+ "grad_norm": 0.5483851432800293,
3454
+ "learning_rate": 0.00024359300313118814,
3455
+ "loss": 4.638274002075195,
3456
+ "step": 4810
3457
+ },
3458
+ {
3459
+ "epoch": 0.8143267443824971,
3460
+ "grad_norm": 0.5573034882545471,
3461
+ "learning_rate": 0.00024321820750287045,
3462
+ "loss": 4.6438957214355465,
3463
+ "step": 4820
3464
+ },
3465
+ {
3466
+ "epoch": 0.8160162189559047,
3467
+ "grad_norm": 0.5450712442398071,
3468
+ "learning_rate": 0.00024284246129933543,
3469
+ "loss": 4.602296447753906,
3470
+ "step": 4830
3471
+ },
3472
+ {
3473
+ "epoch": 0.8177056935293124,
3474
+ "grad_norm": 0.5248677730560303,
3475
+ "learning_rate": 0.0002424657683521847,
3476
+ "loss": 4.624288558959961,
3477
+ "step": 4840
3478
+ },
3479
+ {
3480
+ "epoch": 0.8193951681027201,
3481
+ "grad_norm": 0.5236571431159973,
3482
+ "learning_rate": 0.00024208813250267404,
3483
+ "loss": 4.620320510864258,
3484
+ "step": 4850
3485
+ },
3486
+ {
3487
+ "epoch": 0.8210846426761277,
3488
+ "grad_norm": 0.47448018193244934,
3489
+ "learning_rate": 0.00024170955760167436,
3490
+ "loss": 4.633553314208984,
3491
+ "step": 4860
3492
+ },
3493
+ {
3494
+ "epoch": 0.8227741172495354,
3495
+ "grad_norm": 0.5197002291679382,
3496
+ "learning_rate": 0.0002413300475096322,
3497
+ "loss": 4.628173828125,
3498
+ "step": 4870
3499
+ },
3500
+ {
3501
+ "epoch": 0.824463591822943,
3502
+ "grad_norm": 0.5387418270111084,
3503
+ "learning_rate": 0.00024094960609653078,
3504
+ "loss": 4.629827880859375,
3505
+ "step": 4880
3506
+ },
3507
+ {
3508
+ "epoch": 0.8261530663963508,
3509
+ "grad_norm": 0.5657356977462769,
3510
+ "learning_rate": 0.00024056823724185014,
3511
+ "loss": 4.612607955932617,
3512
+ "step": 4890
3513
+ },
3514
+ {
3515
+ "epoch": 0.8278425409697584,
3516
+ "grad_norm": 0.5203890204429626,
3517
+ "learning_rate": 0.00024018594483452783,
3518
+ "loss": 4.595291519165039,
3519
+ "step": 4900
3520
+ },
3521
+ {
3522
+ "epoch": 0.829532015543166,
3523
+ "grad_norm": 0.5515291094779968,
3524
+ "learning_rate": 0.00023980273277291893,
3525
+ "loss": 4.62861213684082,
3526
+ "step": 4910
3527
+ },
3528
+ {
3529
+ "epoch": 0.8312214901165738,
3530
+ "grad_norm": 0.5310600399971008,
3531
+ "learning_rate": 0.00023941860496475687,
3532
+ "loss": 4.633145141601562,
3533
+ "step": 4920
3534
+ },
3535
+ {
3536
+ "epoch": 0.8329109646899814,
3537
+ "grad_norm": 0.5451234579086304,
3538
+ "learning_rate": 0.00023903356532711296,
3539
+ "loss": 4.614830780029297,
3540
+ "step": 4930
3541
+ },
3542
+ {
3543
+ "epoch": 0.8346004392633891,
3544
+ "grad_norm": 0.47833251953125,
3545
+ "learning_rate": 0.0002386476177863568,
3546
+ "loss": 4.622224807739258,
3547
+ "step": 4940
3548
+ },
3549
+ {
3550
+ "epoch": 0.8362899138367967,
3551
+ "grad_norm": 0.5025030374526978,
3552
+ "learning_rate": 0.00023826076627811628,
3553
+ "loss": 4.607464599609375,
3554
+ "step": 4950
3555
+ },
3556
+ {
3557
+ "epoch": 0.8379793884102045,
3558
+ "grad_norm": 0.5535337328910828,
3559
+ "learning_rate": 0.0002378730147472371,
3560
+ "loss": 4.580402374267578,
3561
+ "step": 4960
3562
+ },
3563
+ {
3564
+ "epoch": 0.8396688629836121,
3565
+ "grad_norm": 0.5151374340057373,
3566
+ "learning_rate": 0.00023748436714774294,
3567
+ "loss": 4.648463439941406,
3568
+ "step": 4970
3569
+ },
3570
+ {
3571
+ "epoch": 0.8413583375570197,
3572
+ "grad_norm": 0.5126184821128845,
3573
+ "learning_rate": 0.00023709482744279492,
3574
+ "loss": 4.621094512939453,
3575
+ "step": 4980
3576
+ },
3577
+ {
3578
+ "epoch": 0.8430478121304275,
3579
+ "grad_norm": 0.5208641886711121,
3580
+ "learning_rate": 0.00023670439960465128,
3581
+ "loss": 4.607065582275391,
3582
+ "step": 4990
3583
+ },
3584
+ {
3585
+ "epoch": 0.8447372867038351,
3586
+ "grad_norm": 0.5431861877441406,
3587
+ "learning_rate": 0.00023631308761462677,
3588
+ "loss": 4.6144451141357425,
3589
+ "step": 5000
3590
+ },
3591
+ {
3592
+ "epoch": 0.8447372867038351,
3593
+ "eval_loss": 4.566016674041748,
3594
+ "eval_runtime": 3.5736,
3595
+ "eval_samples_per_second": 279.83,
3596
+ "eval_steps_per_second": 5.876,
3597
+ "step": 5000
3598
  }
3599
  ],
3600
  "logging_steps": 10,
 
3614
  "attributes": {}
3615
  }
3616
  },
3617
+ "total_flos": 1.6722841042944e+17,
3618
  "train_batch_size": 48,
3619
  "trial_name": null,
3620
  "trial_params": null