Wilsonwin commited on
Commit
b2131e2
·
verified ·
1 Parent(s): 9320830

Training in progress, step 5000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71e25cdc8697039d1202fb4440876be16955562540fb206d7cbbcfc37a7f33da
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9bdb004cc12734dde986cb14fdf851cce0f063e2d6a2ac9c9566bb962bc0873
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a082fad0106d612afcdf0e9dbf262fd1aa3ca7c9a2ef45f2a14751b1d80d165
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b278926ebe3e854059774715cf944c796b018b9ed04789c02ad5bd2ddb56db
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3343121e0ab3aeb674ab29d872307564462c4bd82cdd92e6577a4ff26999fc00
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5948a5161f7923aa0acf66b01adf35dc2196a8acf5bd2c21227561e5bff45666
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:839b4043be0c777e952526844484b5d7c9eb08d95c6a855198a76f2eb1f08d84
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a29280eedf28bde93a8485de1b90963ca69c84125cea86695b5935449e18f453
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7602635580334516,
6
  "eval_steps": 500,
7
- "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3237,6 +3237,364 @@
3237
  "eval_samples_per_second": 276.482,
3238
  "eval_steps_per_second": 5.806,
3239
  "step": 4500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3240
  }
3241
  ],
3242
  "logging_steps": 10,
@@ -3256,7 +3614,7 @@
3256
  "attributes": {}
3257
  }
3258
  },
3259
- "total_flos": 1.50505569386496e+17,
3260
  "train_batch_size": 48,
3261
  "trial_name": null,
3262
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8447372867038351,
6
  "eval_steps": 500,
7
+ "global_step": 5000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3237
  "eval_samples_per_second": 276.482,
3238
  "eval_steps_per_second": 5.806,
3239
  "step": 4500
3240
+ },
3241
+ {
3242
+ "epoch": 0.7619530326068593,
3243
+ "grad_norm": 0.5751690864562988,
3244
+ "learning_rate": 0.0002543762555494541,
3245
+ "loss": 4.659806823730468,
3246
+ "step": 4510
3247
+ },
3248
+ {
3249
+ "epoch": 0.7636425071802669,
3250
+ "grad_norm": 0.5488387942314148,
3251
+ "learning_rate": 0.0002540317094124131,
3252
+ "loss": 4.675619888305664,
3253
+ "step": 4520
3254
+ },
3255
+ {
3256
+ "epoch": 0.7653319817536746,
3257
+ "grad_norm": 0.5706210136413574,
3258
+ "learning_rate": 0.0002536861024314936,
3259
+ "loss": 4.647731018066406,
3260
+ "step": 4530
3261
+ },
3262
+ {
3263
+ "epoch": 0.7670214563270823,
3264
+ "grad_norm": 0.5262100100517273,
3265
+ "learning_rate": 0.0002533394381309583,
3266
+ "loss": 4.629973220825195,
3267
+ "step": 4540
3268
+ },
3269
+ {
3270
+ "epoch": 0.7687109309004899,
3271
+ "grad_norm": 0.5438910126686096,
3272
+ "learning_rate": 0.00025299172004585144,
3273
+ "loss": 4.680305099487304,
3274
+ "step": 4550
3275
+ },
3276
+ {
3277
+ "epoch": 0.7704004054738977,
3278
+ "grad_norm": 0.5125553011894226,
3279
+ "learning_rate": 0.00025264295172196304,
3280
+ "loss": 4.6679943084716795,
3281
+ "step": 4560
3282
+ },
3283
+ {
3284
+ "epoch": 0.7720898800473053,
3285
+ "grad_norm": 0.5525355339050293,
3286
+ "learning_rate": 0.0002522931367157928,
3287
+ "loss": 4.6561134338378904,
3288
+ "step": 4570
3289
+ },
3290
+ {
3291
+ "epoch": 0.773779354620713,
3292
+ "grad_norm": 0.5133577585220337,
3293
+ "learning_rate": 0.00025194227859451384,
3294
+ "loss": 4.66561279296875,
3295
+ "step": 4580
3296
+ },
3297
+ {
3298
+ "epoch": 0.7754688291941206,
3299
+ "grad_norm": 0.5095699429512024,
3300
+ "learning_rate": 0.00025159038093593606,
3301
+ "loss": 4.678707122802734,
3302
+ "step": 4590
3303
+ },
3304
+ {
3305
+ "epoch": 0.7771583037675283,
3306
+ "grad_norm": 0.5241293907165527,
3307
+ "learning_rate": 0.0002512374473284699,
3308
+ "loss": 4.642659759521484,
3309
+ "step": 4600
3310
+ },
3311
+ {
3312
+ "epoch": 0.778847778340936,
3313
+ "grad_norm": 0.557011067867279,
3314
+ "learning_rate": 0.00025088348137108983,
3315
+ "loss": 4.642984771728516,
3316
+ "step": 4610
3317
+ },
3318
+ {
3319
+ "epoch": 0.7805372529143436,
3320
+ "grad_norm": 0.5290088653564453,
3321
+ "learning_rate": 0.0002505284866732974,
3322
+ "loss": 4.668995666503906,
3323
+ "step": 4620
3324
+ },
3325
+ {
3326
+ "epoch": 0.7822267274877513,
3327
+ "grad_norm": 0.519223153591156,
3328
+ "learning_rate": 0.0002501724668550846,
3329
+ "loss": 4.627962112426758,
3330
+ "step": 4630
3331
+ },
3332
+ {
3333
+ "epoch": 0.783916202061159,
3334
+ "grad_norm": 0.5338088274002075,
3335
+ "learning_rate": 0.00024981542554689684,
3336
+ "loss": 4.67579231262207,
3337
+ "step": 4640
3338
+ },
3339
+ {
3340
+ "epoch": 0.7856056766345666,
3341
+ "grad_norm": 0.5252251625061035,
3342
+ "learning_rate": 0.000249457366389596,
3343
+ "loss": 4.656952285766602,
3344
+ "step": 4650
3345
+ },
3346
+ {
3347
+ "epoch": 0.7872951512079743,
3348
+ "grad_norm": 0.5428206324577332,
3349
+ "learning_rate": 0.0002490982930344233,
3350
+ "loss": 4.646731185913086,
3351
+ "step": 4660
3352
+ },
3353
+ {
3354
+ "epoch": 0.788984625781382,
3355
+ "grad_norm": 0.5392381548881531,
3356
+ "learning_rate": 0.0002487382091429621,
3357
+ "loss": 4.663632583618164,
3358
+ "step": 4670
3359
+ },
3360
+ {
3361
+ "epoch": 0.7906741003547897,
3362
+ "grad_norm": 0.51649409532547,
3363
+ "learning_rate": 0.00024837711838710035,
3364
+ "loss": 4.620084762573242,
3365
+ "step": 4680
3366
+ },
3367
+ {
3368
+ "epoch": 0.7923635749281973,
3369
+ "grad_norm": 0.5248917937278748,
3370
+ "learning_rate": 0.00024801502444899353,
3371
+ "loss": 4.66024169921875,
3372
+ "step": 4690
3373
+ },
3374
+ {
3375
+ "epoch": 0.794053049501605,
3376
+ "grad_norm": 0.5321633219718933,
3377
+ "learning_rate": 0.00024765193102102676,
3378
+ "loss": 4.647469329833984,
3379
+ "step": 4700
3380
+ },
3381
+ {
3382
+ "epoch": 0.7957425240750127,
3383
+ "grad_norm": 0.5236574411392212,
3384
+ "learning_rate": 0.0002472878418057772,
3385
+ "loss": 4.6667522430419925,
3386
+ "step": 4710
3387
+ },
3388
+ {
3389
+ "epoch": 0.7974319986484203,
3390
+ "grad_norm": 0.5166000127792358,
3391
+ "learning_rate": 0.0002469227605159766,
3392
+ "loss": 4.6316486358642575,
3393
+ "step": 4720
3394
+ },
3395
+ {
3396
+ "epoch": 0.799121473221828,
3397
+ "grad_norm": 0.6069431304931641,
3398
+ "learning_rate": 0.0002465566908744729,
3399
+ "loss": 4.614125442504883,
3400
+ "step": 4730
3401
+ },
3402
+ {
3403
+ "epoch": 0.8008109477952357,
3404
+ "grad_norm": 0.5319153666496277,
3405
+ "learning_rate": 0.00024618963661419285,
3406
+ "loss": 4.649255752563477,
3407
+ "step": 4740
3408
+ },
3409
+ {
3410
+ "epoch": 0.8025004223686434,
3411
+ "grad_norm": 0.4894997477531433,
3412
+ "learning_rate": 0.0002458216014781035,
3413
+ "loss": 4.621485900878906,
3414
+ "step": 4750
3415
+ },
3416
+ {
3417
+ "epoch": 0.804189896942051,
3418
+ "grad_norm": 0.516018807888031,
3419
+ "learning_rate": 0.00024545258921917416,
3420
+ "loss": 4.630000305175781,
3421
+ "step": 4760
3422
+ },
3423
+ {
3424
+ "epoch": 0.8058793715154587,
3425
+ "grad_norm": 0.5458150506019592,
3426
+ "learning_rate": 0.0002450826036003384,
3427
+ "loss": 4.635307312011719,
3428
+ "step": 4770
3429
+ },
3430
+ {
3431
+ "epoch": 0.8075688460888664,
3432
+ "grad_norm": 0.5067882537841797,
3433
+ "learning_rate": 0.00024471164839445526,
3434
+ "loss": 4.636883163452149,
3435
+ "step": 4780
3436
+ },
3437
+ {
3438
+ "epoch": 0.809258320662274,
3439
+ "grad_norm": 0.4767204523086548,
3440
+ "learning_rate": 0.0002443397273842709,
3441
+ "loss": 4.645626831054687,
3442
+ "step": 4790
3443
+ },
3444
+ {
3445
+ "epoch": 0.8109477952356817,
3446
+ "grad_norm": 0.5159788727760315,
3447
+ "learning_rate": 0.00024396684436238025,
3448
+ "loss": 4.605623626708985,
3449
+ "step": 4800
3450
+ },
3451
+ {
3452
+ "epoch": 0.8126372698090893,
3453
+ "grad_norm": 0.5320490598678589,
3454
+ "learning_rate": 0.00024359300313118814,
3455
+ "loss": 4.638732147216797,
3456
+ "step": 4810
3457
+ },
3458
+ {
3459
+ "epoch": 0.8143267443824971,
3460
+ "grad_norm": 0.5451418161392212,
3461
+ "learning_rate": 0.00024321820750287045,
3462
+ "loss": 4.6449028015136715,
3463
+ "step": 4820
3464
+ },
3465
+ {
3466
+ "epoch": 0.8160162189559047,
3467
+ "grad_norm": 0.5369979739189148,
3468
+ "learning_rate": 0.00024284246129933543,
3469
+ "loss": 4.602875518798828,
3470
+ "step": 4830
3471
+ },
3472
+ {
3473
+ "epoch": 0.8177056935293124,
3474
+ "grad_norm": 0.5349618196487427,
3475
+ "learning_rate": 0.0002424657683521847,
3476
+ "loss": 4.624568939208984,
3477
+ "step": 4840
3478
+ },
3479
+ {
3480
+ "epoch": 0.8193951681027201,
3481
+ "grad_norm": 0.5187742114067078,
3482
+ "learning_rate": 0.00024208813250267404,
3483
+ "loss": 4.621414566040039,
3484
+ "step": 4850
3485
+ },
3486
+ {
3487
+ "epoch": 0.8210846426761277,
3488
+ "grad_norm": 0.49689674377441406,
3489
+ "learning_rate": 0.00024170955760167436,
3490
+ "loss": 4.63438606262207,
3491
+ "step": 4860
3492
+ },
3493
+ {
3494
+ "epoch": 0.8227741172495354,
3495
+ "grad_norm": 0.5191966891288757,
3496
+ "learning_rate": 0.0002413300475096322,
3497
+ "loss": 4.629247665405273,
3498
+ "step": 4870
3499
+ },
3500
+ {
3501
+ "epoch": 0.824463591822943,
3502
+ "grad_norm": 0.5321470499038696,
3503
+ "learning_rate": 0.00024094960609653078,
3504
+ "loss": 4.630535507202149,
3505
+ "step": 4880
3506
+ },
3507
+ {
3508
+ "epoch": 0.8261530663963508,
3509
+ "grad_norm": 0.577171802520752,
3510
+ "learning_rate": 0.00024056823724185014,
3511
+ "loss": 4.614957809448242,
3512
+ "step": 4890
3513
+ },
3514
+ {
3515
+ "epoch": 0.8278425409697584,
3516
+ "grad_norm": 0.5203391313552856,
3517
+ "learning_rate": 0.00024018594483452783,
3518
+ "loss": 4.597796630859375,
3519
+ "step": 4900
3520
+ },
3521
+ {
3522
+ "epoch": 0.829532015543166,
3523
+ "grad_norm": 0.568663477897644,
3524
+ "learning_rate": 0.00023980273277291893,
3525
+ "loss": 4.630698394775391,
3526
+ "step": 4910
3527
+ },
3528
+ {
3529
+ "epoch": 0.8312214901165738,
3530
+ "grad_norm": 0.5214170813560486,
3531
+ "learning_rate": 0.00023941860496475687,
3532
+ "loss": 4.6348930358886715,
3533
+ "step": 4920
3534
+ },
3535
+ {
3536
+ "epoch": 0.8329109646899814,
3537
+ "grad_norm": 0.5391976237297058,
3538
+ "learning_rate": 0.00023903356532711296,
3539
+ "loss": 4.6155132293701175,
3540
+ "step": 4930
3541
+ },
3542
+ {
3543
+ "epoch": 0.8346004392633891,
3544
+ "grad_norm": 0.4739229381084442,
3545
+ "learning_rate": 0.0002386476177863568,
3546
+ "loss": 4.622202301025391,
3547
+ "step": 4940
3548
+ },
3549
+ {
3550
+ "epoch": 0.8362899138367967,
3551
+ "grad_norm": 0.5011942386627197,
3552
+ "learning_rate": 0.00023826076627811628,
3553
+ "loss": 4.608601379394531,
3554
+ "step": 4950
3555
+ },
3556
+ {
3557
+ "epoch": 0.8379793884102045,
3558
+ "grad_norm": 0.5716709494590759,
3559
+ "learning_rate": 0.0002378730147472371,
3560
+ "loss": 4.581511306762695,
3561
+ "step": 4960
3562
+ },
3563
+ {
3564
+ "epoch": 0.8396688629836121,
3565
+ "grad_norm": 0.5052880644798279,
3566
+ "learning_rate": 0.00023748436714774294,
3567
+ "loss": 4.649203491210938,
3568
+ "step": 4970
3569
+ },
3570
+ {
3571
+ "epoch": 0.8413583375570197,
3572
+ "grad_norm": 0.512668788433075,
3573
+ "learning_rate": 0.00023709482744279492,
3574
+ "loss": 4.621175765991211,
3575
+ "step": 4980
3576
+ },
3577
+ {
3578
+ "epoch": 0.8430478121304275,
3579
+ "grad_norm": 0.5231815576553345,
3580
+ "learning_rate": 0.00023670439960465128,
3581
+ "loss": 4.606881713867187,
3582
+ "step": 4990
3583
+ },
3584
+ {
3585
+ "epoch": 0.8447372867038351,
3586
+ "grad_norm": 0.5233691930770874,
3587
+ "learning_rate": 0.00023631308761462677,
3588
+ "loss": 4.614410018920898,
3589
+ "step": 5000
3590
+ },
3591
+ {
3592
+ "epoch": 0.8447372867038351,
3593
+ "eval_loss": 4.584611415863037,
3594
+ "eval_runtime": 3.6357,
3595
+ "eval_samples_per_second": 275.05,
3596
+ "eval_steps_per_second": 5.776,
3597
+ "step": 5000
3598
  }
3599
  ],
3600
  "logging_steps": 10,
 
3614
  "attributes": {}
3615
  }
3616
  },
3617
+ "total_flos": 1.6722841042944e+17,
3618
  "train_batch_size": 48,
3619
  "trial_name": null,
3620
  "trial_params": null