mifeng09 commited on
Commit
24959ac
·
verified ·
1 Parent(s): e240b83

Training in progress, step 2600, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4a6235d767e0a6ef201b5455fcbbfeaa92b51edf50758fd8b882cd7af2d72ea
3
  size 257609792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80575a517640befeb13d8a45ac64f043e3f42763e6b44469f20fcd33684343d7
3
  size 257609792
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a554d918bde3d725bedd8ee6820330007e6d0156885de419dc0031a85ecb2b4c
3
  size 515278091
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ccd6a8136d4f05e4f1eceb89bbc99924cd37d33d944767e7fa5a2668db41ad0
3
  size 515278091
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0fd078c9e4932d689548ec9345ac18c4dd42503c4dbc8344b47f249dcb9160e9
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4f88a3ed70692d8f05b97617079ec0b41dc17b927a833bcbaa62616274bebe6
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7da7c5085795b13d2bf0030671cbddb9f62ae43221bf1424a3830d4cf8c19012
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aea2b6a5675bb9dc7d6d847844f168cbc539a3493d586a8e2634d29c173b0f88
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f1ce0482198d251e67920fc79c9074aa1dba87f00828be5bfd635b348dbb078
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64cd2b31c96e17fa70d4680796990915078e28651e7693ec8503c4c01869ff59
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7380073800738007,
6
  "eval_steps": 500,
7
- "global_step": 2400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3400,6 +3400,294 @@
3400
  "learning_rate": 7.00125e-05,
3401
  "loss": 4.224,
3402
  "step": 2400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3403
  }
3404
  ],
3405
  "logging_steps": 5,
@@ -3419,7 +3707,7 @@
3419
  "attributes": {}
3420
  }
3421
  },
3422
- "total_flos": 2.26567309492224e+16,
3423
  "train_batch_size": 8,
3424
  "trial_name": null,
3425
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7995079950799509,
6
  "eval_steps": 500,
7
+ "global_step": 2600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3400
  "learning_rate": 7.00125e-05,
3401
  "loss": 4.224,
3402
  "step": 2400
3403
+ },
3404
+ {
3405
+ "epoch": 0.7395448954489545,
3406
+ "grad_norm": 0.35414189100265503,
3407
+ "learning_rate": 6.995e-05,
3408
+ "loss": 4.267,
3409
+ "step": 2405
3410
+ },
3411
+ {
3412
+ "epoch": 0.7410824108241082,
3413
+ "grad_norm": 0.3554304838180542,
3414
+ "learning_rate": 6.988750000000001e-05,
3415
+ "loss": 4.1949,
3416
+ "step": 2410
3417
+ },
3418
+ {
3419
+ "epoch": 0.742619926199262,
3420
+ "grad_norm": 0.3534418046474457,
3421
+ "learning_rate": 6.9825e-05,
3422
+ "loss": 4.2337,
3423
+ "step": 2415
3424
+ },
3425
+ {
3426
+ "epoch": 0.7441574415744158,
3427
+ "grad_norm": 0.36624109745025635,
3428
+ "learning_rate": 6.976250000000001e-05,
3429
+ "loss": 4.2385,
3430
+ "step": 2420
3431
+ },
3432
+ {
3433
+ "epoch": 0.7456949569495694,
3434
+ "grad_norm": 0.3525283932685852,
3435
+ "learning_rate": 6.97e-05,
3436
+ "loss": 4.2799,
3437
+ "step": 2425
3438
+ },
3439
+ {
3440
+ "epoch": 0.7472324723247232,
3441
+ "grad_norm": 0.3583906590938568,
3442
+ "learning_rate": 6.96375e-05,
3443
+ "loss": 4.1917,
3444
+ "step": 2430
3445
+ },
3446
+ {
3447
+ "epoch": 0.748769987699877,
3448
+ "grad_norm": 0.355895459651947,
3449
+ "learning_rate": 6.9575e-05,
3450
+ "loss": 4.2309,
3451
+ "step": 2435
3452
+ },
3453
+ {
3454
+ "epoch": 0.7503075030750308,
3455
+ "grad_norm": 0.3442673981189728,
3456
+ "learning_rate": 6.95125e-05,
3457
+ "loss": 4.2104,
3458
+ "step": 2440
3459
+ },
3460
+ {
3461
+ "epoch": 0.7518450184501845,
3462
+ "grad_norm": 0.35169875621795654,
3463
+ "learning_rate": 6.945000000000001e-05,
3464
+ "loss": 4.2586,
3465
+ "step": 2445
3466
+ },
3467
+ {
3468
+ "epoch": 0.7533825338253383,
3469
+ "grad_norm": 0.36030516028404236,
3470
+ "learning_rate": 6.93875e-05,
3471
+ "loss": 4.2897,
3472
+ "step": 2450
3473
+ },
3474
+ {
3475
+ "epoch": 0.754920049200492,
3476
+ "grad_norm": 0.3696916997432709,
3477
+ "learning_rate": 6.9325e-05,
3478
+ "loss": 4.2314,
3479
+ "step": 2455
3480
+ },
3481
+ {
3482
+ "epoch": 0.7564575645756457,
3483
+ "grad_norm": 0.3628195822238922,
3484
+ "learning_rate": 6.926250000000001e-05,
3485
+ "loss": 4.1903,
3486
+ "step": 2460
3487
+ },
3488
+ {
3489
+ "epoch": 0.7579950799507995,
3490
+ "grad_norm": 0.37186235189437866,
3491
+ "learning_rate": 6.92e-05,
3492
+ "loss": 4.2523,
3493
+ "step": 2465
3494
+ },
3495
+ {
3496
+ "epoch": 0.7595325953259533,
3497
+ "grad_norm": 0.35027140378952026,
3498
+ "learning_rate": 6.91375e-05,
3499
+ "loss": 4.2943,
3500
+ "step": 2470
3501
+ },
3502
+ {
3503
+ "epoch": 0.761070110701107,
3504
+ "grad_norm": 0.3844810128211975,
3505
+ "learning_rate": 6.9075e-05,
3506
+ "loss": 4.2552,
3507
+ "step": 2475
3508
+ },
3509
+ {
3510
+ "epoch": 0.7626076260762608,
3511
+ "grad_norm": 0.35497698187828064,
3512
+ "learning_rate": 6.90125e-05,
3513
+ "loss": 4.2048,
3514
+ "step": 2480
3515
+ },
3516
+ {
3517
+ "epoch": 0.7641451414514145,
3518
+ "grad_norm": 0.35539621114730835,
3519
+ "learning_rate": 6.895000000000001e-05,
3520
+ "loss": 4.3195,
3521
+ "step": 2485
3522
+ },
3523
+ {
3524
+ "epoch": 0.7656826568265682,
3525
+ "grad_norm": 0.36448633670806885,
3526
+ "learning_rate": 6.88875e-05,
3527
+ "loss": 4.2099,
3528
+ "step": 2490
3529
+ },
3530
+ {
3531
+ "epoch": 0.767220172201722,
3532
+ "grad_norm": 0.3572072982788086,
3533
+ "learning_rate": 6.8825e-05,
3534
+ "loss": 4.2637,
3535
+ "step": 2495
3536
+ },
3537
+ {
3538
+ "epoch": 0.7687576875768758,
3539
+ "grad_norm": 0.3543466031551361,
3540
+ "learning_rate": 6.876250000000001e-05,
3541
+ "loss": 4.2106,
3542
+ "step": 2500
3543
+ },
3544
+ {
3545
+ "epoch": 0.7687576875768758,
3546
+ "eval_loss": 4.25692892074585,
3547
+ "eval_runtime": 15.9135,
3548
+ "eval_samples_per_second": 62.84,
3549
+ "eval_steps_per_second": 3.959,
3550
+ "step": 2500
3551
+ },
3552
+ {
3553
+ "epoch": 0.7702952029520295,
3554
+ "grad_norm": 0.370313823223114,
3555
+ "learning_rate": 6.87e-05,
3556
+ "loss": 4.3354,
3557
+ "step": 2505
3558
+ },
3559
+ {
3560
+ "epoch": 0.7718327183271833,
3561
+ "grad_norm": 0.3540562391281128,
3562
+ "learning_rate": 6.86375e-05,
3563
+ "loss": 4.2414,
3564
+ "step": 2510
3565
+ },
3566
+ {
3567
+ "epoch": 0.773370233702337,
3568
+ "grad_norm": 0.3575718402862549,
3569
+ "learning_rate": 6.8575e-05,
3570
+ "loss": 4.1738,
3571
+ "step": 2515
3572
+ },
3573
+ {
3574
+ "epoch": 0.7749077490774908,
3575
+ "grad_norm": 0.36341428756713867,
3576
+ "learning_rate": 6.85125e-05,
3577
+ "loss": 4.2806,
3578
+ "step": 2520
3579
+ },
3580
+ {
3581
+ "epoch": 0.7764452644526445,
3582
+ "grad_norm": 0.3755071759223938,
3583
+ "learning_rate": 6.845e-05,
3584
+ "loss": 4.2433,
3585
+ "step": 2525
3586
+ },
3587
+ {
3588
+ "epoch": 0.7779827798277983,
3589
+ "grad_norm": 0.3563622236251831,
3590
+ "learning_rate": 6.83875e-05,
3591
+ "loss": 4.2373,
3592
+ "step": 2530
3593
+ },
3594
+ {
3595
+ "epoch": 0.7795202952029521,
3596
+ "grad_norm": 0.3693353235721588,
3597
+ "learning_rate": 6.832500000000001e-05,
3598
+ "loss": 4.2185,
3599
+ "step": 2535
3600
+ },
3601
+ {
3602
+ "epoch": 0.7810578105781057,
3603
+ "grad_norm": 0.3789558708667755,
3604
+ "learning_rate": 6.826250000000001e-05,
3605
+ "loss": 4.2421,
3606
+ "step": 2540
3607
+ },
3608
+ {
3609
+ "epoch": 0.7825953259532595,
3610
+ "grad_norm": 0.36780837178230286,
3611
+ "learning_rate": 6.82e-05,
3612
+ "loss": 4.258,
3613
+ "step": 2545
3614
+ },
3615
+ {
3616
+ "epoch": 0.7841328413284133,
3617
+ "grad_norm": 0.3676084280014038,
3618
+ "learning_rate": 6.81375e-05,
3619
+ "loss": 4.3125,
3620
+ "step": 2550
3621
+ },
3622
+ {
3623
+ "epoch": 0.785670356703567,
3624
+ "grad_norm": 0.3575945496559143,
3625
+ "learning_rate": 6.8075e-05,
3626
+ "loss": 4.2462,
3627
+ "step": 2555
3628
+ },
3629
+ {
3630
+ "epoch": 0.7872078720787208,
3631
+ "grad_norm": 0.36073750257492065,
3632
+ "learning_rate": 6.80125e-05,
3633
+ "loss": 4.1678,
3634
+ "step": 2560
3635
+ },
3636
+ {
3637
+ "epoch": 0.7887453874538746,
3638
+ "grad_norm": 0.35818690061569214,
3639
+ "learning_rate": 6.795e-05,
3640
+ "loss": 4.3244,
3641
+ "step": 2565
3642
+ },
3643
+ {
3644
+ "epoch": 0.7902829028290282,
3645
+ "grad_norm": 0.353287935256958,
3646
+ "learning_rate": 6.78875e-05,
3647
+ "loss": 4.2217,
3648
+ "step": 2570
3649
+ },
3650
+ {
3651
+ "epoch": 0.791820418204182,
3652
+ "grad_norm": 0.3640352785587311,
3653
+ "learning_rate": 6.782500000000001e-05,
3654
+ "loss": 4.2434,
3655
+ "step": 2575
3656
+ },
3657
+ {
3658
+ "epoch": 0.7933579335793358,
3659
+ "grad_norm": 0.37529149651527405,
3660
+ "learning_rate": 6.77625e-05,
3661
+ "loss": 4.2322,
3662
+ "step": 2580
3663
+ },
3664
+ {
3665
+ "epoch": 0.7948954489544895,
3666
+ "grad_norm": 0.3544490337371826,
3667
+ "learning_rate": 6.77e-05,
3668
+ "loss": 4.2592,
3669
+ "step": 2585
3670
+ },
3671
+ {
3672
+ "epoch": 0.7964329643296433,
3673
+ "grad_norm": 0.3656282424926758,
3674
+ "learning_rate": 6.76375e-05,
3675
+ "loss": 4.2829,
3676
+ "step": 2590
3677
+ },
3678
+ {
3679
+ "epoch": 0.7979704797047971,
3680
+ "grad_norm": 0.3591875731945038,
3681
+ "learning_rate": 6.7575e-05,
3682
+ "loss": 4.3061,
3683
+ "step": 2595
3684
+ },
3685
+ {
3686
+ "epoch": 0.7995079950799509,
3687
+ "grad_norm": 0.3518073260784149,
3688
+ "learning_rate": 6.75125e-05,
3689
+ "loss": 4.2296,
3690
+ "step": 2600
3691
  }
3692
  ],
3693
  "logging_steps": 5,
 
3707
  "attributes": {}
3708
  }
3709
  },
3710
+ "total_flos": 2.45447918616576e+16,
3711
  "train_batch_size": 8,
3712
  "trial_name": null,
3713
  "trial_params": null