aghatage commited on
Commit
9180cc8
·
verified ·
1 Parent(s): bf2ac85

Training in progress, step 9500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:569ec90df3be5964c1b097a10b54e52c90a472078772ad8e6f9345b6a9133db3
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4258719dcecebc1c0843dab41638ebbcc2b7c072f2a468a899ab44b463281542
3
  size 12017472
last-checkpoint/global_step9500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87bfb3658af051bd5168f18c84a6c737b4d026357d647dd16623952cf7279a80
3
+ size 71982309
last-checkpoint/global_step9500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc2da02708dd37de2715719da80cd634ddff717ca29c5c0b45229054bf35b33f
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step9000
 
1
+ global_step9500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1aab311800534a13a9d7022b8b11928b19d22b45728daa1e473cebeb217f764
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:800aef772fd8643d6401fb8f1a4a953f0b1f6d395b08356d3c11c489a5ab7481
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 9000,
3
- "best_metric": 0.5540527105331421,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-9000",
5
- "epoch": 6.540992546809671,
6
  "eval_steps": 250,
7
- "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3617,6 +3617,206 @@
3617
  "eval_samples_per_second": 43.556,
3618
  "eval_steps_per_second": 5.451,
3619
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3620
  }
3621
  ],
3622
  "logging_steps": 25,
@@ -3636,7 +3836,7 @@
3636
  "attributes": {}
3637
  }
3638
  },
3639
- "total_flos": 4.9990258137235456e+17,
3640
  "train_batch_size": 4,
3641
  "trial_name": null,
3642
  "trial_params": null
 
1
  {
2
+ "best_global_step": 9500,
3
+ "best_metric": 0.5516709685325623,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-9500",
5
+ "epoch": 6.9045628067624065,
6
  "eval_steps": 250,
7
+ "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3617
  "eval_samples_per_second": 43.556,
3618
  "eval_steps_per_second": 5.451,
3619
  "step": 9000
3620
+ },
3621
+ {
3622
+ "epoch": 6.559171059807308,
3623
+ "grad_norm": 0.8078477382659912,
3624
+ "learning_rate": 3.4512111433749765e-05,
3625
+ "loss": 0.5355,
3626
+ "mean_token_accuracy": 0.8331593692302703,
3627
+ "num_tokens": 198783104.0,
3628
+ "step": 9025
3629
+ },
3630
+ {
3631
+ "epoch": 6.577349572804945,
3632
+ "grad_norm": 0.7993205189704895,
3633
+ "learning_rate": 3.432256465223894e-05,
3634
+ "loss": 0.5402,
3635
+ "mean_token_accuracy": 0.831983962059021,
3636
+ "num_tokens": 199329947.0,
3637
+ "step": 9050
3638
+ },
3639
+ {
3640
+ "epoch": 6.595528085802581,
3641
+ "grad_norm": 0.8338197469711304,
3642
+ "learning_rate": 3.413314789011529e-05,
3643
+ "loss": 0.5374,
3644
+ "mean_token_accuracy": 0.8321248868107796,
3645
+ "num_tokens": 199881416.0,
3646
+ "step": 9075
3647
+ },
3648
+ {
3649
+ "epoch": 6.613706598800218,
3650
+ "grad_norm": 0.8037586808204651,
3651
+ "learning_rate": 3.394386548522676e-05,
3652
+ "loss": 0.546,
3653
+ "mean_token_accuracy": 0.8305454310774804,
3654
+ "num_tokens": 200430769.0,
3655
+ "step": 9100
3656
+ },
3657
+ {
3658
+ "epoch": 6.631885111797855,
3659
+ "grad_norm": 0.8201845288276672,
3660
+ "learning_rate": 3.375472177234437e-05,
3661
+ "loss": 0.5399,
3662
+ "mean_token_accuracy": 0.8318346044421197,
3663
+ "num_tokens": 200978715.0,
3664
+ "step": 9125
3665
+ },
3666
+ {
3667
+ "epoch": 6.650063624795492,
3668
+ "grad_norm": 0.797753095626831,
3669
+ "learning_rate": 3.356572108306296e-05,
3670
+ "loss": 0.5328,
3671
+ "mean_token_accuracy": 0.8343482685089111,
3672
+ "num_tokens": 201521009.0,
3673
+ "step": 9150
3674
+ },
3675
+ {
3676
+ "epoch": 6.668242137793128,
3677
+ "grad_norm": 0.8221126794815063,
3678
+ "learning_rate": 3.337686774570198e-05,
3679
+ "loss": 0.5374,
3680
+ "mean_token_accuracy": 0.8342596819996834,
3681
+ "num_tokens": 202050884.0,
3682
+ "step": 9175
3683
+ },
3684
+ {
3685
+ "epoch": 6.686420650790765,
3686
+ "grad_norm": 0.7923447489738464,
3687
+ "learning_rate": 3.318816608520636e-05,
3688
+ "loss": 0.5394,
3689
+ "mean_token_accuracy": 0.832169133424759,
3690
+ "num_tokens": 202609530.0,
3691
+ "step": 9200
3692
+ },
3693
+ {
3694
+ "epoch": 6.704599163788402,
3695
+ "grad_norm": 0.8144327402114868,
3696
+ "learning_rate": 3.2999620423047444e-05,
3697
+ "loss": 0.5349,
3698
+ "mean_token_accuracy": 0.8340477573871613,
3699
+ "num_tokens": 203151783.0,
3700
+ "step": 9225
3701
+ },
3702
+ {
3703
+ "epoch": 6.7227776767860385,
3704
+ "grad_norm": 0.808747410774231,
3705
+ "learning_rate": 3.281123507712407e-05,
3706
+ "loss": 0.5435,
3707
+ "mean_token_accuracy": 0.8309207037091255,
3708
+ "num_tokens": 203691329.0,
3709
+ "step": 9250
3710
+ },
3711
+ {
3712
+ "epoch": 6.7227776767860385,
3713
+ "eval_loss": 0.5527983903884888,
3714
+ "eval_mean_token_accuracy": 0.827514678532002,
3715
+ "eval_num_tokens": 203691329.0,
3716
+ "eval_runtime": 112.8236,
3717
+ "eval_samples_per_second": 43.342,
3718
+ "eval_steps_per_second": 5.424,
3719
+ "step": 9250
3720
+ },
3721
+ {
3722
+ "epoch": 6.740956189783676,
3723
+ "grad_norm": 0.7522659301757812,
3724
+ "learning_rate": 3.2623014361663655e-05,
3725
+ "loss": 0.5355,
3726
+ "mean_token_accuracy": 0.8326027810573577,
3727
+ "num_tokens": 204249043.0,
3728
+ "step": 9275
3729
+ },
3730
+ {
3731
+ "epoch": 6.759134702781313,
3732
+ "grad_norm": 0.8386163711547852,
3733
+ "learning_rate": 3.2434962587123394e-05,
3734
+ "loss": 0.5466,
3735
+ "mean_token_accuracy": 0.8312074172496796,
3736
+ "num_tokens": 204801326.0,
3737
+ "step": 9300
3738
+ },
3739
+ {
3740
+ "epoch": 6.77731321577895,
3741
+ "grad_norm": 0.8563323616981506,
3742
+ "learning_rate": 3.2247084060091554e-05,
3743
+ "loss": 0.5401,
3744
+ "mean_token_accuracy": 0.8324404546618461,
3745
+ "num_tokens": 205365857.0,
3746
+ "step": 9325
3747
+ },
3748
+ {
3749
+ "epoch": 6.795491728776586,
3750
+ "grad_norm": 0.8469668030738831,
3751
+ "learning_rate": 3.205938308318887e-05,
3752
+ "loss": 0.5365,
3753
+ "mean_token_accuracy": 0.8328872221708298,
3754
+ "num_tokens": 205919525.0,
3755
+ "step": 9350
3756
+ },
3757
+ {
3758
+ "epoch": 6.813670241774223,
3759
+ "grad_norm": 0.8146055340766907,
3760
+ "learning_rate": 3.187186395496996e-05,
3761
+ "loss": 0.5374,
3762
+ "mean_token_accuracy": 0.8319051578640938,
3763
+ "num_tokens": 206468920.0,
3764
+ "step": 9375
3765
+ },
3766
+ {
3767
+ "epoch": 6.83184875477186,
3768
+ "grad_norm": 0.7568976283073425,
3769
+ "learning_rate": 3.1684530969824895e-05,
3770
+ "loss": 0.5285,
3771
+ "mean_token_accuracy": 0.8365351390838623,
3772
+ "num_tokens": 207011308.0,
3773
+ "step": 9400
3774
+ },
3775
+ {
3776
+ "epoch": 6.850027267769496,
3777
+ "grad_norm": 0.7712914943695068,
3778
+ "learning_rate": 3.1497388417880935e-05,
3779
+ "loss": 0.5426,
3780
+ "mean_token_accuracy": 0.8314177936315537,
3781
+ "num_tokens": 207557360.0,
3782
+ "step": 9425
3783
+ },
3784
+ {
3785
+ "epoch": 6.868205780767133,
3786
+ "grad_norm": 0.8087723851203918,
3787
+ "learning_rate": 3.131044058490415e-05,
3788
+ "loss": 0.5428,
3789
+ "mean_token_accuracy": 0.8317897391319274,
3790
+ "num_tokens": 208118474.0,
3791
+ "step": 9450
3792
+ },
3793
+ {
3794
+ "epoch": 6.88638429376477,
3795
+ "grad_norm": 0.8176565766334534,
3796
+ "learning_rate": 3.112369175220138e-05,
3797
+ "loss": 0.5455,
3798
+ "mean_token_accuracy": 0.8312569990754127,
3799
+ "num_tokens": 208649698.0,
3800
+ "step": 9475
3801
+ },
3802
+ {
3803
+ "epoch": 6.9045628067624065,
3804
+ "grad_norm": 0.8005684614181519,
3805
+ "learning_rate": 3.093714619652211e-05,
3806
+ "loss": 0.5438,
3807
+ "mean_token_accuracy": 0.8314751309156417,
3808
+ "num_tokens": 209201375.0,
3809
+ "step": 9500
3810
+ },
3811
+ {
3812
+ "epoch": 6.9045628067624065,
3813
+ "eval_loss": 0.5516709685325623,
3814
+ "eval_mean_token_accuracy": 0.8279460390019261,
3815
+ "eval_num_tokens": 209201375.0,
3816
+ "eval_runtime": 114.1031,
3817
+ "eval_samples_per_second": 42.856,
3818
+ "eval_steps_per_second": 5.364,
3819
+ "step": 9500
3820
  }
3821
  ],
3822
  "logging_steps": 25,
 
3836
  "attributes": {}
3837
  }
3838
  },
3839
+ "total_flos": 5.276420653375816e+17,
3840
  "train_batch_size": 4,
3841
  "trial_name": null,
3842
  "trial_params": null