shulijia commited on
Commit
71d1711
·
verified ·
1 Parent(s): 62fb848

Training in progress, step 9963, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21dfe2ee8a06dc5ec848e866be280ee76107c1cedbfa1071f376a458a04a60d5
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b1f8f2070aa722256f17caddf76eccc4633099da20d1cc2d61bdf981a76af8
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7df3209f9ba7a30bd7850a80a1fd6686500517ffa3fb89677230076abf146df1
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b36278ea5a4363a66e19662925b0de521702174b32536c12eb455816bf17796c
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7bc6c7300807cf30f1442e05d88e3a8f3c8b9c282aa3836c6b57202974e90680
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca5d742b0ea9db6ebea78c7225beca171b7914b0e5ee83796c299293cd2c7879
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.860778555831639,
6
  "eval_steps": 100,
7
- "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8558,6 +8558,420 @@
8558
  "mean_token_accuracy": 0.7814946219325065,
8559
  "num_tokens": 77811712.0,
8560
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8561
  }
8562
  ],
8563
  "logging_steps": 10,
@@ -8572,12 +8986,12 @@
8572
  "should_evaluate": false,
8573
  "should_log": false,
8574
  "should_save": true,
8575
- "should_training_stop": false
8576
  },
8577
  "attributes": {}
8578
  }
8579
  },
8580
- "total_flos": 2.0564116098986803e+17,
8581
  "train_batch_size": 2,
8582
  "trial_name": null,
8583
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 100,
7
+ "global_step": 9963,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8558
  "mean_token_accuracy": 0.7814946219325065,
8559
  "num_tokens": 77811712.0,
8560
  "step": 9500
8561
+ },
8562
+ {
8563
+ "epoch": 2.86379037723063,
8564
+ "grad_norm": 1.1517497301101685,
8565
+ "learning_rate": 5.063573499888468e-07,
8566
+ "loss": 0.1073,
8567
+ "mean_token_accuracy": 0.8048312108963728,
8568
+ "num_tokens": 77893632.0,
8569
+ "step": 9510
8570
+ },
8571
+ {
8572
+ "epoch": 2.866802198629621,
8573
+ "grad_norm": 1.5399608612060547,
8574
+ "learning_rate": 4.952041043943788e-07,
8575
+ "loss": 0.0834,
8576
+ "mean_token_accuracy": 0.8002446211874485,
8577
+ "num_tokens": 77975552.0,
8578
+ "step": 9520
8579
+ },
8580
+ {
8581
+ "epoch": 2.8698140200286124,
8582
+ "grad_norm": 1.0819060802459717,
8583
+ "learning_rate": 4.840508587999108e-07,
8584
+ "loss": 0.1048,
8585
+ "mean_token_accuracy": 0.789444712176919,
8586
+ "num_tokens": 78057472.0,
8587
+ "step": 9530
8588
+ },
8589
+ {
8590
+ "epoch": 2.8728258414276033,
8591
+ "grad_norm": 1.1191598176956177,
8592
+ "learning_rate": 4.7289761320544284e-07,
8593
+ "loss": 0.101,
8594
+ "mean_token_accuracy": 0.8058096900582313,
8595
+ "num_tokens": 78139392.0,
8596
+ "step": 9540
8597
+ },
8598
+ {
8599
+ "epoch": 2.8758376628265943,
8600
+ "grad_norm": 1.1794003248214722,
8601
+ "learning_rate": 4.617443676109748e-07,
8602
+ "loss": 0.0946,
8603
+ "mean_token_accuracy": 0.804403131455183,
8604
+ "num_tokens": 78221312.0,
8605
+ "step": 9550
8606
+ },
8607
+ {
8608
+ "epoch": 2.8788494842255856,
8609
+ "grad_norm": 1.377267599105835,
8610
+ "learning_rate": 4.505911220165068e-07,
8611
+ "loss": 0.1017,
8612
+ "mean_token_accuracy": 0.7931017633527517,
8613
+ "num_tokens": 78303232.0,
8614
+ "step": 9560
8615
+ },
8616
+ {
8617
+ "epoch": 2.8818613056245765,
8618
+ "grad_norm": 1.4520881175994873,
8619
+ "learning_rate": 4.394378764220388e-07,
8620
+ "loss": 0.1217,
8621
+ "mean_token_accuracy": 0.7699119374155998,
8622
+ "num_tokens": 78385152.0,
8623
+ "step": 9570
8624
+ },
8625
+ {
8626
+ "epoch": 2.8848731270235675,
8627
+ "grad_norm": 1.2226645946502686,
8628
+ "learning_rate": 4.2828463082757086e-07,
8629
+ "loss": 0.1191,
8630
+ "mean_token_accuracy": 0.7835861057043075,
8631
+ "num_tokens": 78467072.0,
8632
+ "step": 9580
8633
+ },
8634
+ {
8635
+ "epoch": 2.8878849484225584,
8636
+ "grad_norm": 1.7126904726028442,
8637
+ "learning_rate": 4.1713138523310286e-07,
8638
+ "loss": 0.1196,
8639
+ "mean_token_accuracy": 0.7943126205354929,
8640
+ "num_tokens": 78548992.0,
8641
+ "step": 9590
8642
+ },
8643
+ {
8644
+ "epoch": 2.8908967698215493,
8645
+ "grad_norm": 1.312665343284607,
8646
+ "learning_rate": 4.059781396386349e-07,
8647
+ "loss": 0.1047,
8648
+ "mean_token_accuracy": 0.7909001961350441,
8649
+ "num_tokens": 78630912.0,
8650
+ "step": 9600
8651
+ },
8652
+ {
8653
+ "epoch": 2.8939085912205407,
8654
+ "grad_norm": 1.339685320854187,
8655
+ "learning_rate": 3.9482489404416684e-07,
8656
+ "loss": 0.1053,
8657
+ "mean_token_accuracy": 0.786631602421403,
8658
+ "num_tokens": 78712832.0,
8659
+ "step": 9610
8660
+ },
8661
+ {
8662
+ "epoch": 2.8969204126195316,
8663
+ "grad_norm": 2.0059938430786133,
8664
+ "learning_rate": 3.836716484496989e-07,
8665
+ "loss": 0.1082,
8666
+ "mean_token_accuracy": 0.7798312120139599,
8667
+ "num_tokens": 78794752.0,
8668
+ "step": 9620
8669
+ },
8670
+ {
8671
+ "epoch": 2.8999322340185225,
8672
+ "grad_norm": 1.2985539436340332,
8673
+ "learning_rate": 3.725184028552309e-07,
8674
+ "loss": 0.0949,
8675
+ "mean_token_accuracy": 0.7817759312689304,
8676
+ "num_tokens": 78876672.0,
8677
+ "step": 9630
8678
+ },
8679
+ {
8680
+ "epoch": 2.902944055417514,
8681
+ "grad_norm": 1.9637115001678467,
8682
+ "learning_rate": 3.613651572607629e-07,
8683
+ "loss": 0.1199,
8684
+ "mean_token_accuracy": 0.7892734818160534,
8685
+ "num_tokens": 78958592.0,
8686
+ "step": 9640
8687
+ },
8688
+ {
8689
+ "epoch": 2.905955876816505,
8690
+ "grad_norm": 1.2397360801696777,
8691
+ "learning_rate": 3.502119116662949e-07,
8692
+ "loss": 0.1069,
8693
+ "mean_token_accuracy": 0.7752935424447059,
8694
+ "num_tokens": 79040512.0,
8695
+ "step": 9650
8696
+ },
8697
+ {
8698
+ "epoch": 2.9089676982154957,
8699
+ "grad_norm": 1.2161389589309692,
8700
+ "learning_rate": 3.3905866607182696e-07,
8701
+ "loss": 0.0987,
8702
+ "mean_token_accuracy": 0.7955479428172112,
8703
+ "num_tokens": 79122432.0,
8704
+ "step": 9660
8705
+ },
8706
+ {
8707
+ "epoch": 2.911979519614487,
8708
+ "grad_norm": 0.8789703845977783,
8709
+ "learning_rate": 3.2790542047735896e-07,
8710
+ "loss": 0.1042,
8711
+ "mean_token_accuracy": 0.8027641840279103,
8712
+ "num_tokens": 79204352.0,
8713
+ "step": 9670
8714
+ },
8715
+ {
8716
+ "epoch": 2.914991341013478,
8717
+ "grad_norm": 0.981950581073761,
8718
+ "learning_rate": 3.16752174882891e-07,
8719
+ "loss": 0.1272,
8720
+ "mean_token_accuracy": 0.7878424659371376,
8721
+ "num_tokens": 79286272.0,
8722
+ "step": 9680
8723
+ },
8724
+ {
8725
+ "epoch": 2.918003162412469,
8726
+ "grad_norm": 1.3362120389938354,
8727
+ "learning_rate": 3.0559892928842294e-07,
8728
+ "loss": 0.1049,
8729
+ "mean_token_accuracy": 0.800146771967411,
8730
+ "num_tokens": 79368192.0,
8731
+ "step": 9690
8732
+ },
8733
+ {
8734
+ "epoch": 2.92101498381146,
8735
+ "grad_norm": 0.9886929988861084,
8736
+ "learning_rate": 2.94445683693955e-07,
8737
+ "loss": 0.1229,
8738
+ "mean_token_accuracy": 0.7738380614668131,
8739
+ "num_tokens": 79450112.0,
8740
+ "step": 9700
8741
+ },
8742
+ {
8743
+ "epoch": 2.9240268052104508,
8744
+ "grad_norm": 1.2238775491714478,
8745
+ "learning_rate": 2.83292438099487e-07,
8746
+ "loss": 0.1047,
8747
+ "mean_token_accuracy": 0.788319468870759,
8748
+ "num_tokens": 79532032.0,
8749
+ "step": 9710
8750
+ },
8751
+ {
8752
+ "epoch": 2.927038626609442,
8753
+ "grad_norm": 1.005550742149353,
8754
+ "learning_rate": 2.7213919250501897e-07,
8755
+ "loss": 0.1215,
8756
+ "mean_token_accuracy": 0.7744006853550672,
8757
+ "num_tokens": 79613952.0,
8758
+ "step": 9720
8759
+ },
8760
+ {
8761
+ "epoch": 2.930050448008433,
8762
+ "grad_norm": 1.1485919952392578,
8763
+ "learning_rate": 2.60985946910551e-07,
8764
+ "loss": 0.1218,
8765
+ "mean_token_accuracy": 0.7930772993713617,
8766
+ "num_tokens": 79695872.0,
8767
+ "step": 9730
8768
+ },
8769
+ {
8770
+ "epoch": 2.933062269407424,
8771
+ "grad_norm": 1.2947425842285156,
8772
+ "learning_rate": 2.49832701316083e-07,
8773
+ "loss": 0.1003,
8774
+ "mean_token_accuracy": 0.7740215234458446,
8775
+ "num_tokens": 79777792.0,
8776
+ "step": 9740
8777
+ },
8778
+ {
8779
+ "epoch": 2.9360740908064153,
8780
+ "grad_norm": 1.7832204103469849,
8781
+ "learning_rate": 2.38679455721615e-07,
8782
+ "loss": 0.1055,
8783
+ "mean_token_accuracy": 0.7808341465890407,
8784
+ "num_tokens": 79859712.0,
8785
+ "step": 9750
8786
+ },
8787
+ {
8788
+ "epoch": 2.9390859122054063,
8789
+ "grad_norm": 1.1873085498809814,
8790
+ "learning_rate": 2.2752621012714705e-07,
8791
+ "loss": 0.1288,
8792
+ "mean_token_accuracy": 0.7696917787194252,
8793
+ "num_tokens": 79941632.0,
8794
+ "step": 9760
8795
+ },
8796
+ {
8797
+ "epoch": 2.942097733604397,
8798
+ "grad_norm": 1.529731035232544,
8799
+ "learning_rate": 2.1637296453267904e-07,
8800
+ "loss": 0.1186,
8801
+ "mean_token_accuracy": 0.791890898346901,
8802
+ "num_tokens": 80023552.0,
8803
+ "step": 9770
8804
+ },
8805
+ {
8806
+ "epoch": 2.9451095550033886,
8807
+ "grad_norm": 1.333554983139038,
8808
+ "learning_rate": 2.0521971893821103e-07,
8809
+ "loss": 0.1051,
8810
+ "mean_token_accuracy": 0.7844911962747574,
8811
+ "num_tokens": 80105472.0,
8812
+ "step": 9780
8813
+ },
8814
+ {
8815
+ "epoch": 2.9481213764023795,
8816
+ "grad_norm": 1.4663509130477905,
8817
+ "learning_rate": 1.9406647334374302e-07,
8818
+ "loss": 0.1041,
8819
+ "mean_token_accuracy": 0.7966854199767113,
8820
+ "num_tokens": 80187392.0,
8821
+ "step": 9790
8822
+ },
8823
+ {
8824
+ "epoch": 2.9511331978013704,
8825
+ "grad_norm": 1.002288579940796,
8826
+ "learning_rate": 1.8291322774927504e-07,
8827
+ "loss": 0.0909,
8828
+ "mean_token_accuracy": 0.7919275924563408,
8829
+ "num_tokens": 80269312.0,
8830
+ "step": 9800
8831
+ },
8832
+ {
8833
+ "epoch": 2.9541450192003613,
8834
+ "grad_norm": 1.2249246835708618,
8835
+ "learning_rate": 1.7175998215480706e-07,
8836
+ "loss": 0.0977,
8837
+ "mean_token_accuracy": 0.7821917802095413,
8838
+ "num_tokens": 80351232.0,
8839
+ "step": 9810
8840
+ },
8841
+ {
8842
+ "epoch": 2.9571568405993522,
8843
+ "grad_norm": 1.3539292812347412,
8844
+ "learning_rate": 1.6060673656033905e-07,
8845
+ "loss": 0.1107,
8846
+ "mean_token_accuracy": 0.7898727986961603,
8847
+ "num_tokens": 80433152.0,
8848
+ "step": 9820
8849
+ },
8850
+ {
8851
+ "epoch": 2.9601686619983436,
8852
+ "grad_norm": 1.2705157995224,
8853
+ "learning_rate": 1.4945349096587107e-07,
8854
+ "loss": 0.0916,
8855
+ "mean_token_accuracy": 0.8011252459138631,
8856
+ "num_tokens": 80515072.0,
8857
+ "step": 9830
8858
+ },
8859
+ {
8860
+ "epoch": 2.9631804833973345,
8861
+ "grad_norm": 1.3075294494628906,
8862
+ "learning_rate": 1.383002453714031e-07,
8863
+ "loss": 0.111,
8864
+ "mean_token_accuracy": 0.7639554768800736,
8865
+ "num_tokens": 80596992.0,
8866
+ "step": 9840
8867
+ },
8868
+ {
8869
+ "epoch": 2.9661923047963255,
8870
+ "grad_norm": 1.1203222274780273,
8871
+ "learning_rate": 1.271469997769351e-07,
8872
+ "loss": 0.1212,
8873
+ "mean_token_accuracy": 0.7945205442607403,
8874
+ "num_tokens": 80678912.0,
8875
+ "step": 9850
8876
+ },
8877
+ {
8878
+ "epoch": 2.969204126195317,
8879
+ "grad_norm": 1.466186285018921,
8880
+ "learning_rate": 1.1599375418246712e-07,
8881
+ "loss": 0.1254,
8882
+ "mean_token_accuracy": 0.7881849348545075,
8883
+ "num_tokens": 80760832.0,
8884
+ "step": 9860
8885
+ },
8886
+ {
8887
+ "epoch": 2.9722159475943077,
8888
+ "grad_norm": 1.363336205482483,
8889
+ "learning_rate": 1.0484050858799912e-07,
8890
+ "loss": 0.1191,
8891
+ "mean_token_accuracy": 0.7713184926658869,
8892
+ "num_tokens": 80842752.0,
8893
+ "step": 9870
8894
+ },
8895
+ {
8896
+ "epoch": 2.9752277689932987,
8897
+ "grad_norm": 0.9907705783843994,
8898
+ "learning_rate": 9.368726299353113e-08,
8899
+ "loss": 0.0936,
8900
+ "mean_token_accuracy": 0.794019079580903,
8901
+ "num_tokens": 80924672.0,
8902
+ "step": 9880
8903
+ },
8904
+ {
8905
+ "epoch": 2.9782395903922896,
8906
+ "grad_norm": 1.0468392372131348,
8907
+ "learning_rate": 8.253401739906312e-08,
8908
+ "loss": 0.1124,
8909
+ "mean_token_accuracy": 0.7831335622817278,
8910
+ "num_tokens": 81006592.0,
8911
+ "step": 9890
8912
+ },
8913
+ {
8914
+ "epoch": 2.9812514117912805,
8915
+ "grad_norm": 1.0683683156967163,
8916
+ "learning_rate": 7.138077180459515e-08,
8917
+ "loss": 0.0994,
8918
+ "mean_token_accuracy": 0.7868028357625008,
8919
+ "num_tokens": 81088512.0,
8920
+ "step": 9900
8921
+ },
8922
+ {
8923
+ "epoch": 2.984263233190272,
8924
+ "grad_norm": 1.7116000652313232,
8925
+ "learning_rate": 6.022752621012715e-08,
8926
+ "loss": 0.0879,
8927
+ "mean_token_accuracy": 0.7907044999301434,
8928
+ "num_tokens": 81170432.0,
8929
+ "step": 9910
8930
+ },
8931
+ {
8932
+ "epoch": 2.987275054589263,
8933
+ "grad_norm": 1.3669886589050293,
8934
+ "learning_rate": 4.9074280615659164e-08,
8935
+ "loss": 0.1216,
8936
+ "mean_token_accuracy": 0.7789016645401716,
8937
+ "num_tokens": 81252352.0,
8938
+ "step": 9920
8939
+ },
8940
+ {
8941
+ "epoch": 2.9902868759882537,
8942
+ "grad_norm": 0.9469903707504272,
8943
+ "learning_rate": 3.792103502119117e-08,
8944
+ "loss": 0.0938,
8945
+ "mean_token_accuracy": 0.7959637988358736,
8946
+ "num_tokens": 81334272.0,
8947
+ "step": 9930
8948
+ },
8949
+ {
8950
+ "epoch": 2.993298697387245,
8951
+ "grad_norm": 1.380719780921936,
8952
+ "learning_rate": 2.676778942672318e-08,
8953
+ "loss": 0.089,
8954
+ "mean_token_accuracy": 0.7933586109429598,
8955
+ "num_tokens": 81416192.0,
8956
+ "step": 9940
8957
+ },
8958
+ {
8959
+ "epoch": 2.996310518786236,
8960
+ "grad_norm": 1.1363697052001953,
8961
+ "learning_rate": 1.5614543832255188e-08,
8962
+ "loss": 0.1048,
8963
+ "mean_token_accuracy": 0.8045499000698328,
8964
+ "num_tokens": 81498112.0,
8965
+ "step": 9950
8966
+ },
8967
+ {
8968
+ "epoch": 2.999322340185227,
8969
+ "grad_norm": 1.5490316152572632,
8970
+ "learning_rate": 4.461298237787197e-09,
8971
+ "loss": 0.1019,
8972
+ "mean_token_accuracy": 0.790349805355072,
8973
+ "num_tokens": 81580032.0,
8974
+ "step": 9960
8975
  }
8976
  ],
8977
  "logging_steps": 10,
 
8986
  "should_evaluate": false,
8987
  "should_log": false,
8988
  "should_save": true,
8989
+ "should_training_stop": true
8990
  },
8991
  "attributes": {}
8992
  }
8993
  },
8994
+ "total_flos": 2.156488071095255e+17,
8995
  "train_batch_size": 2,
8996
  "trial_name": null,
8997
  "trial_params": null