shulijia commited on
Commit
d005165
·
verified ·
1 Parent(s): 0bb1a6a

Training in progress, step 10000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:070b5b7acfb870eafcd0bf40ce133115da39bd3236dee84b8493ea73e863aebf
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21e9c11e02543045a52d1d10e85b29deee320e577ed8c40299be1aac88002bab
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6986d123949d70dafc8db16862d29980777537d8be6a72c449522a071032d5c
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c87bdbbf96a91780aaf4a58c008036f2bfda78e91f3d428d63005f735fe1e0c
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40beb3dc5129ab4ac6babe96012ebdd87569ab488ea6742096d9d349a8d4cd73
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33e6b43d263edc3fb19dbc74c4a7ae9df523ccc7c2602c8a0c606ae6abf92007
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.6904804049987607,
6
  "eval_steps": 100,
7
- "global_step": 9500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8558,6 +8558,456 @@
8558
  "mean_token_accuracy": 0.7248899202793837,
8559
  "num_tokens": 77821952.0,
8560
  "step": 9500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8561
  }
8562
  ],
8563
  "logging_steps": 10,
@@ -8577,7 +9027,7 @@
8577
  "attributes": {}
8578
  }
8579
  },
8580
- "total_flos": 2.0566822331036467e+17,
8581
  "train_batch_size": 2,
8582
  "trial_name": null,
8583
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.8320883633660214,
6
  "eval_steps": 100,
7
+ "global_step": 10000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8558
  "mean_token_accuracy": 0.7248899202793837,
8559
  "num_tokens": 77821952.0,
8560
  "step": 9500
8561
+ },
8562
+ {
8563
+ "epoch": 2.693312564166106,
8564
+ "grad_norm": 1.6963791847229004,
8565
+ "learning_rate": 1.137102695898458e-06,
8566
+ "loss": 0.1448,
8567
+ "mean_token_accuracy": 0.7517612528055906,
8568
+ "num_tokens": 77903872.0,
8569
+ "step": 9510
8570
+ },
8571
+ {
8572
+ "epoch": 2.6961447233334512,
8573
+ "grad_norm": 1.5691133737564087,
8574
+ "learning_rate": 1.1266128186300221e-06,
8575
+ "loss": 0.1062,
8576
+ "mean_token_accuracy": 0.7805895309895277,
8577
+ "num_tokens": 77985792.0,
8578
+ "step": 9520
8579
+ },
8580
+ {
8581
+ "epoch": 2.6989768825007965,
8582
+ "grad_norm": 1.3455393314361572,
8583
+ "learning_rate": 1.1161229413615862e-06,
8584
+ "loss": 0.1181,
8585
+ "mean_token_accuracy": 0.7727250501513481,
8586
+ "num_tokens": 78067712.0,
8587
+ "step": 9530
8588
+ },
8589
+ {
8590
+ "epoch": 2.7018090416681417,
8591
+ "grad_norm": 1.7499293088912964,
8592
+ "learning_rate": 1.1056330640931503e-06,
8593
+ "loss": 0.1621,
8594
+ "mean_token_accuracy": 0.7321550864726305,
8595
+ "num_tokens": 78149632.0,
8596
+ "step": 9540
8597
+ },
8598
+ {
8599
+ "epoch": 2.704641200835487,
8600
+ "grad_norm": 1.078167200088501,
8601
+ "learning_rate": 1.0951431868247141e-06,
8602
+ "loss": 0.1142,
8603
+ "mean_token_accuracy": 0.7804916825145483,
8604
+ "num_tokens": 78231552.0,
8605
+ "step": 9550
8606
+ },
8607
+ {
8608
+ "epoch": 2.7074733600028322,
8609
+ "grad_norm": 1.411314845085144,
8610
+ "learning_rate": 1.0846533095562784e-06,
8611
+ "loss": 0.1143,
8612
+ "mean_token_accuracy": 0.7715141884982586,
8613
+ "num_tokens": 78313472.0,
8614
+ "step": 9560
8615
+ },
8616
+ {
8617
+ "epoch": 2.7103055191701775,
8618
+ "grad_norm": 1.734834909439087,
8619
+ "learning_rate": 1.0741634322878423e-06,
8620
+ "loss": 0.1425,
8621
+ "mean_token_accuracy": 0.7414505925029516,
8622
+ "num_tokens": 78395392.0,
8623
+ "step": 9570
8624
+ },
8625
+ {
8626
+ "epoch": 2.7131376783375227,
8627
+ "grad_norm": 1.7494261264801025,
8628
+ "learning_rate": 1.0636735550194063e-06,
8629
+ "loss": 0.1213,
8630
+ "mean_token_accuracy": 0.759222112223506,
8631
+ "num_tokens": 78477312.0,
8632
+ "step": 9580
8633
+ },
8634
+ {
8635
+ "epoch": 2.7159698375048675,
8636
+ "grad_norm": 1.2814098596572876,
8637
+ "learning_rate": 1.0531836777509704e-06,
8638
+ "loss": 0.1335,
8639
+ "mean_token_accuracy": 0.7782045040279627,
8640
+ "num_tokens": 78559232.0,
8641
+ "step": 9590
8642
+ },
8643
+ {
8644
+ "epoch": 2.7188019966722132,
8645
+ "grad_norm": 1.2416023015975952,
8646
+ "learning_rate": 1.0426938004825345e-06,
8647
+ "loss": 0.131,
8648
+ "mean_token_accuracy": 0.7627446163445711,
8649
+ "num_tokens": 78641152.0,
8650
+ "step": 9600
8651
+ },
8652
+ {
8653
+ "epoch": 2.721634155839558,
8654
+ "grad_norm": 1.2916755676269531,
8655
+ "learning_rate": 1.0322039232140984e-06,
8656
+ "loss": 0.1292,
8657
+ "mean_token_accuracy": 0.7665728945285082,
8658
+ "num_tokens": 78723072.0,
8659
+ "step": 9610
8660
+ },
8661
+ {
8662
+ "epoch": 2.7244663150069033,
8663
+ "grad_norm": 0.9685536026954651,
8664
+ "learning_rate": 1.0217140459456624e-06,
8665
+ "loss": 0.0966,
8666
+ "mean_token_accuracy": 0.7933586113154888,
8667
+ "num_tokens": 78804992.0,
8668
+ "step": 9620
8669
+ },
8670
+ {
8671
+ "epoch": 2.7272984741742485,
8672
+ "grad_norm": 1.0701133012771606,
8673
+ "learning_rate": 1.0112241686772265e-06,
8674
+ "loss": 0.0929,
8675
+ "mean_token_accuracy": 0.768480920419097,
8676
+ "num_tokens": 78886912.0,
8677
+ "step": 9630
8678
+ },
8679
+ {
8680
+ "epoch": 2.730130633341594,
8681
+ "grad_norm": 1.155450701713562,
8682
+ "learning_rate": 1.0007342914087906e-06,
8683
+ "loss": 0.1217,
8684
+ "mean_token_accuracy": 0.7712084148079157,
8685
+ "num_tokens": 78968832.0,
8686
+ "step": 9640
8687
+ },
8688
+ {
8689
+ "epoch": 2.732962792508939,
8690
+ "grad_norm": 1.2108891010284424,
8691
+ "learning_rate": 9.902444141403547e-07,
8692
+ "loss": 0.1269,
8693
+ "mean_token_accuracy": 0.759784734621644,
8694
+ "num_tokens": 79050752.0,
8695
+ "step": 9650
8696
+ },
8697
+ {
8698
+ "epoch": 2.7357949516762843,
8699
+ "grad_norm": 1.3404109477996826,
8700
+ "learning_rate": 9.797545368719187e-07,
8701
+ "loss": 0.115,
8702
+ "mean_token_accuracy": 0.7742294497787953,
8703
+ "num_tokens": 79132672.0,
8704
+ "step": 9660
8705
+ },
8706
+ {
8707
+ "epoch": 2.7386271108436295,
8708
+ "grad_norm": 0.9352473616600037,
8709
+ "learning_rate": 9.692646596034828e-07,
8710
+ "loss": 0.1153,
8711
+ "mean_token_accuracy": 0.7558341480791568,
8712
+ "num_tokens": 79214592.0,
8713
+ "step": 9670
8714
+ },
8715
+ {
8716
+ "epoch": 2.741459270010975,
8717
+ "grad_norm": 1.2585588693618774,
8718
+ "learning_rate": 9.587747823350467e-07,
8719
+ "loss": 0.1447,
8720
+ "mean_token_accuracy": 0.7387475546449422,
8721
+ "num_tokens": 79296512.0,
8722
+ "step": 9680
8723
+ },
8724
+ {
8725
+ "epoch": 2.74429142917832,
8726
+ "grad_norm": 1.4785575866699219,
8727
+ "learning_rate": 9.482849050666109e-07,
8728
+ "loss": 0.1194,
8729
+ "mean_token_accuracy": 0.7637353233993054,
8730
+ "num_tokens": 79378432.0,
8731
+ "step": 9690
8732
+ },
8733
+ {
8734
+ "epoch": 2.747123588345665,
8735
+ "grad_norm": 0.9869931936264038,
8736
+ "learning_rate": 9.377950277981748e-07,
8737
+ "loss": 0.1237,
8738
+ "mean_token_accuracy": 0.7830968666821718,
8739
+ "num_tokens": 79460352.0,
8740
+ "step": 9700
8741
+ },
8742
+ {
8743
+ "epoch": 2.74995574751301,
8744
+ "grad_norm": 1.2523363828659058,
8745
+ "learning_rate": 9.273051505297388e-07,
8746
+ "loss": 0.1295,
8747
+ "mean_token_accuracy": 0.7593199610710144,
8748
+ "num_tokens": 79542272.0,
8749
+ "step": 9710
8750
+ },
8751
+ {
8752
+ "epoch": 2.7527879066803553,
8753
+ "grad_norm": 1.2600061893463135,
8754
+ "learning_rate": 9.16815273261303e-07,
8755
+ "loss": 0.1209,
8756
+ "mean_token_accuracy": 0.7813111554831267,
8757
+ "num_tokens": 79624192.0,
8758
+ "step": 9720
8759
+ },
8760
+ {
8761
+ "epoch": 2.7556200658477006,
8762
+ "grad_norm": 0.9577277898788452,
8763
+ "learning_rate": 9.063253959928669e-07,
8764
+ "loss": 0.1156,
8765
+ "mean_token_accuracy": 0.7740337550640106,
8766
+ "num_tokens": 79706112.0,
8767
+ "step": 9730
8768
+ },
8769
+ {
8770
+ "epoch": 2.758452225015046,
8771
+ "grad_norm": 1.1340205669403076,
8772
+ "learning_rate": 8.958355187244309e-07,
8773
+ "loss": 0.1038,
8774
+ "mean_token_accuracy": 0.7865215256810189,
8775
+ "num_tokens": 79788032.0,
8776
+ "step": 9740
8777
+ },
8778
+ {
8779
+ "epoch": 2.761284384182391,
8780
+ "grad_norm": 1.5387784242630005,
8781
+ "learning_rate": 8.853456414559951e-07,
8782
+ "loss": 0.1328,
8783
+ "mean_token_accuracy": 0.7656678043305873,
8784
+ "num_tokens": 79869952.0,
8785
+ "step": 9750
8786
+ },
8787
+ {
8788
+ "epoch": 2.7641165433497363,
8789
+ "grad_norm": 1.7430437803268433,
8790
+ "learning_rate": 8.748557641875591e-07,
8791
+ "loss": 0.095,
8792
+ "mean_token_accuracy": 0.7949119359254837,
8793
+ "num_tokens": 79951872.0,
8794
+ "step": 9760
8795
+ },
8796
+ {
8797
+ "epoch": 2.7669487025170816,
8798
+ "grad_norm": 1.7460997104644775,
8799
+ "learning_rate": 8.64365886919123e-07,
8800
+ "loss": 0.1196,
8801
+ "mean_token_accuracy": 0.775464779511094,
8802
+ "num_tokens": 80033792.0,
8803
+ "step": 9770
8804
+ },
8805
+ {
8806
+ "epoch": 2.769780861684427,
8807
+ "grad_norm": 1.1114528179168701,
8808
+ "learning_rate": 8.538760096506872e-07,
8809
+ "loss": 0.1293,
8810
+ "mean_token_accuracy": 0.7551736798137426,
8811
+ "num_tokens": 80115712.0,
8812
+ "step": 9780
8813
+ },
8814
+ {
8815
+ "epoch": 2.7726130208517716,
8816
+ "grad_norm": 1.3568215370178223,
8817
+ "learning_rate": 8.433861323822512e-07,
8818
+ "loss": 0.0965,
8819
+ "mean_token_accuracy": 0.7976272024214268,
8820
+ "num_tokens": 80197632.0,
8821
+ "step": 9790
8822
+ },
8823
+ {
8824
+ "epoch": 2.7754451800191173,
8825
+ "grad_norm": 1.039504885673523,
8826
+ "learning_rate": 8.328962551138151e-07,
8827
+ "loss": 0.1181,
8828
+ "mean_token_accuracy": 0.7570694729685783,
8829
+ "num_tokens": 80279552.0,
8830
+ "step": 9800
8831
+ },
8832
+ {
8833
+ "epoch": 2.778277339186462,
8834
+ "grad_norm": 0.9073276519775391,
8835
+ "learning_rate": 8.224063778453793e-07,
8836
+ "loss": 0.1327,
8837
+ "mean_token_accuracy": 0.7564946163445712,
8838
+ "num_tokens": 80361472.0,
8839
+ "step": 9810
8840
+ },
8841
+ {
8842
+ "epoch": 2.7811094983538074,
8843
+ "grad_norm": 2.061521291732788,
8844
+ "learning_rate": 8.119165005769433e-07,
8845
+ "loss": 0.1195,
8846
+ "mean_token_accuracy": 0.7596379648894072,
8847
+ "num_tokens": 80443392.0,
8848
+ "step": 9820
8849
+ },
8850
+ {
8851
+ "epoch": 2.7839416575211526,
8852
+ "grad_norm": 1.21349036693573,
8853
+ "learning_rate": 8.014266233085073e-07,
8854
+ "loss": 0.126,
8855
+ "mean_token_accuracy": 0.7478473570197821,
8856
+ "num_tokens": 80525312.0,
8857
+ "step": 9830
8858
+ },
8859
+ {
8860
+ "epoch": 2.786773816688498,
8861
+ "grad_norm": 1.4586316347122192,
8862
+ "learning_rate": 7.909367460400715e-07,
8863
+ "loss": 0.1223,
8864
+ "mean_token_accuracy": 0.770731408149004,
8865
+ "num_tokens": 80607232.0,
8866
+ "step": 9840
8867
+ },
8868
+ {
8869
+ "epoch": 2.789605975855843,
8870
+ "grad_norm": 1.3496206998825073,
8871
+ "learning_rate": 7.804468687716354e-07,
8872
+ "loss": 0.1015,
8873
+ "mean_token_accuracy": 0.7758072383701802,
8874
+ "num_tokens": 80689152.0,
8875
+ "step": 9850
8876
+ },
8877
+ {
8878
+ "epoch": 2.7924381350231884,
8879
+ "grad_norm": 1.2071694135665894,
8880
+ "learning_rate": 7.699569915031994e-07,
8881
+ "loss": 0.1146,
8882
+ "mean_token_accuracy": 0.774987768009305,
8883
+ "num_tokens": 80771072.0,
8884
+ "step": 9860
8885
+ },
8886
+ {
8887
+ "epoch": 2.7952702941905336,
8888
+ "grad_norm": 1.2012773752212524,
8889
+ "learning_rate": 7.594671142347636e-07,
8890
+ "loss": 0.1262,
8891
+ "mean_token_accuracy": 0.7779476504772902,
8892
+ "num_tokens": 80852992.0,
8893
+ "step": 9870
8894
+ },
8895
+ {
8896
+ "epoch": 2.7981024533578784,
8897
+ "grad_norm": 1.2166376113891602,
8898
+ "learning_rate": 7.489772369663275e-07,
8899
+ "loss": 0.1141,
8900
+ "mean_token_accuracy": 0.779562134295702,
8901
+ "num_tokens": 80934912.0,
8902
+ "step": 9880
8903
+ },
8904
+ {
8905
+ "epoch": 2.800934612525224,
8906
+ "grad_norm": 1.269511103630066,
8907
+ "learning_rate": 7.384873596978916e-07,
8908
+ "loss": 0.1247,
8909
+ "mean_token_accuracy": 0.7734099797904491,
8910
+ "num_tokens": 81016832.0,
8911
+ "step": 9890
8912
+ },
8913
+ {
8914
+ "epoch": 2.803766771692569,
8915
+ "grad_norm": 1.0128493309020996,
8916
+ "learning_rate": 7.279974824294557e-07,
8917
+ "loss": 0.1261,
8918
+ "mean_token_accuracy": 0.7762353252619505,
8919
+ "num_tokens": 81098752.0,
8920
+ "step": 9900
8921
+ },
8922
+ {
8923
+ "epoch": 2.806598930859914,
8924
+ "grad_norm": 1.538405179977417,
8925
+ "learning_rate": 7.175076051610197e-07,
8926
+ "loss": 0.1291,
8927
+ "mean_token_accuracy": 0.7826198644936084,
8928
+ "num_tokens": 81180672.0,
8929
+ "step": 9910
8930
+ },
8931
+ {
8932
+ "epoch": 2.8094310900272594,
8933
+ "grad_norm": 1.5747365951538086,
8934
+ "learning_rate": 7.070177278925837e-07,
8935
+ "loss": 0.1306,
8936
+ "mean_token_accuracy": 0.776382091268897,
8937
+ "num_tokens": 81262592.0,
8938
+ "step": 9920
8939
+ },
8940
+ {
8941
+ "epoch": 2.8122632491946047,
8942
+ "grad_norm": 1.071977972984314,
8943
+ "learning_rate": 6.965278506241478e-07,
8944
+ "loss": 0.1108,
8945
+ "mean_token_accuracy": 0.7729818969964981,
8946
+ "num_tokens": 81344512.0,
8947
+ "step": 9930
8948
+ },
8949
+ {
8950
+ "epoch": 2.81509540836195,
8951
+ "grad_norm": 1.172013282775879,
8952
+ "learning_rate": 6.860379733557118e-07,
8953
+ "loss": 0.1414,
8954
+ "mean_token_accuracy": 0.7524706482887268,
8955
+ "num_tokens": 81426432.0,
8956
+ "step": 9940
8957
+ },
8958
+ {
8959
+ "epoch": 2.817927567529295,
8960
+ "grad_norm": 1.3133201599121094,
8961
+ "learning_rate": 6.755480960872759e-07,
8962
+ "loss": 0.1192,
8963
+ "mean_token_accuracy": 0.7713796466588974,
8964
+ "num_tokens": 81508352.0,
8965
+ "step": 9950
8966
+ },
8967
+ {
8968
+ "epoch": 2.8207597266966404,
8969
+ "grad_norm": 1.6226385831832886,
8970
+ "learning_rate": 6.650582188188398e-07,
8971
+ "loss": 0.1244,
8972
+ "mean_token_accuracy": 0.7702299427241087,
8973
+ "num_tokens": 81590272.0,
8974
+ "step": 9960
8975
+ },
8976
+ {
8977
+ "epoch": 2.8235918858639857,
8978
+ "grad_norm": 1.5946696996688843,
8979
+ "learning_rate": 6.545683415504039e-07,
8980
+ "loss": 0.1186,
8981
+ "mean_token_accuracy": 0.7681873787194491,
8982
+ "num_tokens": 81672192.0,
8983
+ "step": 9970
8984
+ },
8985
+ {
8986
+ "epoch": 2.826424045031331,
8987
+ "grad_norm": 1.3367503881454468,
8988
+ "learning_rate": 6.44078464281968e-07,
8989
+ "loss": 0.129,
8990
+ "mean_token_accuracy": 0.7627201572060585,
8991
+ "num_tokens": 81754112.0,
8992
+ "step": 9980
8993
+ },
8994
+ {
8995
+ "epoch": 2.8292562041986757,
8996
+ "grad_norm": 1.6041656732559204,
8997
+ "learning_rate": 6.335885870135319e-07,
8998
+ "loss": 0.0955,
8999
+ "mean_token_accuracy": 0.7912304297089576,
9000
+ "num_tokens": 81836032.0,
9001
+ "step": 9990
9002
+ },
9003
+ {
9004
+ "epoch": 2.8320883633660214,
9005
+ "grad_norm": 1.589345097541809,
9006
+ "learning_rate": 6.23098709745096e-07,
9007
+ "loss": 0.1391,
9008
+ "mean_token_accuracy": 0.7658023487776517,
9009
+ "num_tokens": 81917952.0,
9010
+ "step": 10000
9011
  }
9012
  ],
9013
  "logging_steps": 10,
 
9027
  "attributes": {}
9028
  }
9029
  },
9030
+ "total_flos": 2.1649315150902067e+17,
9031
  "train_batch_size": 2,
9032
  "trial_name": null,
9033
  "trial_params": null