minpeter commited on
Commit
ccda333
·
verified ·
1 Parent(s): 6a84eef

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd0e106749ec154eecd3ebb9fe7474cf3444291df427cce5d1d61cb2679e8088
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aebe450af3dcf6059e1b801cdd5a67740a0c029d7ba82ba825b46c2d795a19e
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6982c3f836b1c5917d64c3c2c07418fb1042a731f48fa53830cf50384b985a7
3
  size 422377675
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:824987b0f600e49cd5f52186eb43c3ac0c51f149db79c1e96ef00960b794dcc3
3
  size 422377675
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2933fa623da5d83a2ffe4eddaad982ac15c82f5c890445e228adf894e89f9290
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afa414c61f15afda9f56c15a8085d690966f41ac14b2bf137cdb676429126c8c
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4958966c61d8eed22eb0bdc6e0a1efc61ae912a801cb91fc7888b1951205081b
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e6fb65c7e63adbfd3a5e44825c99a68e4df715cad837906f505fdf50341150c
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8307372793354102,
6
  "eval_steps": 100,
7
- "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5672,6 +5672,714 @@
5672
  "eval_samples_per_second": 9.727,
5673
  "eval_steps_per_second": 1.216,
5674
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5675
  }
5676
  ],
5677
  "logging_steps": 1,
@@ -5691,7 +6399,7 @@
5691
  "attributes": {}
5692
  }
5693
  },
5694
- "total_flos": 1.018894554759168e+17,
5695
  "train_batch_size": 16,
5696
  "trial_name": null,
5697
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9345794392523364,
6
  "eval_steps": 100,
7
+ "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5672
  "eval_samples_per_second": 9.727,
5673
  "eval_steps_per_second": 1.216,
5674
  "step": 800
5675
+ },
5676
+ {
5677
+ "epoch": 0.8317757009345794,
5678
+ "grad_norm": 1.3515625,
5679
+ "learning_rate": 7.644208353416704e-05,
5680
+ "loss": 6.1918,
5681
+ "step": 801
5682
+ },
5683
+ {
5684
+ "epoch": 0.8328141225337488,
5685
+ "grad_norm": 2.328125,
5686
+ "learning_rate": 7.553131143566822e-05,
5687
+ "loss": 6.2873,
5688
+ "step": 802
5689
+ },
5690
+ {
5691
+ "epoch": 0.833852544132918,
5692
+ "grad_norm": 1.390625,
5693
+ "learning_rate": 7.462555412526062e-05,
5694
+ "loss": 6.1316,
5695
+ "step": 803
5696
+ },
5697
+ {
5698
+ "epoch": 0.8348909657320872,
5699
+ "grad_norm": 2.0,
5700
+ "learning_rate": 7.372482230380656e-05,
5701
+ "loss": 6.2054,
5702
+ "step": 804
5703
+ },
5704
+ {
5705
+ "epoch": 0.8359293873312564,
5706
+ "grad_norm": 1.7578125,
5707
+ "learning_rate": 7.282912661279584e-05,
5708
+ "loss": 6.1334,
5709
+ "step": 805
5710
+ },
5711
+ {
5712
+ "epoch": 0.8369678089304258,
5713
+ "grad_norm": 1.328125,
5714
+ "learning_rate": 7.19384776342199e-05,
5715
+ "loss": 6.1515,
5716
+ "step": 806
5717
+ },
5718
+ {
5719
+ "epoch": 0.838006230529595,
5720
+ "grad_norm": 1.6328125,
5721
+ "learning_rate": 7.105288589044723e-05,
5722
+ "loss": 6.044,
5723
+ "step": 807
5724
+ },
5725
+ {
5726
+ "epoch": 0.8390446521287642,
5727
+ "grad_norm": 1.265625,
5728
+ "learning_rate": 7.017236184409858e-05,
5729
+ "loss": 6.4329,
5730
+ "step": 808
5731
+ },
5732
+ {
5733
+ "epoch": 0.8400830737279336,
5734
+ "grad_norm": 1.203125,
5735
+ "learning_rate": 6.929691589792358e-05,
5736
+ "loss": 6.2727,
5737
+ "step": 809
5738
+ },
5739
+ {
5740
+ "epoch": 0.8411214953271028,
5741
+ "grad_norm": 1.6015625,
5742
+ "learning_rate": 6.842655839467788e-05,
5743
+ "loss": 6.1676,
5744
+ "step": 810
5745
+ },
5746
+ {
5747
+ "epoch": 0.842159916926272,
5748
+ "grad_norm": 1.546875,
5749
+ "learning_rate": 6.756129961700075e-05,
5750
+ "loss": 5.9781,
5751
+ "step": 811
5752
+ },
5753
+ {
5754
+ "epoch": 0.8431983385254413,
5755
+ "grad_norm": 1.8515625,
5756
+ "learning_rate": 6.670114978729391e-05,
5757
+ "loss": 5.9659,
5758
+ "step": 812
5759
+ },
5760
+ {
5761
+ "epoch": 0.8442367601246106,
5762
+ "grad_norm": 1.6640625,
5763
+ "learning_rate": 6.584611906760035e-05,
5764
+ "loss": 5.9971,
5765
+ "step": 813
5766
+ },
5767
+ {
5768
+ "epoch": 0.8452751817237798,
5769
+ "grad_norm": 1.7734375,
5770
+ "learning_rate": 6.499621755948487e-05,
5771
+ "loss": 6.3746,
5772
+ "step": 814
5773
+ },
5774
+ {
5775
+ "epoch": 0.8463136033229491,
5776
+ "grad_norm": 1.296875,
5777
+ "learning_rate": 6.415145530391403e-05,
5778
+ "loss": 5.9012,
5779
+ "step": 815
5780
+ },
5781
+ {
5782
+ "epoch": 0.8473520249221184,
5783
+ "grad_norm": 1.15625,
5784
+ "learning_rate": 6.331184228113802e-05,
5785
+ "loss": 6.3141,
5786
+ "step": 816
5787
+ },
5788
+ {
5789
+ "epoch": 0.8483904465212876,
5790
+ "grad_norm": 1.5078125,
5791
+ "learning_rate": 6.247738841057255e-05,
5792
+ "loss": 6.2155,
5793
+ "step": 817
5794
+ },
5795
+ {
5796
+ "epoch": 0.8494288681204569,
5797
+ "grad_norm": 1.65625,
5798
+ "learning_rate": 6.164810355068179e-05,
5799
+ "loss": 6.1955,
5800
+ "step": 818
5801
+ },
5802
+ {
5803
+ "epoch": 0.8504672897196262,
5804
+ "grad_norm": 1.515625,
5805
+ "learning_rate": 6.082399749886169e-05,
5806
+ "loss": 6.3145,
5807
+ "step": 819
5808
+ },
5809
+ {
5810
+ "epoch": 0.8515057113187954,
5811
+ "grad_norm": 1.3984375,
5812
+ "learning_rate": 6.000507999132443e-05,
5813
+ "loss": 6.173,
5814
+ "step": 820
5815
+ },
5816
+ {
5817
+ "epoch": 0.8525441329179647,
5818
+ "grad_norm": 1.484375,
5819
+ "learning_rate": 5.919136070298342e-05,
5820
+ "loss": 6.0277,
5821
+ "step": 821
5822
+ },
5823
+ {
5824
+ "epoch": 0.8535825545171339,
5825
+ "grad_norm": 1.4609375,
5826
+ "learning_rate": 5.838284924733866e-05,
5827
+ "loss": 6.163,
5828
+ "step": 822
5829
+ },
5830
+ {
5831
+ "epoch": 0.8546209761163032,
5832
+ "grad_norm": 1.390625,
5833
+ "learning_rate": 5.7579555176363654e-05,
5834
+ "loss": 6.324,
5835
+ "step": 823
5836
+ },
5837
+ {
5838
+ "epoch": 0.8556593977154725,
5839
+ "grad_norm": 1.5390625,
5840
+ "learning_rate": 5.678148798039212e-05,
5841
+ "loss": 6.3801,
5842
+ "step": 824
5843
+ },
5844
+ {
5845
+ "epoch": 0.8566978193146417,
5846
+ "grad_norm": 1.5625,
5847
+ "learning_rate": 5.598865708800616e-05,
5848
+ "loss": 5.8392,
5849
+ "step": 825
5850
+ },
5851
+ {
5852
+ "epoch": 0.857736240913811,
5853
+ "grad_norm": 1.609375,
5854
+ "learning_rate": 5.520107186592477e-05,
5855
+ "loss": 6.3217,
5856
+ "step": 826
5857
+ },
5858
+ {
5859
+ "epoch": 0.8587746625129803,
5860
+ "grad_norm": 1.4609375,
5861
+ "learning_rate": 5.441874161889304e-05,
5862
+ "loss": 6.2988,
5863
+ "step": 827
5864
+ },
5865
+ {
5866
+ "epoch": 0.8598130841121495,
5867
+ "grad_norm": 1.328125,
5868
+ "learning_rate": 5.364167558957267e-05,
5869
+ "loss": 6.3745,
5870
+ "step": 828
5871
+ },
5872
+ {
5873
+ "epoch": 0.8608515057113187,
5874
+ "grad_norm": 1.296875,
5875
+ "learning_rate": 5.286988295843215e-05,
5876
+ "loss": 6.2074,
5877
+ "step": 829
5878
+ },
5879
+ {
5880
+ "epoch": 0.8618899273104881,
5881
+ "grad_norm": 1.4296875,
5882
+ "learning_rate": 5.2103372843638754e-05,
5883
+ "loss": 6.2226,
5884
+ "step": 830
5885
+ },
5886
+ {
5887
+ "epoch": 0.8629283489096573,
5888
+ "grad_norm": 1.59375,
5889
+ "learning_rate": 5.134215430095068e-05,
5890
+ "loss": 5.8549,
5891
+ "step": 831
5892
+ },
5893
+ {
5894
+ "epoch": 0.8639667705088265,
5895
+ "grad_norm": 1.71875,
5896
+ "learning_rate": 5.0586236323610034e-05,
5897
+ "loss": 6.3173,
5898
+ "step": 832
5899
+ },
5900
+ {
5901
+ "epoch": 0.8650051921079959,
5902
+ "grad_norm": 1.234375,
5903
+ "learning_rate": 4.983562784223644e-05,
5904
+ "loss": 6.2721,
5905
+ "step": 833
5906
+ },
5907
+ {
5908
+ "epoch": 0.8660436137071651,
5909
+ "grad_norm": 1.40625,
5910
+ "learning_rate": 4.909033772472205e-05,
5911
+ "loss": 6.1832,
5912
+ "step": 834
5913
+ },
5914
+ {
5915
+ "epoch": 0.8670820353063343,
5916
+ "grad_norm": 1.6640625,
5917
+ "learning_rate": 4.835037477612619e-05,
5918
+ "loss": 6.2962,
5919
+ "step": 835
5920
+ },
5921
+ {
5922
+ "epoch": 0.8681204569055037,
5923
+ "grad_norm": 1.5625,
5924
+ "learning_rate": 4.761574773857163e-05,
5925
+ "loss": 6.3508,
5926
+ "step": 836
5927
+ },
5928
+ {
5929
+ "epoch": 0.8691588785046729,
5930
+ "grad_norm": 1.5859375,
5931
+ "learning_rate": 4.688646529114121e-05,
5932
+ "loss": 6.0014,
5933
+ "step": 837
5934
+ },
5935
+ {
5936
+ "epoch": 0.8701973001038421,
5937
+ "grad_norm": 1.5,
5938
+ "learning_rate": 4.6162536049775385e-05,
5939
+ "loss": 5.8404,
5940
+ "step": 838
5941
+ },
5942
+ {
5943
+ "epoch": 0.8712357217030114,
5944
+ "grad_norm": 1.359375,
5945
+ "learning_rate": 4.5443968567170314e-05,
5946
+ "loss": 5.7522,
5947
+ "step": 839
5948
+ },
5949
+ {
5950
+ "epoch": 0.8722741433021807,
5951
+ "grad_norm": 1.2890625,
5952
+ "learning_rate": 4.4730771332676835e-05,
5953
+ "loss": 6.0892,
5954
+ "step": 840
5955
+ },
5956
+ {
5957
+ "epoch": 0.8733125649013499,
5958
+ "grad_norm": 1.328125,
5959
+ "learning_rate": 4.402295277220048e-05,
5960
+ "loss": 6.2618,
5961
+ "step": 841
5962
+ },
5963
+ {
5964
+ "epoch": 0.8743509865005192,
5965
+ "grad_norm": 1.5390625,
5966
+ "learning_rate": 4.3320521248101484e-05,
5967
+ "loss": 5.9419,
5968
+ "step": 842
5969
+ },
5970
+ {
5971
+ "epoch": 0.8753894080996885,
5972
+ "grad_norm": 1.75,
5973
+ "learning_rate": 4.262348505909608e-05,
5974
+ "loss": 5.9647,
5975
+ "step": 843
5976
+ },
5977
+ {
5978
+ "epoch": 0.8764278296988577,
5979
+ "grad_norm": 1.703125,
5980
+ "learning_rate": 4.1931852440158794e-05,
5981
+ "loss": 6.2304,
5982
+ "step": 844
5983
+ },
5984
+ {
5985
+ "epoch": 0.877466251298027,
5986
+ "grad_norm": 1.9765625,
5987
+ "learning_rate": 4.124563156242467e-05,
5988
+ "loss": 6.3681,
5989
+ "step": 845
5990
+ },
5991
+ {
5992
+ "epoch": 0.8785046728971962,
5993
+ "grad_norm": 1.3828125,
5994
+ "learning_rate": 4.056483053309301e-05,
5995
+ "loss": 6.222,
5996
+ "step": 846
5997
+ },
5998
+ {
5999
+ "epoch": 0.8795430944963655,
6000
+ "grad_norm": 1.625,
6001
+ "learning_rate": 3.988945739533173e-05,
6002
+ "loss": 6.1925,
6003
+ "step": 847
6004
+ },
6005
+ {
6006
+ "epoch": 0.8805815160955348,
6007
+ "grad_norm": 1.4453125,
6008
+ "learning_rate": 3.9219520128182086e-05,
6009
+ "loss": 6.0481,
6010
+ "step": 848
6011
+ },
6012
+ {
6013
+ "epoch": 0.881619937694704,
6014
+ "grad_norm": 1.3203125,
6015
+ "learning_rate": 3.855502664646443e-05,
6016
+ "loss": 6.1119,
6017
+ "step": 849
6018
+ },
6019
+ {
6020
+ "epoch": 0.8826583592938733,
6021
+ "grad_norm": 1.40625,
6022
+ "learning_rate": 3.789598480068479e-05,
6023
+ "loss": 6.2874,
6024
+ "step": 850
6025
+ },
6026
+ {
6027
+ "epoch": 0.8836967808930426,
6028
+ "grad_norm": 1.453125,
6029
+ "learning_rate": 3.7242402376942096e-05,
6030
+ "loss": 6.1814,
6031
+ "step": 851
6032
+ },
6033
+ {
6034
+ "epoch": 0.8847352024922118,
6035
+ "grad_norm": 1.7734375,
6036
+ "learning_rate": 3.659428709683621e-05,
6037
+ "loss": 6.3491,
6038
+ "step": 852
6039
+ },
6040
+ {
6041
+ "epoch": 0.885773624091381,
6042
+ "grad_norm": 1.171875,
6043
+ "learning_rate": 3.59516466173766e-05,
6044
+ "loss": 6.186,
6045
+ "step": 853
6046
+ },
6047
+ {
6048
+ "epoch": 0.8868120456905504,
6049
+ "grad_norm": 1.390625,
6050
+ "learning_rate": 3.531448853089192e-05,
6051
+ "loss": 6.1761,
6052
+ "step": 854
6053
+ },
6054
+ {
6055
+ "epoch": 0.8878504672897196,
6056
+ "grad_norm": 1.6640625,
6057
+ "learning_rate": 3.4682820364940636e-05,
6058
+ "loss": 6.235,
6059
+ "step": 855
6060
+ },
6061
+ {
6062
+ "epoch": 0.8888888888888888,
6063
+ "grad_norm": 1.3046875,
6064
+ "learning_rate": 3.40566495822216e-05,
6065
+ "loss": 6.3703,
6066
+ "step": 856
6067
+ },
6068
+ {
6069
+ "epoch": 0.8899273104880582,
6070
+ "grad_norm": 1.53125,
6071
+ "learning_rate": 3.343598358048594e-05,
6072
+ "loss": 6.2765,
6073
+ "step": 857
6074
+ },
6075
+ {
6076
+ "epoch": 0.8909657320872274,
6077
+ "grad_norm": 1.703125,
6078
+ "learning_rate": 3.2820829692449985e-05,
6079
+ "loss": 6.4258,
6080
+ "step": 858
6081
+ },
6082
+ {
6083
+ "epoch": 0.8920041536863966,
6084
+ "grad_norm": 1.6484375,
6085
+ "learning_rate": 3.221119518570848e-05,
6086
+ "loss": 5.9581,
6087
+ "step": 859
6088
+ },
6089
+ {
6090
+ "epoch": 0.893042575285566,
6091
+ "grad_norm": 1.609375,
6092
+ "learning_rate": 3.160708726264855e-05,
6093
+ "loss": 6.1848,
6094
+ "step": 860
6095
+ },
6096
+ {
6097
+ "epoch": 0.8940809968847352,
6098
+ "grad_norm": 1.640625,
6099
+ "learning_rate": 3.100851306036512e-05,
6100
+ "loss": 6.2575,
6101
+ "step": 861
6102
+ },
6103
+ {
6104
+ "epoch": 0.8951194184839044,
6105
+ "grad_norm": 1.5859375,
6106
+ "learning_rate": 3.0415479650575784e-05,
6107
+ "loss": 5.9909,
6108
+ "step": 862
6109
+ },
6110
+ {
6111
+ "epoch": 0.8961578400830738,
6112
+ "grad_norm": 1.3515625,
6113
+ "learning_rate": 2.982799403953801e-05,
6114
+ "loss": 5.9893,
6115
+ "step": 863
6116
+ },
6117
+ {
6118
+ "epoch": 0.897196261682243,
6119
+ "grad_norm": 1.4921875,
6120
+ "learning_rate": 2.9246063167965962e-05,
6121
+ "loss": 6.1746,
6122
+ "step": 864
6123
+ },
6124
+ {
6125
+ "epoch": 0.8982346832814122,
6126
+ "grad_norm": 1.4453125,
6127
+ "learning_rate": 2.8669693910948646e-05,
6128
+ "loss": 6.1166,
6129
+ "step": 865
6130
+ },
6131
+ {
6132
+ "epoch": 0.8992731048805815,
6133
+ "grad_norm": 1.5,
6134
+ "learning_rate": 2.809889307786856e-05,
6135
+ "loss": 6.2788,
6136
+ "step": 866
6137
+ },
6138
+ {
6139
+ "epoch": 0.9003115264797508,
6140
+ "grad_norm": 1.71875,
6141
+ "learning_rate": 2.7533667412321385e-05,
6142
+ "loss": 5.8872,
6143
+ "step": 867
6144
+ },
6145
+ {
6146
+ "epoch": 0.90134994807892,
6147
+ "grad_norm": 1.75,
6148
+ "learning_rate": 2.6974023592036378e-05,
6149
+ "loss": 6.292,
6150
+ "step": 868
6151
+ },
6152
+ {
6153
+ "epoch": 0.9023883696780893,
6154
+ "grad_norm": 1.5390625,
6155
+ "learning_rate": 2.6419968228797274e-05,
6156
+ "loss": 6.03,
6157
+ "step": 869
6158
+ },
6159
+ {
6160
+ "epoch": 0.9034267912772586,
6161
+ "grad_norm": 1.34375,
6162
+ "learning_rate": 2.5871507868364063e-05,
6163
+ "loss": 6.2342,
6164
+ "step": 870
6165
+ },
6166
+ {
6167
+ "epoch": 0.9044652128764278,
6168
+ "grad_norm": 1.453125,
6169
+ "learning_rate": 2.532864899039622e-05,
6170
+ "loss": 6.2455,
6171
+ "step": 871
6172
+ },
6173
+ {
6174
+ "epoch": 0.9055036344755971,
6175
+ "grad_norm": 1.8125,
6176
+ "learning_rate": 2.4791398008375542e-05,
6177
+ "loss": 5.9972,
6178
+ "step": 872
6179
+ },
6180
+ {
6181
+ "epoch": 0.9065420560747663,
6182
+ "grad_norm": 1.2421875,
6183
+ "learning_rate": 2.4259761269530666e-05,
6184
+ "loss": 6.218,
6185
+ "step": 873
6186
+ },
6187
+ {
6188
+ "epoch": 0.9075804776739356,
6189
+ "grad_norm": 1.6796875,
6190
+ "learning_rate": 2.3733745054762058e-05,
6191
+ "loss": 5.6284,
6192
+ "step": 874
6193
+ },
6194
+ {
6195
+ "epoch": 0.9086188992731049,
6196
+ "grad_norm": 1.3671875,
6197
+ "learning_rate": 2.321335557856791e-05,
6198
+ "loss": 6.2292,
6199
+ "step": 875
6200
+ },
6201
+ {
6202
+ "epoch": 0.9096573208722741,
6203
+ "grad_norm": 1.3671875,
6204
+ "learning_rate": 2.2698598988970422e-05,
6205
+ "loss": 6.3235,
6206
+ "step": 876
6207
+ },
6208
+ {
6209
+ "epoch": 0.9106957424714434,
6210
+ "grad_norm": 1.484375,
6211
+ "learning_rate": 2.2189481367443366e-05,
6212
+ "loss": 6.2901,
6213
+ "step": 877
6214
+ },
6215
+ {
6216
+ "epoch": 0.9117341640706127,
6217
+ "grad_norm": 1.3828125,
6218
+ "learning_rate": 2.16860087288403e-05,
6219
+ "loss": 6.1946,
6220
+ "step": 878
6221
+ },
6222
+ {
6223
+ "epoch": 0.9127725856697819,
6224
+ "grad_norm": 1.390625,
6225
+ "learning_rate": 2.1188187021323413e-05,
6226
+ "loss": 5.9255,
6227
+ "step": 879
6228
+ },
6229
+ {
6230
+ "epoch": 0.9138110072689511,
6231
+ "grad_norm": 1.6171875,
6232
+ "learning_rate": 2.0696022126293124e-05,
6233
+ "loss": 5.9947,
6234
+ "step": 880
6235
+ },
6236
+ {
6237
+ "epoch": 0.9148494288681205,
6238
+ "grad_norm": 1.3828125,
6239
+ "learning_rate": 2.0209519858319037e-05,
6240
+ "loss": 6.0533,
6241
+ "step": 881
6242
+ },
6243
+ {
6244
+ "epoch": 0.9158878504672897,
6245
+ "grad_norm": 1.296875,
6246
+ "learning_rate": 1.9728685965070602e-05,
6247
+ "loss": 6.1479,
6248
+ "step": 882
6249
+ },
6250
+ {
6251
+ "epoch": 0.9169262720664589,
6252
+ "grad_norm": 1.1875,
6253
+ "learning_rate": 1.925352612724979e-05,
6254
+ "loss": 6.1287,
6255
+ "step": 883
6256
+ },
6257
+ {
6258
+ "epoch": 0.9179646936656283,
6259
+ "grad_norm": 1.4609375,
6260
+ "learning_rate": 1.878404595852362e-05,
6261
+ "loss": 6.1971,
6262
+ "step": 884
6263
+ },
6264
+ {
6265
+ "epoch": 0.9190031152647975,
6266
+ "grad_norm": 2.34375,
6267
+ "learning_rate": 1.8320251005457976e-05,
6268
+ "loss": 6.3691,
6269
+ "step": 885
6270
+ },
6271
+ {
6272
+ "epoch": 0.9200415368639667,
6273
+ "grad_norm": 1.4296875,
6274
+ "learning_rate": 1.7862146747452178e-05,
6275
+ "loss": 6.3665,
6276
+ "step": 886
6277
+ },
6278
+ {
6279
+ "epoch": 0.9210799584631361,
6280
+ "grad_norm": 1.5625,
6281
+ "learning_rate": 1.740973859667394e-05,
6282
+ "loss": 6.278,
6283
+ "step": 887
6284
+ },
6285
+ {
6286
+ "epoch": 0.9221183800623053,
6287
+ "grad_norm": 1.8125,
6288
+ "learning_rate": 1.6963031897995862e-05,
6289
+ "loss": 6.1343,
6290
+ "step": 888
6291
+ },
6292
+ {
6293
+ "epoch": 0.9231568016614745,
6294
+ "grad_norm": 1.3046875,
6295
+ "learning_rate": 1.652203192893187e-05,
6296
+ "loss": 6.3423,
6297
+ "step": 889
6298
+ },
6299
+ {
6300
+ "epoch": 0.9241952232606438,
6301
+ "grad_norm": 1.78125,
6302
+ "learning_rate": 1.6086743899575042e-05,
6303
+ "loss": 6.2271,
6304
+ "step": 890
6305
+ },
6306
+ {
6307
+ "epoch": 0.9252336448598131,
6308
+ "grad_norm": 1.578125,
6309
+ "learning_rate": 1.56571729525361e-05,
6310
+ "loss": 6.1616,
6311
+ "step": 891
6312
+ },
6313
+ {
6314
+ "epoch": 0.9262720664589823,
6315
+ "grad_norm": 1.1875,
6316
+ "learning_rate": 1.5233324162882589e-05,
6317
+ "loss": 6.3031,
6318
+ "step": 892
6319
+ },
6320
+ {
6321
+ "epoch": 0.9273104880581516,
6322
+ "grad_norm": 1.3828125,
6323
+ "learning_rate": 1.4815202538078998e-05,
6324
+ "loss": 6.2343,
6325
+ "step": 893
6326
+ },
6327
+ {
6328
+ "epoch": 0.9283489096573209,
6329
+ "grad_norm": 1.9609375,
6330
+ "learning_rate": 1.4402813017927396e-05,
6331
+ "loss": 6.1985,
6332
+ "step": 894
6333
+ },
6334
+ {
6335
+ "epoch": 0.9293873312564901,
6336
+ "grad_norm": 1.3046875,
6337
+ "learning_rate": 1.3996160474509411e-05,
6338
+ "loss": 6.2852,
6339
+ "step": 895
6340
+ },
6341
+ {
6342
+ "epoch": 0.9304257528556594,
6343
+ "grad_norm": 1.8828125,
6344
+ "learning_rate": 1.3595249712128333e-05,
6345
+ "loss": 6.39,
6346
+ "step": 896
6347
+ },
6348
+ {
6349
+ "epoch": 0.9314641744548287,
6350
+ "grad_norm": 1.5078125,
6351
+ "learning_rate": 1.3200085467252488e-05,
6352
+ "loss": 6.1291,
6353
+ "step": 897
6354
+ },
6355
+ {
6356
+ "epoch": 0.9325025960539979,
6357
+ "grad_norm": 1.734375,
6358
+ "learning_rate": 1.28106724084594e-05,
6359
+ "loss": 6.2431,
6360
+ "step": 898
6361
+ },
6362
+ {
6363
+ "epoch": 0.9335410176531672,
6364
+ "grad_norm": 1.1953125,
6365
+ "learning_rate": 1.2427015136380393e-05,
6366
+ "loss": 6.2799,
6367
+ "step": 899
6368
+ },
6369
+ {
6370
+ "epoch": 0.9345794392523364,
6371
+ "grad_norm": 1.3359375,
6372
+ "learning_rate": 1.2049118183646401e-05,
6373
+ "loss": 6.259,
6374
+ "step": 900
6375
+ },
6376
+ {
6377
+ "epoch": 0.9345794392523364,
6378
+ "eval_loss": 6.254264831542969,
6379
+ "eval_runtime": 1.7188,
6380
+ "eval_samples_per_second": 9.309,
6381
+ "eval_steps_per_second": 1.164,
6382
+ "step": 900
6383
  }
6384
  ],
6385
  "logging_steps": 1,
 
6399
  "attributes": {}
6400
  }
6401
  },
6402
+ "total_flos": 1.146256374104064e+17,
6403
  "train_batch_size": 16,
6404
  "trial_name": null,
6405
  "trial_params": null