minpeter commited on
Commit
60f5bdd
·
verified ·
1 Parent(s): d728900

Training in progress, step 2300, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34e7dd2d9b6f0970cb6393fa01c4d5b46e08a118b2ca11c92398326a18aca9b6
3
  size 2066752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d403900b6e4e06d1060ea96c9f9125452e44e25ecb0fe98a4888dab20918096a
3
  size 2066752
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ea6c445c08f665e093952010dc41c9cfe5bc6fd09fae8a9ddc99dbd25132738
3
  size 4121235
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27d100c984f8d641346f5da5c506557f408847bc183236db48c58f564b4d2d81
3
  size 4121235
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:606f8ae83137b5e17dffec803b5eb8d484f9023ac65a91db2b3909da806f7963
3
  size 14391
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60fe173f9860062ebc60b002a64ae72dc915d76f9849b9cb85632a7a607221b5
3
  size 14391
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93112c230b7ca5a82c24435d90248d5e745b06d96f80a988308b962666674dd0
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e16073f3e3321f4e6a7e2a6eca78556f1d80fe032e948e4721dde289f8623b3
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.09501597996026605,
6
  "eval_steps": 100,
7
- "global_step": 2200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -15584,6 +15584,714 @@
15584
  "eval_samples_per_second": 1.719,
15585
  "eval_steps_per_second": 0.215,
15586
  "step": 2200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15587
  }
15588
  ],
15589
  "logging_steps": 1,
@@ -15603,7 +16311,7 @@
15603
  "attributes": {}
15604
  }
15605
  },
15606
- "total_flos": 7032412569600.0,
15607
  "train_batch_size": 1,
15608
  "trial_name": null,
15609
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.09933488814027813,
6
  "eval_steps": 100,
7
+ "global_step": 2300,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
15584
  "eval_samples_per_second": 1.719,
15585
  "eval_steps_per_second": 0.215,
15586
  "step": 2200
15587
+ },
15588
+ {
15589
+ "epoch": 0.09505916904206617,
15590
+ "grad_norm": 0.9140625,
15591
+ "learning_rate": 0.0009944730436502519,
15592
+ "loss": 8.6376,
15593
+ "step": 2201
15594
+ },
15595
+ {
15596
+ "epoch": 0.09510235812386629,
15597
+ "grad_norm": 0.515625,
15598
+ "learning_rate": 0.0009944624498326824,
15599
+ "loss": 8.4071,
15600
+ "step": 2202
15601
+ },
15602
+ {
15603
+ "epoch": 0.09514554720566641,
15604
+ "grad_norm": 0.484375,
15605
+ "learning_rate": 0.0009944518459284934,
15606
+ "loss": 8.4233,
15607
+ "step": 2203
15608
+ },
15609
+ {
15610
+ "epoch": 0.09518873628746653,
15611
+ "grad_norm": 0.70703125,
15612
+ "learning_rate": 0.000994441231937901,
15613
+ "loss": 8.4952,
15614
+ "step": 2204
15615
+ },
15616
+ {
15617
+ "epoch": 0.09523192536926665,
15618
+ "grad_norm": 0.67578125,
15619
+ "learning_rate": 0.0009944306078611223,
15620
+ "loss": 8.4472,
15621
+ "step": 2205
15622
+ },
15623
+ {
15624
+ "epoch": 0.09527511445106678,
15625
+ "grad_norm": 0.8359375,
15626
+ "learning_rate": 0.0009944199736983733,
15627
+ "loss": 8.186,
15628
+ "step": 2206
15629
+ },
15630
+ {
15631
+ "epoch": 0.0953183035328669,
15632
+ "grad_norm": 0.455078125,
15633
+ "learning_rate": 0.0009944093294498714,
15634
+ "loss": 8.4121,
15635
+ "step": 2207
15636
+ },
15637
+ {
15638
+ "epoch": 0.09536149261466702,
15639
+ "grad_norm": 0.58984375,
15640
+ "learning_rate": 0.0009943986751158335,
15641
+ "loss": 8.2663,
15642
+ "step": 2208
15643
+ },
15644
+ {
15645
+ "epoch": 0.09540468169646714,
15646
+ "grad_norm": 0.640625,
15647
+ "learning_rate": 0.000994388010696477,
15648
+ "loss": 8.202,
15649
+ "step": 2209
15650
+ },
15651
+ {
15652
+ "epoch": 0.09544787077826726,
15653
+ "grad_norm": 0.59375,
15654
+ "learning_rate": 0.0009943773361920198,
15655
+ "loss": 8.2491,
15656
+ "step": 2210
15657
+ },
15658
+ {
15659
+ "epoch": 0.09549105986006738,
15660
+ "grad_norm": 0.515625,
15661
+ "learning_rate": 0.000994366651602679,
15662
+ "loss": 8.4205,
15663
+ "step": 2211
15664
+ },
15665
+ {
15666
+ "epoch": 0.0955342489418675,
15667
+ "grad_norm": 0.6640625,
15668
+ "learning_rate": 0.0009943559569286732,
15669
+ "loss": 8.2114,
15670
+ "step": 2212
15671
+ },
15672
+ {
15673
+ "epoch": 0.09557743802366761,
15674
+ "grad_norm": 0.65234375,
15675
+ "learning_rate": 0.0009943452521702198,
15676
+ "loss": 8.2514,
15677
+ "step": 2213
15678
+ },
15679
+ {
15680
+ "epoch": 0.09562062710546773,
15681
+ "grad_norm": 0.8046875,
15682
+ "learning_rate": 0.000994334537327538,
15683
+ "loss": 8.3806,
15684
+ "step": 2214
15685
+ },
15686
+ {
15687
+ "epoch": 0.09566381618726785,
15688
+ "grad_norm": 0.65625,
15689
+ "learning_rate": 0.000994323812400846,
15690
+ "loss": 8.4841,
15691
+ "step": 2215
15692
+ },
15693
+ {
15694
+ "epoch": 0.09570700526906797,
15695
+ "grad_norm": 0.6015625,
15696
+ "learning_rate": 0.0009943130773903623,
15697
+ "loss": 8.4155,
15698
+ "step": 2216
15699
+ },
15700
+ {
15701
+ "epoch": 0.0957501943508681,
15702
+ "grad_norm": 0.4609375,
15703
+ "learning_rate": 0.0009943023322963062,
15704
+ "loss": 8.2637,
15705
+ "step": 2217
15706
+ },
15707
+ {
15708
+ "epoch": 0.09579338343266822,
15709
+ "grad_norm": 0.60546875,
15710
+ "learning_rate": 0.000994291577118897,
15711
+ "loss": 8.4826,
15712
+ "step": 2218
15713
+ },
15714
+ {
15715
+ "epoch": 0.09583657251446834,
15716
+ "grad_norm": 0.5390625,
15717
+ "learning_rate": 0.000994280811858354,
15718
+ "loss": 8.322,
15719
+ "step": 2219
15720
+ },
15721
+ {
15722
+ "epoch": 0.09587976159626846,
15723
+ "grad_norm": 0.56640625,
15724
+ "learning_rate": 0.0009942700365148964,
15725
+ "loss": 8.1471,
15726
+ "step": 2220
15727
+ },
15728
+ {
15729
+ "epoch": 0.09592295067806858,
15730
+ "grad_norm": 0.625,
15731
+ "learning_rate": 0.0009942592510887448,
15732
+ "loss": 8.4959,
15733
+ "step": 2221
15734
+ },
15735
+ {
15736
+ "epoch": 0.0959661397598687,
15737
+ "grad_norm": 0.61328125,
15738
+ "learning_rate": 0.0009942484555801184,
15739
+ "loss": 8.2918,
15740
+ "step": 2222
15741
+ },
15742
+ {
15743
+ "epoch": 0.09600932884166882,
15744
+ "grad_norm": 1.1796875,
15745
+ "learning_rate": 0.000994237649989238,
15746
+ "loss": 8.5336,
15747
+ "step": 2223
15748
+ },
15749
+ {
15750
+ "epoch": 0.09605251792346894,
15751
+ "grad_norm": 0.494140625,
15752
+ "learning_rate": 0.0009942268343163237,
15753
+ "loss": 8.5094,
15754
+ "step": 2224
15755
+ },
15756
+ {
15757
+ "epoch": 0.09609570700526907,
15758
+ "grad_norm": 0.6171875,
15759
+ "learning_rate": 0.0009942160085615961,
15760
+ "loss": 8.1301,
15761
+ "step": 2225
15762
+ },
15763
+ {
15764
+ "epoch": 0.09613889608706919,
15765
+ "grad_norm": 0.60546875,
15766
+ "learning_rate": 0.0009942051727252765,
15767
+ "loss": 8.2714,
15768
+ "step": 2226
15769
+ },
15770
+ {
15771
+ "epoch": 0.09618208516886931,
15772
+ "grad_norm": 0.416015625,
15773
+ "learning_rate": 0.0009941943268075854,
15774
+ "loss": 8.4242,
15775
+ "step": 2227
15776
+ },
15777
+ {
15778
+ "epoch": 0.09622527425066943,
15779
+ "grad_norm": 0.6640625,
15780
+ "learning_rate": 0.0009941834708087445,
15781
+ "loss": 8.463,
15782
+ "step": 2228
15783
+ },
15784
+ {
15785
+ "epoch": 0.09626846333246955,
15786
+ "grad_norm": 0.4453125,
15787
+ "learning_rate": 0.0009941726047289748,
15788
+ "loss": 8.305,
15789
+ "step": 2229
15790
+ },
15791
+ {
15792
+ "epoch": 0.09631165241426967,
15793
+ "grad_norm": 0.58984375,
15794
+ "learning_rate": 0.0009941617285684982,
15795
+ "loss": 8.2656,
15796
+ "step": 2230
15797
+ },
15798
+ {
15799
+ "epoch": 0.09635484149606979,
15800
+ "grad_norm": 0.41796875,
15801
+ "learning_rate": 0.0009941508423275366,
15802
+ "loss": 8.3025,
15803
+ "step": 2231
15804
+ },
15805
+ {
15806
+ "epoch": 0.09639803057786991,
15807
+ "grad_norm": 0.5625,
15808
+ "learning_rate": 0.000994139946006312,
15809
+ "loss": 8.2322,
15810
+ "step": 2232
15811
+ },
15812
+ {
15813
+ "epoch": 0.09644121965967004,
15814
+ "grad_norm": 0.51171875,
15815
+ "learning_rate": 0.0009941290396050467,
15816
+ "loss": 8.2482,
15817
+ "step": 2233
15818
+ },
15819
+ {
15820
+ "epoch": 0.09648440874147016,
15821
+ "grad_norm": 0.56640625,
15822
+ "learning_rate": 0.000994118123123963,
15823
+ "loss": 8.2431,
15824
+ "step": 2234
15825
+ },
15826
+ {
15827
+ "epoch": 0.09652759782327028,
15828
+ "grad_norm": 0.8203125,
15829
+ "learning_rate": 0.000994107196563284,
15830
+ "loss": 8.7743,
15831
+ "step": 2235
15832
+ },
15833
+ {
15834
+ "epoch": 0.0965707869050704,
15835
+ "grad_norm": 1.046875,
15836
+ "learning_rate": 0.0009940962599232323,
15837
+ "loss": 7.9271,
15838
+ "step": 2236
15839
+ },
15840
+ {
15841
+ "epoch": 0.09661397598687052,
15842
+ "grad_norm": 0.5859375,
15843
+ "learning_rate": 0.000994085313204031,
15844
+ "loss": 8.279,
15845
+ "step": 2237
15846
+ },
15847
+ {
15848
+ "epoch": 0.09665716506867064,
15849
+ "grad_norm": 0.60546875,
15850
+ "learning_rate": 0.0009940743564059037,
15851
+ "loss": 8.4154,
15852
+ "step": 2238
15853
+ },
15854
+ {
15855
+ "epoch": 0.09670035415047076,
15856
+ "grad_norm": 0.79296875,
15857
+ "learning_rate": 0.0009940633895290732,
15858
+ "loss": 8.3311,
15859
+ "step": 2239
15860
+ },
15861
+ {
15862
+ "epoch": 0.09674354323227088,
15863
+ "grad_norm": 0.66796875,
15864
+ "learning_rate": 0.000994052412573764,
15865
+ "loss": 8.6155,
15866
+ "step": 2240
15867
+ },
15868
+ {
15869
+ "epoch": 0.096786732314071,
15870
+ "grad_norm": 0.56640625,
15871
+ "learning_rate": 0.0009940414255401996,
15872
+ "loss": 8.4563,
15873
+ "step": 2241
15874
+ },
15875
+ {
15876
+ "epoch": 0.09682992139587113,
15877
+ "grad_norm": 0.60546875,
15878
+ "learning_rate": 0.0009940304284286044,
15879
+ "loss": 8.4334,
15880
+ "step": 2242
15881
+ },
15882
+ {
15883
+ "epoch": 0.09687311047767125,
15884
+ "grad_norm": 0.7109375,
15885
+ "learning_rate": 0.0009940194212392022,
15886
+ "loss": 8.3314,
15887
+ "step": 2243
15888
+ },
15889
+ {
15890
+ "epoch": 0.09691629955947137,
15891
+ "grad_norm": 0.47265625,
15892
+ "learning_rate": 0.000994008403972218,
15893
+ "loss": 8.7214,
15894
+ "step": 2244
15895
+ },
15896
+ {
15897
+ "epoch": 0.09695948864127149,
15898
+ "grad_norm": 1.0625,
15899
+ "learning_rate": 0.0009939973766278766,
15900
+ "loss": 8.297,
15901
+ "step": 2245
15902
+ },
15903
+ {
15904
+ "epoch": 0.09700267772307161,
15905
+ "grad_norm": 0.59765625,
15906
+ "learning_rate": 0.0009939863392064029,
15907
+ "loss": 8.4925,
15908
+ "step": 2246
15909
+ },
15910
+ {
15911
+ "epoch": 0.09704586680487173,
15912
+ "grad_norm": 0.447265625,
15913
+ "learning_rate": 0.0009939752917080217,
15914
+ "loss": 8.2892,
15915
+ "step": 2247
15916
+ },
15917
+ {
15918
+ "epoch": 0.09708905588667185,
15919
+ "grad_norm": 0.69140625,
15920
+ "learning_rate": 0.0009939642341329586,
15921
+ "loss": 8.5275,
15922
+ "step": 2248
15923
+ },
15924
+ {
15925
+ "epoch": 0.09713224496847198,
15926
+ "grad_norm": 2.390625,
15927
+ "learning_rate": 0.0009939531664814392,
15928
+ "loss": 8.694,
15929
+ "step": 2249
15930
+ },
15931
+ {
15932
+ "epoch": 0.0971754340502721,
15933
+ "grad_norm": 0.453125,
15934
+ "learning_rate": 0.0009939420887536893,
15935
+ "loss": 8.4036,
15936
+ "step": 2250
15937
+ },
15938
+ {
15939
+ "epoch": 0.09721862313207222,
15940
+ "grad_norm": 0.66796875,
15941
+ "learning_rate": 0.0009939310009499348,
15942
+ "loss": 8.3543,
15943
+ "step": 2251
15944
+ },
15945
+ {
15946
+ "epoch": 0.09726181221387234,
15947
+ "grad_norm": 0.474609375,
15948
+ "learning_rate": 0.0009939199030704019,
15949
+ "loss": 8.3396,
15950
+ "step": 2252
15951
+ },
15952
+ {
15953
+ "epoch": 0.09730500129567246,
15954
+ "grad_norm": 0.84765625,
15955
+ "learning_rate": 0.0009939087951153168,
15956
+ "loss": 8.3102,
15957
+ "step": 2253
15958
+ },
15959
+ {
15960
+ "epoch": 0.09734819037747257,
15961
+ "grad_norm": 0.4921875,
15962
+ "learning_rate": 0.0009938976770849065,
15963
+ "loss": 8.236,
15964
+ "step": 2254
15965
+ },
15966
+ {
15967
+ "epoch": 0.09739137945927269,
15968
+ "grad_norm": 0.61328125,
15969
+ "learning_rate": 0.0009938865489793976,
15970
+ "loss": 8.1535,
15971
+ "step": 2255
15972
+ },
15973
+ {
15974
+ "epoch": 0.09743456854107281,
15975
+ "grad_norm": 0.4765625,
15976
+ "learning_rate": 0.000993875410799017,
15977
+ "loss": 8.3056,
15978
+ "step": 2256
15979
+ },
15980
+ {
15981
+ "epoch": 0.09747775762287293,
15982
+ "grad_norm": 0.53125,
15983
+ "learning_rate": 0.0009938642625439917,
15984
+ "loss": 8.2276,
15985
+ "step": 2257
15986
+ },
15987
+ {
15988
+ "epoch": 0.09752094670467305,
15989
+ "grad_norm": 0.482421875,
15990
+ "learning_rate": 0.0009938531042145498,
15991
+ "loss": 8.4162,
15992
+ "step": 2258
15993
+ },
15994
+ {
15995
+ "epoch": 0.09756413578647317,
15996
+ "grad_norm": 0.640625,
15997
+ "learning_rate": 0.0009938419358109182,
15998
+ "loss": 8.2504,
15999
+ "step": 2259
16000
+ },
16001
+ {
16002
+ "epoch": 0.0976073248682733,
16003
+ "grad_norm": 0.6171875,
16004
+ "learning_rate": 0.0009938307573333254,
16005
+ "loss": 8.595,
16006
+ "step": 2260
16007
+ },
16008
+ {
16009
+ "epoch": 0.09765051395007342,
16010
+ "grad_norm": 0.49609375,
16011
+ "learning_rate": 0.0009938195687819989,
16012
+ "loss": 8.3727,
16013
+ "step": 2261
16014
+ },
16015
+ {
16016
+ "epoch": 0.09769370303187354,
16017
+ "grad_norm": 0.48828125,
16018
+ "learning_rate": 0.0009938083701571672,
16019
+ "loss": 8.4693,
16020
+ "step": 2262
16021
+ },
16022
+ {
16023
+ "epoch": 0.09773689211367366,
16024
+ "grad_norm": 0.859375,
16025
+ "learning_rate": 0.0009937971614590586,
16026
+ "loss": 8.383,
16027
+ "step": 2263
16028
+ },
16029
+ {
16030
+ "epoch": 0.09778008119547378,
16031
+ "grad_norm": 1.0078125,
16032
+ "learning_rate": 0.0009937859426879018,
16033
+ "loss": 8.1874,
16034
+ "step": 2264
16035
+ },
16036
+ {
16037
+ "epoch": 0.0978232702772739,
16038
+ "grad_norm": 0.4453125,
16039
+ "learning_rate": 0.0009937747138439256,
16040
+ "loss": 8.5885,
16041
+ "step": 2265
16042
+ },
16043
+ {
16044
+ "epoch": 0.09786645935907402,
16045
+ "grad_norm": 0.69140625,
16046
+ "learning_rate": 0.000993763474927359,
16047
+ "loss": 8.2082,
16048
+ "step": 2266
16049
+ },
16050
+ {
16051
+ "epoch": 0.09790964844087414,
16052
+ "grad_norm": 0.515625,
16053
+ "learning_rate": 0.0009937522259384317,
16054
+ "loss": 8.4838,
16055
+ "step": 2267
16056
+ },
16057
+ {
16058
+ "epoch": 0.09795283752267427,
16059
+ "grad_norm": 0.48046875,
16060
+ "learning_rate": 0.0009937409668773728,
16061
+ "loss": 8.301,
16062
+ "step": 2268
16063
+ },
16064
+ {
16065
+ "epoch": 0.09799602660447439,
16066
+ "grad_norm": 0.58984375,
16067
+ "learning_rate": 0.000993729697744412,
16068
+ "loss": 8.5776,
16069
+ "step": 2269
16070
+ },
16071
+ {
16072
+ "epoch": 0.09803921568627451,
16073
+ "grad_norm": 0.91015625,
16074
+ "learning_rate": 0.000993718418539779,
16075
+ "loss": 8.8736,
16076
+ "step": 2270
16077
+ },
16078
+ {
16079
+ "epoch": 0.09808240476807463,
16080
+ "grad_norm": 0.61328125,
16081
+ "learning_rate": 0.0009937071292637043,
16082
+ "loss": 8.0556,
16083
+ "step": 2271
16084
+ },
16085
+ {
16086
+ "epoch": 0.09812559384987475,
16087
+ "grad_norm": 0.90625,
16088
+ "learning_rate": 0.000993695829916418,
16089
+ "loss": 7.9003,
16090
+ "step": 2272
16091
+ },
16092
+ {
16093
+ "epoch": 0.09816878293167487,
16094
+ "grad_norm": 0.63671875,
16095
+ "learning_rate": 0.0009936845204981505,
16096
+ "loss": 8.4324,
16097
+ "step": 2273
16098
+ },
16099
+ {
16100
+ "epoch": 0.098211972013475,
16101
+ "grad_norm": 0.51953125,
16102
+ "learning_rate": 0.0009936732010091328,
16103
+ "loss": 8.5961,
16104
+ "step": 2274
16105
+ },
16106
+ {
16107
+ "epoch": 0.09825516109527511,
16108
+ "grad_norm": 0.51171875,
16109
+ "learning_rate": 0.0009936618714495953,
16110
+ "loss": 8.6324,
16111
+ "step": 2275
16112
+ },
16113
+ {
16114
+ "epoch": 0.09829835017707524,
16115
+ "grad_norm": 0.70703125,
16116
+ "learning_rate": 0.0009936505318197694,
16117
+ "loss": 8.0815,
16118
+ "step": 2276
16119
+ },
16120
+ {
16121
+ "epoch": 0.09834153925887536,
16122
+ "grad_norm": 0.48828125,
16123
+ "learning_rate": 0.0009936391821198866,
16124
+ "loss": 8.584,
16125
+ "step": 2277
16126
+ },
16127
+ {
16128
+ "epoch": 0.09838472834067548,
16129
+ "grad_norm": 0.63671875,
16130
+ "learning_rate": 0.0009936278223501782,
16131
+ "loss": 8.3207,
16132
+ "step": 2278
16133
+ },
16134
+ {
16135
+ "epoch": 0.0984279174224756,
16136
+ "grad_norm": 0.70703125,
16137
+ "learning_rate": 0.0009936164525108761,
16138
+ "loss": 8.617,
16139
+ "step": 2279
16140
+ },
16141
+ {
16142
+ "epoch": 0.09847110650427572,
16143
+ "grad_norm": 0.50390625,
16144
+ "learning_rate": 0.000993605072602212,
16145
+ "loss": 8.7178,
16146
+ "step": 2280
16147
+ },
16148
+ {
16149
+ "epoch": 0.09851429558607584,
16150
+ "grad_norm": 0.515625,
16151
+ "learning_rate": 0.0009935936826244182,
16152
+ "loss": 8.3508,
16153
+ "step": 2281
16154
+ },
16155
+ {
16156
+ "epoch": 0.09855748466787596,
16157
+ "grad_norm": 0.7890625,
16158
+ "learning_rate": 0.000993582282577727,
16159
+ "loss": 9.0207,
16160
+ "step": 2282
16161
+ },
16162
+ {
16163
+ "epoch": 0.09860067374967608,
16164
+ "grad_norm": 0.5234375,
16165
+ "learning_rate": 0.0009935708724623708,
16166
+ "loss": 8.4822,
16167
+ "step": 2283
16168
+ },
16169
+ {
16170
+ "epoch": 0.0986438628314762,
16171
+ "grad_norm": 0.462890625,
16172
+ "learning_rate": 0.0009935594522785826,
16173
+ "loss": 8.537,
16174
+ "step": 2284
16175
+ },
16176
+ {
16177
+ "epoch": 0.09868705191327633,
16178
+ "grad_norm": 0.78515625,
16179
+ "learning_rate": 0.0009935480220265955,
16180
+ "loss": 7.9235,
16181
+ "step": 2285
16182
+ },
16183
+ {
16184
+ "epoch": 0.09873024099507645,
16185
+ "grad_norm": 0.66015625,
16186
+ "learning_rate": 0.0009935365817066422,
16187
+ "loss": 8.3552,
16188
+ "step": 2286
16189
+ },
16190
+ {
16191
+ "epoch": 0.09877343007687657,
16192
+ "grad_norm": 0.64453125,
16193
+ "learning_rate": 0.0009935251313189565,
16194
+ "loss": 8.199,
16195
+ "step": 2287
16196
+ },
16197
+ {
16198
+ "epoch": 0.09881661915867669,
16199
+ "grad_norm": 0.50390625,
16200
+ "learning_rate": 0.0009935136708637716,
16201
+ "loss": 8.3347,
16202
+ "step": 2288
16203
+ },
16204
+ {
16205
+ "epoch": 0.09885980824047681,
16206
+ "grad_norm": 0.61328125,
16207
+ "learning_rate": 0.0009935022003413217,
16208
+ "loss": 8.1595,
16209
+ "step": 2289
16210
+ },
16211
+ {
16212
+ "epoch": 0.09890299732227693,
16213
+ "grad_norm": 0.4453125,
16214
+ "learning_rate": 0.0009934907197518405,
16215
+ "loss": 8.3218,
16216
+ "step": 2290
16217
+ },
16218
+ {
16219
+ "epoch": 0.09894618640407706,
16220
+ "grad_norm": 0.4921875,
16221
+ "learning_rate": 0.0009934792290955622,
16222
+ "loss": 8.3675,
16223
+ "step": 2291
16224
+ },
16225
+ {
16226
+ "epoch": 0.09898937548587718,
16227
+ "grad_norm": 0.6953125,
16228
+ "learning_rate": 0.0009934677283727211,
16229
+ "loss": 8.1193,
16230
+ "step": 2292
16231
+ },
16232
+ {
16233
+ "epoch": 0.0990325645676773,
16234
+ "grad_norm": 0.5390625,
16235
+ "learning_rate": 0.000993456217583552,
16236
+ "loss": 8.5143,
16237
+ "step": 2293
16238
+ },
16239
+ {
16240
+ "epoch": 0.09907575364947742,
16241
+ "grad_norm": 0.72265625,
16242
+ "learning_rate": 0.0009934446967282899,
16243
+ "loss": 7.9545,
16244
+ "step": 2294
16245
+ },
16246
+ {
16247
+ "epoch": 0.09911894273127753,
16248
+ "grad_norm": 0.6796875,
16249
+ "learning_rate": 0.0009934331658071694,
16250
+ "loss": 8.4399,
16251
+ "step": 2295
16252
+ },
16253
+ {
16254
+ "epoch": 0.09916213181307765,
16255
+ "grad_norm": 0.66796875,
16256
+ "learning_rate": 0.000993421624820426,
16257
+ "loss": 8.3492,
16258
+ "step": 2296
16259
+ },
16260
+ {
16261
+ "epoch": 0.09920532089487777,
16262
+ "grad_norm": 0.75,
16263
+ "learning_rate": 0.0009934100737682952,
16264
+ "loss": 8.4884,
16265
+ "step": 2297
16266
+ },
16267
+ {
16268
+ "epoch": 0.09924850997667789,
16269
+ "grad_norm": 0.5703125,
16270
+ "learning_rate": 0.0009933985126510123,
16271
+ "loss": 8.0807,
16272
+ "step": 2298
16273
+ },
16274
+ {
16275
+ "epoch": 0.09929169905847801,
16276
+ "grad_norm": 0.6171875,
16277
+ "learning_rate": 0.0009933869414688132,
16278
+ "loss": 8.3986,
16279
+ "step": 2299
16280
+ },
16281
+ {
16282
+ "epoch": 0.09933488814027813,
16283
+ "grad_norm": 0.478515625,
16284
+ "learning_rate": 0.0009933753602219342,
16285
+ "loss": 8.331,
16286
+ "step": 2300
16287
+ },
16288
+ {
16289
+ "epoch": 0.09933488814027813,
16290
+ "eval_loss": 8.394790649414062,
16291
+ "eval_runtime": 14.132,
16292
+ "eval_samples_per_second": 1.698,
16293
+ "eval_steps_per_second": 0.212,
16294
+ "step": 2300
16295
  }
16296
  ],
16297
  "logging_steps": 1,
 
16311
  "attributes": {}
16312
  }
16313
  },
16314
+ "total_flos": 7352067686400.0,
16315
  "train_batch_size": 1,
16316
  "trial_name": null,
16317
  "trial_params": null