error577 commited on
Commit
09ef8c8
·
verified ·
1 Parent(s): cef5fda

Training in progress, step 1900, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96c7361330baef883a1cefd1a914f4dfd222e1942ded5aa53b0020e1631fa6a0
3
  size 671149168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a97696a92cabb745ba6cadf2fcc30794ffbc03c91397a016c61944f979c42d0e
3
  size 671149168
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:607344293b376b43b5ea9dcc72b5b6a4a1aec6d2f1618a8d50a1ba12a30b5cac
3
  size 179316182
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad721c15307c50724acd6ac4b18952a24ff74a5cf21f65080c486168ba9fcdd8
3
  size 179316182
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:744d24dfd442d827b2147e9ae7ffb1ebb0081316f3e80b03e398055f18dbb8c1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:431a2334446ca149d087ee53da191415d4c95211b38709b7a8404d805dee4327
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d92677bc2060a4e9fdf63252b2a62db0c583efdb9e75a15d0327906fb2aa5af3
3
  size 2080
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:536e3a5de3004cb57f6febbf7190ef605673677b3a7bfb0620dc7718281289d6
3
  size 2080
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.4691164972634871,
5
  "eval_steps": 100,
6
- "global_step": 1800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -12759,6 +12759,714 @@
12759
  "eval_samples_per_second": 2.868,
12760
  "eval_steps_per_second": 1.434,
12761
  "step": 1800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12762
  }
12763
  ],
12764
  "logging_steps": 1,
@@ -12778,7 +13486,7 @@
12778
  "attributes": {}
12779
  }
12780
  },
12781
- "total_flos": 3.218854769338614e+17,
12782
  "train_batch_size": 2,
12783
  "trial_name": null,
12784
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.49517852488923636,
5
  "eval_steps": 100,
6
+ "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
12759
  "eval_samples_per_second": 2.868,
12760
  "eval_steps_per_second": 1.434,
12761
  "step": 1800
12762
+ },
12763
+ {
12764
+ "epoch": 0.4693771175397446,
12765
+ "grad_norm": 10.37785816192627,
12766
+ "learning_rate": 0.00018848585023079067,
12767
+ "loss": 5.1423,
12768
+ "step": 1801
12769
+ },
12770
+ {
12771
+ "epoch": 0.4696377378160021,
12772
+ "grad_norm": 13.957540512084961,
12773
+ "learning_rate": 0.00018847310275305063,
12774
+ "loss": 4.527,
12775
+ "step": 1802
12776
+ },
12777
+ {
12778
+ "epoch": 0.4698983580922596,
12779
+ "grad_norm": 16.043025970458984,
12780
+ "learning_rate": 0.00018846034072339535,
12781
+ "loss": 5.4377,
12782
+ "step": 1803
12783
+ },
12784
+ {
12785
+ "epoch": 0.47015897836851706,
12786
+ "grad_norm": 12.010665893554688,
12787
+ "learning_rate": 0.00018844757869374007,
12788
+ "loss": 4.7116,
12789
+ "step": 1804
12790
+ },
12791
+ {
12792
+ "epoch": 0.47041959864477456,
12793
+ "grad_norm": 14.77712631225586,
12794
+ "learning_rate": 0.0001884348166640848,
12795
+ "loss": 5.9034,
12796
+ "step": 1805
12797
+ },
12798
+ {
12799
+ "epoch": 0.47068021892103207,
12800
+ "grad_norm": 10.91681957244873,
12801
+ "learning_rate": 0.00018842202553059906,
12802
+ "loss": 5.0634,
12803
+ "step": 1806
12804
+ },
12805
+ {
12806
+ "epoch": 0.4709408391972896,
12807
+ "grad_norm": 12.498564720153809,
12808
+ "learning_rate": 0.00018840924894902855,
12809
+ "loss": 5.6897,
12810
+ "step": 1807
12811
+ },
12812
+ {
12813
+ "epoch": 0.471201459473547,
12814
+ "grad_norm": 15.501856803894043,
12815
+ "learning_rate": 0.00018839645781554282,
12816
+ "loss": 3.1788,
12817
+ "step": 1808
12818
+ },
12819
+ {
12820
+ "epoch": 0.47146207974980453,
12821
+ "grad_norm": 13.191068649291992,
12822
+ "learning_rate": 0.00018838365213014185,
12823
+ "loss": 4.8908,
12824
+ "step": 1809
12825
+ },
12826
+ {
12827
+ "epoch": 0.47172270002606204,
12828
+ "grad_norm": 18.41266441345215,
12829
+ "learning_rate": 0.00018837086099665612,
12830
+ "loss": 5.3714,
12831
+ "step": 1810
12832
+ },
12833
+ {
12834
+ "epoch": 0.47198332030231954,
12835
+ "grad_norm": 15.97313117980957,
12836
+ "learning_rate": 0.00018835805531125516,
12837
+ "loss": 5.7231,
12838
+ "step": 1811
12839
+ },
12840
+ {
12841
+ "epoch": 0.472243940578577,
12842
+ "grad_norm": 12.212700843811035,
12843
+ "learning_rate": 0.00018834523507393897,
12844
+ "loss": 5.0301,
12845
+ "step": 1812
12846
+ },
12847
+ {
12848
+ "epoch": 0.4725045608548345,
12849
+ "grad_norm": 218.50746154785156,
12850
+ "learning_rate": 0.00018833241483662277,
12851
+ "loss": 6.6723,
12852
+ "step": 1813
12853
+ },
12854
+ {
12855
+ "epoch": 0.472765181131092,
12856
+ "grad_norm": 12.388707160949707,
12857
+ "learning_rate": 0.00018831958004739136,
12858
+ "loss": 5.1241,
12859
+ "step": 1814
12860
+ },
12861
+ {
12862
+ "epoch": 0.4730258014073495,
12863
+ "grad_norm": 18.206995010375977,
12864
+ "learning_rate": 0.00018830674525815994,
12865
+ "loss": 4.8303,
12866
+ "step": 1815
12867
+ },
12868
+ {
12869
+ "epoch": 0.47328642168360696,
12870
+ "grad_norm": 72.17833709716797,
12871
+ "learning_rate": 0.0001882938959170133,
12872
+ "loss": 4.8661,
12873
+ "step": 1816
12874
+ },
12875
+ {
12876
+ "epoch": 0.47354704195986447,
12877
+ "grad_norm": 18.09402847290039,
12878
+ "learning_rate": 0.00018828104657586664,
12879
+ "loss": 5.473,
12880
+ "step": 1817
12881
+ },
12882
+ {
12883
+ "epoch": 0.473807662236122,
12884
+ "grad_norm": 241.85777282714844,
12885
+ "learning_rate": 0.00018826819723472,
12886
+ "loss": 5.6158,
12887
+ "step": 1818
12888
+ },
12889
+ {
12890
+ "epoch": 0.4740682825123795,
12891
+ "grad_norm": 12.830618858337402,
12892
+ "learning_rate": 0.00018825533334165812,
12893
+ "loss": 4.5027,
12894
+ "step": 1819
12895
+ },
12896
+ {
12897
+ "epoch": 0.47432890278863693,
12898
+ "grad_norm": 18.599620819091797,
12899
+ "learning_rate": 0.000188242454896681,
12900
+ "loss": 5.6648,
12901
+ "step": 1820
12902
+ },
12903
+ {
12904
+ "epoch": 0.47458952306489444,
12905
+ "grad_norm": 13.098050117492676,
12906
+ "learning_rate": 0.0001882295764517039,
12907
+ "loss": 5.0379,
12908
+ "step": 1821
12909
+ },
12910
+ {
12911
+ "epoch": 0.47485014334115194,
12912
+ "grad_norm": 9.492166519165039,
12913
+ "learning_rate": 0.0001882166980067268,
12914
+ "loss": 5.198,
12915
+ "step": 1822
12916
+ },
12917
+ {
12918
+ "epoch": 0.47511076361740945,
12919
+ "grad_norm": 10.473576545715332,
12920
+ "learning_rate": 0.00018820380500983447,
12921
+ "loss": 5.303,
12922
+ "step": 1823
12923
+ },
12924
+ {
12925
+ "epoch": 0.47537138389366695,
12926
+ "grad_norm": 9.241297721862793,
12927
+ "learning_rate": 0.00018819091201294214,
12928
+ "loss": 5.0623,
12929
+ "step": 1824
12930
+ },
12931
+ {
12932
+ "epoch": 0.4756320041699244,
12933
+ "grad_norm": 17.09585189819336,
12934
+ "learning_rate": 0.00018817800446413457,
12935
+ "loss": 4.8895,
12936
+ "step": 1825
12937
+ },
12938
+ {
12939
+ "epoch": 0.4758926244461819,
12940
+ "grad_norm": 10.225650787353516,
12941
+ "learning_rate": 0.000188165096915327,
12942
+ "loss": 4.9848,
12943
+ "step": 1826
12944
+ },
12945
+ {
12946
+ "epoch": 0.4761532447224394,
12947
+ "grad_norm": 10.421326637268066,
12948
+ "learning_rate": 0.00018815218936651945,
12949
+ "loss": 4.5205,
12950
+ "step": 1827
12951
+ },
12952
+ {
12953
+ "epoch": 0.4764138649986969,
12954
+ "grad_norm": 13.143168449401855,
12955
+ "learning_rate": 0.00018813925271388143,
12956
+ "loss": 5.2166,
12957
+ "step": 1828
12958
+ },
12959
+ {
12960
+ "epoch": 0.4766744852749544,
12961
+ "grad_norm": 12.645442962646484,
12962
+ "learning_rate": 0.00018812633061315864,
12963
+ "loss": 5.1552,
12964
+ "step": 1829
12965
+ },
12966
+ {
12967
+ "epoch": 0.4769351055512119,
12968
+ "grad_norm": 11.635645866394043,
12969
+ "learning_rate": 0.00018811339396052063,
12970
+ "loss": 4.7997,
12971
+ "step": 1830
12972
+ },
12973
+ {
12974
+ "epoch": 0.4771957258274694,
12975
+ "grad_norm": 13.22658920288086,
12976
+ "learning_rate": 0.0001881004573078826,
12977
+ "loss": 4.2591,
12978
+ "step": 1831
12979
+ },
12980
+ {
12981
+ "epoch": 0.4774563461037269,
12982
+ "grad_norm": 14.026827812194824,
12983
+ "learning_rate": 0.00018808750610332936,
12984
+ "loss": 5.8879,
12985
+ "step": 1832
12986
+ },
12987
+ {
12988
+ "epoch": 0.47771696637998434,
12989
+ "grad_norm": 12.296269416809082,
12990
+ "learning_rate": 0.00018807455489877611,
12991
+ "loss": 4.9644,
12992
+ "step": 1833
12993
+ },
12994
+ {
12995
+ "epoch": 0.47797758665624185,
12996
+ "grad_norm": 10.826774597167969,
12997
+ "learning_rate": 0.00018806158914230764,
12998
+ "loss": 4.0063,
12999
+ "step": 1834
13000
+ },
13001
+ {
13002
+ "epoch": 0.47823820693249935,
13003
+ "grad_norm": 11.467571258544922,
13004
+ "learning_rate": 0.00018804862338583916,
13005
+ "loss": 4.6543,
13006
+ "step": 1835
13007
+ },
13008
+ {
13009
+ "epoch": 0.47849882720875686,
13010
+ "grad_norm": 11.653413772583008,
13011
+ "learning_rate": 0.00018803564307745546,
13012
+ "loss": 5.013,
13013
+ "step": 1836
13014
+ },
13015
+ {
13016
+ "epoch": 0.4787594474850143,
13017
+ "grad_norm": 10.89033317565918,
13018
+ "learning_rate": 0.00018802266276907176,
13019
+ "loss": 4.8319,
13020
+ "step": 1837
13021
+ },
13022
+ {
13023
+ "epoch": 0.4790200677612718,
13024
+ "grad_norm": 12.89120101928711,
13025
+ "learning_rate": 0.00018800966790877283,
13026
+ "loss": 4.5967,
13027
+ "step": 1838
13028
+ },
13029
+ {
13030
+ "epoch": 0.4792806880375293,
13031
+ "grad_norm": 12.160778045654297,
13032
+ "learning_rate": 0.00018799668760038912,
13033
+ "loss": 5.7447,
13034
+ "step": 1839
13035
+ },
13036
+ {
13037
+ "epoch": 0.47954130831378683,
13038
+ "grad_norm": 13.569367408752441,
13039
+ "learning_rate": 0.00018798367818817496,
13040
+ "loss": 5.7456,
13041
+ "step": 1840
13042
+ },
13043
+ {
13044
+ "epoch": 0.4798019285900443,
13045
+ "grad_norm": 16.512102127075195,
13046
+ "learning_rate": 0.0001879706687759608,
13047
+ "loss": 5.2688,
13048
+ "step": 1841
13049
+ },
13050
+ {
13051
+ "epoch": 0.4800625488663018,
13052
+ "grad_norm": 15.978020668029785,
13053
+ "learning_rate": 0.00018795764481183141,
13054
+ "loss": 5.4303,
13055
+ "step": 1842
13056
+ },
13057
+ {
13058
+ "epoch": 0.4803231691425593,
13059
+ "grad_norm": 9.993230819702148,
13060
+ "learning_rate": 0.00018794463539961725,
13061
+ "loss": 5.0611,
13062
+ "step": 1843
13063
+ },
13064
+ {
13065
+ "epoch": 0.4805837894188168,
13066
+ "grad_norm": 16.74565315246582,
13067
+ "learning_rate": 0.00018793161143548787,
13068
+ "loss": 5.321,
13069
+ "step": 1844
13070
+ },
13071
+ {
13072
+ "epoch": 0.4808444096950743,
13073
+ "grad_norm": 12.556840896606445,
13074
+ "learning_rate": 0.00018791855836752802,
13075
+ "loss": 5.2619,
13076
+ "step": 1845
13077
+ },
13078
+ {
13079
+ "epoch": 0.48110502997133175,
13080
+ "grad_norm": 11.444070816040039,
13081
+ "learning_rate": 0.0001879055198514834,
13082
+ "loss": 5.0738,
13083
+ "step": 1846
13084
+ },
13085
+ {
13086
+ "epoch": 0.48136565024758926,
13087
+ "grad_norm": 23.41417121887207,
13088
+ "learning_rate": 0.0001878924813354388,
13089
+ "loss": 4.5046,
13090
+ "step": 1847
13091
+ },
13092
+ {
13093
+ "epoch": 0.48162627052384677,
13094
+ "grad_norm": 10.137743949890137,
13095
+ "learning_rate": 0.00018787942826747894,
13096
+ "loss": 4.5996,
13097
+ "step": 1848
13098
+ },
13099
+ {
13100
+ "epoch": 0.4818868908001043,
13101
+ "grad_norm": 11.530888557434082,
13102
+ "learning_rate": 0.00018786636064760387,
13103
+ "loss": 4.9895,
13104
+ "step": 1849
13105
+ },
13106
+ {
13107
+ "epoch": 0.4821475110763617,
13108
+ "grad_norm": 10.694371223449707,
13109
+ "learning_rate": 0.0001878532930277288,
13110
+ "loss": 4.4927,
13111
+ "step": 1850
13112
+ },
13113
+ {
13114
+ "epoch": 0.48240813135261923,
13115
+ "grad_norm": 11.96599292755127,
13116
+ "learning_rate": 0.00018784022540785372,
13117
+ "loss": 4.7774,
13118
+ "step": 1851
13119
+ },
13120
+ {
13121
+ "epoch": 0.48266875162887674,
13122
+ "grad_norm": 9.961639404296875,
13123
+ "learning_rate": 0.00018782714323606342,
13124
+ "loss": 4.7172,
13125
+ "step": 1852
13126
+ },
13127
+ {
13128
+ "epoch": 0.48292937190513424,
13129
+ "grad_norm": 16.102052688598633,
13130
+ "learning_rate": 0.0001878140465123579,
13131
+ "loss": 5.5745,
13132
+ "step": 1853
13133
+ },
13134
+ {
13135
+ "epoch": 0.4831899921813917,
13136
+ "grad_norm": 13.830510139465332,
13137
+ "learning_rate": 0.00018780094978865236,
13138
+ "loss": 5.3226,
13139
+ "step": 1854
13140
+ },
13141
+ {
13142
+ "epoch": 0.4834506124576492,
13143
+ "grad_norm": 13.367227554321289,
13144
+ "learning_rate": 0.00018778785306494683,
13145
+ "loss": 4.7316,
13146
+ "step": 1855
13147
+ },
13148
+ {
13149
+ "epoch": 0.4837112327339067,
13150
+ "grad_norm": 17.57742691040039,
13151
+ "learning_rate": 0.00018777474178932607,
13152
+ "loss": 5.3159,
13153
+ "step": 1856
13154
+ },
13155
+ {
13156
+ "epoch": 0.4839718530101642,
13157
+ "grad_norm": 9.331389427185059,
13158
+ "learning_rate": 0.0001877616305137053,
13159
+ "loss": 5.1851,
13160
+ "step": 1857
13161
+ },
13162
+ {
13163
+ "epoch": 0.48423247328642166,
13164
+ "grad_norm": 14.18066120147705,
13165
+ "learning_rate": 0.00018774850468616933,
13166
+ "loss": 4.0067,
13167
+ "step": 1858
13168
+ },
13169
+ {
13170
+ "epoch": 0.48449309356267917,
13171
+ "grad_norm": 16.757022857666016,
13172
+ "learning_rate": 0.00018773537885863334,
13173
+ "loss": 5.3286,
13174
+ "step": 1859
13175
+ },
13176
+ {
13177
+ "epoch": 0.4847537138389367,
13178
+ "grad_norm": 9.797025680541992,
13179
+ "learning_rate": 0.00018772225303109735,
13180
+ "loss": 4.3471,
13181
+ "step": 1860
13182
+ },
13183
+ {
13184
+ "epoch": 0.4850143341151942,
13185
+ "grad_norm": 10.527436256408691,
13186
+ "learning_rate": 0.00018770911265164614,
13187
+ "loss": 4.6839,
13188
+ "step": 1861
13189
+ },
13190
+ {
13191
+ "epoch": 0.48527495439145163,
13192
+ "grad_norm": 12.43220329284668,
13193
+ "learning_rate": 0.0001876959577202797,
13194
+ "loss": 4.8527,
13195
+ "step": 1862
13196
+ },
13197
+ {
13198
+ "epoch": 0.48553557466770914,
13199
+ "grad_norm": 28.21805763244629,
13200
+ "learning_rate": 0.00018768281734082848,
13201
+ "loss": 5.3501,
13202
+ "step": 1863
13203
+ },
13204
+ {
13205
+ "epoch": 0.48579619494396664,
13206
+ "grad_norm": 11.163954734802246,
13207
+ "learning_rate": 0.0001876696478575468,
13208
+ "loss": 4.8881,
13209
+ "step": 1864
13210
+ },
13211
+ {
13212
+ "epoch": 0.48605681522022415,
13213
+ "grad_norm": 10.474588394165039,
13214
+ "learning_rate": 0.00018765649292618036,
13215
+ "loss": 5.016,
13216
+ "step": 1865
13217
+ },
13218
+ {
13219
+ "epoch": 0.48631743549648165,
13220
+ "grad_norm": 9.17603588104248,
13221
+ "learning_rate": 0.0001876433234428987,
13222
+ "loss": 5.0946,
13223
+ "step": 1866
13224
+ },
13225
+ {
13226
+ "epoch": 0.4865780557727391,
13227
+ "grad_norm": 9.54752254486084,
13228
+ "learning_rate": 0.0001876301394077018,
13229
+ "loss": 5.6084,
13230
+ "step": 1867
13231
+ },
13232
+ {
13233
+ "epoch": 0.4868386760489966,
13234
+ "grad_norm": 14.107392311096191,
13235
+ "learning_rate": 0.0001876169553725049,
13236
+ "loss": 5.037,
13237
+ "step": 1868
13238
+ },
13239
+ {
13240
+ "epoch": 0.4870992963252541,
13241
+ "grad_norm": 10.658012390136719,
13242
+ "learning_rate": 0.00018760375678539276,
13243
+ "loss": 5.0839,
13244
+ "step": 1869
13245
+ },
13246
+ {
13247
+ "epoch": 0.4873599166015116,
13248
+ "grad_norm": 14.332069396972656,
13249
+ "learning_rate": 0.00018759055819828063,
13250
+ "loss": 5.2982,
13251
+ "step": 1870
13252
+ },
13253
+ {
13254
+ "epoch": 0.4876205368777691,
13255
+ "grad_norm": 10.385351181030273,
13256
+ "learning_rate": 0.00018757734505925328,
13257
+ "loss": 5.3114,
13258
+ "step": 1871
13259
+ },
13260
+ {
13261
+ "epoch": 0.4878811571540266,
13262
+ "grad_norm": 10.018989562988281,
13263
+ "learning_rate": 0.00018756414647214115,
13264
+ "loss": 5.7433,
13265
+ "step": 1872
13266
+ },
13267
+ {
13268
+ "epoch": 0.4881417774302841,
13269
+ "grad_norm": 14.237773895263672,
13270
+ "learning_rate": 0.00018755091878119856,
13271
+ "loss": 5.8772,
13272
+ "step": 1873
13273
+ },
13274
+ {
13275
+ "epoch": 0.4884023977065416,
13276
+ "grad_norm": 12.690217971801758,
13277
+ "learning_rate": 0.0001875377056421712,
13278
+ "loss": 5.134,
13279
+ "step": 1874
13280
+ },
13281
+ {
13282
+ "epoch": 0.48866301798279904,
13283
+ "grad_norm": 13.357400894165039,
13284
+ "learning_rate": 0.0001875244633993134,
13285
+ "loss": 5.1887,
13286
+ "step": 1875
13287
+ },
13288
+ {
13289
+ "epoch": 0.48892363825905655,
13290
+ "grad_norm": 13.582716941833496,
13291
+ "learning_rate": 0.00018751122115645558,
13292
+ "loss": 4.3156,
13293
+ "step": 1876
13294
+ },
13295
+ {
13296
+ "epoch": 0.48918425853531405,
13297
+ "grad_norm": 15.98849868774414,
13298
+ "learning_rate": 0.00018749797891359776,
13299
+ "loss": 5.4427,
13300
+ "step": 1877
13301
+ },
13302
+ {
13303
+ "epoch": 0.48944487881157156,
13304
+ "grad_norm": 11.24950122833252,
13305
+ "learning_rate": 0.00018748472211882472,
13306
+ "loss": 3.6912,
13307
+ "step": 1878
13308
+ },
13309
+ {
13310
+ "epoch": 0.489705499087829,
13311
+ "grad_norm": 12.458173751831055,
13312
+ "learning_rate": 0.00018747146532405168,
13313
+ "loss": 4.5879,
13314
+ "step": 1879
13315
+ },
13316
+ {
13317
+ "epoch": 0.4899661193640865,
13318
+ "grad_norm": 11.879009246826172,
13319
+ "learning_rate": 0.00018745820852927864,
13320
+ "loss": 5.0259,
13321
+ "step": 1880
13322
+ },
13323
+ {
13324
+ "epoch": 0.490226739640344,
13325
+ "grad_norm": 12.405200958251953,
13326
+ "learning_rate": 0.00018744493718259037,
13327
+ "loss": 4.2239,
13328
+ "step": 1881
13329
+ },
13330
+ {
13331
+ "epoch": 0.49048735991660153,
13332
+ "grad_norm": 16.421504974365234,
13333
+ "learning_rate": 0.00018743165128398687,
13334
+ "loss": 4.807,
13335
+ "step": 1882
13336
+ },
13337
+ {
13338
+ "epoch": 0.490747980192859,
13339
+ "grad_norm": 14.629226684570312,
13340
+ "learning_rate": 0.0001874183799372986,
13341
+ "loss": 5.0214,
13342
+ "step": 1883
13343
+ },
13344
+ {
13345
+ "epoch": 0.4910086004691165,
13346
+ "grad_norm": 14.704447746276855,
13347
+ "learning_rate": 0.00018740507948677987,
13348
+ "loss": 5.0339,
13349
+ "step": 1884
13350
+ },
13351
+ {
13352
+ "epoch": 0.491269220745374,
13353
+ "grad_norm": 12.018187522888184,
13354
+ "learning_rate": 0.00018739179358817637,
13355
+ "loss": 5.4619,
13356
+ "step": 1885
13357
+ },
13358
+ {
13359
+ "epoch": 0.4915298410216315,
13360
+ "grad_norm": 19.699617385864258,
13361
+ "learning_rate": 0.00018737847858574241,
13362
+ "loss": 5.2781,
13363
+ "step": 1886
13364
+ },
13365
+ {
13366
+ "epoch": 0.491790461297889,
13367
+ "grad_norm": 13.180678367614746,
13368
+ "learning_rate": 0.00018736516358330846,
13369
+ "loss": 5.0445,
13370
+ "step": 1887
13371
+ },
13372
+ {
13373
+ "epoch": 0.49205108157414645,
13374
+ "grad_norm": 15.855171203613281,
13375
+ "learning_rate": 0.0001873518485808745,
13376
+ "loss": 4.6027,
13377
+ "step": 1888
13378
+ },
13379
+ {
13380
+ "epoch": 0.49231170185040396,
13381
+ "grad_norm": 11.354763984680176,
13382
+ "learning_rate": 0.00018733851902652532,
13383
+ "loss": 5.392,
13384
+ "step": 1889
13385
+ },
13386
+ {
13387
+ "epoch": 0.49257232212666147,
13388
+ "grad_norm": 10.087592124938965,
13389
+ "learning_rate": 0.00018732520402409136,
13390
+ "loss": 4.3135,
13391
+ "step": 1890
13392
+ },
13393
+ {
13394
+ "epoch": 0.49283294240291897,
13395
+ "grad_norm": 11.550101280212402,
13396
+ "learning_rate": 0.00018731185991782695,
13397
+ "loss": 4.9187,
13398
+ "step": 1891
13399
+ },
13400
+ {
13401
+ "epoch": 0.4930935626791764,
13402
+ "grad_norm": 13.736188888549805,
13403
+ "learning_rate": 0.00018729851581156254,
13404
+ "loss": 5.687,
13405
+ "step": 1892
13406
+ },
13407
+ {
13408
+ "epoch": 0.49335418295543393,
13409
+ "grad_norm": 12.104930877685547,
13410
+ "learning_rate": 0.00018728517170529813,
13411
+ "loss": 5.2789,
13412
+ "step": 1893
13413
+ },
13414
+ {
13415
+ "epoch": 0.49361480323169143,
13416
+ "grad_norm": 12.98553466796875,
13417
+ "learning_rate": 0.00018727181304711848,
13418
+ "loss": 4.5616,
13419
+ "step": 1894
13420
+ },
13421
+ {
13422
+ "epoch": 0.49387542350794894,
13423
+ "grad_norm": 11.326464653015137,
13424
+ "learning_rate": 0.00018725845438893884,
13425
+ "loss": 4.837,
13426
+ "step": 1895
13427
+ },
13428
+ {
13429
+ "epoch": 0.4941360437842064,
13430
+ "grad_norm": 12.19218921661377,
13431
+ "learning_rate": 0.00018724508117884398,
13432
+ "loss": 5.0118,
13433
+ "step": 1896
13434
+ },
13435
+ {
13436
+ "epoch": 0.4943966640604639,
13437
+ "grad_norm": 9.611741065979004,
13438
+ "learning_rate": 0.0001872317079687491,
13439
+ "loss": 5.421,
13440
+ "step": 1897
13441
+ },
13442
+ {
13443
+ "epoch": 0.4946572843367214,
13444
+ "grad_norm": 20.134742736816406,
13445
+ "learning_rate": 0.00018721833475865424,
13446
+ "loss": 4.8459,
13447
+ "step": 1898
13448
+ },
13449
+ {
13450
+ "epoch": 0.4949179046129789,
13451
+ "grad_norm": 18.743206024169922,
13452
+ "learning_rate": 0.00018720494699664414,
13453
+ "loss": 4.4803,
13454
+ "step": 1899
13455
+ },
13456
+ {
13457
+ "epoch": 0.49517852488923636,
13458
+ "grad_norm": 12.631272315979004,
13459
+ "learning_rate": 0.0001871915446827188,
13460
+ "loss": 3.5087,
13461
+ "step": 1900
13462
+ },
13463
+ {
13464
+ "epoch": 0.49517852488923636,
13465
+ "eval_loss": 2.3786075115203857,
13466
+ "eval_runtime": 27.2055,
13467
+ "eval_samples_per_second": 2.867,
13468
+ "eval_steps_per_second": 1.434,
13469
+ "step": 1900
13470
  }
13471
  ],
13472
  "logging_steps": 1,
 
13486
  "attributes": {}
13487
  }
13488
  },
13489
+ "total_flos": 3.397729721788662e+17,
13490
  "train_batch_size": 2,
13491
  "trial_name": null,
13492
  "trial_params": null