{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 127.95905310300704, "eval_steps": 500, "global_step": 200000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3198976327575176, "grad_norm": 4.1601386070251465, "learning_rate": 5e-06, "loss": 10.3279, "step": 500 }, { "epoch": 0.6397952655150352, "grad_norm": 4.366061687469482, "learning_rate": 1e-05, "loss": 9.3834, "step": 1000 }, { "epoch": 0.9596928982725528, "grad_norm": 4.784337043762207, "learning_rate": 1.5e-05, "loss": 8.8888, "step": 1500 }, { "epoch": 1.2795905310300704, "grad_norm": 3.9968652725219727, "learning_rate": 2e-05, "loss": 8.6568, "step": 2000 }, { "epoch": 1.599488163787588, "grad_norm": 4.402552127838135, "learning_rate": 2.5e-05, "loss": 8.5473, "step": 2500 }, { "epoch": 1.9193857965451055, "grad_norm": 4.639041423797607, "learning_rate": 3e-05, "loss": 8.4044, "step": 3000 }, { "epoch": 2.239283429302623, "grad_norm": 5.651747226715088, "learning_rate": 3.5e-05, "loss": 8.2868, "step": 3500 }, { "epoch": 2.5591810620601407, "grad_norm": 4.6999359130859375, "learning_rate": 4e-05, "loss": 8.1766, "step": 4000 }, { "epoch": 2.8790786948176583, "grad_norm": 4.838181495666504, "learning_rate": 4.499e-05, "loss": 8.1118, "step": 4500 }, { "epoch": 3.198976327575176, "grad_norm": 4.238831996917725, "learning_rate": 4.999e-05, "loss": 8.0038, "step": 5000 }, { "epoch": 3.5188739603326935, "grad_norm": 4.455530643463135, "learning_rate": 5.499000000000001e-05, "loss": 7.9014, "step": 5500 }, { "epoch": 3.838771593090211, "grad_norm": 5.811736583709717, "learning_rate": 5.999e-05, "loss": 7.8352, "step": 6000 }, { "epoch": 4.158669225847729, "grad_norm": 4.998301982879639, "learning_rate": 6.498e-05, "loss": 7.7613, "step": 6500 }, { "epoch": 4.478566858605246, "grad_norm": 5.011510848999023, "learning_rate": 6.998e-05, "loss": 7.6554, "step": 7000 }, { "epoch": 4.798464491362764, "grad_norm": 4.750300884246826, "learning_rate": 7.498e-05, "loss": 7.6109, "step": 7500 }, { "epoch": 5.1183621241202815, "grad_norm": 6.24017858505249, "learning_rate": 7.998e-05, "loss": 7.5186, "step": 8000 }, { "epoch": 5.438259756877799, "grad_norm": 6.061458587646484, "learning_rate": 8.497000000000001e-05, "loss": 7.3966, "step": 8500 }, { "epoch": 5.758157389635317, "grad_norm": 7.151447772979736, "learning_rate": 8.997000000000001e-05, "loss": 7.2877, "step": 9000 }, { "epoch": 6.078055022392834, "grad_norm": 7.578985214233398, "learning_rate": 9.497000000000001e-05, "loss": 7.1542, "step": 9500 }, { "epoch": 6.397952655150352, "grad_norm": 5.948920726776123, "learning_rate": 9.997e-05, "loss": 7.0008, "step": 10000 }, { "epoch": 6.717850287907869, "grad_norm": 8.036959648132324, "learning_rate": 9.982896551724137e-05, "loss": 6.8966, "step": 10500 }, { "epoch": 7.037747920665387, "grad_norm": 7.160433292388916, "learning_rate": 9.965655172413794e-05, "loss": 6.7509, "step": 11000 }, { "epoch": 7.357645553422905, "grad_norm": 5.934999465942383, "learning_rate": 9.948413793103449e-05, "loss": 6.5833, "step": 11500 }, { "epoch": 7.677543186180422, "grad_norm": 7.745622634887695, "learning_rate": 9.931172413793104e-05, "loss": 6.4975, "step": 12000 }, { "epoch": 7.99744081893794, "grad_norm": 7.0418477058410645, "learning_rate": 9.91393103448276e-05, "loss": 6.4261, "step": 12500 }, { "epoch": 8.317338451695457, "grad_norm": 6.101259708404541, "learning_rate": 9.896689655172414e-05, "loss": 6.2092, "step": 13000 }, { "epoch": 8.637236084452976, "grad_norm": 7.289799213409424, "learning_rate": 9.87944827586207e-05, "loss": 6.1436, "step": 13500 }, { "epoch": 8.957133717210493, "grad_norm": 8.126811027526855, "learning_rate": 9.862206896551725e-05, "loss": 6.0456, "step": 14000 }, { "epoch": 9.277031349968011, "grad_norm": 8.221816062927246, "learning_rate": 9.845000000000001e-05, "loss": 5.9141, "step": 14500 }, { "epoch": 9.596928982725528, "grad_norm": 7.361550331115723, "learning_rate": 9.827793103448277e-05, "loss": 5.8326, "step": 15000 }, { "epoch": 9.916826615483046, "grad_norm": 7.1737775802612305, "learning_rate": 9.810551724137932e-05, "loss": 5.7974, "step": 15500 }, { "epoch": 10.236724248240563, "grad_norm": 9.80185604095459, "learning_rate": 9.793310344827586e-05, "loss": 5.6282, "step": 16000 }, { "epoch": 10.556621880998081, "grad_norm": 7.2062153816223145, "learning_rate": 9.776068965517242e-05, "loss": 5.5619, "step": 16500 }, { "epoch": 10.876519513755598, "grad_norm": 10.801878929138184, "learning_rate": 9.758827586206896e-05, "loss": 5.5155, "step": 17000 }, { "epoch": 11.196417146513117, "grad_norm": 8.48509693145752, "learning_rate": 9.741586206896553e-05, "loss": 5.4259, "step": 17500 }, { "epoch": 11.516314779270633, "grad_norm": 8.47572135925293, "learning_rate": 9.724344827586207e-05, "loss": 5.3205, "step": 18000 }, { "epoch": 11.836212412028152, "grad_norm": 6.122796535491943, "learning_rate": 9.707103448275863e-05, "loss": 5.3025, "step": 18500 }, { "epoch": 12.156110044785668, "grad_norm": 8.210710525512695, "learning_rate": 9.689896551724139e-05, "loss": 5.2264, "step": 19000 }, { "epoch": 12.476007677543187, "grad_norm": 7.857537746429443, "learning_rate": 9.672655172413794e-05, "loss": 5.1395, "step": 19500 }, { "epoch": 12.795905310300704, "grad_norm": 7.743075370788574, "learning_rate": 9.655413793103448e-05, "loss": 5.1109, "step": 20000 }, { "epoch": 13.115802943058222, "grad_norm": 10.574569702148438, "learning_rate": 9.638172413793104e-05, "loss": 5.0794, "step": 20500 }, { "epoch": 13.435700575815739, "grad_norm": 8.313858985900879, "learning_rate": 9.620931034482758e-05, "loss": 4.921, "step": 21000 }, { "epoch": 13.755598208573257, "grad_norm": 9.096057891845703, "learning_rate": 9.603689655172414e-05, "loss": 4.96, "step": 21500 }, { "epoch": 14.075495841330774, "grad_norm": 8.402993202209473, "learning_rate": 9.58644827586207e-05, "loss": 4.9062, "step": 22000 }, { "epoch": 14.395393474088293, "grad_norm": 8.110074996948242, "learning_rate": 9.569206896551725e-05, "loss": 4.8026, "step": 22500 }, { "epoch": 14.71529110684581, "grad_norm": 7.908292293548584, "learning_rate": 9.552000000000001e-05, "loss": 4.82, "step": 23000 }, { "epoch": 15.035188739603328, "grad_norm": 7.991878986358643, "learning_rate": 9.534758620689655e-05, "loss": 4.7397, "step": 23500 }, { "epoch": 15.355086372360844, "grad_norm": 8.696029663085938, "learning_rate": 9.517551724137932e-05, "loss": 4.6656, "step": 24000 }, { "epoch": 15.674984005118363, "grad_norm": 9.421612739562988, "learning_rate": 9.500310344827586e-05, "loss": 4.6412, "step": 24500 }, { "epoch": 15.99488163787588, "grad_norm": 9.747482299804688, "learning_rate": 9.483068965517242e-05, "loss": 4.6048, "step": 25000 }, { "epoch": 16.314779270633398, "grad_norm": 10.389492988586426, "learning_rate": 9.465827586206897e-05, "loss": 4.481, "step": 25500 }, { "epoch": 16.634676903390915, "grad_norm": 8.661949157714844, "learning_rate": 9.448586206896553e-05, "loss": 4.4923, "step": 26000 }, { "epoch": 16.95457453614843, "grad_norm": 12.681297302246094, "learning_rate": 9.431344827586207e-05, "loss": 4.4816, "step": 26500 }, { "epoch": 17.27447216890595, "grad_norm": 8.993134498596191, "learning_rate": 9.414103448275863e-05, "loss": 4.3512, "step": 27000 }, { "epoch": 17.59436980166347, "grad_norm": 10.020146369934082, "learning_rate": 9.396862068965517e-05, "loss": 4.3447, "step": 27500 }, { "epoch": 17.914267434420985, "grad_norm": 9.514701843261719, "learning_rate": 9.379655172413794e-05, "loss": 4.3376, "step": 28000 }, { "epoch": 18.234165067178502, "grad_norm": 10.324498176574707, "learning_rate": 9.362413793103448e-05, "loss": 4.2612, "step": 28500 }, { "epoch": 18.554062699936022, "grad_norm": 10.682856559753418, "learning_rate": 9.345172413793104e-05, "loss": 4.226, "step": 29000 }, { "epoch": 18.87396033269354, "grad_norm": 7.883260726928711, "learning_rate": 9.327931034482758e-05, "loss": 4.19, "step": 29500 }, { "epoch": 19.193857965451055, "grad_norm": 12.470623016357422, "learning_rate": 9.310724137931035e-05, "loss": 4.1881, "step": 30000 }, { "epoch": 19.513755598208572, "grad_norm": 9.932331085205078, "learning_rate": 9.29348275862069e-05, "loss": 4.0853, "step": 30500 }, { "epoch": 19.833653230966092, "grad_norm": 8.153782844543457, "learning_rate": 9.276241379310345e-05, "loss": 4.1087, "step": 31000 }, { "epoch": 20.15355086372361, "grad_norm": 8.214093208312988, "learning_rate": 9.258999999999999e-05, "loss": 4.0751, "step": 31500 }, { "epoch": 20.473448496481126, "grad_norm": 11.927350044250488, "learning_rate": 9.241758620689656e-05, "loss": 3.9686, "step": 32000 }, { "epoch": 20.793346129238643, "grad_norm": 9.67835807800293, "learning_rate": 9.224551724137932e-05, "loss": 3.9745, "step": 32500 }, { "epoch": 21.113243761996163, "grad_norm": 9.911735534667969, "learning_rate": 9.207310344827586e-05, "loss": 3.9308, "step": 33000 }, { "epoch": 21.43314139475368, "grad_norm": 9.05053424835205, "learning_rate": 9.190068965517242e-05, "loss": 3.8718, "step": 33500 }, { "epoch": 21.753039027511196, "grad_norm": 9.588044166564941, "learning_rate": 9.172827586206897e-05, "loss": 3.8425, "step": 34000 }, { "epoch": 22.072936660268713, "grad_norm": 8.788230895996094, "learning_rate": 9.155620689655173e-05, "loss": 3.8617, "step": 34500 }, { "epoch": 22.392834293026233, "grad_norm": 9.435895919799805, "learning_rate": 9.138379310344827e-05, "loss": 3.7524, "step": 35000 }, { "epoch": 22.71273192578375, "grad_norm": 9.870182037353516, "learning_rate": 9.121137931034483e-05, "loss": 3.7916, "step": 35500 }, { "epoch": 23.032629558541267, "grad_norm": 9.612881660461426, "learning_rate": 9.103896551724139e-05, "loss": 3.8011, "step": 36000 }, { "epoch": 23.352527191298783, "grad_norm": 9.643827438354492, "learning_rate": 9.086689655172414e-05, "loss": 3.6478, "step": 36500 }, { "epoch": 23.672424824056304, "grad_norm": 14.105424880981445, "learning_rate": 9.069448275862069e-05, "loss": 3.6671, "step": 37000 }, { "epoch": 23.99232245681382, "grad_norm": 10.427962303161621, "learning_rate": 9.052206896551724e-05, "loss": 3.6809, "step": 37500 }, { "epoch": 24.312220089571337, "grad_norm": 11.505946159362793, "learning_rate": 9.03496551724138e-05, "loss": 3.553, "step": 38000 }, { "epoch": 24.632117722328854, "grad_norm": 10.393635749816895, "learning_rate": 9.017724137931035e-05, "loss": 3.5408, "step": 38500 }, { "epoch": 24.952015355086374, "grad_norm": 9.023842811584473, "learning_rate": 9.00051724137931e-05, "loss": 3.5915, "step": 39000 }, { "epoch": 25.27191298784389, "grad_norm": 10.69048023223877, "learning_rate": 8.983275862068967e-05, "loss": 3.4896, "step": 39500 }, { "epoch": 25.591810620601407, "grad_norm": 10.803936958312988, "learning_rate": 8.966034482758621e-05, "loss": 3.4854, "step": 40000 }, { "epoch": 25.911708253358924, "grad_norm": 10.489801406860352, "learning_rate": 8.948793103448276e-05, "loss": 3.4871, "step": 40500 }, { "epoch": 26.231605886116444, "grad_norm": 10.558309555053711, "learning_rate": 8.931586206896552e-05, "loss": 3.4186, "step": 41000 }, { "epoch": 26.55150351887396, "grad_norm": 12.186748504638672, "learning_rate": 8.914344827586208e-05, "loss": 3.4027, "step": 41500 }, { "epoch": 26.871401151631478, "grad_norm": 9.8623046875, "learning_rate": 8.897103448275862e-05, "loss": 3.4191, "step": 42000 }, { "epoch": 27.191298784388994, "grad_norm": 11.407792091369629, "learning_rate": 8.879862068965518e-05, "loss": 3.341, "step": 42500 }, { "epoch": 27.511196417146515, "grad_norm": 13.37617301940918, "learning_rate": 8.862655172413794e-05, "loss": 3.3137, "step": 43000 }, { "epoch": 27.83109404990403, "grad_norm": 10.30826187133789, "learning_rate": 8.845413793103449e-05, "loss": 3.3036, "step": 43500 }, { "epoch": 28.150991682661548, "grad_norm": 12.024778366088867, "learning_rate": 8.828172413793105e-05, "loss": 3.2678, "step": 44000 }, { "epoch": 28.470889315419065, "grad_norm": 9.730340957641602, "learning_rate": 8.810931034482759e-05, "loss": 3.1949, "step": 44500 }, { "epoch": 28.790786948176585, "grad_norm": 9.700602531433105, "learning_rate": 8.793689655172414e-05, "loss": 3.2541, "step": 45000 }, { "epoch": 29.1106845809341, "grad_norm": 12.359143257141113, "learning_rate": 8.77648275862069e-05, "loss": 3.2456, "step": 45500 }, { "epoch": 29.43058221369162, "grad_norm": 11.989018440246582, "learning_rate": 8.759241379310346e-05, "loss": 3.1154, "step": 46000 }, { "epoch": 29.750479846449135, "grad_norm": 10.904190063476562, "learning_rate": 8.742e-05, "loss": 3.175, "step": 46500 }, { "epoch": 30.070377479206655, "grad_norm": 11.253949165344238, "learning_rate": 8.724758620689656e-05, "loss": 3.1478, "step": 47000 }, { "epoch": 30.390275111964172, "grad_norm": 12.229791641235352, "learning_rate": 8.707517241379311e-05, "loss": 3.0632, "step": 47500 }, { "epoch": 30.71017274472169, "grad_norm": 9.516524314880371, "learning_rate": 8.690275862068967e-05, "loss": 3.0843, "step": 48000 }, { "epoch": 31.030070377479205, "grad_norm": 13.730731010437012, "learning_rate": 8.673034482758621e-05, "loss": 3.098, "step": 48500 }, { "epoch": 31.349968010236726, "grad_norm": 9.73539924621582, "learning_rate": 8.655827586206897e-05, "loss": 2.9611, "step": 49000 }, { "epoch": 31.669865642994242, "grad_norm": 12.066815376281738, "learning_rate": 8.638586206896552e-05, "loss": 2.9943, "step": 49500 }, { "epoch": 31.98976327575176, "grad_norm": 11.028585433959961, "learning_rate": 8.621344827586208e-05, "loss": 3.0424, "step": 50000 }, { "epoch": 32.30966090850928, "grad_norm": 11.2380952835083, "learning_rate": 8.604103448275862e-05, "loss": 2.9023, "step": 50500 }, { "epoch": 32.629558541266796, "grad_norm": 9.345772743225098, "learning_rate": 8.586862068965518e-05, "loss": 2.9586, "step": 51000 }, { "epoch": 32.94945617402431, "grad_norm": 10.239849090576172, "learning_rate": 8.569655172413793e-05, "loss": 2.9461, "step": 51500 }, { "epoch": 33.26935380678183, "grad_norm": 11.058523178100586, "learning_rate": 8.552413793103449e-05, "loss": 2.8453, "step": 52000 }, { "epoch": 33.589251439539346, "grad_norm": 12.131317138671875, "learning_rate": 8.535172413793105e-05, "loss": 2.8603, "step": 52500 }, { "epoch": 33.90914907229686, "grad_norm": 10.392476081848145, "learning_rate": 8.517931034482759e-05, "loss": 2.8817, "step": 53000 }, { "epoch": 34.22904670505438, "grad_norm": 10.749021530151367, "learning_rate": 8.500724137931036e-05, "loss": 2.8073, "step": 53500 }, { "epoch": 34.5489443378119, "grad_norm": 12.33171558380127, "learning_rate": 8.48348275862069e-05, "loss": 2.7793, "step": 54000 }, { "epoch": 34.86884197056942, "grad_norm": 12.961758613586426, "learning_rate": 8.466241379310346e-05, "loss": 2.8066, "step": 54500 }, { "epoch": 35.18873960332694, "grad_norm": 13.320075035095215, "learning_rate": 8.449e-05, "loss": 2.7459, "step": 55000 }, { "epoch": 35.50863723608445, "grad_norm": 14.416489601135254, "learning_rate": 8.431758620689655e-05, "loss": 2.7321, "step": 55500 }, { "epoch": 35.82853486884197, "grad_norm": 11.203073501586914, "learning_rate": 8.414551724137931e-05, "loss": 2.7486, "step": 56000 }, { "epoch": 36.14843250159949, "grad_norm": 10.463476181030273, "learning_rate": 8.397310344827587e-05, "loss": 2.7086, "step": 56500 }, { "epoch": 36.468330134357004, "grad_norm": 11.375761985778809, "learning_rate": 8.380068965517241e-05, "loss": 2.6387, "step": 57000 }, { "epoch": 36.78822776711452, "grad_norm": 11.649105072021484, "learning_rate": 8.362827586206897e-05, "loss": 2.6746, "step": 57500 }, { "epoch": 37.108125399872044, "grad_norm": 12.708244323730469, "learning_rate": 8.345586206896552e-05, "loss": 2.6454, "step": 58000 }, { "epoch": 37.42802303262956, "grad_norm": 12.876201629638672, "learning_rate": 8.328344827586208e-05, "loss": 2.5798, "step": 58500 }, { "epoch": 37.74792066538708, "grad_norm": 11.92346477508545, "learning_rate": 8.311103448275862e-05, "loss": 2.6589, "step": 59000 }, { "epoch": 38.067818298144594, "grad_norm": 10.742238998413086, "learning_rate": 8.293896551724138e-05, "loss": 2.5868, "step": 59500 }, { "epoch": 38.38771593090211, "grad_norm": 11.399048805236816, "learning_rate": 8.276655172413793e-05, "loss": 2.5124, "step": 60000 }, { "epoch": 38.70761356365963, "grad_norm": 13.563875198364258, "learning_rate": 8.259413793103449e-05, "loss": 2.576, "step": 60500 }, { "epoch": 39.027511196417144, "grad_norm": 11.297135353088379, "learning_rate": 8.242172413793103e-05, "loss": 2.5671, "step": 61000 }, { "epoch": 39.34740882917466, "grad_norm": 11.336121559143066, "learning_rate": 8.22496551724138e-05, "loss": 2.4445, "step": 61500 }, { "epoch": 39.667306461932185, "grad_norm": 9.477692604064941, "learning_rate": 8.207724137931035e-05, "loss": 2.4981, "step": 62000 }, { "epoch": 39.9872040946897, "grad_norm": 11.597848892211914, "learning_rate": 8.19048275862069e-05, "loss": 2.5382, "step": 62500 }, { "epoch": 40.30710172744722, "grad_norm": 14.910037994384766, "learning_rate": 8.173241379310346e-05, "loss": 2.4158, "step": 63000 }, { "epoch": 40.626999360204735, "grad_norm": 11.870673179626465, "learning_rate": 8.156e-05, "loss": 2.4395, "step": 63500 }, { "epoch": 40.94689699296225, "grad_norm": 15.279576301574707, "learning_rate": 8.138758620689655e-05, "loss": 2.4653, "step": 64000 }, { "epoch": 41.26679462571977, "grad_norm": 11.710406303405762, "learning_rate": 8.121551724137931e-05, "loss": 2.3567, "step": 64500 }, { "epoch": 41.586692258477285, "grad_norm": 10.663411140441895, "learning_rate": 8.104310344827587e-05, "loss": 2.3655, "step": 65000 }, { "epoch": 41.9065898912348, "grad_norm": 13.946629524230957, "learning_rate": 8.087068965517241e-05, "loss": 2.4336, "step": 65500 }, { "epoch": 42.226487523992326, "grad_norm": 13.782262802124023, "learning_rate": 8.069827586206898e-05, "loss": 2.3351, "step": 66000 }, { "epoch": 42.54638515674984, "grad_norm": 11.177961349487305, "learning_rate": 8.052586206896552e-05, "loss": 2.359, "step": 66500 }, { "epoch": 42.86628278950736, "grad_norm": 15.120301246643066, "learning_rate": 8.035344827586208e-05, "loss": 2.355, "step": 67000 }, { "epoch": 43.186180422264876, "grad_norm": 10.805267333984375, "learning_rate": 8.018103448275862e-05, "loss": 2.2905, "step": 67500 }, { "epoch": 43.50607805502239, "grad_norm": 11.777176856994629, "learning_rate": 8.000862068965517e-05, "loss": 2.2906, "step": 68000 }, { "epoch": 43.82597568777991, "grad_norm": 15.457807540893555, "learning_rate": 7.983689655172414e-05, "loss": 2.3269, "step": 68500 }, { "epoch": 44.145873320537426, "grad_norm": 11.639357566833496, "learning_rate": 7.966448275862069e-05, "loss": 2.2371, "step": 69000 }, { "epoch": 44.46577095329494, "grad_norm": 11.710591316223145, "learning_rate": 7.949206896551725e-05, "loss": 2.2248, "step": 69500 }, { "epoch": 44.785668586052466, "grad_norm": 12.675103187561035, "learning_rate": 7.93196551724138e-05, "loss": 2.2586, "step": 70000 }, { "epoch": 45.10556621880998, "grad_norm": 12.752120971679688, "learning_rate": 7.914724137931034e-05, "loss": 2.2489, "step": 70500 }, { "epoch": 45.4254638515675, "grad_norm": 11.379339218139648, "learning_rate": 7.89751724137931e-05, "loss": 2.1774, "step": 71000 }, { "epoch": 45.74536148432502, "grad_norm": 12.76633358001709, "learning_rate": 7.880275862068966e-05, "loss": 2.2007, "step": 71500 }, { "epoch": 46.06525911708253, "grad_norm": 11.421367645263672, "learning_rate": 7.863034482758621e-05, "loss": 2.2262, "step": 72000 }, { "epoch": 46.38515674984005, "grad_norm": 14.81748104095459, "learning_rate": 7.845793103448277e-05, "loss": 2.101, "step": 72500 }, { "epoch": 46.70505438259757, "grad_norm": 12.902971267700195, "learning_rate": 7.828551724137931e-05, "loss": 2.1568, "step": 73000 }, { "epoch": 47.02495201535508, "grad_norm": 10.685113906860352, "learning_rate": 7.811310344827587e-05, "loss": 2.1655, "step": 73500 }, { "epoch": 47.34484964811261, "grad_norm": 15.892518043518066, "learning_rate": 7.794068965517242e-05, "loss": 2.0551, "step": 74000 }, { "epoch": 47.664747280870124, "grad_norm": 13.730358123779297, "learning_rate": 7.776862068965518e-05, "loss": 2.1053, "step": 74500 }, { "epoch": 47.98464491362764, "grad_norm": 13.635787963867188, "learning_rate": 7.759620689655172e-05, "loss": 2.1408, "step": 75000 }, { "epoch": 48.30454254638516, "grad_norm": 12.861611366271973, "learning_rate": 7.742379310344828e-05, "loss": 2.0104, "step": 75500 }, { "epoch": 48.624440179142674, "grad_norm": 11.84931468963623, "learning_rate": 7.725137931034483e-05, "loss": 2.0555, "step": 76000 }, { "epoch": 48.94433781190019, "grad_norm": 15.812765121459961, "learning_rate": 7.70793103448276e-05, "loss": 2.1087, "step": 76500 }, { "epoch": 49.26423544465771, "grad_norm": 14.233431816101074, "learning_rate": 7.690689655172414e-05, "loss": 1.9998, "step": 77000 }, { "epoch": 49.584133077415224, "grad_norm": 14.329803466796875, "learning_rate": 7.673448275862069e-05, "loss": 2.0189, "step": 77500 }, { "epoch": 49.90403071017275, "grad_norm": 11.00400161743164, "learning_rate": 7.656206896551725e-05, "loss": 2.059, "step": 78000 }, { "epoch": 50.223928342930265, "grad_norm": 13.582133293151855, "learning_rate": 7.63896551724138e-05, "loss": 1.9753, "step": 78500 }, { "epoch": 50.54382597568778, "grad_norm": 12.560907363891602, "learning_rate": 7.621724137931034e-05, "loss": 1.9759, "step": 79000 }, { "epoch": 50.8637236084453, "grad_norm": 12.169915199279785, "learning_rate": 7.60451724137931e-05, "loss": 1.9944, "step": 79500 }, { "epoch": 51.183621241202815, "grad_norm": 13.5604248046875, "learning_rate": 7.587275862068966e-05, "loss": 1.9323, "step": 80000 }, { "epoch": 51.50351887396033, "grad_norm": 15.892741203308105, "learning_rate": 7.570034482758621e-05, "loss": 1.9095, "step": 80500 }, { "epoch": 51.82341650671785, "grad_norm": 13.435209274291992, "learning_rate": 7.552793103448276e-05, "loss": 1.9508, "step": 81000 }, { "epoch": 52.143314139475365, "grad_norm": 11.180010795593262, "learning_rate": 7.535586206896551e-05, "loss": 1.9216, "step": 81500 }, { "epoch": 52.46321177223289, "grad_norm": 12.792661666870117, "learning_rate": 7.518344827586207e-05, "loss": 1.8817, "step": 82000 }, { "epoch": 52.783109404990405, "grad_norm": 11.785886764526367, "learning_rate": 7.501103448275863e-05, "loss": 1.9121, "step": 82500 }, { "epoch": 53.10300703774792, "grad_norm": 10.568120002746582, "learning_rate": 7.483862068965518e-05, "loss": 1.8885, "step": 83000 }, { "epoch": 53.42290467050544, "grad_norm": 14.641459465026855, "learning_rate": 7.466620689655172e-05, "loss": 1.8357, "step": 83500 }, { "epoch": 53.742802303262955, "grad_norm": 13.5363187789917, "learning_rate": 7.449379310344828e-05, "loss": 1.8515, "step": 84000 }, { "epoch": 54.06269993602047, "grad_norm": 12.997908592224121, "learning_rate": 7.432172413793104e-05, "loss": 1.8681, "step": 84500 }, { "epoch": 54.38259756877799, "grad_norm": 12.53503131866455, "learning_rate": 7.414931034482759e-05, "loss": 1.7785, "step": 85000 }, { "epoch": 54.702495201535505, "grad_norm": 11.986194610595703, "learning_rate": 7.397689655172413e-05, "loss": 1.8507, "step": 85500 }, { "epoch": 55.02239283429303, "grad_norm": 12.089723587036133, "learning_rate": 7.380448275862069e-05, "loss": 1.8537, "step": 86000 }, { "epoch": 55.342290467050546, "grad_norm": 13.552453994750977, "learning_rate": 7.363206896551725e-05, "loss": 1.7622, "step": 86500 }, { "epoch": 55.66218809980806, "grad_norm": 12.03878116607666, "learning_rate": 7.346e-05, "loss": 1.8076, "step": 87000 }, { "epoch": 55.98208573256558, "grad_norm": 11.187782287597656, "learning_rate": 7.328758620689655e-05, "loss": 1.8241, "step": 87500 }, { "epoch": 56.301983365323096, "grad_norm": 14.924737930297852, "learning_rate": 7.311517241379312e-05, "loss": 1.7077, "step": 88000 }, { "epoch": 56.62188099808061, "grad_norm": 12.302467346191406, "learning_rate": 7.294275862068966e-05, "loss": 1.74, "step": 88500 }, { "epoch": 56.94177863083813, "grad_norm": 10.834394454956055, "learning_rate": 7.277034482758621e-05, "loss": 1.7827, "step": 89000 }, { "epoch": 57.261676263595646, "grad_norm": 14.356012344360352, "learning_rate": 7.259793103448276e-05, "loss": 1.7082, "step": 89500 }, { "epoch": 57.58157389635317, "grad_norm": 14.632678031921387, "learning_rate": 7.242551724137931e-05, "loss": 1.7045, "step": 90000 }, { "epoch": 57.90147152911069, "grad_norm": 13.501043319702148, "learning_rate": 7.225344827586207e-05, "loss": 1.7522, "step": 90500 }, { "epoch": 58.2213691618682, "grad_norm": 18.839614868164062, "learning_rate": 7.208103448275862e-05, "loss": 1.6801, "step": 91000 }, { "epoch": 58.54126679462572, "grad_norm": 13.41618824005127, "learning_rate": 7.190862068965517e-05, "loss": 1.7005, "step": 91500 }, { "epoch": 58.86116442738324, "grad_norm": 12.56169605255127, "learning_rate": 7.173620689655172e-05, "loss": 1.7018, "step": 92000 }, { "epoch": 59.18106206014075, "grad_norm": 13.447467803955078, "learning_rate": 7.15641379310345e-05, "loss": 1.6691, "step": 92500 }, { "epoch": 59.50095969289827, "grad_norm": 12.452493667602539, "learning_rate": 7.139172413793104e-05, "loss": 1.651, "step": 93000 }, { "epoch": 59.82085732565579, "grad_norm": 10.552214622497559, "learning_rate": 7.121931034482759e-05, "loss": 1.6916, "step": 93500 }, { "epoch": 60.14075495841331, "grad_norm": 11.099422454833984, "learning_rate": 7.104689655172413e-05, "loss": 1.6716, "step": 94000 }, { "epoch": 60.46065259117083, "grad_norm": 13.663276672363281, "learning_rate": 7.08744827586207e-05, "loss": 1.6105, "step": 94500 }, { "epoch": 60.780550223928344, "grad_norm": 11.783934593200684, "learning_rate": 7.070206896551725e-05, "loss": 1.6399, "step": 95000 }, { "epoch": 61.10044785668586, "grad_norm": 12.881340026855469, "learning_rate": 7.053e-05, "loss": 1.6058, "step": 95500 }, { "epoch": 61.42034548944338, "grad_norm": 12.405476570129395, "learning_rate": 7.035758620689656e-05, "loss": 1.5703, "step": 96000 }, { "epoch": 61.740243122200894, "grad_norm": 11.660452842712402, "learning_rate": 7.018517241379311e-05, "loss": 1.6002, "step": 96500 }, { "epoch": 62.06014075495841, "grad_norm": 11.69723892211914, "learning_rate": 7.001275862068966e-05, "loss": 1.6186, "step": 97000 }, { "epoch": 62.38003838771593, "grad_norm": 16.210947036743164, "learning_rate": 6.984034482758621e-05, "loss": 1.5663, "step": 97500 }, { "epoch": 62.69993602047345, "grad_norm": 11.853803634643555, "learning_rate": 6.966827586206897e-05, "loss": 1.5744, "step": 98000 }, { "epoch": 63.01983365323097, "grad_norm": 10.565818786621094, "learning_rate": 6.949586206896553e-05, "loss": 1.5829, "step": 98500 }, { "epoch": 63.339731285988485, "grad_norm": 11.621013641357422, "learning_rate": 6.932344827586207e-05, "loss": 1.5213, "step": 99000 }, { "epoch": 63.659628918746, "grad_norm": 10.182308197021484, "learning_rate": 6.915103448275862e-05, "loss": 1.5291, "step": 99500 }, { "epoch": 63.97952655150352, "grad_norm": 14.434243202209473, "learning_rate": 6.897862068965517e-05, "loss": 1.5612, "step": 100000 }, { "epoch": 64.29942418426104, "grad_norm": 12.513864517211914, "learning_rate": 6.880655172413794e-05, "loss": 1.5041, "step": 100500 }, { "epoch": 64.61932181701856, "grad_norm": 13.189037322998047, "learning_rate": 6.863413793103448e-05, "loss": 1.5097, "step": 101000 }, { "epoch": 64.93921944977608, "grad_norm": 11.867232322692871, "learning_rate": 6.846172413793104e-05, "loss": 1.5401, "step": 101500 }, { "epoch": 65.25911708253359, "grad_norm": 13.00894832611084, "learning_rate": 6.828931034482758e-05, "loss": 1.4581, "step": 102000 }, { "epoch": 65.57901471529111, "grad_norm": 11.719345092773438, "learning_rate": 6.811689655172415e-05, "loss": 1.4807, "step": 102500 }, { "epoch": 65.89891234804863, "grad_norm": 11.355063438415527, "learning_rate": 6.79444827586207e-05, "loss": 1.4892, "step": 103000 }, { "epoch": 66.21880998080614, "grad_norm": 13.351948738098145, "learning_rate": 6.777241379310345e-05, "loss": 1.4636, "step": 103500 }, { "epoch": 66.53870761356366, "grad_norm": 15.406342506408691, "learning_rate": 6.76e-05, "loss": 1.4678, "step": 104000 }, { "epoch": 66.85860524632118, "grad_norm": 14.357329368591309, "learning_rate": 6.742758620689656e-05, "loss": 1.4936, "step": 104500 }, { "epoch": 67.17850287907869, "grad_norm": 12.275686264038086, "learning_rate": 6.725517241379311e-05, "loss": 1.4566, "step": 105000 }, { "epoch": 67.49840051183621, "grad_norm": 13.198380470275879, "learning_rate": 6.708310344827586e-05, "loss": 1.4326, "step": 105500 }, { "epoch": 67.81829814459373, "grad_norm": 13.365631103515625, "learning_rate": 6.691068965517242e-05, "loss": 1.426, "step": 106000 }, { "epoch": 68.13819577735124, "grad_norm": 14.106985092163086, "learning_rate": 6.673827586206897e-05, "loss": 1.4289, "step": 106500 }, { "epoch": 68.45809341010876, "grad_norm": 10.076281547546387, "learning_rate": 6.656586206896553e-05, "loss": 1.396, "step": 107000 }, { "epoch": 68.77799104286628, "grad_norm": 14.63807201385498, "learning_rate": 6.639344827586207e-05, "loss": 1.3979, "step": 107500 }, { "epoch": 69.0978886756238, "grad_norm": 13.643959045410156, "learning_rate": 6.622137931034483e-05, "loss": 1.4274, "step": 108000 }, { "epoch": 69.41778630838132, "grad_norm": 11.819470405578613, "learning_rate": 6.604896551724138e-05, "loss": 1.3769, "step": 108500 }, { "epoch": 69.73768394113884, "grad_norm": 14.33261775970459, "learning_rate": 6.587655172413794e-05, "loss": 1.3825, "step": 109000 }, { "epoch": 70.05758157389636, "grad_norm": 10.918536186218262, "learning_rate": 6.570413793103448e-05, "loss": 1.3913, "step": 109500 }, { "epoch": 70.37747920665387, "grad_norm": 13.519926071166992, "learning_rate": 6.553172413793104e-05, "loss": 1.3341, "step": 110000 }, { "epoch": 70.69737683941139, "grad_norm": 12.5425386428833, "learning_rate": 6.535931034482759e-05, "loss": 1.3828, "step": 110500 }, { "epoch": 71.0172744721689, "grad_norm": 11.435805320739746, "learning_rate": 6.518724137931035e-05, "loss": 1.3821, "step": 111000 }, { "epoch": 71.33717210492642, "grad_norm": 12.65505313873291, "learning_rate": 6.501482758620689e-05, "loss": 1.3083, "step": 111500 }, { "epoch": 71.65706973768394, "grad_norm": 15.489115715026855, "learning_rate": 6.484241379310345e-05, "loss": 1.341, "step": 112000 }, { "epoch": 71.97696737044146, "grad_norm": 14.14395809173584, "learning_rate": 6.467e-05, "loss": 1.3579, "step": 112500 }, { "epoch": 72.29686500319897, "grad_norm": 13.708014488220215, "learning_rate": 6.449758620689656e-05, "loss": 1.3032, "step": 113000 }, { "epoch": 72.61676263595649, "grad_norm": 10.75635814666748, "learning_rate": 6.432551724137932e-05, "loss": 1.3045, "step": 113500 }, { "epoch": 72.93666026871401, "grad_norm": 12.12192440032959, "learning_rate": 6.415310344827586e-05, "loss": 1.3248, "step": 114000 }, { "epoch": 73.25655790147152, "grad_norm": 13.368456840515137, "learning_rate": 6.398068965517241e-05, "loss": 1.287, "step": 114500 }, { "epoch": 73.57645553422904, "grad_norm": 12.584633827209473, "learning_rate": 6.380827586206897e-05, "loss": 1.3015, "step": 115000 }, { "epoch": 73.89635316698656, "grad_norm": 13.863194465637207, "learning_rate": 6.363620689655173e-05, "loss": 1.2898, "step": 115500 }, { "epoch": 74.21625079974409, "grad_norm": 12.937112808227539, "learning_rate": 6.346379310344827e-05, "loss": 1.2456, "step": 116000 }, { "epoch": 74.5361484325016, "grad_norm": 11.274981498718262, "learning_rate": 6.329137931034484e-05, "loss": 1.2689, "step": 116500 }, { "epoch": 74.85604606525912, "grad_norm": 14.425061225891113, "learning_rate": 6.311896551724138e-05, "loss": 1.2752, "step": 117000 }, { "epoch": 75.17594369801664, "grad_norm": 11.654635429382324, "learning_rate": 6.294655172413794e-05, "loss": 1.2442, "step": 117500 }, { "epoch": 75.49584133077416, "grad_norm": 10.00129222869873, "learning_rate": 6.277413793103448e-05, "loss": 1.2506, "step": 118000 }, { "epoch": 75.81573896353167, "grad_norm": 11.665295600891113, "learning_rate": 6.260206896551725e-05, "loss": 1.2541, "step": 118500 }, { "epoch": 76.13563659628919, "grad_norm": 10.555766105651855, "learning_rate": 6.24296551724138e-05, "loss": 1.2486, "step": 119000 }, { "epoch": 76.4555342290467, "grad_norm": 14.879280090332031, "learning_rate": 6.225724137931035e-05, "loss": 1.2124, "step": 119500 }, { "epoch": 76.77543186180422, "grad_norm": 15.131136894226074, "learning_rate": 6.208482758620689e-05, "loss": 1.2549, "step": 120000 }, { "epoch": 77.09532949456174, "grad_norm": 9.889472961425781, "learning_rate": 6.191241379310345e-05, "loss": 1.2376, "step": 120500 }, { "epoch": 77.41522712731926, "grad_norm": 11.307145118713379, "learning_rate": 6.174e-05, "loss": 1.1958, "step": 121000 }, { "epoch": 77.73512476007677, "grad_norm": 14.303799629211426, "learning_rate": 6.156758620689656e-05, "loss": 1.2009, "step": 121500 }, { "epoch": 78.05502239283429, "grad_norm": 11.318217277526855, "learning_rate": 6.139517241379311e-05, "loss": 1.2215, "step": 122000 }, { "epoch": 78.3749200255918, "grad_norm": 13.979291915893555, "learning_rate": 6.122310344827586e-05, "loss": 1.1713, "step": 122500 }, { "epoch": 78.69481765834932, "grad_norm": 12.78084945678711, "learning_rate": 6.105103448275863e-05, "loss": 1.1901, "step": 123000 }, { "epoch": 79.01471529110684, "grad_norm": 10.332459449768066, "learning_rate": 6.087862068965517e-05, "loss": 1.2141, "step": 123500 }, { "epoch": 79.33461292386437, "grad_norm": 11.179670333862305, "learning_rate": 6.0706206896551735e-05, "loss": 1.1641, "step": 124000 }, { "epoch": 79.65451055662189, "grad_norm": 12.706995964050293, "learning_rate": 6.053379310344828e-05, "loss": 1.1887, "step": 124500 }, { "epoch": 79.9744081893794, "grad_norm": 12.575511932373047, "learning_rate": 6.036137931034483e-05, "loss": 1.1994, "step": 125000 }, { "epoch": 80.29430582213692, "grad_norm": 11.299592971801758, "learning_rate": 6.0189310344827584e-05, "loss": 1.1535, "step": 125500 }, { "epoch": 80.61420345489444, "grad_norm": 9.741961479187012, "learning_rate": 6.0016896551724147e-05, "loss": 1.1508, "step": 126000 }, { "epoch": 80.93410108765195, "grad_norm": 13.86517333984375, "learning_rate": 5.984448275862069e-05, "loss": 1.1453, "step": 126500 }, { "epoch": 81.25399872040947, "grad_norm": 12.810471534729004, "learning_rate": 5.9672068965517244e-05, "loss": 1.131, "step": 127000 }, { "epoch": 81.57389635316699, "grad_norm": 11.828211784362793, "learning_rate": 5.949965517241379e-05, "loss": 1.1462, "step": 127500 }, { "epoch": 81.8937939859245, "grad_norm": 13.588178634643555, "learning_rate": 5.932724137931035e-05, "loss": 1.1519, "step": 128000 }, { "epoch": 82.21369161868202, "grad_norm": 13.903426170349121, "learning_rate": 5.91548275862069e-05, "loss": 1.1222, "step": 128500 }, { "epoch": 82.53358925143954, "grad_norm": 13.447443962097168, "learning_rate": 5.898275862068966e-05, "loss": 1.1173, "step": 129000 }, { "epoch": 82.85348688419705, "grad_norm": 12.132195472717285, "learning_rate": 5.8810344827586205e-05, "loss": 1.1262, "step": 129500 }, { "epoch": 83.17338451695457, "grad_norm": 11.170686721801758, "learning_rate": 5.863793103448276e-05, "loss": 1.0957, "step": 130000 }, { "epoch": 83.49328214971209, "grad_norm": 12.57539176940918, "learning_rate": 5.846551724137931e-05, "loss": 1.0862, "step": 130500 }, { "epoch": 83.8131797824696, "grad_norm": 14.212547302246094, "learning_rate": 5.8293448275862074e-05, "loss": 1.0929, "step": 131000 }, { "epoch": 84.13307741522712, "grad_norm": 14.803600311279297, "learning_rate": 5.8121034482758616e-05, "loss": 1.0948, "step": 131500 }, { "epoch": 84.45297504798465, "grad_norm": 19.55899429321289, "learning_rate": 5.794862068965518e-05, "loss": 1.0788, "step": 132000 }, { "epoch": 84.77287268074217, "grad_norm": 11.086203575134277, "learning_rate": 5.7776206896551734e-05, "loss": 1.098, "step": 132500 }, { "epoch": 85.09277031349968, "grad_norm": 10.74999713897705, "learning_rate": 5.7603793103448276e-05, "loss": 1.0649, "step": 133000 }, { "epoch": 85.4126679462572, "grad_norm": 14.409449577331543, "learning_rate": 5.743137931034484e-05, "loss": 1.0592, "step": 133500 }, { "epoch": 85.73256557901472, "grad_norm": 10.215742111206055, "learning_rate": 5.725931034482759e-05, "loss": 1.0765, "step": 134000 }, { "epoch": 86.05246321177223, "grad_norm": 12.911944389343262, "learning_rate": 5.7086896551724146e-05, "loss": 1.0504, "step": 134500 }, { "epoch": 86.37236084452975, "grad_norm": 14.987035751342773, "learning_rate": 5.691448275862069e-05, "loss": 1.0141, "step": 135000 }, { "epoch": 86.69225847728727, "grad_norm": 11.989995002746582, "learning_rate": 5.674206896551725e-05, "loss": 1.0431, "step": 135500 }, { "epoch": 87.01215611004478, "grad_norm": 12.771849632263184, "learning_rate": 5.657e-05, "loss": 1.054, "step": 136000 }, { "epoch": 87.3320537428023, "grad_norm": 13.398333549499512, "learning_rate": 5.639758620689656e-05, "loss": 0.9984, "step": 136500 }, { "epoch": 87.65195137555982, "grad_norm": 10.814030647277832, "learning_rate": 5.6225172413793106e-05, "loss": 1.0283, "step": 137000 }, { "epoch": 87.97184900831734, "grad_norm": 12.13095760345459, "learning_rate": 5.605275862068966e-05, "loss": 1.0414, "step": 137500 }, { "epoch": 88.29174664107485, "grad_norm": 12.733049392700195, "learning_rate": 5.5880344827586204e-05, "loss": 1.0087, "step": 138000 }, { "epoch": 88.61164427383237, "grad_norm": 16.555213928222656, "learning_rate": 5.570827586206897e-05, "loss": 1.0062, "step": 138500 }, { "epoch": 88.93154190658989, "grad_norm": 11.025595664978027, "learning_rate": 5.553586206896552e-05, "loss": 1.0212, "step": 139000 }, { "epoch": 89.2514395393474, "grad_norm": 9.93308162689209, "learning_rate": 5.5363448275862074e-05, "loss": 0.998, "step": 139500 }, { "epoch": 89.57133717210493, "grad_norm": 14.131500244140625, "learning_rate": 5.519103448275862e-05, "loss": 0.9797, "step": 140000 }, { "epoch": 89.89123480486245, "grad_norm": 13.041298866271973, "learning_rate": 5.501862068965518e-05, "loss": 1.0205, "step": 140500 }, { "epoch": 90.21113243761997, "grad_norm": 10.885424613952637, "learning_rate": 5.484655172413793e-05, "loss": 0.9763, "step": 141000 }, { "epoch": 90.53103007037748, "grad_norm": 17.75884437561035, "learning_rate": 5.4674137931034485e-05, "loss": 0.9791, "step": 141500 }, { "epoch": 90.850927703135, "grad_norm": 15.903059005737305, "learning_rate": 5.4501724137931034e-05, "loss": 0.9823, "step": 142000 }, { "epoch": 91.17082533589252, "grad_norm": 12.409110069274902, "learning_rate": 5.432931034482759e-05, "loss": 0.9732, "step": 142500 }, { "epoch": 91.49072296865003, "grad_norm": 14.427364349365234, "learning_rate": 5.4157241379310355e-05, "loss": 0.9594, "step": 143000 }, { "epoch": 91.81062060140755, "grad_norm": 10.96267032623291, "learning_rate": 5.39848275862069e-05, "loss": 0.9852, "step": 143500 }, { "epoch": 92.13051823416507, "grad_norm": 9.344204902648926, "learning_rate": 5.381241379310345e-05, "loss": 0.9776, "step": 144000 }, { "epoch": 92.45041586692258, "grad_norm": 13.800095558166504, "learning_rate": 5.364e-05, "loss": 0.9477, "step": 144500 }, { "epoch": 92.7703134996801, "grad_norm": 14.4652099609375, "learning_rate": 5.346758620689656e-05, "loss": 0.9521, "step": 145000 }, { "epoch": 93.09021113243762, "grad_norm": 10.197824478149414, "learning_rate": 5.3295172413793106e-05, "loss": 0.9508, "step": 145500 }, { "epoch": 93.41010876519513, "grad_norm": 14.248830795288086, "learning_rate": 5.312310344827587e-05, "loss": 0.9298, "step": 146000 }, { "epoch": 93.73000639795265, "grad_norm": 15.136180877685547, "learning_rate": 5.295068965517241e-05, "loss": 0.9363, "step": 146500 }, { "epoch": 94.04990403071017, "grad_norm": 14.999555587768555, "learning_rate": 5.277862068965518e-05, "loss": 0.9574, "step": 147000 }, { "epoch": 94.36980166346768, "grad_norm": 10.511527061462402, "learning_rate": 5.260620689655172e-05, "loss": 0.9429, "step": 147500 }, { "epoch": 94.68969929622521, "grad_norm": 12.433847427368164, "learning_rate": 5.243379310344828e-05, "loss": 0.9327, "step": 148000 }, { "epoch": 95.00959692898273, "grad_norm": 11.800546646118164, "learning_rate": 5.2261379310344825e-05, "loss": 0.9363, "step": 148500 }, { "epoch": 95.32949456174025, "grad_norm": 11.03012466430664, "learning_rate": 5.208896551724138e-05, "loss": 0.9172, "step": 149000 }, { "epoch": 95.64939219449776, "grad_norm": 13.628169059753418, "learning_rate": 5.191655172413793e-05, "loss": 0.9101, "step": 149500 }, { "epoch": 95.96928982725528, "grad_norm": 11.726004600524902, "learning_rate": 5.1744137931034485e-05, "loss": 0.9292, "step": 150000 }, { "epoch": 96.2891874600128, "grad_norm": 9.179962158203125, "learning_rate": 5.1571724137931033e-05, "loss": 0.8977, "step": 150500 }, { "epoch": 96.60908509277031, "grad_norm": 11.146485328674316, "learning_rate": 5.139931034482759e-05, "loss": 0.9037, "step": 151000 }, { "epoch": 96.92898272552783, "grad_norm": 14.070140838623047, "learning_rate": 5.122724137931034e-05, "loss": 0.9075, "step": 151500 }, { "epoch": 97.24888035828535, "grad_norm": 12.702670097351074, "learning_rate": 5.1054827586206897e-05, "loss": 0.8721, "step": 152000 }, { "epoch": 97.56877799104286, "grad_norm": 11.813859939575195, "learning_rate": 5.088241379310346e-05, "loss": 0.8876, "step": 152500 }, { "epoch": 97.88867562380038, "grad_norm": 14.402729034423828, "learning_rate": 5.071e-05, "loss": 0.8986, "step": 153000 }, { "epoch": 98.2085732565579, "grad_norm": 11.126707077026367, "learning_rate": 5.0537586206896556e-05, "loss": 0.8569, "step": 153500 }, { "epoch": 98.52847088931541, "grad_norm": 13.64499282836914, "learning_rate": 5.0365172413793105e-05, "loss": 0.8774, "step": 154000 }, { "epoch": 98.84836852207293, "grad_norm": 13.022969245910645, "learning_rate": 5.019275862068966e-05, "loss": 0.8801, "step": 154500 }, { "epoch": 99.16826615483045, "grad_norm": 12.945636749267578, "learning_rate": 5.002034482758621e-05, "loss": 0.8597, "step": 155000 }, { "epoch": 99.48816378758798, "grad_norm": 12.05784797668457, "learning_rate": 4.984827586206897e-05, "loss": 0.8477, "step": 155500 }, { "epoch": 99.8080614203455, "grad_norm": 11.149604797363281, "learning_rate": 4.967586206896552e-05, "loss": 0.8533, "step": 156000 }, { "epoch": 100.12795905310301, "grad_norm": 12.993003845214844, "learning_rate": 4.950344827586207e-05, "loss": 0.8722, "step": 156500 }, { "epoch": 100.44785668586053, "grad_norm": 12.33105754852295, "learning_rate": 4.933103448275863e-05, "loss": 0.8397, "step": 157000 }, { "epoch": 100.76775431861805, "grad_norm": 12.321619987487793, "learning_rate": 4.9158965517241387e-05, "loss": 0.8548, "step": 157500 }, { "epoch": 101.08765195137556, "grad_norm": 13.47990894317627, "learning_rate": 4.8986896551724145e-05, "loss": 0.8493, "step": 158000 }, { "epoch": 101.40754958413308, "grad_norm": 10.382761001586914, "learning_rate": 4.8814482758620694e-05, "loss": 0.8434, "step": 158500 }, { "epoch": 101.7274472168906, "grad_norm": 8.643112182617188, "learning_rate": 4.864206896551724e-05, "loss": 0.8408, "step": 159000 }, { "epoch": 102.04734484964811, "grad_norm": 10.81409740447998, "learning_rate": 4.84696551724138e-05, "loss": 0.8415, "step": 159500 }, { "epoch": 102.36724248240563, "grad_norm": 11.696605682373047, "learning_rate": 4.829758620689656e-05, "loss": 0.8305, "step": 160000 }, { "epoch": 102.68714011516315, "grad_norm": 13.345202445983887, "learning_rate": 4.8125172413793106e-05, "loss": 0.8271, "step": 160500 }, { "epoch": 103.00703774792066, "grad_norm": 11.675226211547852, "learning_rate": 4.795275862068966e-05, "loss": 0.8491, "step": 161000 }, { "epoch": 103.32693538067818, "grad_norm": 12.083547592163086, "learning_rate": 4.778034482758621e-05, "loss": 0.8052, "step": 161500 }, { "epoch": 103.6468330134357, "grad_norm": 9.721264839172363, "learning_rate": 4.760793103448276e-05, "loss": 0.8128, "step": 162000 }, { "epoch": 103.96673064619321, "grad_norm": 13.526360511779785, "learning_rate": 4.743586206896552e-05, "loss": 0.8225, "step": 162500 }, { "epoch": 104.28662827895073, "grad_norm": 14.503246307373047, "learning_rate": 4.726344827586207e-05, "loss": 0.8014, "step": 163000 }, { "epoch": 104.60652591170825, "grad_norm": 12.239891052246094, "learning_rate": 4.709103448275862e-05, "loss": 0.8032, "step": 163500 }, { "epoch": 104.92642354446578, "grad_norm": 12.229057312011719, "learning_rate": 4.691862068965517e-05, "loss": 0.8073, "step": 164000 }, { "epoch": 105.2463211772233, "grad_norm": 11.960144996643066, "learning_rate": 4.6746206896551726e-05, "loss": 0.7956, "step": 164500 }, { "epoch": 105.56621880998081, "grad_norm": 13.662198066711426, "learning_rate": 4.6573793103448275e-05, "loss": 0.7875, "step": 165000 }, { "epoch": 105.88611644273833, "grad_norm": 13.428373336791992, "learning_rate": 4.6401724137931034e-05, "loss": 0.7967, "step": 165500 }, { "epoch": 106.20601407549584, "grad_norm": 13.212657928466797, "learning_rate": 4.622931034482759e-05, "loss": 0.7734, "step": 166000 }, { "epoch": 106.52591170825336, "grad_norm": 13.421162605285645, "learning_rate": 4.605689655172414e-05, "loss": 0.7691, "step": 166500 }, { "epoch": 106.84580934101088, "grad_norm": 10.209512710571289, "learning_rate": 4.588448275862069e-05, "loss": 0.7952, "step": 167000 }, { "epoch": 107.1657069737684, "grad_norm": 10.482810020446777, "learning_rate": 4.571206896551725e-05, "loss": 0.7837, "step": 167500 }, { "epoch": 107.48560460652591, "grad_norm": 13.598471641540527, "learning_rate": 4.55396551724138e-05, "loss": 0.778, "step": 168000 }, { "epoch": 107.80550223928343, "grad_norm": 12.402639389038086, "learning_rate": 4.5367241379310346e-05, "loss": 0.7711, "step": 168500 }, { "epoch": 108.12539987204094, "grad_norm": 12.243593215942383, "learning_rate": 4.51948275862069e-05, "loss": 0.773, "step": 169000 }, { "epoch": 108.44529750479846, "grad_norm": 10.736000061035156, "learning_rate": 4.502275862068966e-05, "loss": 0.7477, "step": 169500 }, { "epoch": 108.76519513755598, "grad_norm": 11.13589096069336, "learning_rate": 4.485034482758621e-05, "loss": 0.7669, "step": 170000 }, { "epoch": 109.0850927703135, "grad_norm": 10.14847183227539, "learning_rate": 4.4677931034482765e-05, "loss": 0.7618, "step": 170500 }, { "epoch": 109.40499040307101, "grad_norm": 10.636765480041504, "learning_rate": 4.4505517241379314e-05, "loss": 0.7362, "step": 171000 }, { "epoch": 109.72488803582854, "grad_norm": 12.350906372070312, "learning_rate": 4.433344827586207e-05, "loss": 0.7578, "step": 171500 }, { "epoch": 110.04478566858606, "grad_norm": 13.237043380737305, "learning_rate": 4.416103448275862e-05, "loss": 0.7662, "step": 172000 }, { "epoch": 110.36468330134358, "grad_norm": 8.747899055480957, "learning_rate": 4.398862068965518e-05, "loss": 0.7306, "step": 172500 }, { "epoch": 110.68458093410109, "grad_norm": 11.915460586547852, "learning_rate": 4.3816206896551725e-05, "loss": 0.7604, "step": 173000 }, { "epoch": 111.00447856685861, "grad_norm": 10.675039291381836, "learning_rate": 4.3643793103448274e-05, "loss": 0.7348, "step": 173500 }, { "epoch": 111.32437619961613, "grad_norm": 11.528447151184082, "learning_rate": 4.347172413793103e-05, "loss": 0.7297, "step": 174000 }, { "epoch": 111.64427383237364, "grad_norm": 11.974442481994629, "learning_rate": 4.329931034482759e-05, "loss": 0.7398, "step": 174500 }, { "epoch": 111.96417146513116, "grad_norm": 10.059257507324219, "learning_rate": 4.312689655172414e-05, "loss": 0.7337, "step": 175000 }, { "epoch": 112.28406909788868, "grad_norm": 11.215494155883789, "learning_rate": 4.295448275862069e-05, "loss": 0.7154, "step": 175500 }, { "epoch": 112.60396673064619, "grad_norm": 11.685689926147461, "learning_rate": 4.278241379310345e-05, "loss": 0.7268, "step": 176000 }, { "epoch": 112.92386436340371, "grad_norm": 12.056086540222168, "learning_rate": 4.261e-05, "loss": 0.7214, "step": 176500 }, { "epoch": 113.24376199616123, "grad_norm": 10.17962646484375, "learning_rate": 4.243758620689655e-05, "loss": 0.7309, "step": 177000 }, { "epoch": 113.56365962891874, "grad_norm": 13.576325416564941, "learning_rate": 4.226517241379311e-05, "loss": 0.7047, "step": 177500 }, { "epoch": 113.88355726167626, "grad_norm": 10.385096549987793, "learning_rate": 4.209275862068966e-05, "loss": 0.7298, "step": 178000 }, { "epoch": 114.20345489443378, "grad_norm": 10.976679801940918, "learning_rate": 4.192068965517242e-05, "loss": 0.7079, "step": 178500 }, { "epoch": 114.52335252719129, "grad_norm": 14.960927963256836, "learning_rate": 4.174827586206897e-05, "loss": 0.7004, "step": 179000 }, { "epoch": 114.84325015994882, "grad_norm": 11.473701477050781, "learning_rate": 4.157586206896552e-05, "loss": 0.7137, "step": 179500 }, { "epoch": 115.16314779270634, "grad_norm": 11.255741119384766, "learning_rate": 4.140344827586207e-05, "loss": 0.6994, "step": 180000 }, { "epoch": 115.48304542546386, "grad_norm": 11.247090339660645, "learning_rate": 4.123103448275862e-05, "loss": 0.6822, "step": 180500 }, { "epoch": 115.80294305822137, "grad_norm": 10.883934020996094, "learning_rate": 4.1058620689655176e-05, "loss": 0.6902, "step": 181000 }, { "epoch": 116.12284069097889, "grad_norm": 12.544395446777344, "learning_rate": 4.0886551724137935e-05, "loss": 0.6853, "step": 181500 }, { "epoch": 116.4427383237364, "grad_norm": 10.791812896728516, "learning_rate": 4.0714137931034484e-05, "loss": 0.6785, "step": 182000 }, { "epoch": 116.76263595649392, "grad_norm": 13.567925453186035, "learning_rate": 4.054172413793104e-05, "loss": 0.6904, "step": 182500 }, { "epoch": 117.08253358925144, "grad_norm": 10.842440605163574, "learning_rate": 4.036931034482759e-05, "loss": 0.6851, "step": 183000 }, { "epoch": 117.40243122200896, "grad_norm": 13.358929634094238, "learning_rate": 4.0196896551724136e-05, "loss": 0.6629, "step": 183500 }, { "epoch": 117.72232885476647, "grad_norm": 12.40263557434082, "learning_rate": 4.002448275862069e-05, "loss": 0.666, "step": 184000 }, { "epoch": 118.04222648752399, "grad_norm": 12.171306610107422, "learning_rate": 3.985241379310345e-05, "loss": 0.6875, "step": 184500 }, { "epoch": 118.3621241202815, "grad_norm": 11.05837631225586, "learning_rate": 3.968e-05, "loss": 0.6598, "step": 185000 }, { "epoch": 118.68202175303902, "grad_norm": 13.27622127532959, "learning_rate": 3.9507586206896555e-05, "loss": 0.6735, "step": 185500 }, { "epoch": 119.00191938579654, "grad_norm": 10.379920959472656, "learning_rate": 3.9335172413793104e-05, "loss": 0.6849, "step": 186000 }, { "epoch": 119.32181701855406, "grad_norm": 13.972020149230957, "learning_rate": 3.916275862068965e-05, "loss": 0.6641, "step": 186500 }, { "epoch": 119.64171465131157, "grad_norm": 11.595196723937988, "learning_rate": 3.899068965517241e-05, "loss": 0.6577, "step": 187000 }, { "epoch": 119.9616122840691, "grad_norm": 11.496007919311523, "learning_rate": 3.881827586206897e-05, "loss": 0.6601, "step": 187500 }, { "epoch": 120.28150991682662, "grad_norm": 10.272443771362305, "learning_rate": 3.864586206896552e-05, "loss": 0.6365, "step": 188000 }, { "epoch": 120.60140754958414, "grad_norm": 11.38764762878418, "learning_rate": 3.847344827586207e-05, "loss": 0.6484, "step": 188500 }, { "epoch": 120.92130518234165, "grad_norm": 14.054584503173828, "learning_rate": 3.8301034482758627e-05, "loss": 0.6662, "step": 189000 }, { "epoch": 121.24120281509917, "grad_norm": 10.214823722839355, "learning_rate": 3.8128620689655175e-05, "loss": 0.6378, "step": 189500 }, { "epoch": 121.56110044785669, "grad_norm": 10.101186752319336, "learning_rate": 3.7956551724137934e-05, "loss": 0.6458, "step": 190000 }, { "epoch": 121.8809980806142, "grad_norm": 9.963375091552734, "learning_rate": 3.778413793103448e-05, "loss": 0.6518, "step": 190500 }, { "epoch": 122.20089571337172, "grad_norm": 12.217530250549316, "learning_rate": 3.761172413793104e-05, "loss": 0.6382, "step": 191000 }, { "epoch": 122.52079334612924, "grad_norm": 13.154520034790039, "learning_rate": 3.743931034482759e-05, "loss": 0.6272, "step": 191500 }, { "epoch": 122.84069097888676, "grad_norm": 12.079745292663574, "learning_rate": 3.7267241379310346e-05, "loss": 0.6496, "step": 192000 }, { "epoch": 123.16058861164427, "grad_norm": 9.851350784301758, "learning_rate": 3.70948275862069e-05, "loss": 0.6253, "step": 192500 }, { "epoch": 123.48048624440179, "grad_norm": 10.869081497192383, "learning_rate": 3.692241379310345e-05, "loss": 0.6307, "step": 193000 }, { "epoch": 123.8003838771593, "grad_norm": 16.297130584716797, "learning_rate": 3.675034482758621e-05, "loss": 0.6336, "step": 193500 }, { "epoch": 124.12028150991682, "grad_norm": 11.86780834197998, "learning_rate": 3.657793103448276e-05, "loss": 0.626, "step": 194000 }, { "epoch": 124.44017914267434, "grad_norm": 12.359445571899414, "learning_rate": 3.640551724137931e-05, "loss": 0.6068, "step": 194500 }, { "epoch": 124.76007677543186, "grad_norm": 10.589780807495117, "learning_rate": 3.623310344827586e-05, "loss": 0.6388, "step": 195000 }, { "epoch": 125.07997440818939, "grad_norm": 11.763408660888672, "learning_rate": 3.606068965517241e-05, "loss": 0.6132, "step": 195500 }, { "epoch": 125.3998720409469, "grad_norm": 11.668269157409668, "learning_rate": 3.5888275862068966e-05, "loss": 0.602, "step": 196000 }, { "epoch": 125.71976967370442, "grad_norm": 11.628352165222168, "learning_rate": 3.5715862068965515e-05, "loss": 0.6196, "step": 196500 }, { "epoch": 126.03966730646194, "grad_norm": 11.364988327026367, "learning_rate": 3.554344827586207e-05, "loss": 0.6197, "step": 197000 }, { "epoch": 126.35956493921945, "grad_norm": 10.588140487670898, "learning_rate": 3.5371034482758626e-05, "loss": 0.6017, "step": 197500 }, { "epoch": 126.67946257197697, "grad_norm": 13.472407341003418, "learning_rate": 3.5198965517241385e-05, "loss": 0.6145, "step": 198000 }, { "epoch": 126.99936020473449, "grad_norm": 10.752822875976562, "learning_rate": 3.502655172413793e-05, "loss": 0.6211, "step": 198500 }, { "epoch": 127.319257837492, "grad_norm": 11.411170959472656, "learning_rate": 3.485413793103449e-05, "loss": 0.5919, "step": 199000 }, { "epoch": 127.63915547024952, "grad_norm": 12.288487434387207, "learning_rate": 3.468172413793104e-05, "loss": 0.6032, "step": 199500 }, { "epoch": 127.95905310300704, "grad_norm": 8.647520065307617, "learning_rate": 3.4509310344827586e-05, "loss": 0.6228, "step": 200000 } ], "logging_steps": 500, "max_steps": 300000, "num_input_tokens_seen": 0, "num_train_epochs": 192, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2457759656933786e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }