| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 89.57133717210493, |
| "eval_steps": 500, |
| "global_step": 140000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.3198976327575176, |
| "grad_norm": 4.1601386070251465, |
| "learning_rate": 5e-06, |
| "loss": 10.3279, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6397952655150352, |
| "grad_norm": 4.366061687469482, |
| "learning_rate": 1e-05, |
| "loss": 9.3834, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.9596928982725528, |
| "grad_norm": 4.784337043762207, |
| "learning_rate": 1.5e-05, |
| "loss": 8.8888, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2795905310300704, |
| "grad_norm": 3.9968652725219727, |
| "learning_rate": 2e-05, |
| "loss": 8.6568, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.599488163787588, |
| "grad_norm": 4.402552127838135, |
| "learning_rate": 2.5e-05, |
| "loss": 8.5473, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.9193857965451055, |
| "grad_norm": 4.639041423797607, |
| "learning_rate": 3e-05, |
| "loss": 8.4044, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.239283429302623, |
| "grad_norm": 5.651747226715088, |
| "learning_rate": 3.5e-05, |
| "loss": 8.2868, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.5591810620601407, |
| "grad_norm": 4.6999359130859375, |
| "learning_rate": 4e-05, |
| "loss": 8.1766, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.8790786948176583, |
| "grad_norm": 4.838181495666504, |
| "learning_rate": 4.499e-05, |
| "loss": 8.1118, |
| "step": 4500 |
| }, |
| { |
| "epoch": 3.198976327575176, |
| "grad_norm": 4.238831996917725, |
| "learning_rate": 4.999e-05, |
| "loss": 8.0038, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.5188739603326935, |
| "grad_norm": 4.455530643463135, |
| "learning_rate": 5.499000000000001e-05, |
| "loss": 7.9014, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.838771593090211, |
| "grad_norm": 5.811736583709717, |
| "learning_rate": 5.999e-05, |
| "loss": 7.8352, |
| "step": 6000 |
| }, |
| { |
| "epoch": 4.158669225847729, |
| "grad_norm": 4.998301982879639, |
| "learning_rate": 6.498e-05, |
| "loss": 7.7613, |
| "step": 6500 |
| }, |
| { |
| "epoch": 4.478566858605246, |
| "grad_norm": 5.011510848999023, |
| "learning_rate": 6.998e-05, |
| "loss": 7.6554, |
| "step": 7000 |
| }, |
| { |
| "epoch": 4.798464491362764, |
| "grad_norm": 4.750300884246826, |
| "learning_rate": 7.498e-05, |
| "loss": 7.6109, |
| "step": 7500 |
| }, |
| { |
| "epoch": 5.1183621241202815, |
| "grad_norm": 6.24017858505249, |
| "learning_rate": 7.998e-05, |
| "loss": 7.5186, |
| "step": 8000 |
| }, |
| { |
| "epoch": 5.438259756877799, |
| "grad_norm": 6.061458587646484, |
| "learning_rate": 8.497000000000001e-05, |
| "loss": 7.3966, |
| "step": 8500 |
| }, |
| { |
| "epoch": 5.758157389635317, |
| "grad_norm": 7.151447772979736, |
| "learning_rate": 8.997000000000001e-05, |
| "loss": 7.2877, |
| "step": 9000 |
| }, |
| { |
| "epoch": 6.078055022392834, |
| "grad_norm": 7.578985214233398, |
| "learning_rate": 9.497000000000001e-05, |
| "loss": 7.1542, |
| "step": 9500 |
| }, |
| { |
| "epoch": 6.397952655150352, |
| "grad_norm": 5.948920726776123, |
| "learning_rate": 9.997e-05, |
| "loss": 7.0008, |
| "step": 10000 |
| }, |
| { |
| "epoch": 6.717850287907869, |
| "grad_norm": 8.036959648132324, |
| "learning_rate": 9.982896551724137e-05, |
| "loss": 6.8966, |
| "step": 10500 |
| }, |
| { |
| "epoch": 7.037747920665387, |
| "grad_norm": 7.160433292388916, |
| "learning_rate": 9.965655172413794e-05, |
| "loss": 6.7509, |
| "step": 11000 |
| }, |
| { |
| "epoch": 7.357645553422905, |
| "grad_norm": 5.934999465942383, |
| "learning_rate": 9.948413793103449e-05, |
| "loss": 6.5833, |
| "step": 11500 |
| }, |
| { |
| "epoch": 7.677543186180422, |
| "grad_norm": 7.745622634887695, |
| "learning_rate": 9.931172413793104e-05, |
| "loss": 6.4975, |
| "step": 12000 |
| }, |
| { |
| "epoch": 7.99744081893794, |
| "grad_norm": 7.0418477058410645, |
| "learning_rate": 9.91393103448276e-05, |
| "loss": 6.4261, |
| "step": 12500 |
| }, |
| { |
| "epoch": 8.317338451695457, |
| "grad_norm": 6.101259708404541, |
| "learning_rate": 9.896689655172414e-05, |
| "loss": 6.2092, |
| "step": 13000 |
| }, |
| { |
| "epoch": 8.637236084452976, |
| "grad_norm": 7.289799213409424, |
| "learning_rate": 9.87944827586207e-05, |
| "loss": 6.1436, |
| "step": 13500 |
| }, |
| { |
| "epoch": 8.957133717210493, |
| "grad_norm": 8.126811027526855, |
| "learning_rate": 9.862206896551725e-05, |
| "loss": 6.0456, |
| "step": 14000 |
| }, |
| { |
| "epoch": 9.277031349968011, |
| "grad_norm": 8.221816062927246, |
| "learning_rate": 9.845000000000001e-05, |
| "loss": 5.9141, |
| "step": 14500 |
| }, |
| { |
| "epoch": 9.596928982725528, |
| "grad_norm": 7.361550331115723, |
| "learning_rate": 9.827793103448277e-05, |
| "loss": 5.8326, |
| "step": 15000 |
| }, |
| { |
| "epoch": 9.916826615483046, |
| "grad_norm": 7.1737775802612305, |
| "learning_rate": 9.810551724137932e-05, |
| "loss": 5.7974, |
| "step": 15500 |
| }, |
| { |
| "epoch": 10.236724248240563, |
| "grad_norm": 9.80185604095459, |
| "learning_rate": 9.793310344827586e-05, |
| "loss": 5.6282, |
| "step": 16000 |
| }, |
| { |
| "epoch": 10.556621880998081, |
| "grad_norm": 7.2062153816223145, |
| "learning_rate": 9.776068965517242e-05, |
| "loss": 5.5619, |
| "step": 16500 |
| }, |
| { |
| "epoch": 10.876519513755598, |
| "grad_norm": 10.801878929138184, |
| "learning_rate": 9.758827586206896e-05, |
| "loss": 5.5155, |
| "step": 17000 |
| }, |
| { |
| "epoch": 11.196417146513117, |
| "grad_norm": 8.48509693145752, |
| "learning_rate": 9.741586206896553e-05, |
| "loss": 5.4259, |
| "step": 17500 |
| }, |
| { |
| "epoch": 11.516314779270633, |
| "grad_norm": 8.47572135925293, |
| "learning_rate": 9.724344827586207e-05, |
| "loss": 5.3205, |
| "step": 18000 |
| }, |
| { |
| "epoch": 11.836212412028152, |
| "grad_norm": 6.122796535491943, |
| "learning_rate": 9.707103448275863e-05, |
| "loss": 5.3025, |
| "step": 18500 |
| }, |
| { |
| "epoch": 12.156110044785668, |
| "grad_norm": 8.210710525512695, |
| "learning_rate": 9.689896551724139e-05, |
| "loss": 5.2264, |
| "step": 19000 |
| }, |
| { |
| "epoch": 12.476007677543187, |
| "grad_norm": 7.857537746429443, |
| "learning_rate": 9.672655172413794e-05, |
| "loss": 5.1395, |
| "step": 19500 |
| }, |
| { |
| "epoch": 12.795905310300704, |
| "grad_norm": 7.743075370788574, |
| "learning_rate": 9.655413793103448e-05, |
| "loss": 5.1109, |
| "step": 20000 |
| }, |
| { |
| "epoch": 13.115802943058222, |
| "grad_norm": 10.574569702148438, |
| "learning_rate": 9.638172413793104e-05, |
| "loss": 5.0794, |
| "step": 20500 |
| }, |
| { |
| "epoch": 13.435700575815739, |
| "grad_norm": 8.313858985900879, |
| "learning_rate": 9.620931034482758e-05, |
| "loss": 4.921, |
| "step": 21000 |
| }, |
| { |
| "epoch": 13.755598208573257, |
| "grad_norm": 9.096057891845703, |
| "learning_rate": 9.603689655172414e-05, |
| "loss": 4.96, |
| "step": 21500 |
| }, |
| { |
| "epoch": 14.075495841330774, |
| "grad_norm": 8.402993202209473, |
| "learning_rate": 9.58644827586207e-05, |
| "loss": 4.9062, |
| "step": 22000 |
| }, |
| { |
| "epoch": 14.395393474088293, |
| "grad_norm": 8.110074996948242, |
| "learning_rate": 9.569206896551725e-05, |
| "loss": 4.8026, |
| "step": 22500 |
| }, |
| { |
| "epoch": 14.71529110684581, |
| "grad_norm": 7.908292293548584, |
| "learning_rate": 9.552000000000001e-05, |
| "loss": 4.82, |
| "step": 23000 |
| }, |
| { |
| "epoch": 15.035188739603328, |
| "grad_norm": 7.991878986358643, |
| "learning_rate": 9.534758620689655e-05, |
| "loss": 4.7397, |
| "step": 23500 |
| }, |
| { |
| "epoch": 15.355086372360844, |
| "grad_norm": 8.696029663085938, |
| "learning_rate": 9.517551724137932e-05, |
| "loss": 4.6656, |
| "step": 24000 |
| }, |
| { |
| "epoch": 15.674984005118363, |
| "grad_norm": 9.421612739562988, |
| "learning_rate": 9.500310344827586e-05, |
| "loss": 4.6412, |
| "step": 24500 |
| }, |
| { |
| "epoch": 15.99488163787588, |
| "grad_norm": 9.747482299804688, |
| "learning_rate": 9.483068965517242e-05, |
| "loss": 4.6048, |
| "step": 25000 |
| }, |
| { |
| "epoch": 16.314779270633398, |
| "grad_norm": 10.389492988586426, |
| "learning_rate": 9.465827586206897e-05, |
| "loss": 4.481, |
| "step": 25500 |
| }, |
| { |
| "epoch": 16.634676903390915, |
| "grad_norm": 8.661949157714844, |
| "learning_rate": 9.448586206896553e-05, |
| "loss": 4.4923, |
| "step": 26000 |
| }, |
| { |
| "epoch": 16.95457453614843, |
| "grad_norm": 12.681297302246094, |
| "learning_rate": 9.431344827586207e-05, |
| "loss": 4.4816, |
| "step": 26500 |
| }, |
| { |
| "epoch": 17.27447216890595, |
| "grad_norm": 8.993134498596191, |
| "learning_rate": 9.414103448275863e-05, |
| "loss": 4.3512, |
| "step": 27000 |
| }, |
| { |
| "epoch": 17.59436980166347, |
| "grad_norm": 10.020146369934082, |
| "learning_rate": 9.396862068965517e-05, |
| "loss": 4.3447, |
| "step": 27500 |
| }, |
| { |
| "epoch": 17.914267434420985, |
| "grad_norm": 9.514701843261719, |
| "learning_rate": 9.379655172413794e-05, |
| "loss": 4.3376, |
| "step": 28000 |
| }, |
| { |
| "epoch": 18.234165067178502, |
| "grad_norm": 10.324498176574707, |
| "learning_rate": 9.362413793103448e-05, |
| "loss": 4.2612, |
| "step": 28500 |
| }, |
| { |
| "epoch": 18.554062699936022, |
| "grad_norm": 10.682856559753418, |
| "learning_rate": 9.345172413793104e-05, |
| "loss": 4.226, |
| "step": 29000 |
| }, |
| { |
| "epoch": 18.87396033269354, |
| "grad_norm": 7.883260726928711, |
| "learning_rate": 9.327931034482758e-05, |
| "loss": 4.19, |
| "step": 29500 |
| }, |
| { |
| "epoch": 19.193857965451055, |
| "grad_norm": 12.470623016357422, |
| "learning_rate": 9.310724137931035e-05, |
| "loss": 4.1881, |
| "step": 30000 |
| }, |
| { |
| "epoch": 19.513755598208572, |
| "grad_norm": 9.932331085205078, |
| "learning_rate": 9.29348275862069e-05, |
| "loss": 4.0853, |
| "step": 30500 |
| }, |
| { |
| "epoch": 19.833653230966092, |
| "grad_norm": 8.153782844543457, |
| "learning_rate": 9.276241379310345e-05, |
| "loss": 4.1087, |
| "step": 31000 |
| }, |
| { |
| "epoch": 20.15355086372361, |
| "grad_norm": 8.214093208312988, |
| "learning_rate": 9.258999999999999e-05, |
| "loss": 4.0751, |
| "step": 31500 |
| }, |
| { |
| "epoch": 20.473448496481126, |
| "grad_norm": 11.927350044250488, |
| "learning_rate": 9.241758620689656e-05, |
| "loss": 3.9686, |
| "step": 32000 |
| }, |
| { |
| "epoch": 20.793346129238643, |
| "grad_norm": 9.67835807800293, |
| "learning_rate": 9.224551724137932e-05, |
| "loss": 3.9745, |
| "step": 32500 |
| }, |
| { |
| "epoch": 21.113243761996163, |
| "grad_norm": 9.911735534667969, |
| "learning_rate": 9.207310344827586e-05, |
| "loss": 3.9308, |
| "step": 33000 |
| }, |
| { |
| "epoch": 21.43314139475368, |
| "grad_norm": 9.05053424835205, |
| "learning_rate": 9.190068965517242e-05, |
| "loss": 3.8718, |
| "step": 33500 |
| }, |
| { |
| "epoch": 21.753039027511196, |
| "grad_norm": 9.588044166564941, |
| "learning_rate": 9.172827586206897e-05, |
| "loss": 3.8425, |
| "step": 34000 |
| }, |
| { |
| "epoch": 22.072936660268713, |
| "grad_norm": 8.788230895996094, |
| "learning_rate": 9.155620689655173e-05, |
| "loss": 3.8617, |
| "step": 34500 |
| }, |
| { |
| "epoch": 22.392834293026233, |
| "grad_norm": 9.435895919799805, |
| "learning_rate": 9.138379310344827e-05, |
| "loss": 3.7524, |
| "step": 35000 |
| }, |
| { |
| "epoch": 22.71273192578375, |
| "grad_norm": 9.870182037353516, |
| "learning_rate": 9.121137931034483e-05, |
| "loss": 3.7916, |
| "step": 35500 |
| }, |
| { |
| "epoch": 23.032629558541267, |
| "grad_norm": 9.612881660461426, |
| "learning_rate": 9.103896551724139e-05, |
| "loss": 3.8011, |
| "step": 36000 |
| }, |
| { |
| "epoch": 23.352527191298783, |
| "grad_norm": 9.643827438354492, |
| "learning_rate": 9.086689655172414e-05, |
| "loss": 3.6478, |
| "step": 36500 |
| }, |
| { |
| "epoch": 23.672424824056304, |
| "grad_norm": 14.105424880981445, |
| "learning_rate": 9.069448275862069e-05, |
| "loss": 3.6671, |
| "step": 37000 |
| }, |
| { |
| "epoch": 23.99232245681382, |
| "grad_norm": 10.427962303161621, |
| "learning_rate": 9.052206896551724e-05, |
| "loss": 3.6809, |
| "step": 37500 |
| }, |
| { |
| "epoch": 24.312220089571337, |
| "grad_norm": 11.505946159362793, |
| "learning_rate": 9.03496551724138e-05, |
| "loss": 3.553, |
| "step": 38000 |
| }, |
| { |
| "epoch": 24.632117722328854, |
| "grad_norm": 10.393635749816895, |
| "learning_rate": 9.017724137931035e-05, |
| "loss": 3.5408, |
| "step": 38500 |
| }, |
| { |
| "epoch": 24.952015355086374, |
| "grad_norm": 9.023842811584473, |
| "learning_rate": 9.00051724137931e-05, |
| "loss": 3.5915, |
| "step": 39000 |
| }, |
| { |
| "epoch": 25.27191298784389, |
| "grad_norm": 10.69048023223877, |
| "learning_rate": 8.983275862068967e-05, |
| "loss": 3.4896, |
| "step": 39500 |
| }, |
| { |
| "epoch": 25.591810620601407, |
| "grad_norm": 10.803936958312988, |
| "learning_rate": 8.966034482758621e-05, |
| "loss": 3.4854, |
| "step": 40000 |
| }, |
| { |
| "epoch": 25.911708253358924, |
| "grad_norm": 10.489801406860352, |
| "learning_rate": 8.948793103448276e-05, |
| "loss": 3.4871, |
| "step": 40500 |
| }, |
| { |
| "epoch": 26.231605886116444, |
| "grad_norm": 10.558309555053711, |
| "learning_rate": 8.931586206896552e-05, |
| "loss": 3.4186, |
| "step": 41000 |
| }, |
| { |
| "epoch": 26.55150351887396, |
| "grad_norm": 12.186748504638672, |
| "learning_rate": 8.914344827586208e-05, |
| "loss": 3.4027, |
| "step": 41500 |
| }, |
| { |
| "epoch": 26.871401151631478, |
| "grad_norm": 9.8623046875, |
| "learning_rate": 8.897103448275862e-05, |
| "loss": 3.4191, |
| "step": 42000 |
| }, |
| { |
| "epoch": 27.191298784388994, |
| "grad_norm": 11.407792091369629, |
| "learning_rate": 8.879862068965518e-05, |
| "loss": 3.341, |
| "step": 42500 |
| }, |
| { |
| "epoch": 27.511196417146515, |
| "grad_norm": 13.37617301940918, |
| "learning_rate": 8.862655172413794e-05, |
| "loss": 3.3137, |
| "step": 43000 |
| }, |
| { |
| "epoch": 27.83109404990403, |
| "grad_norm": 10.30826187133789, |
| "learning_rate": 8.845413793103449e-05, |
| "loss": 3.3036, |
| "step": 43500 |
| }, |
| { |
| "epoch": 28.150991682661548, |
| "grad_norm": 12.024778366088867, |
| "learning_rate": 8.828172413793105e-05, |
| "loss": 3.2678, |
| "step": 44000 |
| }, |
| { |
| "epoch": 28.470889315419065, |
| "grad_norm": 9.730340957641602, |
| "learning_rate": 8.810931034482759e-05, |
| "loss": 3.1949, |
| "step": 44500 |
| }, |
| { |
| "epoch": 28.790786948176585, |
| "grad_norm": 9.700602531433105, |
| "learning_rate": 8.793689655172414e-05, |
| "loss": 3.2541, |
| "step": 45000 |
| }, |
| { |
| "epoch": 29.1106845809341, |
| "grad_norm": 12.359143257141113, |
| "learning_rate": 8.77648275862069e-05, |
| "loss": 3.2456, |
| "step": 45500 |
| }, |
| { |
| "epoch": 29.43058221369162, |
| "grad_norm": 11.989018440246582, |
| "learning_rate": 8.759241379310346e-05, |
| "loss": 3.1154, |
| "step": 46000 |
| }, |
| { |
| "epoch": 29.750479846449135, |
| "grad_norm": 10.904190063476562, |
| "learning_rate": 8.742e-05, |
| "loss": 3.175, |
| "step": 46500 |
| }, |
| { |
| "epoch": 30.070377479206655, |
| "grad_norm": 11.253949165344238, |
| "learning_rate": 8.724758620689656e-05, |
| "loss": 3.1478, |
| "step": 47000 |
| }, |
| { |
| "epoch": 30.390275111964172, |
| "grad_norm": 12.229791641235352, |
| "learning_rate": 8.707517241379311e-05, |
| "loss": 3.0632, |
| "step": 47500 |
| }, |
| { |
| "epoch": 30.71017274472169, |
| "grad_norm": 9.516524314880371, |
| "learning_rate": 8.690275862068967e-05, |
| "loss": 3.0843, |
| "step": 48000 |
| }, |
| { |
| "epoch": 31.030070377479205, |
| "grad_norm": 13.730731010437012, |
| "learning_rate": 8.673034482758621e-05, |
| "loss": 3.098, |
| "step": 48500 |
| }, |
| { |
| "epoch": 31.349968010236726, |
| "grad_norm": 9.73539924621582, |
| "learning_rate": 8.655827586206897e-05, |
| "loss": 2.9611, |
| "step": 49000 |
| }, |
| { |
| "epoch": 31.669865642994242, |
| "grad_norm": 12.066815376281738, |
| "learning_rate": 8.638586206896552e-05, |
| "loss": 2.9943, |
| "step": 49500 |
| }, |
| { |
| "epoch": 31.98976327575176, |
| "grad_norm": 11.028585433959961, |
| "learning_rate": 8.621344827586208e-05, |
| "loss": 3.0424, |
| "step": 50000 |
| }, |
| { |
| "epoch": 32.30966090850928, |
| "grad_norm": 11.2380952835083, |
| "learning_rate": 8.604103448275862e-05, |
| "loss": 2.9023, |
| "step": 50500 |
| }, |
| { |
| "epoch": 32.629558541266796, |
| "grad_norm": 9.345772743225098, |
| "learning_rate": 8.586862068965518e-05, |
| "loss": 2.9586, |
| "step": 51000 |
| }, |
| { |
| "epoch": 32.94945617402431, |
| "grad_norm": 10.239849090576172, |
| "learning_rate": 8.569655172413793e-05, |
| "loss": 2.9461, |
| "step": 51500 |
| }, |
| { |
| "epoch": 33.26935380678183, |
| "grad_norm": 11.058523178100586, |
| "learning_rate": 8.552413793103449e-05, |
| "loss": 2.8453, |
| "step": 52000 |
| }, |
| { |
| "epoch": 33.589251439539346, |
| "grad_norm": 12.131317138671875, |
| "learning_rate": 8.535172413793105e-05, |
| "loss": 2.8603, |
| "step": 52500 |
| }, |
| { |
| "epoch": 33.90914907229686, |
| "grad_norm": 10.392476081848145, |
| "learning_rate": 8.517931034482759e-05, |
| "loss": 2.8817, |
| "step": 53000 |
| }, |
| { |
| "epoch": 34.22904670505438, |
| "grad_norm": 10.749021530151367, |
| "learning_rate": 8.500724137931036e-05, |
| "loss": 2.8073, |
| "step": 53500 |
| }, |
| { |
| "epoch": 34.5489443378119, |
| "grad_norm": 12.33171558380127, |
| "learning_rate": 8.48348275862069e-05, |
| "loss": 2.7793, |
| "step": 54000 |
| }, |
| { |
| "epoch": 34.86884197056942, |
| "grad_norm": 12.961758613586426, |
| "learning_rate": 8.466241379310346e-05, |
| "loss": 2.8066, |
| "step": 54500 |
| }, |
| { |
| "epoch": 35.18873960332694, |
| "grad_norm": 13.320075035095215, |
| "learning_rate": 8.449e-05, |
| "loss": 2.7459, |
| "step": 55000 |
| }, |
| { |
| "epoch": 35.50863723608445, |
| "grad_norm": 14.416489601135254, |
| "learning_rate": 8.431758620689655e-05, |
| "loss": 2.7321, |
| "step": 55500 |
| }, |
| { |
| "epoch": 35.82853486884197, |
| "grad_norm": 11.203073501586914, |
| "learning_rate": 8.414551724137931e-05, |
| "loss": 2.7486, |
| "step": 56000 |
| }, |
| { |
| "epoch": 36.14843250159949, |
| "grad_norm": 10.463476181030273, |
| "learning_rate": 8.397310344827587e-05, |
| "loss": 2.7086, |
| "step": 56500 |
| }, |
| { |
| "epoch": 36.468330134357004, |
| "grad_norm": 11.375761985778809, |
| "learning_rate": 8.380068965517241e-05, |
| "loss": 2.6387, |
| "step": 57000 |
| }, |
| { |
| "epoch": 36.78822776711452, |
| "grad_norm": 11.649105072021484, |
| "learning_rate": 8.362827586206897e-05, |
| "loss": 2.6746, |
| "step": 57500 |
| }, |
| { |
| "epoch": 37.108125399872044, |
| "grad_norm": 12.708244323730469, |
| "learning_rate": 8.345586206896552e-05, |
| "loss": 2.6454, |
| "step": 58000 |
| }, |
| { |
| "epoch": 37.42802303262956, |
| "grad_norm": 12.876201629638672, |
| "learning_rate": 8.328344827586208e-05, |
| "loss": 2.5798, |
| "step": 58500 |
| }, |
| { |
| "epoch": 37.74792066538708, |
| "grad_norm": 11.92346477508545, |
| "learning_rate": 8.311103448275862e-05, |
| "loss": 2.6589, |
| "step": 59000 |
| }, |
| { |
| "epoch": 38.067818298144594, |
| "grad_norm": 10.742238998413086, |
| "learning_rate": 8.293896551724138e-05, |
| "loss": 2.5868, |
| "step": 59500 |
| }, |
| { |
| "epoch": 38.38771593090211, |
| "grad_norm": 11.399048805236816, |
| "learning_rate": 8.276655172413793e-05, |
| "loss": 2.5124, |
| "step": 60000 |
| }, |
| { |
| "epoch": 38.70761356365963, |
| "grad_norm": 13.563875198364258, |
| "learning_rate": 8.259413793103449e-05, |
| "loss": 2.576, |
| "step": 60500 |
| }, |
| { |
| "epoch": 39.027511196417144, |
| "grad_norm": 11.297135353088379, |
| "learning_rate": 8.242172413793103e-05, |
| "loss": 2.5671, |
| "step": 61000 |
| }, |
| { |
| "epoch": 39.34740882917466, |
| "grad_norm": 11.336121559143066, |
| "learning_rate": 8.22496551724138e-05, |
| "loss": 2.4445, |
| "step": 61500 |
| }, |
| { |
| "epoch": 39.667306461932185, |
| "grad_norm": 9.477692604064941, |
| "learning_rate": 8.207724137931035e-05, |
| "loss": 2.4981, |
| "step": 62000 |
| }, |
| { |
| "epoch": 39.9872040946897, |
| "grad_norm": 11.597848892211914, |
| "learning_rate": 8.19048275862069e-05, |
| "loss": 2.5382, |
| "step": 62500 |
| }, |
| { |
| "epoch": 40.30710172744722, |
| "grad_norm": 14.910037994384766, |
| "learning_rate": 8.173241379310346e-05, |
| "loss": 2.4158, |
| "step": 63000 |
| }, |
| { |
| "epoch": 40.626999360204735, |
| "grad_norm": 11.870673179626465, |
| "learning_rate": 8.156e-05, |
| "loss": 2.4395, |
| "step": 63500 |
| }, |
| { |
| "epoch": 40.94689699296225, |
| "grad_norm": 15.279576301574707, |
| "learning_rate": 8.138758620689655e-05, |
| "loss": 2.4653, |
| "step": 64000 |
| }, |
| { |
| "epoch": 41.26679462571977, |
| "grad_norm": 11.710406303405762, |
| "learning_rate": 8.121551724137931e-05, |
| "loss": 2.3567, |
| "step": 64500 |
| }, |
| { |
| "epoch": 41.586692258477285, |
| "grad_norm": 10.663411140441895, |
| "learning_rate": 8.104310344827587e-05, |
| "loss": 2.3655, |
| "step": 65000 |
| }, |
| { |
| "epoch": 41.9065898912348, |
| "grad_norm": 13.946629524230957, |
| "learning_rate": 8.087068965517241e-05, |
| "loss": 2.4336, |
| "step": 65500 |
| }, |
| { |
| "epoch": 42.226487523992326, |
| "grad_norm": 13.782262802124023, |
| "learning_rate": 8.069827586206898e-05, |
| "loss": 2.3351, |
| "step": 66000 |
| }, |
| { |
| "epoch": 42.54638515674984, |
| "grad_norm": 11.177961349487305, |
| "learning_rate": 8.052586206896552e-05, |
| "loss": 2.359, |
| "step": 66500 |
| }, |
| { |
| "epoch": 42.86628278950736, |
| "grad_norm": 15.120301246643066, |
| "learning_rate": 8.035344827586208e-05, |
| "loss": 2.355, |
| "step": 67000 |
| }, |
| { |
| "epoch": 43.186180422264876, |
| "grad_norm": 10.805267333984375, |
| "learning_rate": 8.018103448275862e-05, |
| "loss": 2.2905, |
| "step": 67500 |
| }, |
| { |
| "epoch": 43.50607805502239, |
| "grad_norm": 11.777176856994629, |
| "learning_rate": 8.000862068965517e-05, |
| "loss": 2.2906, |
| "step": 68000 |
| }, |
| { |
| "epoch": 43.82597568777991, |
| "grad_norm": 15.457807540893555, |
| "learning_rate": 7.983689655172414e-05, |
| "loss": 2.3269, |
| "step": 68500 |
| }, |
| { |
| "epoch": 44.145873320537426, |
| "grad_norm": 11.639357566833496, |
| "learning_rate": 7.966448275862069e-05, |
| "loss": 2.2371, |
| "step": 69000 |
| }, |
| { |
| "epoch": 44.46577095329494, |
| "grad_norm": 11.710591316223145, |
| "learning_rate": 7.949206896551725e-05, |
| "loss": 2.2248, |
| "step": 69500 |
| }, |
| { |
| "epoch": 44.785668586052466, |
| "grad_norm": 12.675103187561035, |
| "learning_rate": 7.93196551724138e-05, |
| "loss": 2.2586, |
| "step": 70000 |
| }, |
| { |
| "epoch": 45.10556621880998, |
| "grad_norm": 12.752120971679688, |
| "learning_rate": 7.914724137931034e-05, |
| "loss": 2.2489, |
| "step": 70500 |
| }, |
| { |
| "epoch": 45.4254638515675, |
| "grad_norm": 11.379339218139648, |
| "learning_rate": 7.89751724137931e-05, |
| "loss": 2.1774, |
| "step": 71000 |
| }, |
| { |
| "epoch": 45.74536148432502, |
| "grad_norm": 12.76633358001709, |
| "learning_rate": 7.880275862068966e-05, |
| "loss": 2.2007, |
| "step": 71500 |
| }, |
| { |
| "epoch": 46.06525911708253, |
| "grad_norm": 11.421367645263672, |
| "learning_rate": 7.863034482758621e-05, |
| "loss": 2.2262, |
| "step": 72000 |
| }, |
| { |
| "epoch": 46.38515674984005, |
| "grad_norm": 14.81748104095459, |
| "learning_rate": 7.845793103448277e-05, |
| "loss": 2.101, |
| "step": 72500 |
| }, |
| { |
| "epoch": 46.70505438259757, |
| "grad_norm": 12.902971267700195, |
| "learning_rate": 7.828551724137931e-05, |
| "loss": 2.1568, |
| "step": 73000 |
| }, |
| { |
| "epoch": 47.02495201535508, |
| "grad_norm": 10.685113906860352, |
| "learning_rate": 7.811310344827587e-05, |
| "loss": 2.1655, |
| "step": 73500 |
| }, |
| { |
| "epoch": 47.34484964811261, |
| "grad_norm": 15.892518043518066, |
| "learning_rate": 7.794068965517242e-05, |
| "loss": 2.0551, |
| "step": 74000 |
| }, |
| { |
| "epoch": 47.664747280870124, |
| "grad_norm": 13.730358123779297, |
| "learning_rate": 7.776862068965518e-05, |
| "loss": 2.1053, |
| "step": 74500 |
| }, |
| { |
| "epoch": 47.98464491362764, |
| "grad_norm": 13.635787963867188, |
| "learning_rate": 7.759620689655172e-05, |
| "loss": 2.1408, |
| "step": 75000 |
| }, |
| { |
| "epoch": 48.30454254638516, |
| "grad_norm": 12.861611366271973, |
| "learning_rate": 7.742379310344828e-05, |
| "loss": 2.0104, |
| "step": 75500 |
| }, |
| { |
| "epoch": 48.624440179142674, |
| "grad_norm": 11.84931468963623, |
| "learning_rate": 7.725137931034483e-05, |
| "loss": 2.0555, |
| "step": 76000 |
| }, |
| { |
| "epoch": 48.94433781190019, |
| "grad_norm": 15.812765121459961, |
| "learning_rate": 7.70793103448276e-05, |
| "loss": 2.1087, |
| "step": 76500 |
| }, |
| { |
| "epoch": 49.26423544465771, |
| "grad_norm": 14.233431816101074, |
| "learning_rate": 7.690689655172414e-05, |
| "loss": 1.9998, |
| "step": 77000 |
| }, |
| { |
| "epoch": 49.584133077415224, |
| "grad_norm": 14.329803466796875, |
| "learning_rate": 7.673448275862069e-05, |
| "loss": 2.0189, |
| "step": 77500 |
| }, |
| { |
| "epoch": 49.90403071017275, |
| "grad_norm": 11.00400161743164, |
| "learning_rate": 7.656206896551725e-05, |
| "loss": 2.059, |
| "step": 78000 |
| }, |
| { |
| "epoch": 50.223928342930265, |
| "grad_norm": 13.582133293151855, |
| "learning_rate": 7.63896551724138e-05, |
| "loss": 1.9753, |
| "step": 78500 |
| }, |
| { |
| "epoch": 50.54382597568778, |
| "grad_norm": 12.560907363891602, |
| "learning_rate": 7.621724137931034e-05, |
| "loss": 1.9759, |
| "step": 79000 |
| }, |
| { |
| "epoch": 50.8637236084453, |
| "grad_norm": 12.169915199279785, |
| "learning_rate": 7.60451724137931e-05, |
| "loss": 1.9944, |
| "step": 79500 |
| }, |
| { |
| "epoch": 51.183621241202815, |
| "grad_norm": 13.5604248046875, |
| "learning_rate": 7.587275862068966e-05, |
| "loss": 1.9323, |
| "step": 80000 |
| }, |
| { |
| "epoch": 51.50351887396033, |
| "grad_norm": 15.892741203308105, |
| "learning_rate": 7.570034482758621e-05, |
| "loss": 1.9095, |
| "step": 80500 |
| }, |
| { |
| "epoch": 51.82341650671785, |
| "grad_norm": 13.435209274291992, |
| "learning_rate": 7.552793103448276e-05, |
| "loss": 1.9508, |
| "step": 81000 |
| }, |
| { |
| "epoch": 52.143314139475365, |
| "grad_norm": 11.180010795593262, |
| "learning_rate": 7.535586206896551e-05, |
| "loss": 1.9216, |
| "step": 81500 |
| }, |
| { |
| "epoch": 52.46321177223289, |
| "grad_norm": 12.792661666870117, |
| "learning_rate": 7.518344827586207e-05, |
| "loss": 1.8817, |
| "step": 82000 |
| }, |
| { |
| "epoch": 52.783109404990405, |
| "grad_norm": 11.785886764526367, |
| "learning_rate": 7.501103448275863e-05, |
| "loss": 1.9121, |
| "step": 82500 |
| }, |
| { |
| "epoch": 53.10300703774792, |
| "grad_norm": 10.568120002746582, |
| "learning_rate": 7.483862068965518e-05, |
| "loss": 1.8885, |
| "step": 83000 |
| }, |
| { |
| "epoch": 53.42290467050544, |
| "grad_norm": 14.641459465026855, |
| "learning_rate": 7.466620689655172e-05, |
| "loss": 1.8357, |
| "step": 83500 |
| }, |
| { |
| "epoch": 53.742802303262955, |
| "grad_norm": 13.5363187789917, |
| "learning_rate": 7.449379310344828e-05, |
| "loss": 1.8515, |
| "step": 84000 |
| }, |
| { |
| "epoch": 54.06269993602047, |
| "grad_norm": 12.997908592224121, |
| "learning_rate": 7.432172413793104e-05, |
| "loss": 1.8681, |
| "step": 84500 |
| }, |
| { |
| "epoch": 54.38259756877799, |
| "grad_norm": 12.53503131866455, |
| "learning_rate": 7.414931034482759e-05, |
| "loss": 1.7785, |
| "step": 85000 |
| }, |
| { |
| "epoch": 54.702495201535505, |
| "grad_norm": 11.986194610595703, |
| "learning_rate": 7.397689655172413e-05, |
| "loss": 1.8507, |
| "step": 85500 |
| }, |
| { |
| "epoch": 55.02239283429303, |
| "grad_norm": 12.089723587036133, |
| "learning_rate": 7.380448275862069e-05, |
| "loss": 1.8537, |
| "step": 86000 |
| }, |
| { |
| "epoch": 55.342290467050546, |
| "grad_norm": 13.552453994750977, |
| "learning_rate": 7.363206896551725e-05, |
| "loss": 1.7622, |
| "step": 86500 |
| }, |
| { |
| "epoch": 55.66218809980806, |
| "grad_norm": 12.03878116607666, |
| "learning_rate": 7.346e-05, |
| "loss": 1.8076, |
| "step": 87000 |
| }, |
| { |
| "epoch": 55.98208573256558, |
| "grad_norm": 11.187782287597656, |
| "learning_rate": 7.328758620689655e-05, |
| "loss": 1.8241, |
| "step": 87500 |
| }, |
| { |
| "epoch": 56.301983365323096, |
| "grad_norm": 14.924737930297852, |
| "learning_rate": 7.311517241379312e-05, |
| "loss": 1.7077, |
| "step": 88000 |
| }, |
| { |
| "epoch": 56.62188099808061, |
| "grad_norm": 12.302467346191406, |
| "learning_rate": 7.294275862068966e-05, |
| "loss": 1.74, |
| "step": 88500 |
| }, |
| { |
| "epoch": 56.94177863083813, |
| "grad_norm": 10.834394454956055, |
| "learning_rate": 7.277034482758621e-05, |
| "loss": 1.7827, |
| "step": 89000 |
| }, |
| { |
| "epoch": 57.261676263595646, |
| "grad_norm": 14.356012344360352, |
| "learning_rate": 7.259793103448276e-05, |
| "loss": 1.7082, |
| "step": 89500 |
| }, |
| { |
| "epoch": 57.58157389635317, |
| "grad_norm": 14.632678031921387, |
| "learning_rate": 7.242551724137931e-05, |
| "loss": 1.7045, |
| "step": 90000 |
| }, |
| { |
| "epoch": 57.90147152911069, |
| "grad_norm": 13.501043319702148, |
| "learning_rate": 7.225344827586207e-05, |
| "loss": 1.7522, |
| "step": 90500 |
| }, |
| { |
| "epoch": 58.2213691618682, |
| "grad_norm": 18.839614868164062, |
| "learning_rate": 7.208103448275862e-05, |
| "loss": 1.6801, |
| "step": 91000 |
| }, |
| { |
| "epoch": 58.54126679462572, |
| "grad_norm": 13.41618824005127, |
| "learning_rate": 7.190862068965517e-05, |
| "loss": 1.7005, |
| "step": 91500 |
| }, |
| { |
| "epoch": 58.86116442738324, |
| "grad_norm": 12.56169605255127, |
| "learning_rate": 7.173620689655172e-05, |
| "loss": 1.7018, |
| "step": 92000 |
| }, |
| { |
| "epoch": 59.18106206014075, |
| "grad_norm": 13.447467803955078, |
| "learning_rate": 7.15641379310345e-05, |
| "loss": 1.6691, |
| "step": 92500 |
| }, |
| { |
| "epoch": 59.50095969289827, |
| "grad_norm": 12.452493667602539, |
| "learning_rate": 7.139172413793104e-05, |
| "loss": 1.651, |
| "step": 93000 |
| }, |
| { |
| "epoch": 59.82085732565579, |
| "grad_norm": 10.552214622497559, |
| "learning_rate": 7.121931034482759e-05, |
| "loss": 1.6916, |
| "step": 93500 |
| }, |
| { |
| "epoch": 60.14075495841331, |
| "grad_norm": 11.099422454833984, |
| "learning_rate": 7.104689655172413e-05, |
| "loss": 1.6716, |
| "step": 94000 |
| }, |
| { |
| "epoch": 60.46065259117083, |
| "grad_norm": 13.663276672363281, |
| "learning_rate": 7.08744827586207e-05, |
| "loss": 1.6105, |
| "step": 94500 |
| }, |
| { |
| "epoch": 60.780550223928344, |
| "grad_norm": 11.783934593200684, |
| "learning_rate": 7.070206896551725e-05, |
| "loss": 1.6399, |
| "step": 95000 |
| }, |
| { |
| "epoch": 61.10044785668586, |
| "grad_norm": 12.881340026855469, |
| "learning_rate": 7.053e-05, |
| "loss": 1.6058, |
| "step": 95500 |
| }, |
| { |
| "epoch": 61.42034548944338, |
| "grad_norm": 12.405476570129395, |
| "learning_rate": 7.035758620689656e-05, |
| "loss": 1.5703, |
| "step": 96000 |
| }, |
| { |
| "epoch": 61.740243122200894, |
| "grad_norm": 11.660452842712402, |
| "learning_rate": 7.018517241379311e-05, |
| "loss": 1.6002, |
| "step": 96500 |
| }, |
| { |
| "epoch": 62.06014075495841, |
| "grad_norm": 11.69723892211914, |
| "learning_rate": 7.001275862068966e-05, |
| "loss": 1.6186, |
| "step": 97000 |
| }, |
| { |
| "epoch": 62.38003838771593, |
| "grad_norm": 16.210947036743164, |
| "learning_rate": 6.984034482758621e-05, |
| "loss": 1.5663, |
| "step": 97500 |
| }, |
| { |
| "epoch": 62.69993602047345, |
| "grad_norm": 11.853803634643555, |
| "learning_rate": 6.966827586206897e-05, |
| "loss": 1.5744, |
| "step": 98000 |
| }, |
| { |
| "epoch": 63.01983365323097, |
| "grad_norm": 10.565818786621094, |
| "learning_rate": 6.949586206896553e-05, |
| "loss": 1.5829, |
| "step": 98500 |
| }, |
| { |
| "epoch": 63.339731285988485, |
| "grad_norm": 11.621013641357422, |
| "learning_rate": 6.932344827586207e-05, |
| "loss": 1.5213, |
| "step": 99000 |
| }, |
| { |
| "epoch": 63.659628918746, |
| "grad_norm": 10.182308197021484, |
| "learning_rate": 6.915103448275862e-05, |
| "loss": 1.5291, |
| "step": 99500 |
| }, |
| { |
| "epoch": 63.97952655150352, |
| "grad_norm": 14.434243202209473, |
| "learning_rate": 6.897862068965517e-05, |
| "loss": 1.5612, |
| "step": 100000 |
| }, |
| { |
| "epoch": 64.29942418426104, |
| "grad_norm": 12.513864517211914, |
| "learning_rate": 6.880655172413794e-05, |
| "loss": 1.5041, |
| "step": 100500 |
| }, |
| { |
| "epoch": 64.61932181701856, |
| "grad_norm": 13.189037322998047, |
| "learning_rate": 6.863413793103448e-05, |
| "loss": 1.5097, |
| "step": 101000 |
| }, |
| { |
| "epoch": 64.93921944977608, |
| "grad_norm": 11.867232322692871, |
| "learning_rate": 6.846172413793104e-05, |
| "loss": 1.5401, |
| "step": 101500 |
| }, |
| { |
| "epoch": 65.25911708253359, |
| "grad_norm": 13.00894832611084, |
| "learning_rate": 6.828931034482758e-05, |
| "loss": 1.4581, |
| "step": 102000 |
| }, |
| { |
| "epoch": 65.57901471529111, |
| "grad_norm": 11.719345092773438, |
| "learning_rate": 6.811689655172415e-05, |
| "loss": 1.4807, |
| "step": 102500 |
| }, |
| { |
| "epoch": 65.89891234804863, |
| "grad_norm": 11.355063438415527, |
| "learning_rate": 6.79444827586207e-05, |
| "loss": 1.4892, |
| "step": 103000 |
| }, |
| { |
| "epoch": 66.21880998080614, |
| "grad_norm": 13.351948738098145, |
| "learning_rate": 6.777241379310345e-05, |
| "loss": 1.4636, |
| "step": 103500 |
| }, |
| { |
| "epoch": 66.53870761356366, |
| "grad_norm": 15.406342506408691, |
| "learning_rate": 6.76e-05, |
| "loss": 1.4678, |
| "step": 104000 |
| }, |
| { |
| "epoch": 66.85860524632118, |
| "grad_norm": 14.357329368591309, |
| "learning_rate": 6.742758620689656e-05, |
| "loss": 1.4936, |
| "step": 104500 |
| }, |
| { |
| "epoch": 67.17850287907869, |
| "grad_norm": 12.275686264038086, |
| "learning_rate": 6.725517241379311e-05, |
| "loss": 1.4566, |
| "step": 105000 |
| }, |
| { |
| "epoch": 67.49840051183621, |
| "grad_norm": 13.198380470275879, |
| "learning_rate": 6.708310344827586e-05, |
| "loss": 1.4326, |
| "step": 105500 |
| }, |
| { |
| "epoch": 67.81829814459373, |
| "grad_norm": 13.365631103515625, |
| "learning_rate": 6.691068965517242e-05, |
| "loss": 1.426, |
| "step": 106000 |
| }, |
| { |
| "epoch": 68.13819577735124, |
| "grad_norm": 14.106985092163086, |
| "learning_rate": 6.673827586206897e-05, |
| "loss": 1.4289, |
| "step": 106500 |
| }, |
| { |
| "epoch": 68.45809341010876, |
| "grad_norm": 10.076281547546387, |
| "learning_rate": 6.656586206896553e-05, |
| "loss": 1.396, |
| "step": 107000 |
| }, |
| { |
| "epoch": 68.77799104286628, |
| "grad_norm": 14.63807201385498, |
| "learning_rate": 6.639344827586207e-05, |
| "loss": 1.3979, |
| "step": 107500 |
| }, |
| { |
| "epoch": 69.0978886756238, |
| "grad_norm": 13.643959045410156, |
| "learning_rate": 6.622137931034483e-05, |
| "loss": 1.4274, |
| "step": 108000 |
| }, |
| { |
| "epoch": 69.41778630838132, |
| "grad_norm": 11.819470405578613, |
| "learning_rate": 6.604896551724138e-05, |
| "loss": 1.3769, |
| "step": 108500 |
| }, |
| { |
| "epoch": 69.73768394113884, |
| "grad_norm": 14.33261775970459, |
| "learning_rate": 6.587655172413794e-05, |
| "loss": 1.3825, |
| "step": 109000 |
| }, |
| { |
| "epoch": 70.05758157389636, |
| "grad_norm": 10.918536186218262, |
| "learning_rate": 6.570413793103448e-05, |
| "loss": 1.3913, |
| "step": 109500 |
| }, |
| { |
| "epoch": 70.37747920665387, |
| "grad_norm": 13.519926071166992, |
| "learning_rate": 6.553172413793104e-05, |
| "loss": 1.3341, |
| "step": 110000 |
| }, |
| { |
| "epoch": 70.69737683941139, |
| "grad_norm": 12.5425386428833, |
| "learning_rate": 6.535931034482759e-05, |
| "loss": 1.3828, |
| "step": 110500 |
| }, |
| { |
| "epoch": 71.0172744721689, |
| "grad_norm": 11.435805320739746, |
| "learning_rate": 6.518724137931035e-05, |
| "loss": 1.3821, |
| "step": 111000 |
| }, |
| { |
| "epoch": 71.33717210492642, |
| "grad_norm": 12.65505313873291, |
| "learning_rate": 6.501482758620689e-05, |
| "loss": 1.3083, |
| "step": 111500 |
| }, |
| { |
| "epoch": 71.65706973768394, |
| "grad_norm": 15.489115715026855, |
| "learning_rate": 6.484241379310345e-05, |
| "loss": 1.341, |
| "step": 112000 |
| }, |
| { |
| "epoch": 71.97696737044146, |
| "grad_norm": 14.14395809173584, |
| "learning_rate": 6.467e-05, |
| "loss": 1.3579, |
| "step": 112500 |
| }, |
| { |
| "epoch": 72.29686500319897, |
| "grad_norm": 13.708014488220215, |
| "learning_rate": 6.449758620689656e-05, |
| "loss": 1.3032, |
| "step": 113000 |
| }, |
| { |
| "epoch": 72.61676263595649, |
| "grad_norm": 10.75635814666748, |
| "learning_rate": 6.432551724137932e-05, |
| "loss": 1.3045, |
| "step": 113500 |
| }, |
| { |
| "epoch": 72.93666026871401, |
| "grad_norm": 12.12192440032959, |
| "learning_rate": 6.415310344827586e-05, |
| "loss": 1.3248, |
| "step": 114000 |
| }, |
| { |
| "epoch": 73.25655790147152, |
| "grad_norm": 13.368456840515137, |
| "learning_rate": 6.398068965517241e-05, |
| "loss": 1.287, |
| "step": 114500 |
| }, |
| { |
| "epoch": 73.57645553422904, |
| "grad_norm": 12.584633827209473, |
| "learning_rate": 6.380827586206897e-05, |
| "loss": 1.3015, |
| "step": 115000 |
| }, |
| { |
| "epoch": 73.89635316698656, |
| "grad_norm": 13.863194465637207, |
| "learning_rate": 6.363620689655173e-05, |
| "loss": 1.2898, |
| "step": 115500 |
| }, |
| { |
| "epoch": 74.21625079974409, |
| "grad_norm": 12.937112808227539, |
| "learning_rate": 6.346379310344827e-05, |
| "loss": 1.2456, |
| "step": 116000 |
| }, |
| { |
| "epoch": 74.5361484325016, |
| "grad_norm": 11.274981498718262, |
| "learning_rate": 6.329137931034484e-05, |
| "loss": 1.2689, |
| "step": 116500 |
| }, |
| { |
| "epoch": 74.85604606525912, |
| "grad_norm": 14.425061225891113, |
| "learning_rate": 6.311896551724138e-05, |
| "loss": 1.2752, |
| "step": 117000 |
| }, |
| { |
| "epoch": 75.17594369801664, |
| "grad_norm": 11.654635429382324, |
| "learning_rate": 6.294655172413794e-05, |
| "loss": 1.2442, |
| "step": 117500 |
| }, |
| { |
| "epoch": 75.49584133077416, |
| "grad_norm": 10.00129222869873, |
| "learning_rate": 6.277413793103448e-05, |
| "loss": 1.2506, |
| "step": 118000 |
| }, |
| { |
| "epoch": 75.81573896353167, |
| "grad_norm": 11.665295600891113, |
| "learning_rate": 6.260206896551725e-05, |
| "loss": 1.2541, |
| "step": 118500 |
| }, |
| { |
| "epoch": 76.13563659628919, |
| "grad_norm": 10.555766105651855, |
| "learning_rate": 6.24296551724138e-05, |
| "loss": 1.2486, |
| "step": 119000 |
| }, |
| { |
| "epoch": 76.4555342290467, |
| "grad_norm": 14.879280090332031, |
| "learning_rate": 6.225724137931035e-05, |
| "loss": 1.2124, |
| "step": 119500 |
| }, |
| { |
| "epoch": 76.77543186180422, |
| "grad_norm": 15.131136894226074, |
| "learning_rate": 6.208482758620689e-05, |
| "loss": 1.2549, |
| "step": 120000 |
| }, |
| { |
| "epoch": 77.09532949456174, |
| "grad_norm": 9.889472961425781, |
| "learning_rate": 6.191241379310345e-05, |
| "loss": 1.2376, |
| "step": 120500 |
| }, |
| { |
| "epoch": 77.41522712731926, |
| "grad_norm": 11.307145118713379, |
| "learning_rate": 6.174e-05, |
| "loss": 1.1958, |
| "step": 121000 |
| }, |
| { |
| "epoch": 77.73512476007677, |
| "grad_norm": 14.303799629211426, |
| "learning_rate": 6.156758620689656e-05, |
| "loss": 1.2009, |
| "step": 121500 |
| }, |
| { |
| "epoch": 78.05502239283429, |
| "grad_norm": 11.318217277526855, |
| "learning_rate": 6.139517241379311e-05, |
| "loss": 1.2215, |
| "step": 122000 |
| }, |
| { |
| "epoch": 78.3749200255918, |
| "grad_norm": 13.979291915893555, |
| "learning_rate": 6.122310344827586e-05, |
| "loss": 1.1713, |
| "step": 122500 |
| }, |
| { |
| "epoch": 78.69481765834932, |
| "grad_norm": 12.78084945678711, |
| "learning_rate": 6.105103448275863e-05, |
| "loss": 1.1901, |
| "step": 123000 |
| }, |
| { |
| "epoch": 79.01471529110684, |
| "grad_norm": 10.332459449768066, |
| "learning_rate": 6.087862068965517e-05, |
| "loss": 1.2141, |
| "step": 123500 |
| }, |
| { |
| "epoch": 79.33461292386437, |
| "grad_norm": 11.179670333862305, |
| "learning_rate": 6.0706206896551735e-05, |
| "loss": 1.1641, |
| "step": 124000 |
| }, |
| { |
| "epoch": 79.65451055662189, |
| "grad_norm": 12.706995964050293, |
| "learning_rate": 6.053379310344828e-05, |
| "loss": 1.1887, |
| "step": 124500 |
| }, |
| { |
| "epoch": 79.9744081893794, |
| "grad_norm": 12.575511932373047, |
| "learning_rate": 6.036137931034483e-05, |
| "loss": 1.1994, |
| "step": 125000 |
| }, |
| { |
| "epoch": 80.29430582213692, |
| "grad_norm": 11.299592971801758, |
| "learning_rate": 6.0189310344827584e-05, |
| "loss": 1.1535, |
| "step": 125500 |
| }, |
| { |
| "epoch": 80.61420345489444, |
| "grad_norm": 9.741961479187012, |
| "learning_rate": 6.0016896551724147e-05, |
| "loss": 1.1508, |
| "step": 126000 |
| }, |
| { |
| "epoch": 80.93410108765195, |
| "grad_norm": 13.86517333984375, |
| "learning_rate": 5.984448275862069e-05, |
| "loss": 1.1453, |
| "step": 126500 |
| }, |
| { |
| "epoch": 81.25399872040947, |
| "grad_norm": 12.810471534729004, |
| "learning_rate": 5.9672068965517244e-05, |
| "loss": 1.131, |
| "step": 127000 |
| }, |
| { |
| "epoch": 81.57389635316699, |
| "grad_norm": 11.828211784362793, |
| "learning_rate": 5.949965517241379e-05, |
| "loss": 1.1462, |
| "step": 127500 |
| }, |
| { |
| "epoch": 81.8937939859245, |
| "grad_norm": 13.588178634643555, |
| "learning_rate": 5.932724137931035e-05, |
| "loss": 1.1519, |
| "step": 128000 |
| }, |
| { |
| "epoch": 82.21369161868202, |
| "grad_norm": 13.903426170349121, |
| "learning_rate": 5.91548275862069e-05, |
| "loss": 1.1222, |
| "step": 128500 |
| }, |
| { |
| "epoch": 82.53358925143954, |
| "grad_norm": 13.447443962097168, |
| "learning_rate": 5.898275862068966e-05, |
| "loss": 1.1173, |
| "step": 129000 |
| }, |
| { |
| "epoch": 82.85348688419705, |
| "grad_norm": 12.132195472717285, |
| "learning_rate": 5.8810344827586205e-05, |
| "loss": 1.1262, |
| "step": 129500 |
| }, |
| { |
| "epoch": 83.17338451695457, |
| "grad_norm": 11.170686721801758, |
| "learning_rate": 5.863793103448276e-05, |
| "loss": 1.0957, |
| "step": 130000 |
| }, |
| { |
| "epoch": 83.49328214971209, |
| "grad_norm": 12.57539176940918, |
| "learning_rate": 5.846551724137931e-05, |
| "loss": 1.0862, |
| "step": 130500 |
| }, |
| { |
| "epoch": 83.8131797824696, |
| "grad_norm": 14.212547302246094, |
| "learning_rate": 5.8293448275862074e-05, |
| "loss": 1.0929, |
| "step": 131000 |
| }, |
| { |
| "epoch": 84.13307741522712, |
| "grad_norm": 14.803600311279297, |
| "learning_rate": 5.8121034482758616e-05, |
| "loss": 1.0948, |
| "step": 131500 |
| }, |
| { |
| "epoch": 84.45297504798465, |
| "grad_norm": 19.55899429321289, |
| "learning_rate": 5.794862068965518e-05, |
| "loss": 1.0788, |
| "step": 132000 |
| }, |
| { |
| "epoch": 84.77287268074217, |
| "grad_norm": 11.086203575134277, |
| "learning_rate": 5.7776206896551734e-05, |
| "loss": 1.098, |
| "step": 132500 |
| }, |
| { |
| "epoch": 85.09277031349968, |
| "grad_norm": 10.74999713897705, |
| "learning_rate": 5.7603793103448276e-05, |
| "loss": 1.0649, |
| "step": 133000 |
| }, |
| { |
| "epoch": 85.4126679462572, |
| "grad_norm": 14.409449577331543, |
| "learning_rate": 5.743137931034484e-05, |
| "loss": 1.0592, |
| "step": 133500 |
| }, |
| { |
| "epoch": 85.73256557901472, |
| "grad_norm": 10.215742111206055, |
| "learning_rate": 5.725931034482759e-05, |
| "loss": 1.0765, |
| "step": 134000 |
| }, |
| { |
| "epoch": 86.05246321177223, |
| "grad_norm": 12.911944389343262, |
| "learning_rate": 5.7086896551724146e-05, |
| "loss": 1.0504, |
| "step": 134500 |
| }, |
| { |
| "epoch": 86.37236084452975, |
| "grad_norm": 14.987035751342773, |
| "learning_rate": 5.691448275862069e-05, |
| "loss": 1.0141, |
| "step": 135000 |
| }, |
| { |
| "epoch": 86.69225847728727, |
| "grad_norm": 11.989995002746582, |
| "learning_rate": 5.674206896551725e-05, |
| "loss": 1.0431, |
| "step": 135500 |
| }, |
| { |
| "epoch": 87.01215611004478, |
| "grad_norm": 12.771849632263184, |
| "learning_rate": 5.657e-05, |
| "loss": 1.054, |
| "step": 136000 |
| }, |
| { |
| "epoch": 87.3320537428023, |
| "grad_norm": 13.398333549499512, |
| "learning_rate": 5.639758620689656e-05, |
| "loss": 0.9984, |
| "step": 136500 |
| }, |
| { |
| "epoch": 87.65195137555982, |
| "grad_norm": 10.814030647277832, |
| "learning_rate": 5.6225172413793106e-05, |
| "loss": 1.0283, |
| "step": 137000 |
| }, |
| { |
| "epoch": 87.97184900831734, |
| "grad_norm": 12.13095760345459, |
| "learning_rate": 5.605275862068966e-05, |
| "loss": 1.0414, |
| "step": 137500 |
| }, |
| { |
| "epoch": 88.29174664107485, |
| "grad_norm": 12.733049392700195, |
| "learning_rate": 5.5880344827586204e-05, |
| "loss": 1.0087, |
| "step": 138000 |
| }, |
| { |
| "epoch": 88.61164427383237, |
| "grad_norm": 16.555213928222656, |
| "learning_rate": 5.570827586206897e-05, |
| "loss": 1.0062, |
| "step": 138500 |
| }, |
| { |
| "epoch": 88.93154190658989, |
| "grad_norm": 11.025595664978027, |
| "learning_rate": 5.553586206896552e-05, |
| "loss": 1.0212, |
| "step": 139000 |
| }, |
| { |
| "epoch": 89.2514395393474, |
| "grad_norm": 9.93308162689209, |
| "learning_rate": 5.5363448275862074e-05, |
| "loss": 0.998, |
| "step": 139500 |
| }, |
| { |
| "epoch": 89.57133717210493, |
| "grad_norm": 14.131500244140625, |
| "learning_rate": 5.519103448275862e-05, |
| "loss": 0.9797, |
| "step": 140000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 300000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 192, |
| "save_steps": 20000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5720679109463245e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|