| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 51.183621241202815, |
| "eval_steps": 500, |
| "global_step": 80000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.3198976327575176, |
| "grad_norm": 4.1601386070251465, |
| "learning_rate": 5e-06, |
| "loss": 10.3279, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6397952655150352, |
| "grad_norm": 4.366061687469482, |
| "learning_rate": 1e-05, |
| "loss": 9.3834, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.9596928982725528, |
| "grad_norm": 4.784337043762207, |
| "learning_rate": 1.5e-05, |
| "loss": 8.8888, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.2795905310300704, |
| "grad_norm": 3.9968652725219727, |
| "learning_rate": 2e-05, |
| "loss": 8.6568, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.599488163787588, |
| "grad_norm": 4.402552127838135, |
| "learning_rate": 2.5e-05, |
| "loss": 8.5473, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.9193857965451055, |
| "grad_norm": 4.639041423797607, |
| "learning_rate": 3e-05, |
| "loss": 8.4044, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.239283429302623, |
| "grad_norm": 5.651747226715088, |
| "learning_rate": 3.5e-05, |
| "loss": 8.2868, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.5591810620601407, |
| "grad_norm": 4.6999359130859375, |
| "learning_rate": 4e-05, |
| "loss": 8.1766, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.8790786948176583, |
| "grad_norm": 4.838181495666504, |
| "learning_rate": 4.499e-05, |
| "loss": 8.1118, |
| "step": 4500 |
| }, |
| { |
| "epoch": 3.198976327575176, |
| "grad_norm": 4.238831996917725, |
| "learning_rate": 4.999e-05, |
| "loss": 8.0038, |
| "step": 5000 |
| }, |
| { |
| "epoch": 3.5188739603326935, |
| "grad_norm": 4.455530643463135, |
| "learning_rate": 5.499000000000001e-05, |
| "loss": 7.9014, |
| "step": 5500 |
| }, |
| { |
| "epoch": 3.838771593090211, |
| "grad_norm": 5.811736583709717, |
| "learning_rate": 5.999e-05, |
| "loss": 7.8352, |
| "step": 6000 |
| }, |
| { |
| "epoch": 4.158669225847729, |
| "grad_norm": 4.998301982879639, |
| "learning_rate": 6.498e-05, |
| "loss": 7.7613, |
| "step": 6500 |
| }, |
| { |
| "epoch": 4.478566858605246, |
| "grad_norm": 5.011510848999023, |
| "learning_rate": 6.998e-05, |
| "loss": 7.6554, |
| "step": 7000 |
| }, |
| { |
| "epoch": 4.798464491362764, |
| "grad_norm": 4.750300884246826, |
| "learning_rate": 7.498e-05, |
| "loss": 7.6109, |
| "step": 7500 |
| }, |
| { |
| "epoch": 5.1183621241202815, |
| "grad_norm": 6.24017858505249, |
| "learning_rate": 7.998e-05, |
| "loss": 7.5186, |
| "step": 8000 |
| }, |
| { |
| "epoch": 5.438259756877799, |
| "grad_norm": 6.061458587646484, |
| "learning_rate": 8.497000000000001e-05, |
| "loss": 7.3966, |
| "step": 8500 |
| }, |
| { |
| "epoch": 5.758157389635317, |
| "grad_norm": 7.151447772979736, |
| "learning_rate": 8.997000000000001e-05, |
| "loss": 7.2877, |
| "step": 9000 |
| }, |
| { |
| "epoch": 6.078055022392834, |
| "grad_norm": 7.578985214233398, |
| "learning_rate": 9.497000000000001e-05, |
| "loss": 7.1542, |
| "step": 9500 |
| }, |
| { |
| "epoch": 6.397952655150352, |
| "grad_norm": 5.948920726776123, |
| "learning_rate": 9.997e-05, |
| "loss": 7.0008, |
| "step": 10000 |
| }, |
| { |
| "epoch": 6.717850287907869, |
| "grad_norm": 8.036959648132324, |
| "learning_rate": 9.982896551724137e-05, |
| "loss": 6.8966, |
| "step": 10500 |
| }, |
| { |
| "epoch": 7.037747920665387, |
| "grad_norm": 7.160433292388916, |
| "learning_rate": 9.965655172413794e-05, |
| "loss": 6.7509, |
| "step": 11000 |
| }, |
| { |
| "epoch": 7.357645553422905, |
| "grad_norm": 5.934999465942383, |
| "learning_rate": 9.948413793103449e-05, |
| "loss": 6.5833, |
| "step": 11500 |
| }, |
| { |
| "epoch": 7.677543186180422, |
| "grad_norm": 7.745622634887695, |
| "learning_rate": 9.931172413793104e-05, |
| "loss": 6.4975, |
| "step": 12000 |
| }, |
| { |
| "epoch": 7.99744081893794, |
| "grad_norm": 7.0418477058410645, |
| "learning_rate": 9.91393103448276e-05, |
| "loss": 6.4261, |
| "step": 12500 |
| }, |
| { |
| "epoch": 8.317338451695457, |
| "grad_norm": 6.101259708404541, |
| "learning_rate": 9.896689655172414e-05, |
| "loss": 6.2092, |
| "step": 13000 |
| }, |
| { |
| "epoch": 8.637236084452976, |
| "grad_norm": 7.289799213409424, |
| "learning_rate": 9.87944827586207e-05, |
| "loss": 6.1436, |
| "step": 13500 |
| }, |
| { |
| "epoch": 8.957133717210493, |
| "grad_norm": 8.126811027526855, |
| "learning_rate": 9.862206896551725e-05, |
| "loss": 6.0456, |
| "step": 14000 |
| }, |
| { |
| "epoch": 9.277031349968011, |
| "grad_norm": 8.221816062927246, |
| "learning_rate": 9.845000000000001e-05, |
| "loss": 5.9141, |
| "step": 14500 |
| }, |
| { |
| "epoch": 9.596928982725528, |
| "grad_norm": 7.361550331115723, |
| "learning_rate": 9.827793103448277e-05, |
| "loss": 5.8326, |
| "step": 15000 |
| }, |
| { |
| "epoch": 9.916826615483046, |
| "grad_norm": 7.1737775802612305, |
| "learning_rate": 9.810551724137932e-05, |
| "loss": 5.7974, |
| "step": 15500 |
| }, |
| { |
| "epoch": 10.236724248240563, |
| "grad_norm": 9.80185604095459, |
| "learning_rate": 9.793310344827586e-05, |
| "loss": 5.6282, |
| "step": 16000 |
| }, |
| { |
| "epoch": 10.556621880998081, |
| "grad_norm": 7.2062153816223145, |
| "learning_rate": 9.776068965517242e-05, |
| "loss": 5.5619, |
| "step": 16500 |
| }, |
| { |
| "epoch": 10.876519513755598, |
| "grad_norm": 10.801878929138184, |
| "learning_rate": 9.758827586206896e-05, |
| "loss": 5.5155, |
| "step": 17000 |
| }, |
| { |
| "epoch": 11.196417146513117, |
| "grad_norm": 8.48509693145752, |
| "learning_rate": 9.741586206896553e-05, |
| "loss": 5.4259, |
| "step": 17500 |
| }, |
| { |
| "epoch": 11.516314779270633, |
| "grad_norm": 8.47572135925293, |
| "learning_rate": 9.724344827586207e-05, |
| "loss": 5.3205, |
| "step": 18000 |
| }, |
| { |
| "epoch": 11.836212412028152, |
| "grad_norm": 6.122796535491943, |
| "learning_rate": 9.707103448275863e-05, |
| "loss": 5.3025, |
| "step": 18500 |
| }, |
| { |
| "epoch": 12.156110044785668, |
| "grad_norm": 8.210710525512695, |
| "learning_rate": 9.689896551724139e-05, |
| "loss": 5.2264, |
| "step": 19000 |
| }, |
| { |
| "epoch": 12.476007677543187, |
| "grad_norm": 7.857537746429443, |
| "learning_rate": 9.672655172413794e-05, |
| "loss": 5.1395, |
| "step": 19500 |
| }, |
| { |
| "epoch": 12.795905310300704, |
| "grad_norm": 7.743075370788574, |
| "learning_rate": 9.655413793103448e-05, |
| "loss": 5.1109, |
| "step": 20000 |
| }, |
| { |
| "epoch": 13.115802943058222, |
| "grad_norm": 10.574569702148438, |
| "learning_rate": 9.638172413793104e-05, |
| "loss": 5.0794, |
| "step": 20500 |
| }, |
| { |
| "epoch": 13.435700575815739, |
| "grad_norm": 8.313858985900879, |
| "learning_rate": 9.620931034482758e-05, |
| "loss": 4.921, |
| "step": 21000 |
| }, |
| { |
| "epoch": 13.755598208573257, |
| "grad_norm": 9.096057891845703, |
| "learning_rate": 9.603689655172414e-05, |
| "loss": 4.96, |
| "step": 21500 |
| }, |
| { |
| "epoch": 14.075495841330774, |
| "grad_norm": 8.402993202209473, |
| "learning_rate": 9.58644827586207e-05, |
| "loss": 4.9062, |
| "step": 22000 |
| }, |
| { |
| "epoch": 14.395393474088293, |
| "grad_norm": 8.110074996948242, |
| "learning_rate": 9.569206896551725e-05, |
| "loss": 4.8026, |
| "step": 22500 |
| }, |
| { |
| "epoch": 14.71529110684581, |
| "grad_norm": 7.908292293548584, |
| "learning_rate": 9.552000000000001e-05, |
| "loss": 4.82, |
| "step": 23000 |
| }, |
| { |
| "epoch": 15.035188739603328, |
| "grad_norm": 7.991878986358643, |
| "learning_rate": 9.534758620689655e-05, |
| "loss": 4.7397, |
| "step": 23500 |
| }, |
| { |
| "epoch": 15.355086372360844, |
| "grad_norm": 8.696029663085938, |
| "learning_rate": 9.517551724137932e-05, |
| "loss": 4.6656, |
| "step": 24000 |
| }, |
| { |
| "epoch": 15.674984005118363, |
| "grad_norm": 9.421612739562988, |
| "learning_rate": 9.500310344827586e-05, |
| "loss": 4.6412, |
| "step": 24500 |
| }, |
| { |
| "epoch": 15.99488163787588, |
| "grad_norm": 9.747482299804688, |
| "learning_rate": 9.483068965517242e-05, |
| "loss": 4.6048, |
| "step": 25000 |
| }, |
| { |
| "epoch": 16.314779270633398, |
| "grad_norm": 10.389492988586426, |
| "learning_rate": 9.465827586206897e-05, |
| "loss": 4.481, |
| "step": 25500 |
| }, |
| { |
| "epoch": 16.634676903390915, |
| "grad_norm": 8.661949157714844, |
| "learning_rate": 9.448586206896553e-05, |
| "loss": 4.4923, |
| "step": 26000 |
| }, |
| { |
| "epoch": 16.95457453614843, |
| "grad_norm": 12.681297302246094, |
| "learning_rate": 9.431344827586207e-05, |
| "loss": 4.4816, |
| "step": 26500 |
| }, |
| { |
| "epoch": 17.27447216890595, |
| "grad_norm": 8.993134498596191, |
| "learning_rate": 9.414103448275863e-05, |
| "loss": 4.3512, |
| "step": 27000 |
| }, |
| { |
| "epoch": 17.59436980166347, |
| "grad_norm": 10.020146369934082, |
| "learning_rate": 9.396862068965517e-05, |
| "loss": 4.3447, |
| "step": 27500 |
| }, |
| { |
| "epoch": 17.914267434420985, |
| "grad_norm": 9.514701843261719, |
| "learning_rate": 9.379655172413794e-05, |
| "loss": 4.3376, |
| "step": 28000 |
| }, |
| { |
| "epoch": 18.234165067178502, |
| "grad_norm": 10.324498176574707, |
| "learning_rate": 9.362413793103448e-05, |
| "loss": 4.2612, |
| "step": 28500 |
| }, |
| { |
| "epoch": 18.554062699936022, |
| "grad_norm": 10.682856559753418, |
| "learning_rate": 9.345172413793104e-05, |
| "loss": 4.226, |
| "step": 29000 |
| }, |
| { |
| "epoch": 18.87396033269354, |
| "grad_norm": 7.883260726928711, |
| "learning_rate": 9.327931034482758e-05, |
| "loss": 4.19, |
| "step": 29500 |
| }, |
| { |
| "epoch": 19.193857965451055, |
| "grad_norm": 12.470623016357422, |
| "learning_rate": 9.310724137931035e-05, |
| "loss": 4.1881, |
| "step": 30000 |
| }, |
| { |
| "epoch": 19.513755598208572, |
| "grad_norm": 9.932331085205078, |
| "learning_rate": 9.29348275862069e-05, |
| "loss": 4.0853, |
| "step": 30500 |
| }, |
| { |
| "epoch": 19.833653230966092, |
| "grad_norm": 8.153782844543457, |
| "learning_rate": 9.276241379310345e-05, |
| "loss": 4.1087, |
| "step": 31000 |
| }, |
| { |
| "epoch": 20.15355086372361, |
| "grad_norm": 8.214093208312988, |
| "learning_rate": 9.258999999999999e-05, |
| "loss": 4.0751, |
| "step": 31500 |
| }, |
| { |
| "epoch": 20.473448496481126, |
| "grad_norm": 11.927350044250488, |
| "learning_rate": 9.241758620689656e-05, |
| "loss": 3.9686, |
| "step": 32000 |
| }, |
| { |
| "epoch": 20.793346129238643, |
| "grad_norm": 9.67835807800293, |
| "learning_rate": 9.224551724137932e-05, |
| "loss": 3.9745, |
| "step": 32500 |
| }, |
| { |
| "epoch": 21.113243761996163, |
| "grad_norm": 9.911735534667969, |
| "learning_rate": 9.207310344827586e-05, |
| "loss": 3.9308, |
| "step": 33000 |
| }, |
| { |
| "epoch": 21.43314139475368, |
| "grad_norm": 9.05053424835205, |
| "learning_rate": 9.190068965517242e-05, |
| "loss": 3.8718, |
| "step": 33500 |
| }, |
| { |
| "epoch": 21.753039027511196, |
| "grad_norm": 9.588044166564941, |
| "learning_rate": 9.172827586206897e-05, |
| "loss": 3.8425, |
| "step": 34000 |
| }, |
| { |
| "epoch": 22.072936660268713, |
| "grad_norm": 8.788230895996094, |
| "learning_rate": 9.155620689655173e-05, |
| "loss": 3.8617, |
| "step": 34500 |
| }, |
| { |
| "epoch": 22.392834293026233, |
| "grad_norm": 9.435895919799805, |
| "learning_rate": 9.138379310344827e-05, |
| "loss": 3.7524, |
| "step": 35000 |
| }, |
| { |
| "epoch": 22.71273192578375, |
| "grad_norm": 9.870182037353516, |
| "learning_rate": 9.121137931034483e-05, |
| "loss": 3.7916, |
| "step": 35500 |
| }, |
| { |
| "epoch": 23.032629558541267, |
| "grad_norm": 9.612881660461426, |
| "learning_rate": 9.103896551724139e-05, |
| "loss": 3.8011, |
| "step": 36000 |
| }, |
| { |
| "epoch": 23.352527191298783, |
| "grad_norm": 9.643827438354492, |
| "learning_rate": 9.086689655172414e-05, |
| "loss": 3.6478, |
| "step": 36500 |
| }, |
| { |
| "epoch": 23.672424824056304, |
| "grad_norm": 14.105424880981445, |
| "learning_rate": 9.069448275862069e-05, |
| "loss": 3.6671, |
| "step": 37000 |
| }, |
| { |
| "epoch": 23.99232245681382, |
| "grad_norm": 10.427962303161621, |
| "learning_rate": 9.052206896551724e-05, |
| "loss": 3.6809, |
| "step": 37500 |
| }, |
| { |
| "epoch": 24.312220089571337, |
| "grad_norm": 11.505946159362793, |
| "learning_rate": 9.03496551724138e-05, |
| "loss": 3.553, |
| "step": 38000 |
| }, |
| { |
| "epoch": 24.632117722328854, |
| "grad_norm": 10.393635749816895, |
| "learning_rate": 9.017724137931035e-05, |
| "loss": 3.5408, |
| "step": 38500 |
| }, |
| { |
| "epoch": 24.952015355086374, |
| "grad_norm": 9.023842811584473, |
| "learning_rate": 9.00051724137931e-05, |
| "loss": 3.5915, |
| "step": 39000 |
| }, |
| { |
| "epoch": 25.27191298784389, |
| "grad_norm": 10.69048023223877, |
| "learning_rate": 8.983275862068967e-05, |
| "loss": 3.4896, |
| "step": 39500 |
| }, |
| { |
| "epoch": 25.591810620601407, |
| "grad_norm": 10.803936958312988, |
| "learning_rate": 8.966034482758621e-05, |
| "loss": 3.4854, |
| "step": 40000 |
| }, |
| { |
| "epoch": 25.911708253358924, |
| "grad_norm": 10.489801406860352, |
| "learning_rate": 8.948793103448276e-05, |
| "loss": 3.4871, |
| "step": 40500 |
| }, |
| { |
| "epoch": 26.231605886116444, |
| "grad_norm": 10.558309555053711, |
| "learning_rate": 8.931586206896552e-05, |
| "loss": 3.4186, |
| "step": 41000 |
| }, |
| { |
| "epoch": 26.55150351887396, |
| "grad_norm": 12.186748504638672, |
| "learning_rate": 8.914344827586208e-05, |
| "loss": 3.4027, |
| "step": 41500 |
| }, |
| { |
| "epoch": 26.871401151631478, |
| "grad_norm": 9.8623046875, |
| "learning_rate": 8.897103448275862e-05, |
| "loss": 3.4191, |
| "step": 42000 |
| }, |
| { |
| "epoch": 27.191298784388994, |
| "grad_norm": 11.407792091369629, |
| "learning_rate": 8.879862068965518e-05, |
| "loss": 3.341, |
| "step": 42500 |
| }, |
| { |
| "epoch": 27.511196417146515, |
| "grad_norm": 13.37617301940918, |
| "learning_rate": 8.862655172413794e-05, |
| "loss": 3.3137, |
| "step": 43000 |
| }, |
| { |
| "epoch": 27.83109404990403, |
| "grad_norm": 10.30826187133789, |
| "learning_rate": 8.845413793103449e-05, |
| "loss": 3.3036, |
| "step": 43500 |
| }, |
| { |
| "epoch": 28.150991682661548, |
| "grad_norm": 12.024778366088867, |
| "learning_rate": 8.828172413793105e-05, |
| "loss": 3.2678, |
| "step": 44000 |
| }, |
| { |
| "epoch": 28.470889315419065, |
| "grad_norm": 9.730340957641602, |
| "learning_rate": 8.810931034482759e-05, |
| "loss": 3.1949, |
| "step": 44500 |
| }, |
| { |
| "epoch": 28.790786948176585, |
| "grad_norm": 9.700602531433105, |
| "learning_rate": 8.793689655172414e-05, |
| "loss": 3.2541, |
| "step": 45000 |
| }, |
| { |
| "epoch": 29.1106845809341, |
| "grad_norm": 12.359143257141113, |
| "learning_rate": 8.77648275862069e-05, |
| "loss": 3.2456, |
| "step": 45500 |
| }, |
| { |
| "epoch": 29.43058221369162, |
| "grad_norm": 11.989018440246582, |
| "learning_rate": 8.759241379310346e-05, |
| "loss": 3.1154, |
| "step": 46000 |
| }, |
| { |
| "epoch": 29.750479846449135, |
| "grad_norm": 10.904190063476562, |
| "learning_rate": 8.742e-05, |
| "loss": 3.175, |
| "step": 46500 |
| }, |
| { |
| "epoch": 30.070377479206655, |
| "grad_norm": 11.253949165344238, |
| "learning_rate": 8.724758620689656e-05, |
| "loss": 3.1478, |
| "step": 47000 |
| }, |
| { |
| "epoch": 30.390275111964172, |
| "grad_norm": 12.229791641235352, |
| "learning_rate": 8.707517241379311e-05, |
| "loss": 3.0632, |
| "step": 47500 |
| }, |
| { |
| "epoch": 30.71017274472169, |
| "grad_norm": 9.516524314880371, |
| "learning_rate": 8.690275862068967e-05, |
| "loss": 3.0843, |
| "step": 48000 |
| }, |
| { |
| "epoch": 31.030070377479205, |
| "grad_norm": 13.730731010437012, |
| "learning_rate": 8.673034482758621e-05, |
| "loss": 3.098, |
| "step": 48500 |
| }, |
| { |
| "epoch": 31.349968010236726, |
| "grad_norm": 9.73539924621582, |
| "learning_rate": 8.655827586206897e-05, |
| "loss": 2.9611, |
| "step": 49000 |
| }, |
| { |
| "epoch": 31.669865642994242, |
| "grad_norm": 12.066815376281738, |
| "learning_rate": 8.638586206896552e-05, |
| "loss": 2.9943, |
| "step": 49500 |
| }, |
| { |
| "epoch": 31.98976327575176, |
| "grad_norm": 11.028585433959961, |
| "learning_rate": 8.621344827586208e-05, |
| "loss": 3.0424, |
| "step": 50000 |
| }, |
| { |
| "epoch": 32.30966090850928, |
| "grad_norm": 11.2380952835083, |
| "learning_rate": 8.604103448275862e-05, |
| "loss": 2.9023, |
| "step": 50500 |
| }, |
| { |
| "epoch": 32.629558541266796, |
| "grad_norm": 9.345772743225098, |
| "learning_rate": 8.586862068965518e-05, |
| "loss": 2.9586, |
| "step": 51000 |
| }, |
| { |
| "epoch": 32.94945617402431, |
| "grad_norm": 10.239849090576172, |
| "learning_rate": 8.569655172413793e-05, |
| "loss": 2.9461, |
| "step": 51500 |
| }, |
| { |
| "epoch": 33.26935380678183, |
| "grad_norm": 11.058523178100586, |
| "learning_rate": 8.552413793103449e-05, |
| "loss": 2.8453, |
| "step": 52000 |
| }, |
| { |
| "epoch": 33.589251439539346, |
| "grad_norm": 12.131317138671875, |
| "learning_rate": 8.535172413793105e-05, |
| "loss": 2.8603, |
| "step": 52500 |
| }, |
| { |
| "epoch": 33.90914907229686, |
| "grad_norm": 10.392476081848145, |
| "learning_rate": 8.517931034482759e-05, |
| "loss": 2.8817, |
| "step": 53000 |
| }, |
| { |
| "epoch": 34.22904670505438, |
| "grad_norm": 10.749021530151367, |
| "learning_rate": 8.500724137931036e-05, |
| "loss": 2.8073, |
| "step": 53500 |
| }, |
| { |
| "epoch": 34.5489443378119, |
| "grad_norm": 12.33171558380127, |
| "learning_rate": 8.48348275862069e-05, |
| "loss": 2.7793, |
| "step": 54000 |
| }, |
| { |
| "epoch": 34.86884197056942, |
| "grad_norm": 12.961758613586426, |
| "learning_rate": 8.466241379310346e-05, |
| "loss": 2.8066, |
| "step": 54500 |
| }, |
| { |
| "epoch": 35.18873960332694, |
| "grad_norm": 13.320075035095215, |
| "learning_rate": 8.449e-05, |
| "loss": 2.7459, |
| "step": 55000 |
| }, |
| { |
| "epoch": 35.50863723608445, |
| "grad_norm": 14.416489601135254, |
| "learning_rate": 8.431758620689655e-05, |
| "loss": 2.7321, |
| "step": 55500 |
| }, |
| { |
| "epoch": 35.82853486884197, |
| "grad_norm": 11.203073501586914, |
| "learning_rate": 8.414551724137931e-05, |
| "loss": 2.7486, |
| "step": 56000 |
| }, |
| { |
| "epoch": 36.14843250159949, |
| "grad_norm": 10.463476181030273, |
| "learning_rate": 8.397310344827587e-05, |
| "loss": 2.7086, |
| "step": 56500 |
| }, |
| { |
| "epoch": 36.468330134357004, |
| "grad_norm": 11.375761985778809, |
| "learning_rate": 8.380068965517241e-05, |
| "loss": 2.6387, |
| "step": 57000 |
| }, |
| { |
| "epoch": 36.78822776711452, |
| "grad_norm": 11.649105072021484, |
| "learning_rate": 8.362827586206897e-05, |
| "loss": 2.6746, |
| "step": 57500 |
| }, |
| { |
| "epoch": 37.108125399872044, |
| "grad_norm": 12.708244323730469, |
| "learning_rate": 8.345586206896552e-05, |
| "loss": 2.6454, |
| "step": 58000 |
| }, |
| { |
| "epoch": 37.42802303262956, |
| "grad_norm": 12.876201629638672, |
| "learning_rate": 8.328344827586208e-05, |
| "loss": 2.5798, |
| "step": 58500 |
| }, |
| { |
| "epoch": 37.74792066538708, |
| "grad_norm": 11.92346477508545, |
| "learning_rate": 8.311103448275862e-05, |
| "loss": 2.6589, |
| "step": 59000 |
| }, |
| { |
| "epoch": 38.067818298144594, |
| "grad_norm": 10.742238998413086, |
| "learning_rate": 8.293896551724138e-05, |
| "loss": 2.5868, |
| "step": 59500 |
| }, |
| { |
| "epoch": 38.38771593090211, |
| "grad_norm": 11.399048805236816, |
| "learning_rate": 8.276655172413793e-05, |
| "loss": 2.5124, |
| "step": 60000 |
| }, |
| { |
| "epoch": 38.70761356365963, |
| "grad_norm": 13.563875198364258, |
| "learning_rate": 8.259413793103449e-05, |
| "loss": 2.576, |
| "step": 60500 |
| }, |
| { |
| "epoch": 39.027511196417144, |
| "grad_norm": 11.297135353088379, |
| "learning_rate": 8.242172413793103e-05, |
| "loss": 2.5671, |
| "step": 61000 |
| }, |
| { |
| "epoch": 39.34740882917466, |
| "grad_norm": 11.336121559143066, |
| "learning_rate": 8.22496551724138e-05, |
| "loss": 2.4445, |
| "step": 61500 |
| }, |
| { |
| "epoch": 39.667306461932185, |
| "grad_norm": 9.477692604064941, |
| "learning_rate": 8.207724137931035e-05, |
| "loss": 2.4981, |
| "step": 62000 |
| }, |
| { |
| "epoch": 39.9872040946897, |
| "grad_norm": 11.597848892211914, |
| "learning_rate": 8.19048275862069e-05, |
| "loss": 2.5382, |
| "step": 62500 |
| }, |
| { |
| "epoch": 40.30710172744722, |
| "grad_norm": 14.910037994384766, |
| "learning_rate": 8.173241379310346e-05, |
| "loss": 2.4158, |
| "step": 63000 |
| }, |
| { |
| "epoch": 40.626999360204735, |
| "grad_norm": 11.870673179626465, |
| "learning_rate": 8.156e-05, |
| "loss": 2.4395, |
| "step": 63500 |
| }, |
| { |
| "epoch": 40.94689699296225, |
| "grad_norm": 15.279576301574707, |
| "learning_rate": 8.138758620689655e-05, |
| "loss": 2.4653, |
| "step": 64000 |
| }, |
| { |
| "epoch": 41.26679462571977, |
| "grad_norm": 11.710406303405762, |
| "learning_rate": 8.121551724137931e-05, |
| "loss": 2.3567, |
| "step": 64500 |
| }, |
| { |
| "epoch": 41.586692258477285, |
| "grad_norm": 10.663411140441895, |
| "learning_rate": 8.104310344827587e-05, |
| "loss": 2.3655, |
| "step": 65000 |
| }, |
| { |
| "epoch": 41.9065898912348, |
| "grad_norm": 13.946629524230957, |
| "learning_rate": 8.087068965517241e-05, |
| "loss": 2.4336, |
| "step": 65500 |
| }, |
| { |
| "epoch": 42.226487523992326, |
| "grad_norm": 13.782262802124023, |
| "learning_rate": 8.069827586206898e-05, |
| "loss": 2.3351, |
| "step": 66000 |
| }, |
| { |
| "epoch": 42.54638515674984, |
| "grad_norm": 11.177961349487305, |
| "learning_rate": 8.052586206896552e-05, |
| "loss": 2.359, |
| "step": 66500 |
| }, |
| { |
| "epoch": 42.86628278950736, |
| "grad_norm": 15.120301246643066, |
| "learning_rate": 8.035344827586208e-05, |
| "loss": 2.355, |
| "step": 67000 |
| }, |
| { |
| "epoch": 43.186180422264876, |
| "grad_norm": 10.805267333984375, |
| "learning_rate": 8.018103448275862e-05, |
| "loss": 2.2905, |
| "step": 67500 |
| }, |
| { |
| "epoch": 43.50607805502239, |
| "grad_norm": 11.777176856994629, |
| "learning_rate": 8.000862068965517e-05, |
| "loss": 2.2906, |
| "step": 68000 |
| }, |
| { |
| "epoch": 43.82597568777991, |
| "grad_norm": 15.457807540893555, |
| "learning_rate": 7.983689655172414e-05, |
| "loss": 2.3269, |
| "step": 68500 |
| }, |
| { |
| "epoch": 44.145873320537426, |
| "grad_norm": 11.639357566833496, |
| "learning_rate": 7.966448275862069e-05, |
| "loss": 2.2371, |
| "step": 69000 |
| }, |
| { |
| "epoch": 44.46577095329494, |
| "grad_norm": 11.710591316223145, |
| "learning_rate": 7.949206896551725e-05, |
| "loss": 2.2248, |
| "step": 69500 |
| }, |
| { |
| "epoch": 44.785668586052466, |
| "grad_norm": 12.675103187561035, |
| "learning_rate": 7.93196551724138e-05, |
| "loss": 2.2586, |
| "step": 70000 |
| }, |
| { |
| "epoch": 45.10556621880998, |
| "grad_norm": 12.752120971679688, |
| "learning_rate": 7.914724137931034e-05, |
| "loss": 2.2489, |
| "step": 70500 |
| }, |
| { |
| "epoch": 45.4254638515675, |
| "grad_norm": 11.379339218139648, |
| "learning_rate": 7.89751724137931e-05, |
| "loss": 2.1774, |
| "step": 71000 |
| }, |
| { |
| "epoch": 45.74536148432502, |
| "grad_norm": 12.76633358001709, |
| "learning_rate": 7.880275862068966e-05, |
| "loss": 2.2007, |
| "step": 71500 |
| }, |
| { |
| "epoch": 46.06525911708253, |
| "grad_norm": 11.421367645263672, |
| "learning_rate": 7.863034482758621e-05, |
| "loss": 2.2262, |
| "step": 72000 |
| }, |
| { |
| "epoch": 46.38515674984005, |
| "grad_norm": 14.81748104095459, |
| "learning_rate": 7.845793103448277e-05, |
| "loss": 2.101, |
| "step": 72500 |
| }, |
| { |
| "epoch": 46.70505438259757, |
| "grad_norm": 12.902971267700195, |
| "learning_rate": 7.828551724137931e-05, |
| "loss": 2.1568, |
| "step": 73000 |
| }, |
| { |
| "epoch": 47.02495201535508, |
| "grad_norm": 10.685113906860352, |
| "learning_rate": 7.811310344827587e-05, |
| "loss": 2.1655, |
| "step": 73500 |
| }, |
| { |
| "epoch": 47.34484964811261, |
| "grad_norm": 15.892518043518066, |
| "learning_rate": 7.794068965517242e-05, |
| "loss": 2.0551, |
| "step": 74000 |
| }, |
| { |
| "epoch": 47.664747280870124, |
| "grad_norm": 13.730358123779297, |
| "learning_rate": 7.776862068965518e-05, |
| "loss": 2.1053, |
| "step": 74500 |
| }, |
| { |
| "epoch": 47.98464491362764, |
| "grad_norm": 13.635787963867188, |
| "learning_rate": 7.759620689655172e-05, |
| "loss": 2.1408, |
| "step": 75000 |
| }, |
| { |
| "epoch": 48.30454254638516, |
| "grad_norm": 12.861611366271973, |
| "learning_rate": 7.742379310344828e-05, |
| "loss": 2.0104, |
| "step": 75500 |
| }, |
| { |
| "epoch": 48.624440179142674, |
| "grad_norm": 11.84931468963623, |
| "learning_rate": 7.725137931034483e-05, |
| "loss": 2.0555, |
| "step": 76000 |
| }, |
| { |
| "epoch": 48.94433781190019, |
| "grad_norm": 15.812765121459961, |
| "learning_rate": 7.70793103448276e-05, |
| "loss": 2.1087, |
| "step": 76500 |
| }, |
| { |
| "epoch": 49.26423544465771, |
| "grad_norm": 14.233431816101074, |
| "learning_rate": 7.690689655172414e-05, |
| "loss": 1.9998, |
| "step": 77000 |
| }, |
| { |
| "epoch": 49.584133077415224, |
| "grad_norm": 14.329803466796875, |
| "learning_rate": 7.673448275862069e-05, |
| "loss": 2.0189, |
| "step": 77500 |
| }, |
| { |
| "epoch": 49.90403071017275, |
| "grad_norm": 11.00400161743164, |
| "learning_rate": 7.656206896551725e-05, |
| "loss": 2.059, |
| "step": 78000 |
| }, |
| { |
| "epoch": 50.223928342930265, |
| "grad_norm": 13.582133293151855, |
| "learning_rate": 7.63896551724138e-05, |
| "loss": 1.9753, |
| "step": 78500 |
| }, |
| { |
| "epoch": 50.54382597568778, |
| "grad_norm": 12.560907363891602, |
| "learning_rate": 7.621724137931034e-05, |
| "loss": 1.9759, |
| "step": 79000 |
| }, |
| { |
| "epoch": 50.8637236084453, |
| "grad_norm": 12.169915199279785, |
| "learning_rate": 7.60451724137931e-05, |
| "loss": 1.9944, |
| "step": 79500 |
| }, |
| { |
| "epoch": 51.183621241202815, |
| "grad_norm": 13.5604248046875, |
| "learning_rate": 7.587275862068966e-05, |
| "loss": 1.9323, |
| "step": 80000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 300000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 192, |
| "save_steps": 20000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.984666210441626e+16, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|