| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 127.95905310300704, | |
| "eval_steps": 500, | |
| "global_step": 200000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.3198976327575176, | |
| "grad_norm": 4.1601386070251465, | |
| "learning_rate": 5e-06, | |
| "loss": 10.3279, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6397952655150352, | |
| "grad_norm": 4.366061687469482, | |
| "learning_rate": 1e-05, | |
| "loss": 9.3834, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9596928982725528, | |
| "grad_norm": 4.784337043762207, | |
| "learning_rate": 1.5e-05, | |
| "loss": 8.8888, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2795905310300704, | |
| "grad_norm": 3.9968652725219727, | |
| "learning_rate": 2e-05, | |
| "loss": 8.6568, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.599488163787588, | |
| "grad_norm": 4.402552127838135, | |
| "learning_rate": 2.5e-05, | |
| "loss": 8.5473, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.9193857965451055, | |
| "grad_norm": 4.639041423797607, | |
| "learning_rate": 3e-05, | |
| "loss": 8.4044, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.239283429302623, | |
| "grad_norm": 5.651747226715088, | |
| "learning_rate": 3.5e-05, | |
| "loss": 8.2868, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.5591810620601407, | |
| "grad_norm": 4.6999359130859375, | |
| "learning_rate": 4e-05, | |
| "loss": 8.1766, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.8790786948176583, | |
| "grad_norm": 4.838181495666504, | |
| "learning_rate": 4.499e-05, | |
| "loss": 8.1118, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.198976327575176, | |
| "grad_norm": 4.238831996917725, | |
| "learning_rate": 4.999e-05, | |
| "loss": 8.0038, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.5188739603326935, | |
| "grad_norm": 4.455530643463135, | |
| "learning_rate": 5.499000000000001e-05, | |
| "loss": 7.9014, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.838771593090211, | |
| "grad_norm": 5.811736583709717, | |
| "learning_rate": 5.999e-05, | |
| "loss": 7.8352, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 4.158669225847729, | |
| "grad_norm": 4.998301982879639, | |
| "learning_rate": 6.498e-05, | |
| "loss": 7.7613, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.478566858605246, | |
| "grad_norm": 5.011510848999023, | |
| "learning_rate": 6.998e-05, | |
| "loss": 7.6554, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.798464491362764, | |
| "grad_norm": 4.750300884246826, | |
| "learning_rate": 7.498e-05, | |
| "loss": 7.6109, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 5.1183621241202815, | |
| "grad_norm": 6.24017858505249, | |
| "learning_rate": 7.998e-05, | |
| "loss": 7.5186, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.438259756877799, | |
| "grad_norm": 6.061458587646484, | |
| "learning_rate": 8.497000000000001e-05, | |
| "loss": 7.3966, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.758157389635317, | |
| "grad_norm": 7.151447772979736, | |
| "learning_rate": 8.997000000000001e-05, | |
| "loss": 7.2877, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 6.078055022392834, | |
| "grad_norm": 7.578985214233398, | |
| "learning_rate": 9.497000000000001e-05, | |
| "loss": 7.1542, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.397952655150352, | |
| "grad_norm": 5.948920726776123, | |
| "learning_rate": 9.997e-05, | |
| "loss": 7.0008, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.717850287907869, | |
| "grad_norm": 8.036959648132324, | |
| "learning_rate": 9.982896551724137e-05, | |
| "loss": 6.8966, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 7.037747920665387, | |
| "grad_norm": 7.160433292388916, | |
| "learning_rate": 9.965655172413794e-05, | |
| "loss": 6.7509, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 7.357645553422905, | |
| "grad_norm": 5.934999465942383, | |
| "learning_rate": 9.948413793103449e-05, | |
| "loss": 6.5833, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 7.677543186180422, | |
| "grad_norm": 7.745622634887695, | |
| "learning_rate": 9.931172413793104e-05, | |
| "loss": 6.4975, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.99744081893794, | |
| "grad_norm": 7.0418477058410645, | |
| "learning_rate": 9.91393103448276e-05, | |
| "loss": 6.4261, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 8.317338451695457, | |
| "grad_norm": 6.101259708404541, | |
| "learning_rate": 9.896689655172414e-05, | |
| "loss": 6.2092, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 8.637236084452976, | |
| "grad_norm": 7.289799213409424, | |
| "learning_rate": 9.87944827586207e-05, | |
| "loss": 6.1436, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 8.957133717210493, | |
| "grad_norm": 8.126811027526855, | |
| "learning_rate": 9.862206896551725e-05, | |
| "loss": 6.0456, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 9.277031349968011, | |
| "grad_norm": 8.221816062927246, | |
| "learning_rate": 9.845000000000001e-05, | |
| "loss": 5.9141, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 9.596928982725528, | |
| "grad_norm": 7.361550331115723, | |
| "learning_rate": 9.827793103448277e-05, | |
| "loss": 5.8326, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 9.916826615483046, | |
| "grad_norm": 7.1737775802612305, | |
| "learning_rate": 9.810551724137932e-05, | |
| "loss": 5.7974, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 10.236724248240563, | |
| "grad_norm": 9.80185604095459, | |
| "learning_rate": 9.793310344827586e-05, | |
| "loss": 5.6282, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 10.556621880998081, | |
| "grad_norm": 7.2062153816223145, | |
| "learning_rate": 9.776068965517242e-05, | |
| "loss": 5.5619, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 10.876519513755598, | |
| "grad_norm": 10.801878929138184, | |
| "learning_rate": 9.758827586206896e-05, | |
| "loss": 5.5155, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 11.196417146513117, | |
| "grad_norm": 8.48509693145752, | |
| "learning_rate": 9.741586206896553e-05, | |
| "loss": 5.4259, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 11.516314779270633, | |
| "grad_norm": 8.47572135925293, | |
| "learning_rate": 9.724344827586207e-05, | |
| "loss": 5.3205, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 11.836212412028152, | |
| "grad_norm": 6.122796535491943, | |
| "learning_rate": 9.707103448275863e-05, | |
| "loss": 5.3025, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 12.156110044785668, | |
| "grad_norm": 8.210710525512695, | |
| "learning_rate": 9.689896551724139e-05, | |
| "loss": 5.2264, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 12.476007677543187, | |
| "grad_norm": 7.857537746429443, | |
| "learning_rate": 9.672655172413794e-05, | |
| "loss": 5.1395, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 12.795905310300704, | |
| "grad_norm": 7.743075370788574, | |
| "learning_rate": 9.655413793103448e-05, | |
| "loss": 5.1109, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 13.115802943058222, | |
| "grad_norm": 10.574569702148438, | |
| "learning_rate": 9.638172413793104e-05, | |
| "loss": 5.0794, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 13.435700575815739, | |
| "grad_norm": 8.313858985900879, | |
| "learning_rate": 9.620931034482758e-05, | |
| "loss": 4.921, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 13.755598208573257, | |
| "grad_norm": 9.096057891845703, | |
| "learning_rate": 9.603689655172414e-05, | |
| "loss": 4.96, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 14.075495841330774, | |
| "grad_norm": 8.402993202209473, | |
| "learning_rate": 9.58644827586207e-05, | |
| "loss": 4.9062, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 14.395393474088293, | |
| "grad_norm": 8.110074996948242, | |
| "learning_rate": 9.569206896551725e-05, | |
| "loss": 4.8026, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 14.71529110684581, | |
| "grad_norm": 7.908292293548584, | |
| "learning_rate": 9.552000000000001e-05, | |
| "loss": 4.82, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 15.035188739603328, | |
| "grad_norm": 7.991878986358643, | |
| "learning_rate": 9.534758620689655e-05, | |
| "loss": 4.7397, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 15.355086372360844, | |
| "grad_norm": 8.696029663085938, | |
| "learning_rate": 9.517551724137932e-05, | |
| "loss": 4.6656, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 15.674984005118363, | |
| "grad_norm": 9.421612739562988, | |
| "learning_rate": 9.500310344827586e-05, | |
| "loss": 4.6412, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 15.99488163787588, | |
| "grad_norm": 9.747482299804688, | |
| "learning_rate": 9.483068965517242e-05, | |
| "loss": 4.6048, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 16.314779270633398, | |
| "grad_norm": 10.389492988586426, | |
| "learning_rate": 9.465827586206897e-05, | |
| "loss": 4.481, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 16.634676903390915, | |
| "grad_norm": 8.661949157714844, | |
| "learning_rate": 9.448586206896553e-05, | |
| "loss": 4.4923, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 16.95457453614843, | |
| "grad_norm": 12.681297302246094, | |
| "learning_rate": 9.431344827586207e-05, | |
| "loss": 4.4816, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 17.27447216890595, | |
| "grad_norm": 8.993134498596191, | |
| "learning_rate": 9.414103448275863e-05, | |
| "loss": 4.3512, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 17.59436980166347, | |
| "grad_norm": 10.020146369934082, | |
| "learning_rate": 9.396862068965517e-05, | |
| "loss": 4.3447, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 17.914267434420985, | |
| "grad_norm": 9.514701843261719, | |
| "learning_rate": 9.379655172413794e-05, | |
| "loss": 4.3376, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 18.234165067178502, | |
| "grad_norm": 10.324498176574707, | |
| "learning_rate": 9.362413793103448e-05, | |
| "loss": 4.2612, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 18.554062699936022, | |
| "grad_norm": 10.682856559753418, | |
| "learning_rate": 9.345172413793104e-05, | |
| "loss": 4.226, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 18.87396033269354, | |
| "grad_norm": 7.883260726928711, | |
| "learning_rate": 9.327931034482758e-05, | |
| "loss": 4.19, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 19.193857965451055, | |
| "grad_norm": 12.470623016357422, | |
| "learning_rate": 9.310724137931035e-05, | |
| "loss": 4.1881, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 19.513755598208572, | |
| "grad_norm": 9.932331085205078, | |
| "learning_rate": 9.29348275862069e-05, | |
| "loss": 4.0853, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 19.833653230966092, | |
| "grad_norm": 8.153782844543457, | |
| "learning_rate": 9.276241379310345e-05, | |
| "loss": 4.1087, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 20.15355086372361, | |
| "grad_norm": 8.214093208312988, | |
| "learning_rate": 9.258999999999999e-05, | |
| "loss": 4.0751, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 20.473448496481126, | |
| "grad_norm": 11.927350044250488, | |
| "learning_rate": 9.241758620689656e-05, | |
| "loss": 3.9686, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 20.793346129238643, | |
| "grad_norm": 9.67835807800293, | |
| "learning_rate": 9.224551724137932e-05, | |
| "loss": 3.9745, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 21.113243761996163, | |
| "grad_norm": 9.911735534667969, | |
| "learning_rate": 9.207310344827586e-05, | |
| "loss": 3.9308, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 21.43314139475368, | |
| "grad_norm": 9.05053424835205, | |
| "learning_rate": 9.190068965517242e-05, | |
| "loss": 3.8718, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 21.753039027511196, | |
| "grad_norm": 9.588044166564941, | |
| "learning_rate": 9.172827586206897e-05, | |
| "loss": 3.8425, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 22.072936660268713, | |
| "grad_norm": 8.788230895996094, | |
| "learning_rate": 9.155620689655173e-05, | |
| "loss": 3.8617, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 22.392834293026233, | |
| "grad_norm": 9.435895919799805, | |
| "learning_rate": 9.138379310344827e-05, | |
| "loss": 3.7524, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 22.71273192578375, | |
| "grad_norm": 9.870182037353516, | |
| "learning_rate": 9.121137931034483e-05, | |
| "loss": 3.7916, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 23.032629558541267, | |
| "grad_norm": 9.612881660461426, | |
| "learning_rate": 9.103896551724139e-05, | |
| "loss": 3.8011, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 23.352527191298783, | |
| "grad_norm": 9.643827438354492, | |
| "learning_rate": 9.086689655172414e-05, | |
| "loss": 3.6478, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 23.672424824056304, | |
| "grad_norm": 14.105424880981445, | |
| "learning_rate": 9.069448275862069e-05, | |
| "loss": 3.6671, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 23.99232245681382, | |
| "grad_norm": 10.427962303161621, | |
| "learning_rate": 9.052206896551724e-05, | |
| "loss": 3.6809, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 24.312220089571337, | |
| "grad_norm": 11.505946159362793, | |
| "learning_rate": 9.03496551724138e-05, | |
| "loss": 3.553, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 24.632117722328854, | |
| "grad_norm": 10.393635749816895, | |
| "learning_rate": 9.017724137931035e-05, | |
| "loss": 3.5408, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 24.952015355086374, | |
| "grad_norm": 9.023842811584473, | |
| "learning_rate": 9.00051724137931e-05, | |
| "loss": 3.5915, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 25.27191298784389, | |
| "grad_norm": 10.69048023223877, | |
| "learning_rate": 8.983275862068967e-05, | |
| "loss": 3.4896, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 25.591810620601407, | |
| "grad_norm": 10.803936958312988, | |
| "learning_rate": 8.966034482758621e-05, | |
| "loss": 3.4854, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 25.911708253358924, | |
| "grad_norm": 10.489801406860352, | |
| "learning_rate": 8.948793103448276e-05, | |
| "loss": 3.4871, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 26.231605886116444, | |
| "grad_norm": 10.558309555053711, | |
| "learning_rate": 8.931586206896552e-05, | |
| "loss": 3.4186, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 26.55150351887396, | |
| "grad_norm": 12.186748504638672, | |
| "learning_rate": 8.914344827586208e-05, | |
| "loss": 3.4027, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 26.871401151631478, | |
| "grad_norm": 9.8623046875, | |
| "learning_rate": 8.897103448275862e-05, | |
| "loss": 3.4191, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 27.191298784388994, | |
| "grad_norm": 11.407792091369629, | |
| "learning_rate": 8.879862068965518e-05, | |
| "loss": 3.341, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 27.511196417146515, | |
| "grad_norm": 13.37617301940918, | |
| "learning_rate": 8.862655172413794e-05, | |
| "loss": 3.3137, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 27.83109404990403, | |
| "grad_norm": 10.30826187133789, | |
| "learning_rate": 8.845413793103449e-05, | |
| "loss": 3.3036, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 28.150991682661548, | |
| "grad_norm": 12.024778366088867, | |
| "learning_rate": 8.828172413793105e-05, | |
| "loss": 3.2678, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 28.470889315419065, | |
| "grad_norm": 9.730340957641602, | |
| "learning_rate": 8.810931034482759e-05, | |
| "loss": 3.1949, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 28.790786948176585, | |
| "grad_norm": 9.700602531433105, | |
| "learning_rate": 8.793689655172414e-05, | |
| "loss": 3.2541, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 29.1106845809341, | |
| "grad_norm": 12.359143257141113, | |
| "learning_rate": 8.77648275862069e-05, | |
| "loss": 3.2456, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 29.43058221369162, | |
| "grad_norm": 11.989018440246582, | |
| "learning_rate": 8.759241379310346e-05, | |
| "loss": 3.1154, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 29.750479846449135, | |
| "grad_norm": 10.904190063476562, | |
| "learning_rate": 8.742e-05, | |
| "loss": 3.175, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 30.070377479206655, | |
| "grad_norm": 11.253949165344238, | |
| "learning_rate": 8.724758620689656e-05, | |
| "loss": 3.1478, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 30.390275111964172, | |
| "grad_norm": 12.229791641235352, | |
| "learning_rate": 8.707517241379311e-05, | |
| "loss": 3.0632, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 30.71017274472169, | |
| "grad_norm": 9.516524314880371, | |
| "learning_rate": 8.690275862068967e-05, | |
| "loss": 3.0843, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 31.030070377479205, | |
| "grad_norm": 13.730731010437012, | |
| "learning_rate": 8.673034482758621e-05, | |
| "loss": 3.098, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 31.349968010236726, | |
| "grad_norm": 9.73539924621582, | |
| "learning_rate": 8.655827586206897e-05, | |
| "loss": 2.9611, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 31.669865642994242, | |
| "grad_norm": 12.066815376281738, | |
| "learning_rate": 8.638586206896552e-05, | |
| "loss": 2.9943, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 31.98976327575176, | |
| "grad_norm": 11.028585433959961, | |
| "learning_rate": 8.621344827586208e-05, | |
| "loss": 3.0424, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 32.30966090850928, | |
| "grad_norm": 11.2380952835083, | |
| "learning_rate": 8.604103448275862e-05, | |
| "loss": 2.9023, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 32.629558541266796, | |
| "grad_norm": 9.345772743225098, | |
| "learning_rate": 8.586862068965518e-05, | |
| "loss": 2.9586, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 32.94945617402431, | |
| "grad_norm": 10.239849090576172, | |
| "learning_rate": 8.569655172413793e-05, | |
| "loss": 2.9461, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 33.26935380678183, | |
| "grad_norm": 11.058523178100586, | |
| "learning_rate": 8.552413793103449e-05, | |
| "loss": 2.8453, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 33.589251439539346, | |
| "grad_norm": 12.131317138671875, | |
| "learning_rate": 8.535172413793105e-05, | |
| "loss": 2.8603, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 33.90914907229686, | |
| "grad_norm": 10.392476081848145, | |
| "learning_rate": 8.517931034482759e-05, | |
| "loss": 2.8817, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 34.22904670505438, | |
| "grad_norm": 10.749021530151367, | |
| "learning_rate": 8.500724137931036e-05, | |
| "loss": 2.8073, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 34.5489443378119, | |
| "grad_norm": 12.33171558380127, | |
| "learning_rate": 8.48348275862069e-05, | |
| "loss": 2.7793, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 34.86884197056942, | |
| "grad_norm": 12.961758613586426, | |
| "learning_rate": 8.466241379310346e-05, | |
| "loss": 2.8066, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 35.18873960332694, | |
| "grad_norm": 13.320075035095215, | |
| "learning_rate": 8.449e-05, | |
| "loss": 2.7459, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 35.50863723608445, | |
| "grad_norm": 14.416489601135254, | |
| "learning_rate": 8.431758620689655e-05, | |
| "loss": 2.7321, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 35.82853486884197, | |
| "grad_norm": 11.203073501586914, | |
| "learning_rate": 8.414551724137931e-05, | |
| "loss": 2.7486, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 36.14843250159949, | |
| "grad_norm": 10.463476181030273, | |
| "learning_rate": 8.397310344827587e-05, | |
| "loss": 2.7086, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 36.468330134357004, | |
| "grad_norm": 11.375761985778809, | |
| "learning_rate": 8.380068965517241e-05, | |
| "loss": 2.6387, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 36.78822776711452, | |
| "grad_norm": 11.649105072021484, | |
| "learning_rate": 8.362827586206897e-05, | |
| "loss": 2.6746, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 37.108125399872044, | |
| "grad_norm": 12.708244323730469, | |
| "learning_rate": 8.345586206896552e-05, | |
| "loss": 2.6454, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 37.42802303262956, | |
| "grad_norm": 12.876201629638672, | |
| "learning_rate": 8.328344827586208e-05, | |
| "loss": 2.5798, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 37.74792066538708, | |
| "grad_norm": 11.92346477508545, | |
| "learning_rate": 8.311103448275862e-05, | |
| "loss": 2.6589, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 38.067818298144594, | |
| "grad_norm": 10.742238998413086, | |
| "learning_rate": 8.293896551724138e-05, | |
| "loss": 2.5868, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 38.38771593090211, | |
| "grad_norm": 11.399048805236816, | |
| "learning_rate": 8.276655172413793e-05, | |
| "loss": 2.5124, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 38.70761356365963, | |
| "grad_norm": 13.563875198364258, | |
| "learning_rate": 8.259413793103449e-05, | |
| "loss": 2.576, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 39.027511196417144, | |
| "grad_norm": 11.297135353088379, | |
| "learning_rate": 8.242172413793103e-05, | |
| "loss": 2.5671, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 39.34740882917466, | |
| "grad_norm": 11.336121559143066, | |
| "learning_rate": 8.22496551724138e-05, | |
| "loss": 2.4445, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 39.667306461932185, | |
| "grad_norm": 9.477692604064941, | |
| "learning_rate": 8.207724137931035e-05, | |
| "loss": 2.4981, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 39.9872040946897, | |
| "grad_norm": 11.597848892211914, | |
| "learning_rate": 8.19048275862069e-05, | |
| "loss": 2.5382, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 40.30710172744722, | |
| "grad_norm": 14.910037994384766, | |
| "learning_rate": 8.173241379310346e-05, | |
| "loss": 2.4158, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 40.626999360204735, | |
| "grad_norm": 11.870673179626465, | |
| "learning_rate": 8.156e-05, | |
| "loss": 2.4395, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 40.94689699296225, | |
| "grad_norm": 15.279576301574707, | |
| "learning_rate": 8.138758620689655e-05, | |
| "loss": 2.4653, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 41.26679462571977, | |
| "grad_norm": 11.710406303405762, | |
| "learning_rate": 8.121551724137931e-05, | |
| "loss": 2.3567, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 41.586692258477285, | |
| "grad_norm": 10.663411140441895, | |
| "learning_rate": 8.104310344827587e-05, | |
| "loss": 2.3655, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 41.9065898912348, | |
| "grad_norm": 13.946629524230957, | |
| "learning_rate": 8.087068965517241e-05, | |
| "loss": 2.4336, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 42.226487523992326, | |
| "grad_norm": 13.782262802124023, | |
| "learning_rate": 8.069827586206898e-05, | |
| "loss": 2.3351, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 42.54638515674984, | |
| "grad_norm": 11.177961349487305, | |
| "learning_rate": 8.052586206896552e-05, | |
| "loss": 2.359, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 42.86628278950736, | |
| "grad_norm": 15.120301246643066, | |
| "learning_rate": 8.035344827586208e-05, | |
| "loss": 2.355, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 43.186180422264876, | |
| "grad_norm": 10.805267333984375, | |
| "learning_rate": 8.018103448275862e-05, | |
| "loss": 2.2905, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 43.50607805502239, | |
| "grad_norm": 11.777176856994629, | |
| "learning_rate": 8.000862068965517e-05, | |
| "loss": 2.2906, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 43.82597568777991, | |
| "grad_norm": 15.457807540893555, | |
| "learning_rate": 7.983689655172414e-05, | |
| "loss": 2.3269, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 44.145873320537426, | |
| "grad_norm": 11.639357566833496, | |
| "learning_rate": 7.966448275862069e-05, | |
| "loss": 2.2371, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 44.46577095329494, | |
| "grad_norm": 11.710591316223145, | |
| "learning_rate": 7.949206896551725e-05, | |
| "loss": 2.2248, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 44.785668586052466, | |
| "grad_norm": 12.675103187561035, | |
| "learning_rate": 7.93196551724138e-05, | |
| "loss": 2.2586, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 45.10556621880998, | |
| "grad_norm": 12.752120971679688, | |
| "learning_rate": 7.914724137931034e-05, | |
| "loss": 2.2489, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 45.4254638515675, | |
| "grad_norm": 11.379339218139648, | |
| "learning_rate": 7.89751724137931e-05, | |
| "loss": 2.1774, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 45.74536148432502, | |
| "grad_norm": 12.76633358001709, | |
| "learning_rate": 7.880275862068966e-05, | |
| "loss": 2.2007, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 46.06525911708253, | |
| "grad_norm": 11.421367645263672, | |
| "learning_rate": 7.863034482758621e-05, | |
| "loss": 2.2262, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 46.38515674984005, | |
| "grad_norm": 14.81748104095459, | |
| "learning_rate": 7.845793103448277e-05, | |
| "loss": 2.101, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 46.70505438259757, | |
| "grad_norm": 12.902971267700195, | |
| "learning_rate": 7.828551724137931e-05, | |
| "loss": 2.1568, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 47.02495201535508, | |
| "grad_norm": 10.685113906860352, | |
| "learning_rate": 7.811310344827587e-05, | |
| "loss": 2.1655, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 47.34484964811261, | |
| "grad_norm": 15.892518043518066, | |
| "learning_rate": 7.794068965517242e-05, | |
| "loss": 2.0551, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 47.664747280870124, | |
| "grad_norm": 13.730358123779297, | |
| "learning_rate": 7.776862068965518e-05, | |
| "loss": 2.1053, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 47.98464491362764, | |
| "grad_norm": 13.635787963867188, | |
| "learning_rate": 7.759620689655172e-05, | |
| "loss": 2.1408, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 48.30454254638516, | |
| "grad_norm": 12.861611366271973, | |
| "learning_rate": 7.742379310344828e-05, | |
| "loss": 2.0104, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 48.624440179142674, | |
| "grad_norm": 11.84931468963623, | |
| "learning_rate": 7.725137931034483e-05, | |
| "loss": 2.0555, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 48.94433781190019, | |
| "grad_norm": 15.812765121459961, | |
| "learning_rate": 7.70793103448276e-05, | |
| "loss": 2.1087, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 49.26423544465771, | |
| "grad_norm": 14.233431816101074, | |
| "learning_rate": 7.690689655172414e-05, | |
| "loss": 1.9998, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 49.584133077415224, | |
| "grad_norm": 14.329803466796875, | |
| "learning_rate": 7.673448275862069e-05, | |
| "loss": 2.0189, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 49.90403071017275, | |
| "grad_norm": 11.00400161743164, | |
| "learning_rate": 7.656206896551725e-05, | |
| "loss": 2.059, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 50.223928342930265, | |
| "grad_norm": 13.582133293151855, | |
| "learning_rate": 7.63896551724138e-05, | |
| "loss": 1.9753, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 50.54382597568778, | |
| "grad_norm": 12.560907363891602, | |
| "learning_rate": 7.621724137931034e-05, | |
| "loss": 1.9759, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 50.8637236084453, | |
| "grad_norm": 12.169915199279785, | |
| "learning_rate": 7.60451724137931e-05, | |
| "loss": 1.9944, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 51.183621241202815, | |
| "grad_norm": 13.5604248046875, | |
| "learning_rate": 7.587275862068966e-05, | |
| "loss": 1.9323, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 51.50351887396033, | |
| "grad_norm": 15.892741203308105, | |
| "learning_rate": 7.570034482758621e-05, | |
| "loss": 1.9095, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 51.82341650671785, | |
| "grad_norm": 13.435209274291992, | |
| "learning_rate": 7.552793103448276e-05, | |
| "loss": 1.9508, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 52.143314139475365, | |
| "grad_norm": 11.180010795593262, | |
| "learning_rate": 7.535586206896551e-05, | |
| "loss": 1.9216, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 52.46321177223289, | |
| "grad_norm": 12.792661666870117, | |
| "learning_rate": 7.518344827586207e-05, | |
| "loss": 1.8817, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 52.783109404990405, | |
| "grad_norm": 11.785886764526367, | |
| "learning_rate": 7.501103448275863e-05, | |
| "loss": 1.9121, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 53.10300703774792, | |
| "grad_norm": 10.568120002746582, | |
| "learning_rate": 7.483862068965518e-05, | |
| "loss": 1.8885, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 53.42290467050544, | |
| "grad_norm": 14.641459465026855, | |
| "learning_rate": 7.466620689655172e-05, | |
| "loss": 1.8357, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 53.742802303262955, | |
| "grad_norm": 13.5363187789917, | |
| "learning_rate": 7.449379310344828e-05, | |
| "loss": 1.8515, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 54.06269993602047, | |
| "grad_norm": 12.997908592224121, | |
| "learning_rate": 7.432172413793104e-05, | |
| "loss": 1.8681, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 54.38259756877799, | |
| "grad_norm": 12.53503131866455, | |
| "learning_rate": 7.414931034482759e-05, | |
| "loss": 1.7785, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 54.702495201535505, | |
| "grad_norm": 11.986194610595703, | |
| "learning_rate": 7.397689655172413e-05, | |
| "loss": 1.8507, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 55.02239283429303, | |
| "grad_norm": 12.089723587036133, | |
| "learning_rate": 7.380448275862069e-05, | |
| "loss": 1.8537, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 55.342290467050546, | |
| "grad_norm": 13.552453994750977, | |
| "learning_rate": 7.363206896551725e-05, | |
| "loss": 1.7622, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 55.66218809980806, | |
| "grad_norm": 12.03878116607666, | |
| "learning_rate": 7.346e-05, | |
| "loss": 1.8076, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 55.98208573256558, | |
| "grad_norm": 11.187782287597656, | |
| "learning_rate": 7.328758620689655e-05, | |
| "loss": 1.8241, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 56.301983365323096, | |
| "grad_norm": 14.924737930297852, | |
| "learning_rate": 7.311517241379312e-05, | |
| "loss": 1.7077, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 56.62188099808061, | |
| "grad_norm": 12.302467346191406, | |
| "learning_rate": 7.294275862068966e-05, | |
| "loss": 1.74, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 56.94177863083813, | |
| "grad_norm": 10.834394454956055, | |
| "learning_rate": 7.277034482758621e-05, | |
| "loss": 1.7827, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 57.261676263595646, | |
| "grad_norm": 14.356012344360352, | |
| "learning_rate": 7.259793103448276e-05, | |
| "loss": 1.7082, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 57.58157389635317, | |
| "grad_norm": 14.632678031921387, | |
| "learning_rate": 7.242551724137931e-05, | |
| "loss": 1.7045, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 57.90147152911069, | |
| "grad_norm": 13.501043319702148, | |
| "learning_rate": 7.225344827586207e-05, | |
| "loss": 1.7522, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 58.2213691618682, | |
| "grad_norm": 18.839614868164062, | |
| "learning_rate": 7.208103448275862e-05, | |
| "loss": 1.6801, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 58.54126679462572, | |
| "grad_norm": 13.41618824005127, | |
| "learning_rate": 7.190862068965517e-05, | |
| "loss": 1.7005, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 58.86116442738324, | |
| "grad_norm": 12.56169605255127, | |
| "learning_rate": 7.173620689655172e-05, | |
| "loss": 1.7018, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 59.18106206014075, | |
| "grad_norm": 13.447467803955078, | |
| "learning_rate": 7.15641379310345e-05, | |
| "loss": 1.6691, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 59.50095969289827, | |
| "grad_norm": 12.452493667602539, | |
| "learning_rate": 7.139172413793104e-05, | |
| "loss": 1.651, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 59.82085732565579, | |
| "grad_norm": 10.552214622497559, | |
| "learning_rate": 7.121931034482759e-05, | |
| "loss": 1.6916, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 60.14075495841331, | |
| "grad_norm": 11.099422454833984, | |
| "learning_rate": 7.104689655172413e-05, | |
| "loss": 1.6716, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 60.46065259117083, | |
| "grad_norm": 13.663276672363281, | |
| "learning_rate": 7.08744827586207e-05, | |
| "loss": 1.6105, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 60.780550223928344, | |
| "grad_norm": 11.783934593200684, | |
| "learning_rate": 7.070206896551725e-05, | |
| "loss": 1.6399, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 61.10044785668586, | |
| "grad_norm": 12.881340026855469, | |
| "learning_rate": 7.053e-05, | |
| "loss": 1.6058, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 61.42034548944338, | |
| "grad_norm": 12.405476570129395, | |
| "learning_rate": 7.035758620689656e-05, | |
| "loss": 1.5703, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 61.740243122200894, | |
| "grad_norm": 11.660452842712402, | |
| "learning_rate": 7.018517241379311e-05, | |
| "loss": 1.6002, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 62.06014075495841, | |
| "grad_norm": 11.69723892211914, | |
| "learning_rate": 7.001275862068966e-05, | |
| "loss": 1.6186, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 62.38003838771593, | |
| "grad_norm": 16.210947036743164, | |
| "learning_rate": 6.984034482758621e-05, | |
| "loss": 1.5663, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 62.69993602047345, | |
| "grad_norm": 11.853803634643555, | |
| "learning_rate": 6.966827586206897e-05, | |
| "loss": 1.5744, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 63.01983365323097, | |
| "grad_norm": 10.565818786621094, | |
| "learning_rate": 6.949586206896553e-05, | |
| "loss": 1.5829, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 63.339731285988485, | |
| "grad_norm": 11.621013641357422, | |
| "learning_rate": 6.932344827586207e-05, | |
| "loss": 1.5213, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 63.659628918746, | |
| "grad_norm": 10.182308197021484, | |
| "learning_rate": 6.915103448275862e-05, | |
| "loss": 1.5291, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 63.97952655150352, | |
| "grad_norm": 14.434243202209473, | |
| "learning_rate": 6.897862068965517e-05, | |
| "loss": 1.5612, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 64.29942418426104, | |
| "grad_norm": 12.513864517211914, | |
| "learning_rate": 6.880655172413794e-05, | |
| "loss": 1.5041, | |
| "step": 100500 | |
| }, | |
| { | |
| "epoch": 64.61932181701856, | |
| "grad_norm": 13.189037322998047, | |
| "learning_rate": 6.863413793103448e-05, | |
| "loss": 1.5097, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 64.93921944977608, | |
| "grad_norm": 11.867232322692871, | |
| "learning_rate": 6.846172413793104e-05, | |
| "loss": 1.5401, | |
| "step": 101500 | |
| }, | |
| { | |
| "epoch": 65.25911708253359, | |
| "grad_norm": 13.00894832611084, | |
| "learning_rate": 6.828931034482758e-05, | |
| "loss": 1.4581, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 65.57901471529111, | |
| "grad_norm": 11.719345092773438, | |
| "learning_rate": 6.811689655172415e-05, | |
| "loss": 1.4807, | |
| "step": 102500 | |
| }, | |
| { | |
| "epoch": 65.89891234804863, | |
| "grad_norm": 11.355063438415527, | |
| "learning_rate": 6.79444827586207e-05, | |
| "loss": 1.4892, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 66.21880998080614, | |
| "grad_norm": 13.351948738098145, | |
| "learning_rate": 6.777241379310345e-05, | |
| "loss": 1.4636, | |
| "step": 103500 | |
| }, | |
| { | |
| "epoch": 66.53870761356366, | |
| "grad_norm": 15.406342506408691, | |
| "learning_rate": 6.76e-05, | |
| "loss": 1.4678, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 66.85860524632118, | |
| "grad_norm": 14.357329368591309, | |
| "learning_rate": 6.742758620689656e-05, | |
| "loss": 1.4936, | |
| "step": 104500 | |
| }, | |
| { | |
| "epoch": 67.17850287907869, | |
| "grad_norm": 12.275686264038086, | |
| "learning_rate": 6.725517241379311e-05, | |
| "loss": 1.4566, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 67.49840051183621, | |
| "grad_norm": 13.198380470275879, | |
| "learning_rate": 6.708310344827586e-05, | |
| "loss": 1.4326, | |
| "step": 105500 | |
| }, | |
| { | |
| "epoch": 67.81829814459373, | |
| "grad_norm": 13.365631103515625, | |
| "learning_rate": 6.691068965517242e-05, | |
| "loss": 1.426, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 68.13819577735124, | |
| "grad_norm": 14.106985092163086, | |
| "learning_rate": 6.673827586206897e-05, | |
| "loss": 1.4289, | |
| "step": 106500 | |
| }, | |
| { | |
| "epoch": 68.45809341010876, | |
| "grad_norm": 10.076281547546387, | |
| "learning_rate": 6.656586206896553e-05, | |
| "loss": 1.396, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 68.77799104286628, | |
| "grad_norm": 14.63807201385498, | |
| "learning_rate": 6.639344827586207e-05, | |
| "loss": 1.3979, | |
| "step": 107500 | |
| }, | |
| { | |
| "epoch": 69.0978886756238, | |
| "grad_norm": 13.643959045410156, | |
| "learning_rate": 6.622137931034483e-05, | |
| "loss": 1.4274, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 69.41778630838132, | |
| "grad_norm": 11.819470405578613, | |
| "learning_rate": 6.604896551724138e-05, | |
| "loss": 1.3769, | |
| "step": 108500 | |
| }, | |
| { | |
| "epoch": 69.73768394113884, | |
| "grad_norm": 14.33261775970459, | |
| "learning_rate": 6.587655172413794e-05, | |
| "loss": 1.3825, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 70.05758157389636, | |
| "grad_norm": 10.918536186218262, | |
| "learning_rate": 6.570413793103448e-05, | |
| "loss": 1.3913, | |
| "step": 109500 | |
| }, | |
| { | |
| "epoch": 70.37747920665387, | |
| "grad_norm": 13.519926071166992, | |
| "learning_rate": 6.553172413793104e-05, | |
| "loss": 1.3341, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 70.69737683941139, | |
| "grad_norm": 12.5425386428833, | |
| "learning_rate": 6.535931034482759e-05, | |
| "loss": 1.3828, | |
| "step": 110500 | |
| }, | |
| { | |
| "epoch": 71.0172744721689, | |
| "grad_norm": 11.435805320739746, | |
| "learning_rate": 6.518724137931035e-05, | |
| "loss": 1.3821, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 71.33717210492642, | |
| "grad_norm": 12.65505313873291, | |
| "learning_rate": 6.501482758620689e-05, | |
| "loss": 1.3083, | |
| "step": 111500 | |
| }, | |
| { | |
| "epoch": 71.65706973768394, | |
| "grad_norm": 15.489115715026855, | |
| "learning_rate": 6.484241379310345e-05, | |
| "loss": 1.341, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 71.97696737044146, | |
| "grad_norm": 14.14395809173584, | |
| "learning_rate": 6.467e-05, | |
| "loss": 1.3579, | |
| "step": 112500 | |
| }, | |
| { | |
| "epoch": 72.29686500319897, | |
| "grad_norm": 13.708014488220215, | |
| "learning_rate": 6.449758620689656e-05, | |
| "loss": 1.3032, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 72.61676263595649, | |
| "grad_norm": 10.75635814666748, | |
| "learning_rate": 6.432551724137932e-05, | |
| "loss": 1.3045, | |
| "step": 113500 | |
| }, | |
| { | |
| "epoch": 72.93666026871401, | |
| "grad_norm": 12.12192440032959, | |
| "learning_rate": 6.415310344827586e-05, | |
| "loss": 1.3248, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 73.25655790147152, | |
| "grad_norm": 13.368456840515137, | |
| "learning_rate": 6.398068965517241e-05, | |
| "loss": 1.287, | |
| "step": 114500 | |
| }, | |
| { | |
| "epoch": 73.57645553422904, | |
| "grad_norm": 12.584633827209473, | |
| "learning_rate": 6.380827586206897e-05, | |
| "loss": 1.3015, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 73.89635316698656, | |
| "grad_norm": 13.863194465637207, | |
| "learning_rate": 6.363620689655173e-05, | |
| "loss": 1.2898, | |
| "step": 115500 | |
| }, | |
| { | |
| "epoch": 74.21625079974409, | |
| "grad_norm": 12.937112808227539, | |
| "learning_rate": 6.346379310344827e-05, | |
| "loss": 1.2456, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 74.5361484325016, | |
| "grad_norm": 11.274981498718262, | |
| "learning_rate": 6.329137931034484e-05, | |
| "loss": 1.2689, | |
| "step": 116500 | |
| }, | |
| { | |
| "epoch": 74.85604606525912, | |
| "grad_norm": 14.425061225891113, | |
| "learning_rate": 6.311896551724138e-05, | |
| "loss": 1.2752, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 75.17594369801664, | |
| "grad_norm": 11.654635429382324, | |
| "learning_rate": 6.294655172413794e-05, | |
| "loss": 1.2442, | |
| "step": 117500 | |
| }, | |
| { | |
| "epoch": 75.49584133077416, | |
| "grad_norm": 10.00129222869873, | |
| "learning_rate": 6.277413793103448e-05, | |
| "loss": 1.2506, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 75.81573896353167, | |
| "grad_norm": 11.665295600891113, | |
| "learning_rate": 6.260206896551725e-05, | |
| "loss": 1.2541, | |
| "step": 118500 | |
| }, | |
| { | |
| "epoch": 76.13563659628919, | |
| "grad_norm": 10.555766105651855, | |
| "learning_rate": 6.24296551724138e-05, | |
| "loss": 1.2486, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 76.4555342290467, | |
| "grad_norm": 14.879280090332031, | |
| "learning_rate": 6.225724137931035e-05, | |
| "loss": 1.2124, | |
| "step": 119500 | |
| }, | |
| { | |
| "epoch": 76.77543186180422, | |
| "grad_norm": 15.131136894226074, | |
| "learning_rate": 6.208482758620689e-05, | |
| "loss": 1.2549, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 77.09532949456174, | |
| "grad_norm": 9.889472961425781, | |
| "learning_rate": 6.191241379310345e-05, | |
| "loss": 1.2376, | |
| "step": 120500 | |
| }, | |
| { | |
| "epoch": 77.41522712731926, | |
| "grad_norm": 11.307145118713379, | |
| "learning_rate": 6.174e-05, | |
| "loss": 1.1958, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 77.73512476007677, | |
| "grad_norm": 14.303799629211426, | |
| "learning_rate": 6.156758620689656e-05, | |
| "loss": 1.2009, | |
| "step": 121500 | |
| }, | |
| { | |
| "epoch": 78.05502239283429, | |
| "grad_norm": 11.318217277526855, | |
| "learning_rate": 6.139517241379311e-05, | |
| "loss": 1.2215, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 78.3749200255918, | |
| "grad_norm": 13.979291915893555, | |
| "learning_rate": 6.122310344827586e-05, | |
| "loss": 1.1713, | |
| "step": 122500 | |
| }, | |
| { | |
| "epoch": 78.69481765834932, | |
| "grad_norm": 12.78084945678711, | |
| "learning_rate": 6.105103448275863e-05, | |
| "loss": 1.1901, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 79.01471529110684, | |
| "grad_norm": 10.332459449768066, | |
| "learning_rate": 6.087862068965517e-05, | |
| "loss": 1.2141, | |
| "step": 123500 | |
| }, | |
| { | |
| "epoch": 79.33461292386437, | |
| "grad_norm": 11.179670333862305, | |
| "learning_rate": 6.0706206896551735e-05, | |
| "loss": 1.1641, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 79.65451055662189, | |
| "grad_norm": 12.706995964050293, | |
| "learning_rate": 6.053379310344828e-05, | |
| "loss": 1.1887, | |
| "step": 124500 | |
| }, | |
| { | |
| "epoch": 79.9744081893794, | |
| "grad_norm": 12.575511932373047, | |
| "learning_rate": 6.036137931034483e-05, | |
| "loss": 1.1994, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 80.29430582213692, | |
| "grad_norm": 11.299592971801758, | |
| "learning_rate": 6.0189310344827584e-05, | |
| "loss": 1.1535, | |
| "step": 125500 | |
| }, | |
| { | |
| "epoch": 80.61420345489444, | |
| "grad_norm": 9.741961479187012, | |
| "learning_rate": 6.0016896551724147e-05, | |
| "loss": 1.1508, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 80.93410108765195, | |
| "grad_norm": 13.86517333984375, | |
| "learning_rate": 5.984448275862069e-05, | |
| "loss": 1.1453, | |
| "step": 126500 | |
| }, | |
| { | |
| "epoch": 81.25399872040947, | |
| "grad_norm": 12.810471534729004, | |
| "learning_rate": 5.9672068965517244e-05, | |
| "loss": 1.131, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 81.57389635316699, | |
| "grad_norm": 11.828211784362793, | |
| "learning_rate": 5.949965517241379e-05, | |
| "loss": 1.1462, | |
| "step": 127500 | |
| }, | |
| { | |
| "epoch": 81.8937939859245, | |
| "grad_norm": 13.588178634643555, | |
| "learning_rate": 5.932724137931035e-05, | |
| "loss": 1.1519, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 82.21369161868202, | |
| "grad_norm": 13.903426170349121, | |
| "learning_rate": 5.91548275862069e-05, | |
| "loss": 1.1222, | |
| "step": 128500 | |
| }, | |
| { | |
| "epoch": 82.53358925143954, | |
| "grad_norm": 13.447443962097168, | |
| "learning_rate": 5.898275862068966e-05, | |
| "loss": 1.1173, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 82.85348688419705, | |
| "grad_norm": 12.132195472717285, | |
| "learning_rate": 5.8810344827586205e-05, | |
| "loss": 1.1262, | |
| "step": 129500 | |
| }, | |
| { | |
| "epoch": 83.17338451695457, | |
| "grad_norm": 11.170686721801758, | |
| "learning_rate": 5.863793103448276e-05, | |
| "loss": 1.0957, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 83.49328214971209, | |
| "grad_norm": 12.57539176940918, | |
| "learning_rate": 5.846551724137931e-05, | |
| "loss": 1.0862, | |
| "step": 130500 | |
| }, | |
| { | |
| "epoch": 83.8131797824696, | |
| "grad_norm": 14.212547302246094, | |
| "learning_rate": 5.8293448275862074e-05, | |
| "loss": 1.0929, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 84.13307741522712, | |
| "grad_norm": 14.803600311279297, | |
| "learning_rate": 5.8121034482758616e-05, | |
| "loss": 1.0948, | |
| "step": 131500 | |
| }, | |
| { | |
| "epoch": 84.45297504798465, | |
| "grad_norm": 19.55899429321289, | |
| "learning_rate": 5.794862068965518e-05, | |
| "loss": 1.0788, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 84.77287268074217, | |
| "grad_norm": 11.086203575134277, | |
| "learning_rate": 5.7776206896551734e-05, | |
| "loss": 1.098, | |
| "step": 132500 | |
| }, | |
| { | |
| "epoch": 85.09277031349968, | |
| "grad_norm": 10.74999713897705, | |
| "learning_rate": 5.7603793103448276e-05, | |
| "loss": 1.0649, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 85.4126679462572, | |
| "grad_norm": 14.409449577331543, | |
| "learning_rate": 5.743137931034484e-05, | |
| "loss": 1.0592, | |
| "step": 133500 | |
| }, | |
| { | |
| "epoch": 85.73256557901472, | |
| "grad_norm": 10.215742111206055, | |
| "learning_rate": 5.725931034482759e-05, | |
| "loss": 1.0765, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 86.05246321177223, | |
| "grad_norm": 12.911944389343262, | |
| "learning_rate": 5.7086896551724146e-05, | |
| "loss": 1.0504, | |
| "step": 134500 | |
| }, | |
| { | |
| "epoch": 86.37236084452975, | |
| "grad_norm": 14.987035751342773, | |
| "learning_rate": 5.691448275862069e-05, | |
| "loss": 1.0141, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 86.69225847728727, | |
| "grad_norm": 11.989995002746582, | |
| "learning_rate": 5.674206896551725e-05, | |
| "loss": 1.0431, | |
| "step": 135500 | |
| }, | |
| { | |
| "epoch": 87.01215611004478, | |
| "grad_norm": 12.771849632263184, | |
| "learning_rate": 5.657e-05, | |
| "loss": 1.054, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 87.3320537428023, | |
| "grad_norm": 13.398333549499512, | |
| "learning_rate": 5.639758620689656e-05, | |
| "loss": 0.9984, | |
| "step": 136500 | |
| }, | |
| { | |
| "epoch": 87.65195137555982, | |
| "grad_norm": 10.814030647277832, | |
| "learning_rate": 5.6225172413793106e-05, | |
| "loss": 1.0283, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 87.97184900831734, | |
| "grad_norm": 12.13095760345459, | |
| "learning_rate": 5.605275862068966e-05, | |
| "loss": 1.0414, | |
| "step": 137500 | |
| }, | |
| { | |
| "epoch": 88.29174664107485, | |
| "grad_norm": 12.733049392700195, | |
| "learning_rate": 5.5880344827586204e-05, | |
| "loss": 1.0087, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 88.61164427383237, | |
| "grad_norm": 16.555213928222656, | |
| "learning_rate": 5.570827586206897e-05, | |
| "loss": 1.0062, | |
| "step": 138500 | |
| }, | |
| { | |
| "epoch": 88.93154190658989, | |
| "grad_norm": 11.025595664978027, | |
| "learning_rate": 5.553586206896552e-05, | |
| "loss": 1.0212, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 89.2514395393474, | |
| "grad_norm": 9.93308162689209, | |
| "learning_rate": 5.5363448275862074e-05, | |
| "loss": 0.998, | |
| "step": 139500 | |
| }, | |
| { | |
| "epoch": 89.57133717210493, | |
| "grad_norm": 14.131500244140625, | |
| "learning_rate": 5.519103448275862e-05, | |
| "loss": 0.9797, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 89.89123480486245, | |
| "grad_norm": 13.041298866271973, | |
| "learning_rate": 5.501862068965518e-05, | |
| "loss": 1.0205, | |
| "step": 140500 | |
| }, | |
| { | |
| "epoch": 90.21113243761997, | |
| "grad_norm": 10.885424613952637, | |
| "learning_rate": 5.484655172413793e-05, | |
| "loss": 0.9763, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 90.53103007037748, | |
| "grad_norm": 17.75884437561035, | |
| "learning_rate": 5.4674137931034485e-05, | |
| "loss": 0.9791, | |
| "step": 141500 | |
| }, | |
| { | |
| "epoch": 90.850927703135, | |
| "grad_norm": 15.903059005737305, | |
| "learning_rate": 5.4501724137931034e-05, | |
| "loss": 0.9823, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 91.17082533589252, | |
| "grad_norm": 12.409110069274902, | |
| "learning_rate": 5.432931034482759e-05, | |
| "loss": 0.9732, | |
| "step": 142500 | |
| }, | |
| { | |
| "epoch": 91.49072296865003, | |
| "grad_norm": 14.427364349365234, | |
| "learning_rate": 5.4157241379310355e-05, | |
| "loss": 0.9594, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 91.81062060140755, | |
| "grad_norm": 10.96267032623291, | |
| "learning_rate": 5.39848275862069e-05, | |
| "loss": 0.9852, | |
| "step": 143500 | |
| }, | |
| { | |
| "epoch": 92.13051823416507, | |
| "grad_norm": 9.344204902648926, | |
| "learning_rate": 5.381241379310345e-05, | |
| "loss": 0.9776, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 92.45041586692258, | |
| "grad_norm": 13.800095558166504, | |
| "learning_rate": 5.364e-05, | |
| "loss": 0.9477, | |
| "step": 144500 | |
| }, | |
| { | |
| "epoch": 92.7703134996801, | |
| "grad_norm": 14.4652099609375, | |
| "learning_rate": 5.346758620689656e-05, | |
| "loss": 0.9521, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 93.09021113243762, | |
| "grad_norm": 10.197824478149414, | |
| "learning_rate": 5.3295172413793106e-05, | |
| "loss": 0.9508, | |
| "step": 145500 | |
| }, | |
| { | |
| "epoch": 93.41010876519513, | |
| "grad_norm": 14.248830795288086, | |
| "learning_rate": 5.312310344827587e-05, | |
| "loss": 0.9298, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 93.73000639795265, | |
| "grad_norm": 15.136180877685547, | |
| "learning_rate": 5.295068965517241e-05, | |
| "loss": 0.9363, | |
| "step": 146500 | |
| }, | |
| { | |
| "epoch": 94.04990403071017, | |
| "grad_norm": 14.999555587768555, | |
| "learning_rate": 5.277862068965518e-05, | |
| "loss": 0.9574, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 94.36980166346768, | |
| "grad_norm": 10.511527061462402, | |
| "learning_rate": 5.260620689655172e-05, | |
| "loss": 0.9429, | |
| "step": 147500 | |
| }, | |
| { | |
| "epoch": 94.68969929622521, | |
| "grad_norm": 12.433847427368164, | |
| "learning_rate": 5.243379310344828e-05, | |
| "loss": 0.9327, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 95.00959692898273, | |
| "grad_norm": 11.800546646118164, | |
| "learning_rate": 5.2261379310344825e-05, | |
| "loss": 0.9363, | |
| "step": 148500 | |
| }, | |
| { | |
| "epoch": 95.32949456174025, | |
| "grad_norm": 11.03012466430664, | |
| "learning_rate": 5.208896551724138e-05, | |
| "loss": 0.9172, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 95.64939219449776, | |
| "grad_norm": 13.628169059753418, | |
| "learning_rate": 5.191655172413793e-05, | |
| "loss": 0.9101, | |
| "step": 149500 | |
| }, | |
| { | |
| "epoch": 95.96928982725528, | |
| "grad_norm": 11.726004600524902, | |
| "learning_rate": 5.1744137931034485e-05, | |
| "loss": 0.9292, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 96.2891874600128, | |
| "grad_norm": 9.179962158203125, | |
| "learning_rate": 5.1571724137931033e-05, | |
| "loss": 0.8977, | |
| "step": 150500 | |
| }, | |
| { | |
| "epoch": 96.60908509277031, | |
| "grad_norm": 11.146485328674316, | |
| "learning_rate": 5.139931034482759e-05, | |
| "loss": 0.9037, | |
| "step": 151000 | |
| }, | |
| { | |
| "epoch": 96.92898272552783, | |
| "grad_norm": 14.070140838623047, | |
| "learning_rate": 5.122724137931034e-05, | |
| "loss": 0.9075, | |
| "step": 151500 | |
| }, | |
| { | |
| "epoch": 97.24888035828535, | |
| "grad_norm": 12.702670097351074, | |
| "learning_rate": 5.1054827586206897e-05, | |
| "loss": 0.8721, | |
| "step": 152000 | |
| }, | |
| { | |
| "epoch": 97.56877799104286, | |
| "grad_norm": 11.813859939575195, | |
| "learning_rate": 5.088241379310346e-05, | |
| "loss": 0.8876, | |
| "step": 152500 | |
| }, | |
| { | |
| "epoch": 97.88867562380038, | |
| "grad_norm": 14.402729034423828, | |
| "learning_rate": 5.071e-05, | |
| "loss": 0.8986, | |
| "step": 153000 | |
| }, | |
| { | |
| "epoch": 98.2085732565579, | |
| "grad_norm": 11.126707077026367, | |
| "learning_rate": 5.0537586206896556e-05, | |
| "loss": 0.8569, | |
| "step": 153500 | |
| }, | |
| { | |
| "epoch": 98.52847088931541, | |
| "grad_norm": 13.64499282836914, | |
| "learning_rate": 5.0365172413793105e-05, | |
| "loss": 0.8774, | |
| "step": 154000 | |
| }, | |
| { | |
| "epoch": 98.84836852207293, | |
| "grad_norm": 13.022969245910645, | |
| "learning_rate": 5.019275862068966e-05, | |
| "loss": 0.8801, | |
| "step": 154500 | |
| }, | |
| { | |
| "epoch": 99.16826615483045, | |
| "grad_norm": 12.945636749267578, | |
| "learning_rate": 5.002034482758621e-05, | |
| "loss": 0.8597, | |
| "step": 155000 | |
| }, | |
| { | |
| "epoch": 99.48816378758798, | |
| "grad_norm": 12.05784797668457, | |
| "learning_rate": 4.984827586206897e-05, | |
| "loss": 0.8477, | |
| "step": 155500 | |
| }, | |
| { | |
| "epoch": 99.8080614203455, | |
| "grad_norm": 11.149604797363281, | |
| "learning_rate": 4.967586206896552e-05, | |
| "loss": 0.8533, | |
| "step": 156000 | |
| }, | |
| { | |
| "epoch": 100.12795905310301, | |
| "grad_norm": 12.993003845214844, | |
| "learning_rate": 4.950344827586207e-05, | |
| "loss": 0.8722, | |
| "step": 156500 | |
| }, | |
| { | |
| "epoch": 100.44785668586053, | |
| "grad_norm": 12.33105754852295, | |
| "learning_rate": 4.933103448275863e-05, | |
| "loss": 0.8397, | |
| "step": 157000 | |
| }, | |
| { | |
| "epoch": 100.76775431861805, | |
| "grad_norm": 12.321619987487793, | |
| "learning_rate": 4.9158965517241387e-05, | |
| "loss": 0.8548, | |
| "step": 157500 | |
| }, | |
| { | |
| "epoch": 101.08765195137556, | |
| "grad_norm": 13.47990894317627, | |
| "learning_rate": 4.8986896551724145e-05, | |
| "loss": 0.8493, | |
| "step": 158000 | |
| }, | |
| { | |
| "epoch": 101.40754958413308, | |
| "grad_norm": 10.382761001586914, | |
| "learning_rate": 4.8814482758620694e-05, | |
| "loss": 0.8434, | |
| "step": 158500 | |
| }, | |
| { | |
| "epoch": 101.7274472168906, | |
| "grad_norm": 8.643112182617188, | |
| "learning_rate": 4.864206896551724e-05, | |
| "loss": 0.8408, | |
| "step": 159000 | |
| }, | |
| { | |
| "epoch": 102.04734484964811, | |
| "grad_norm": 10.81409740447998, | |
| "learning_rate": 4.84696551724138e-05, | |
| "loss": 0.8415, | |
| "step": 159500 | |
| }, | |
| { | |
| "epoch": 102.36724248240563, | |
| "grad_norm": 11.696605682373047, | |
| "learning_rate": 4.829758620689656e-05, | |
| "loss": 0.8305, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 102.68714011516315, | |
| "grad_norm": 13.345202445983887, | |
| "learning_rate": 4.8125172413793106e-05, | |
| "loss": 0.8271, | |
| "step": 160500 | |
| }, | |
| { | |
| "epoch": 103.00703774792066, | |
| "grad_norm": 11.675226211547852, | |
| "learning_rate": 4.795275862068966e-05, | |
| "loss": 0.8491, | |
| "step": 161000 | |
| }, | |
| { | |
| "epoch": 103.32693538067818, | |
| "grad_norm": 12.083547592163086, | |
| "learning_rate": 4.778034482758621e-05, | |
| "loss": 0.8052, | |
| "step": 161500 | |
| }, | |
| { | |
| "epoch": 103.6468330134357, | |
| "grad_norm": 9.721264839172363, | |
| "learning_rate": 4.760793103448276e-05, | |
| "loss": 0.8128, | |
| "step": 162000 | |
| }, | |
| { | |
| "epoch": 103.96673064619321, | |
| "grad_norm": 13.526360511779785, | |
| "learning_rate": 4.743586206896552e-05, | |
| "loss": 0.8225, | |
| "step": 162500 | |
| }, | |
| { | |
| "epoch": 104.28662827895073, | |
| "grad_norm": 14.503246307373047, | |
| "learning_rate": 4.726344827586207e-05, | |
| "loss": 0.8014, | |
| "step": 163000 | |
| }, | |
| { | |
| "epoch": 104.60652591170825, | |
| "grad_norm": 12.239891052246094, | |
| "learning_rate": 4.709103448275862e-05, | |
| "loss": 0.8032, | |
| "step": 163500 | |
| }, | |
| { | |
| "epoch": 104.92642354446578, | |
| "grad_norm": 12.229057312011719, | |
| "learning_rate": 4.691862068965517e-05, | |
| "loss": 0.8073, | |
| "step": 164000 | |
| }, | |
| { | |
| "epoch": 105.2463211772233, | |
| "grad_norm": 11.960144996643066, | |
| "learning_rate": 4.6746206896551726e-05, | |
| "loss": 0.7956, | |
| "step": 164500 | |
| }, | |
| { | |
| "epoch": 105.56621880998081, | |
| "grad_norm": 13.662198066711426, | |
| "learning_rate": 4.6573793103448275e-05, | |
| "loss": 0.7875, | |
| "step": 165000 | |
| }, | |
| { | |
| "epoch": 105.88611644273833, | |
| "grad_norm": 13.428373336791992, | |
| "learning_rate": 4.6401724137931034e-05, | |
| "loss": 0.7967, | |
| "step": 165500 | |
| }, | |
| { | |
| "epoch": 106.20601407549584, | |
| "grad_norm": 13.212657928466797, | |
| "learning_rate": 4.622931034482759e-05, | |
| "loss": 0.7734, | |
| "step": 166000 | |
| }, | |
| { | |
| "epoch": 106.52591170825336, | |
| "grad_norm": 13.421162605285645, | |
| "learning_rate": 4.605689655172414e-05, | |
| "loss": 0.7691, | |
| "step": 166500 | |
| }, | |
| { | |
| "epoch": 106.84580934101088, | |
| "grad_norm": 10.209512710571289, | |
| "learning_rate": 4.588448275862069e-05, | |
| "loss": 0.7952, | |
| "step": 167000 | |
| }, | |
| { | |
| "epoch": 107.1657069737684, | |
| "grad_norm": 10.482810020446777, | |
| "learning_rate": 4.571206896551725e-05, | |
| "loss": 0.7837, | |
| "step": 167500 | |
| }, | |
| { | |
| "epoch": 107.48560460652591, | |
| "grad_norm": 13.598471641540527, | |
| "learning_rate": 4.55396551724138e-05, | |
| "loss": 0.778, | |
| "step": 168000 | |
| }, | |
| { | |
| "epoch": 107.80550223928343, | |
| "grad_norm": 12.402639389038086, | |
| "learning_rate": 4.5367241379310346e-05, | |
| "loss": 0.7711, | |
| "step": 168500 | |
| }, | |
| { | |
| "epoch": 108.12539987204094, | |
| "grad_norm": 12.243593215942383, | |
| "learning_rate": 4.51948275862069e-05, | |
| "loss": 0.773, | |
| "step": 169000 | |
| }, | |
| { | |
| "epoch": 108.44529750479846, | |
| "grad_norm": 10.736000061035156, | |
| "learning_rate": 4.502275862068966e-05, | |
| "loss": 0.7477, | |
| "step": 169500 | |
| }, | |
| { | |
| "epoch": 108.76519513755598, | |
| "grad_norm": 11.13589096069336, | |
| "learning_rate": 4.485034482758621e-05, | |
| "loss": 0.7669, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 109.0850927703135, | |
| "grad_norm": 10.14847183227539, | |
| "learning_rate": 4.4677931034482765e-05, | |
| "loss": 0.7618, | |
| "step": 170500 | |
| }, | |
| { | |
| "epoch": 109.40499040307101, | |
| "grad_norm": 10.636765480041504, | |
| "learning_rate": 4.4505517241379314e-05, | |
| "loss": 0.7362, | |
| "step": 171000 | |
| }, | |
| { | |
| "epoch": 109.72488803582854, | |
| "grad_norm": 12.350906372070312, | |
| "learning_rate": 4.433344827586207e-05, | |
| "loss": 0.7578, | |
| "step": 171500 | |
| }, | |
| { | |
| "epoch": 110.04478566858606, | |
| "grad_norm": 13.237043380737305, | |
| "learning_rate": 4.416103448275862e-05, | |
| "loss": 0.7662, | |
| "step": 172000 | |
| }, | |
| { | |
| "epoch": 110.36468330134358, | |
| "grad_norm": 8.747899055480957, | |
| "learning_rate": 4.398862068965518e-05, | |
| "loss": 0.7306, | |
| "step": 172500 | |
| }, | |
| { | |
| "epoch": 110.68458093410109, | |
| "grad_norm": 11.915460586547852, | |
| "learning_rate": 4.3816206896551725e-05, | |
| "loss": 0.7604, | |
| "step": 173000 | |
| }, | |
| { | |
| "epoch": 111.00447856685861, | |
| "grad_norm": 10.675039291381836, | |
| "learning_rate": 4.3643793103448274e-05, | |
| "loss": 0.7348, | |
| "step": 173500 | |
| }, | |
| { | |
| "epoch": 111.32437619961613, | |
| "grad_norm": 11.528447151184082, | |
| "learning_rate": 4.347172413793103e-05, | |
| "loss": 0.7297, | |
| "step": 174000 | |
| }, | |
| { | |
| "epoch": 111.64427383237364, | |
| "grad_norm": 11.974442481994629, | |
| "learning_rate": 4.329931034482759e-05, | |
| "loss": 0.7398, | |
| "step": 174500 | |
| }, | |
| { | |
| "epoch": 111.96417146513116, | |
| "grad_norm": 10.059257507324219, | |
| "learning_rate": 4.312689655172414e-05, | |
| "loss": 0.7337, | |
| "step": 175000 | |
| }, | |
| { | |
| "epoch": 112.28406909788868, | |
| "grad_norm": 11.215494155883789, | |
| "learning_rate": 4.295448275862069e-05, | |
| "loss": 0.7154, | |
| "step": 175500 | |
| }, | |
| { | |
| "epoch": 112.60396673064619, | |
| "grad_norm": 11.685689926147461, | |
| "learning_rate": 4.278241379310345e-05, | |
| "loss": 0.7268, | |
| "step": 176000 | |
| }, | |
| { | |
| "epoch": 112.92386436340371, | |
| "grad_norm": 12.056086540222168, | |
| "learning_rate": 4.261e-05, | |
| "loss": 0.7214, | |
| "step": 176500 | |
| }, | |
| { | |
| "epoch": 113.24376199616123, | |
| "grad_norm": 10.17962646484375, | |
| "learning_rate": 4.243758620689655e-05, | |
| "loss": 0.7309, | |
| "step": 177000 | |
| }, | |
| { | |
| "epoch": 113.56365962891874, | |
| "grad_norm": 13.576325416564941, | |
| "learning_rate": 4.226517241379311e-05, | |
| "loss": 0.7047, | |
| "step": 177500 | |
| }, | |
| { | |
| "epoch": 113.88355726167626, | |
| "grad_norm": 10.385096549987793, | |
| "learning_rate": 4.209275862068966e-05, | |
| "loss": 0.7298, | |
| "step": 178000 | |
| }, | |
| { | |
| "epoch": 114.20345489443378, | |
| "grad_norm": 10.976679801940918, | |
| "learning_rate": 4.192068965517242e-05, | |
| "loss": 0.7079, | |
| "step": 178500 | |
| }, | |
| { | |
| "epoch": 114.52335252719129, | |
| "grad_norm": 14.960927963256836, | |
| "learning_rate": 4.174827586206897e-05, | |
| "loss": 0.7004, | |
| "step": 179000 | |
| }, | |
| { | |
| "epoch": 114.84325015994882, | |
| "grad_norm": 11.473701477050781, | |
| "learning_rate": 4.157586206896552e-05, | |
| "loss": 0.7137, | |
| "step": 179500 | |
| }, | |
| { | |
| "epoch": 115.16314779270634, | |
| "grad_norm": 11.255741119384766, | |
| "learning_rate": 4.140344827586207e-05, | |
| "loss": 0.6994, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 115.48304542546386, | |
| "grad_norm": 11.247090339660645, | |
| "learning_rate": 4.123103448275862e-05, | |
| "loss": 0.6822, | |
| "step": 180500 | |
| }, | |
| { | |
| "epoch": 115.80294305822137, | |
| "grad_norm": 10.883934020996094, | |
| "learning_rate": 4.1058620689655176e-05, | |
| "loss": 0.6902, | |
| "step": 181000 | |
| }, | |
| { | |
| "epoch": 116.12284069097889, | |
| "grad_norm": 12.544395446777344, | |
| "learning_rate": 4.0886551724137935e-05, | |
| "loss": 0.6853, | |
| "step": 181500 | |
| }, | |
| { | |
| "epoch": 116.4427383237364, | |
| "grad_norm": 10.791812896728516, | |
| "learning_rate": 4.0714137931034484e-05, | |
| "loss": 0.6785, | |
| "step": 182000 | |
| }, | |
| { | |
| "epoch": 116.76263595649392, | |
| "grad_norm": 13.567925453186035, | |
| "learning_rate": 4.054172413793104e-05, | |
| "loss": 0.6904, | |
| "step": 182500 | |
| }, | |
| { | |
| "epoch": 117.08253358925144, | |
| "grad_norm": 10.842440605163574, | |
| "learning_rate": 4.036931034482759e-05, | |
| "loss": 0.6851, | |
| "step": 183000 | |
| }, | |
| { | |
| "epoch": 117.40243122200896, | |
| "grad_norm": 13.358929634094238, | |
| "learning_rate": 4.0196896551724136e-05, | |
| "loss": 0.6629, | |
| "step": 183500 | |
| }, | |
| { | |
| "epoch": 117.72232885476647, | |
| "grad_norm": 12.40263557434082, | |
| "learning_rate": 4.002448275862069e-05, | |
| "loss": 0.666, | |
| "step": 184000 | |
| }, | |
| { | |
| "epoch": 118.04222648752399, | |
| "grad_norm": 12.171306610107422, | |
| "learning_rate": 3.985241379310345e-05, | |
| "loss": 0.6875, | |
| "step": 184500 | |
| }, | |
| { | |
| "epoch": 118.3621241202815, | |
| "grad_norm": 11.05837631225586, | |
| "learning_rate": 3.968e-05, | |
| "loss": 0.6598, | |
| "step": 185000 | |
| }, | |
| { | |
| "epoch": 118.68202175303902, | |
| "grad_norm": 13.27622127532959, | |
| "learning_rate": 3.9507586206896555e-05, | |
| "loss": 0.6735, | |
| "step": 185500 | |
| }, | |
| { | |
| "epoch": 119.00191938579654, | |
| "grad_norm": 10.379920959472656, | |
| "learning_rate": 3.9335172413793104e-05, | |
| "loss": 0.6849, | |
| "step": 186000 | |
| }, | |
| { | |
| "epoch": 119.32181701855406, | |
| "grad_norm": 13.972020149230957, | |
| "learning_rate": 3.916275862068965e-05, | |
| "loss": 0.6641, | |
| "step": 186500 | |
| }, | |
| { | |
| "epoch": 119.64171465131157, | |
| "grad_norm": 11.595196723937988, | |
| "learning_rate": 3.899068965517241e-05, | |
| "loss": 0.6577, | |
| "step": 187000 | |
| }, | |
| { | |
| "epoch": 119.9616122840691, | |
| "grad_norm": 11.496007919311523, | |
| "learning_rate": 3.881827586206897e-05, | |
| "loss": 0.6601, | |
| "step": 187500 | |
| }, | |
| { | |
| "epoch": 120.28150991682662, | |
| "grad_norm": 10.272443771362305, | |
| "learning_rate": 3.864586206896552e-05, | |
| "loss": 0.6365, | |
| "step": 188000 | |
| }, | |
| { | |
| "epoch": 120.60140754958414, | |
| "grad_norm": 11.38764762878418, | |
| "learning_rate": 3.847344827586207e-05, | |
| "loss": 0.6484, | |
| "step": 188500 | |
| }, | |
| { | |
| "epoch": 120.92130518234165, | |
| "grad_norm": 14.054584503173828, | |
| "learning_rate": 3.8301034482758627e-05, | |
| "loss": 0.6662, | |
| "step": 189000 | |
| }, | |
| { | |
| "epoch": 121.24120281509917, | |
| "grad_norm": 10.214823722839355, | |
| "learning_rate": 3.8128620689655175e-05, | |
| "loss": 0.6378, | |
| "step": 189500 | |
| }, | |
| { | |
| "epoch": 121.56110044785669, | |
| "grad_norm": 10.101186752319336, | |
| "learning_rate": 3.7956551724137934e-05, | |
| "loss": 0.6458, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 121.8809980806142, | |
| "grad_norm": 9.963375091552734, | |
| "learning_rate": 3.778413793103448e-05, | |
| "loss": 0.6518, | |
| "step": 190500 | |
| }, | |
| { | |
| "epoch": 122.20089571337172, | |
| "grad_norm": 12.217530250549316, | |
| "learning_rate": 3.761172413793104e-05, | |
| "loss": 0.6382, | |
| "step": 191000 | |
| }, | |
| { | |
| "epoch": 122.52079334612924, | |
| "grad_norm": 13.154520034790039, | |
| "learning_rate": 3.743931034482759e-05, | |
| "loss": 0.6272, | |
| "step": 191500 | |
| }, | |
| { | |
| "epoch": 122.84069097888676, | |
| "grad_norm": 12.079745292663574, | |
| "learning_rate": 3.7267241379310346e-05, | |
| "loss": 0.6496, | |
| "step": 192000 | |
| }, | |
| { | |
| "epoch": 123.16058861164427, | |
| "grad_norm": 9.851350784301758, | |
| "learning_rate": 3.70948275862069e-05, | |
| "loss": 0.6253, | |
| "step": 192500 | |
| }, | |
| { | |
| "epoch": 123.48048624440179, | |
| "grad_norm": 10.869081497192383, | |
| "learning_rate": 3.692241379310345e-05, | |
| "loss": 0.6307, | |
| "step": 193000 | |
| }, | |
| { | |
| "epoch": 123.8003838771593, | |
| "grad_norm": 16.297130584716797, | |
| "learning_rate": 3.675034482758621e-05, | |
| "loss": 0.6336, | |
| "step": 193500 | |
| }, | |
| { | |
| "epoch": 124.12028150991682, | |
| "grad_norm": 11.86780834197998, | |
| "learning_rate": 3.657793103448276e-05, | |
| "loss": 0.626, | |
| "step": 194000 | |
| }, | |
| { | |
| "epoch": 124.44017914267434, | |
| "grad_norm": 12.359445571899414, | |
| "learning_rate": 3.640551724137931e-05, | |
| "loss": 0.6068, | |
| "step": 194500 | |
| }, | |
| { | |
| "epoch": 124.76007677543186, | |
| "grad_norm": 10.589780807495117, | |
| "learning_rate": 3.623310344827586e-05, | |
| "loss": 0.6388, | |
| "step": 195000 | |
| }, | |
| { | |
| "epoch": 125.07997440818939, | |
| "grad_norm": 11.763408660888672, | |
| "learning_rate": 3.606068965517241e-05, | |
| "loss": 0.6132, | |
| "step": 195500 | |
| }, | |
| { | |
| "epoch": 125.3998720409469, | |
| "grad_norm": 11.668269157409668, | |
| "learning_rate": 3.5888275862068966e-05, | |
| "loss": 0.602, | |
| "step": 196000 | |
| }, | |
| { | |
| "epoch": 125.71976967370442, | |
| "grad_norm": 11.628352165222168, | |
| "learning_rate": 3.5715862068965515e-05, | |
| "loss": 0.6196, | |
| "step": 196500 | |
| }, | |
| { | |
| "epoch": 126.03966730646194, | |
| "grad_norm": 11.364988327026367, | |
| "learning_rate": 3.554344827586207e-05, | |
| "loss": 0.6197, | |
| "step": 197000 | |
| }, | |
| { | |
| "epoch": 126.35956493921945, | |
| "grad_norm": 10.588140487670898, | |
| "learning_rate": 3.5371034482758626e-05, | |
| "loss": 0.6017, | |
| "step": 197500 | |
| }, | |
| { | |
| "epoch": 126.67946257197697, | |
| "grad_norm": 13.472407341003418, | |
| "learning_rate": 3.5198965517241385e-05, | |
| "loss": 0.6145, | |
| "step": 198000 | |
| }, | |
| { | |
| "epoch": 126.99936020473449, | |
| "grad_norm": 10.752822875976562, | |
| "learning_rate": 3.502655172413793e-05, | |
| "loss": 0.6211, | |
| "step": 198500 | |
| }, | |
| { | |
| "epoch": 127.319257837492, | |
| "grad_norm": 11.411170959472656, | |
| "learning_rate": 3.485413793103449e-05, | |
| "loss": 0.5919, | |
| "step": 199000 | |
| }, | |
| { | |
| "epoch": 127.63915547024952, | |
| "grad_norm": 12.288487434387207, | |
| "learning_rate": 3.468172413793104e-05, | |
| "loss": 0.6032, | |
| "step": 199500 | |
| }, | |
| { | |
| "epoch": 127.95905310300704, | |
| "grad_norm": 8.647520065307617, | |
| "learning_rate": 3.4509310344827586e-05, | |
| "loss": 0.6228, | |
| "step": 200000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 300000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 192, | |
| "save_steps": 20000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2457759656933786e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |