| { |
| "best_global_step": 33000, |
| "best_metric": 0.6409004926681519, |
| "best_model_checkpoint": "/home/ubuntu/deepseek-math-b200-resumed/checkpoint-33000", |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 63609, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 4.7163276314897404e-05, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.0, |
| "loss": 1.4702, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0023581638157448705, |
| "grad_norm": 16.5, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.8687, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.004716327631489741, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.9e-06, |
| "loss": 0.762, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.007074491447234611, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.99228455809413e-06, |
| "loss": 0.7395, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.009432655262979482, |
| "grad_norm": 2.0625, |
| "learning_rate": 9.984411658190178e-06, |
| "loss": 0.7109, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.011790819078724351, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.976538758286228e-06, |
| "loss": 0.6712, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.014148982894469222, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.968665858382277e-06, |
| "loss": 0.6872, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.016507146710214093, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.960792958478327e-06, |
| "loss": 0.6699, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.018865310525958964, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.952920058574375e-06, |
| "loss": 0.6767, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.02122347434170383, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.945047158670426e-06, |
| "loss": 0.6632, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.023581638157448702, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.937174258766474e-06, |
| "loss": 0.6635, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.023581638157448702, |
| "eval_loss": 0.6704570055007935, |
| "eval_runtime": 471.8556, |
| "eval_samples_per_second": 75.68, |
| "eval_steps_per_second": 37.84, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.025939801973193573, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.929301358862524e-06, |
| "loss": 0.6684, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.028297965788938444, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.921428458958575e-06, |
| "loss": 0.6743, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.030656129604683315, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.913555559054623e-06, |
| "loss": 0.6726, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.033014293420428185, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.905682659150673e-06, |
| "loss": 0.6692, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.035372457236173056, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.897809759246722e-06, |
| "loss": 0.6622, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.03773062105191793, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.889936859342772e-06, |
| "loss": 0.6613, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.04008878486766279, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.88206395943882e-06, |
| "loss": 0.6643, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.04244694868340766, |
| "grad_norm": 1.6171875, |
| "learning_rate": 9.87419105953487e-06, |
| "loss": 0.6457, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.04480511249915253, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.86631815963092e-06, |
| "loss": 0.6598, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.047163276314897404, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.858445259726968e-06, |
| "loss": 0.6687, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.047163276314897404, |
| "eval_loss": 0.6613409519195557, |
| "eval_runtime": 471.2575, |
| "eval_samples_per_second": 75.776, |
| "eval_steps_per_second": 37.888, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.049521440130642275, |
| "grad_norm": 1.609375, |
| "learning_rate": 9.850572359823018e-06, |
| "loss": 0.6618, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.051879603946387146, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.842699459919067e-06, |
| "loss": 0.654, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.054237767762132016, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.834826560015117e-06, |
| "loss": 0.6651, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.05659593157787689, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.826953660111166e-06, |
| "loss": 0.6597, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.05895409539362176, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.819080760207216e-06, |
| "loss": 0.6654, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.06131225920936663, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.811207860303264e-06, |
| "loss": 0.6743, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.0636704230251115, |
| "grad_norm": 1.75, |
| "learning_rate": 9.803334960399315e-06, |
| "loss": 0.6578, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.06602858684085637, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.795462060495363e-06, |
| "loss": 0.6495, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.06838675065660124, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.787589160591413e-06, |
| "loss": 0.668, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.07074491447234611, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.779716260687464e-06, |
| "loss": 0.6636, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.07074491447234611, |
| "eval_loss": 0.6566535830497742, |
| "eval_runtime": 469.1444, |
| "eval_samples_per_second": 76.117, |
| "eval_steps_per_second": 38.059, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.07310307828809098, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.771843360783512e-06, |
| "loss": 0.6704, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.07546124210383585, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.76397046087956e-06, |
| "loss": 0.6636, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.07781940591958073, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.756097560975611e-06, |
| "loss": 0.6451, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.08017756973532558, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.74822466107166e-06, |
| "loss": 0.6523, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.08253573355107045, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.74035176116771e-06, |
| "loss": 0.6636, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.08489389736681532, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.732478861263758e-06, |
| "loss": 0.645, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.0872520611825602, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.724605961359807e-06, |
| "loss": 0.6565, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.08961022499830507, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.716733061455857e-06, |
| "loss": 0.6424, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.09196838881404994, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.708860161551907e-06, |
| "loss": 0.6612, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.09432655262979481, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.700987261647956e-06, |
| "loss": 0.6457, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.09432655262979481, |
| "eval_loss": 0.6536301374435425, |
| "eval_runtime": 469.9529, |
| "eval_samples_per_second": 75.986, |
| "eval_steps_per_second": 37.993, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.09668471644553968, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.693114361744006e-06, |
| "loss": 0.6538, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.09904288026128455, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.685241461840055e-06, |
| "loss": 0.6505, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.10140104407702942, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.677368561936105e-06, |
| "loss": 0.6554, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.10375920789277429, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.669495662032153e-06, |
| "loss": 0.6541, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.10611737170851916, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.661622762128204e-06, |
| "loss": 0.6525, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.10847553552426403, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.653749862224252e-06, |
| "loss": 0.6487, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.1108336993400089, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.645876962320302e-06, |
| "loss": 0.6497, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.11319186315575377, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.638004062416351e-06, |
| "loss": 0.6603, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.11555002697149865, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.630131162512401e-06, |
| "loss": 0.6615, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.11790819078724352, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.62225826260845e-06, |
| "loss": 0.6497, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.11790819078724352, |
| "eval_loss": 0.6514254212379456, |
| "eval_runtime": 468.4051, |
| "eval_samples_per_second": 76.237, |
| "eval_steps_per_second": 38.119, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.12026635460298839, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.6143853627045e-06, |
| "loss": 0.6438, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.12262451841873326, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.606512462800549e-06, |
| "loss": 0.6503, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.12498268223447813, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.598639562896597e-06, |
| "loss": 0.6413, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.127340846050223, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.590766662992647e-06, |
| "loss": 0.6598, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.12969900986596786, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.582893763088696e-06, |
| "loss": 0.6363, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.13205717368171274, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.575020863184746e-06, |
| "loss": 0.6558, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.1344153374974576, |
| "grad_norm": 2.078125, |
| "learning_rate": 9.567147963280795e-06, |
| "loss": 0.6429, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.13677350131320248, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.559275063376845e-06, |
| "loss": 0.6619, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.13913166512894734, |
| "grad_norm": 1.9921875, |
| "learning_rate": 9.551402163472895e-06, |
| "loss": 0.6607, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.14148982894469223, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.543529263568944e-06, |
| "loss": 0.6558, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.14148982894469223, |
| "eval_loss": 0.649859607219696, |
| "eval_runtime": 471.3671, |
| "eval_samples_per_second": 75.758, |
| "eval_steps_per_second": 37.879, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.14384799276043708, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.535656363664994e-06, |
| "loss": 0.6347, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.14620615657618197, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.527783463761042e-06, |
| "loss": 0.6467, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.14856432039192682, |
| "grad_norm": 1.75, |
| "learning_rate": 9.519910563857093e-06, |
| "loss": 0.6584, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.1509224842076717, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.512037663953141e-06, |
| "loss": 0.6499, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.15328064802341657, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.504164764049192e-06, |
| "loss": 0.6528, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.15563881183916145, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.49629186414524e-06, |
| "loss": 0.6434, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1579969756549063, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.488418964241289e-06, |
| "loss": 0.6542, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.16035513947065116, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.480546064337339e-06, |
| "loss": 0.6469, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.16271330328639605, |
| "grad_norm": 1.75, |
| "learning_rate": 9.472673164433387e-06, |
| "loss": 0.6385, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.1650714671021409, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.464800264529438e-06, |
| "loss": 0.6535, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.1650714671021409, |
| "eval_loss": 0.6484876871109009, |
| "eval_runtime": 476.3019, |
| "eval_samples_per_second": 74.973, |
| "eval_steps_per_second": 37.487, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.1674296309178858, |
| "grad_norm": 1.875, |
| "learning_rate": 9.456927364625486e-06, |
| "loss": 0.6531, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.16978779473363065, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.449054464721536e-06, |
| "loss": 0.6472, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.17214595854937553, |
| "grad_norm": 1.625, |
| "learning_rate": 9.441181564817585e-06, |
| "loss": 0.6474, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.1745041223651204, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.433308664913635e-06, |
| "loss": 0.6498, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.17686228618086527, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.425435765009684e-06, |
| "loss": 0.6615, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.17922044999661013, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.417562865105734e-06, |
| "loss": 0.6431, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.18157861381235502, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.409689965201784e-06, |
| "loss": 0.6369, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.18393677762809987, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.401817065297833e-06, |
| "loss": 0.6329, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.18629494144384476, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.393944165393883e-06, |
| "loss": 0.6452, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.18865310525958962, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.386071265489932e-06, |
| "loss": 0.6562, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.18865310525958962, |
| "eval_loss": 0.6474871635437012, |
| "eval_runtime": 471.3025, |
| "eval_samples_per_second": 75.769, |
| "eval_steps_per_second": 37.884, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.1910112690753345, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.378198365585982e-06, |
| "loss": 0.6507, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.19336943289107936, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.37032546568203e-06, |
| "loss": 0.6486, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.19572759670682424, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.362452565778079e-06, |
| "loss": 0.6593, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.1980857605225691, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.354579665874129e-06, |
| "loss": 0.6526, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.20044392433831398, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.346706765970178e-06, |
| "loss": 0.6393, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.20280208815405884, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.338833866066228e-06, |
| "loss": 0.6465, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.20516025196980373, |
| "grad_norm": 1.8515625, |
| "learning_rate": 9.330960966162276e-06, |
| "loss": 0.6501, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.20751841578554858, |
| "grad_norm": 1.9453125, |
| "learning_rate": 9.323088066258327e-06, |
| "loss": 0.6485, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.20987657960129344, |
| "grad_norm": 1.875, |
| "learning_rate": 9.315215166354375e-06, |
| "loss": 0.6468, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.21223474341703832, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.307342266450425e-06, |
| "loss": 0.6492, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.21223474341703832, |
| "eval_loss": 0.6467618346214294, |
| "eval_runtime": 472.4529, |
| "eval_samples_per_second": 75.584, |
| "eval_steps_per_second": 37.792, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.21459290723278318, |
| "grad_norm": 1.5390625, |
| "learning_rate": 9.299469366546474e-06, |
| "loss": 0.6532, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.21695107104852807, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.291596466642524e-06, |
| "loss": 0.6518, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.21930923486427292, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.283723566738573e-06, |
| "loss": 0.6533, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.2216673986800178, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.275850666834623e-06, |
| "loss": 0.6467, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.22402556249576266, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.267977766930673e-06, |
| "loss": 0.6557, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.22638372631150755, |
| "grad_norm": 1.875, |
| "learning_rate": 9.260104867026722e-06, |
| "loss": 0.6405, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.2287418901272524, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.25223196712277e-06, |
| "loss": 0.6461, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.2311000539429973, |
| "grad_norm": 1.546875, |
| "learning_rate": 9.24435906721882e-06, |
| "loss": 0.6404, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.23345821775874215, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.236486167314869e-06, |
| "loss": 0.6345, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.23581638157448703, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.22861326741092e-06, |
| "loss": 0.6569, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.23581638157448703, |
| "eval_loss": 0.6459140777587891, |
| "eval_runtime": 471.5037, |
| "eval_samples_per_second": 75.736, |
| "eval_steps_per_second": 37.868, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2381745453902319, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.220740367506968e-06, |
| "loss": 0.627, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.24053270920597677, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.212867467603016e-06, |
| "loss": 0.6353, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.24289087302172163, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.204994567699067e-06, |
| "loss": 0.6251, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.24524903683746652, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.197121667795117e-06, |
| "loss": 0.652, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.24760720065321137, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.189248767891166e-06, |
| "loss": 0.6512, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.24996536446895626, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.181375867987216e-06, |
| "loss": 0.6522, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.2523235282847011, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.173502968083264e-06, |
| "loss": 0.6397, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.254681692100446, |
| "grad_norm": 2.03125, |
| "learning_rate": 9.165630068179315e-06, |
| "loss": 0.6427, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.2570398559161909, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.157757168275363e-06, |
| "loss": 0.6367, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.2593980197319357, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.149884268371413e-06, |
| "loss": 0.6441, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.2593980197319357, |
| "eval_loss": 0.6453782320022583, |
| "eval_runtime": 474.9186, |
| "eval_samples_per_second": 75.192, |
| "eval_steps_per_second": 37.596, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.2617561835476806, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.142011368467462e-06, |
| "loss": 0.6361, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.2641143473634255, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.134138468563512e-06, |
| "loss": 0.6535, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.2664725111791703, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.12626556865956e-06, |
| "loss": 0.6484, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.2688306749949152, |
| "grad_norm": 1.9375, |
| "learning_rate": 9.118392668755611e-06, |
| "loss": 0.6505, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.2711888388106601, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.11051976885166e-06, |
| "loss": 0.6319, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.27354700262640497, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.102646868947708e-06, |
| "loss": 0.6495, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.2759051664421498, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.094773969043758e-06, |
| "loss": 0.65, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.2782633302578947, |
| "grad_norm": 1.78125, |
| "learning_rate": 9.086901069139807e-06, |
| "loss": 0.6459, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.28062149407363957, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.079028169235857e-06, |
| "loss": 0.6462, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.28297965788938445, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.071155269331906e-06, |
| "loss": 0.6378, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.28297965788938445, |
| "eval_loss": 0.6447737812995911, |
| "eval_runtime": 470.2799, |
| "eval_samples_per_second": 75.933, |
| "eval_steps_per_second": 37.967, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.2853378217051293, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.063282369427956e-06, |
| "loss": 0.6452, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.28769598552087416, |
| "grad_norm": 1.703125, |
| "learning_rate": 9.055409469524004e-06, |
| "loss": 0.6443, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.29005414933661905, |
| "grad_norm": 2.046875, |
| "learning_rate": 9.047536569620055e-06, |
| "loss": 0.6429, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.29241231315236393, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.039663669716105e-06, |
| "loss": 0.6313, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.29477047696810876, |
| "grad_norm": 1.984375, |
| "learning_rate": 9.031790769812153e-06, |
| "loss": 0.6576, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.29712864078385365, |
| "grad_norm": 1.6328125, |
| "learning_rate": 9.023917869908204e-06, |
| "loss": 0.6467, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.29948680459959853, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.016044970004252e-06, |
| "loss": 0.6372, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.3018449684153434, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.008172070100302e-06, |
| "loss": 0.6413, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.30420313223108825, |
| "grad_norm": 1.8125, |
| "learning_rate": 9.000299170196351e-06, |
| "loss": 0.6488, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.30656129604683313, |
| "grad_norm": 2.0, |
| "learning_rate": 8.992426270292401e-06, |
| "loss": 0.6407, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.30656129604683313, |
| "eval_loss": 0.6444392800331116, |
| "eval_runtime": 471.0868, |
| "eval_samples_per_second": 75.803, |
| "eval_steps_per_second": 37.902, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.308919459862578, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.98455337038845e-06, |
| "loss": 0.6493, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.3112776236783229, |
| "grad_norm": 2.0, |
| "learning_rate": 8.976680470484498e-06, |
| "loss": 0.6578, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.31363578749406773, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.968807570580548e-06, |
| "loss": 0.6534, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.3159939513098126, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.960934670676597e-06, |
| "loss": 0.6371, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.3183521151255575, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.953061770772647e-06, |
| "loss": 0.6489, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.32071027894130233, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.945188870868696e-06, |
| "loss": 0.6379, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.3230684427570472, |
| "grad_norm": 2.09375, |
| "learning_rate": 8.937315970964746e-06, |
| "loss": 0.6462, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.3254266065727921, |
| "grad_norm": 1.6484375, |
| "learning_rate": 8.929443071060795e-06, |
| "loss": 0.6363, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.327784770388537, |
| "grad_norm": 2.3125, |
| "learning_rate": 8.921570171156845e-06, |
| "loss": 0.6599, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.3301429342042818, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.913697271252893e-06, |
| "loss": 0.6359, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3301429342042818, |
| "eval_loss": 0.6439831852912903, |
| "eval_runtime": 470.7382, |
| "eval_samples_per_second": 75.86, |
| "eval_steps_per_second": 37.93, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3325010980200267, |
| "grad_norm": 2.09375, |
| "learning_rate": 8.905824371348944e-06, |
| "loss": 0.6382, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.3348592618357716, |
| "grad_norm": 2.03125, |
| "learning_rate": 8.897951471444994e-06, |
| "loss": 0.6516, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.33721742565151647, |
| "grad_norm": 1.703125, |
| "learning_rate": 8.890078571541042e-06, |
| "loss": 0.6342, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.3395755894672613, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.882205671637093e-06, |
| "loss": 0.6407, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.3419337532830062, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.874332771733141e-06, |
| "loss": 0.6477, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.34429191709875107, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.86645987182919e-06, |
| "loss": 0.6366, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.34665008091449595, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.85858697192524e-06, |
| "loss": 0.6438, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.3490082447302408, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.850714072021289e-06, |
| "loss": 0.641, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.35136640854598566, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.842841172117339e-06, |
| "loss": 0.649, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.35372457236173055, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.834968272213387e-06, |
| "loss": 0.6515, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.35372457236173055, |
| "eval_loss": 0.6437468528747559, |
| "eval_runtime": 473.1767, |
| "eval_samples_per_second": 75.469, |
| "eval_steps_per_second": 37.734, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.35608273617747543, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.827095372309438e-06, |
| "loss": 0.6373, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.35844089999322026, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.819222472405486e-06, |
| "loss": 0.6434, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.36079906380896515, |
| "grad_norm": 2.03125, |
| "learning_rate": 8.811349572501536e-06, |
| "loss": 0.6448, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.36315722762471003, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.803476672597585e-06, |
| "loss": 0.6461, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.36551539144045486, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.795603772693635e-06, |
| "loss": 0.6427, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.36787355525619975, |
| "grad_norm": 1.9375, |
| "learning_rate": 8.787730872789684e-06, |
| "loss": 0.6454, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.37023171907194463, |
| "grad_norm": 1.9765625, |
| "learning_rate": 8.779857972885734e-06, |
| "loss": 0.6451, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.3725898828876895, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.771985072981782e-06, |
| "loss": 0.6312, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.37494804670343435, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.764112173077833e-06, |
| "loss": 0.6457, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.37730621051917923, |
| "grad_norm": 2.125, |
| "learning_rate": 8.756239273173883e-06, |
| "loss": 0.6505, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.37730621051917923, |
| "eval_loss": 0.6432761549949646, |
| "eval_runtime": 471.5659, |
| "eval_samples_per_second": 75.726, |
| "eval_steps_per_second": 37.863, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.3796643743349241, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.748366373269931e-06, |
| "loss": 0.6487, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.382022538150669, |
| "grad_norm": 1.6171875, |
| "learning_rate": 8.74049347336598e-06, |
| "loss": 0.6217, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.38438070196641383, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.73262057346203e-06, |
| "loss": 0.6399, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.3867388657821587, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.724747673558079e-06, |
| "loss": 0.6562, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.3890970295979036, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.716874773654127e-06, |
| "loss": 0.64, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.3914551934136485, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.709001873750178e-06, |
| "loss": 0.6441, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.3938133572293933, |
| "grad_norm": 1.9765625, |
| "learning_rate": 8.701128973846226e-06, |
| "loss": 0.6292, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.3961715210451382, |
| "grad_norm": 1.671875, |
| "learning_rate": 8.693256073942276e-06, |
| "loss": 0.6465, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.3985296848608831, |
| "grad_norm": 2.203125, |
| "learning_rate": 8.685383174038327e-06, |
| "loss": 0.6421, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.40088784867662797, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.677510274134375e-06, |
| "loss": 0.6361, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.40088784867662797, |
| "eval_loss": 0.6430058479309082, |
| "eval_runtime": 473.3534, |
| "eval_samples_per_second": 75.44, |
| "eval_steps_per_second": 37.72, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.4032460124923728, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.669637374230425e-06, |
| "loss": 0.6427, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.4056041763081177, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.661764474326474e-06, |
| "loss": 0.6402, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.40796234012386257, |
| "grad_norm": 1.8828125, |
| "learning_rate": 8.653891574422524e-06, |
| "loss": 0.629, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.41032050393960745, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.646018674518573e-06, |
| "loss": 0.6307, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.4126786677553523, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.638145774614623e-06, |
| "loss": 0.6407, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.41503683157109716, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.630272874710671e-06, |
| "loss": 0.6553, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.41739499538684205, |
| "grad_norm": 2.265625, |
| "learning_rate": 8.622399974806722e-06, |
| "loss": 0.6571, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.4197531592025869, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.61452707490277e-06, |
| "loss": 0.6312, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.42211132301833176, |
| "grad_norm": 1.9609375, |
| "learning_rate": 8.60665417499882e-06, |
| "loss": 0.6407, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.42446948683407665, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.598781275094869e-06, |
| "loss": 0.6328, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.42446948683407665, |
| "eval_loss": 0.642894446849823, |
| "eval_runtime": 471.7695, |
| "eval_samples_per_second": 75.694, |
| "eval_steps_per_second": 37.847, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.42682765064982153, |
| "grad_norm": 2.03125, |
| "learning_rate": 8.590908375190918e-06, |
| "loss": 0.6457, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.42918581446556636, |
| "grad_norm": 2.09375, |
| "learning_rate": 8.583035475286968e-06, |
| "loss": 0.6342, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.43154397828131125, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.575162575383016e-06, |
| "loss": 0.634, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.43390214209705613, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.567289675479067e-06, |
| "loss": 0.638, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.436260305912801, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.559416775575115e-06, |
| "loss": 0.6377, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.43861846972854585, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.551543875671165e-06, |
| "loss": 0.6573, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.44097663354429073, |
| "grad_norm": 1.6015625, |
| "learning_rate": 8.543670975767214e-06, |
| "loss": 0.6486, |
| "step": 9350 |
| }, |
| { |
| "epoch": 0.4433347973600356, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.535798075863264e-06, |
| "loss": 0.6453, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.4456929611757805, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.527925175959314e-06, |
| "loss": 0.6308, |
| "step": 9450 |
| }, |
| { |
| "epoch": 0.44805112499152533, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.520052276055363e-06, |
| "loss": 0.6301, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.44805112499152533, |
| "eval_loss": 0.6426186561584473, |
| "eval_runtime": 475.5113, |
| "eval_samples_per_second": 75.098, |
| "eval_steps_per_second": 37.549, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.4504092888072702, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.512179376151413e-06, |
| "loss": 0.6382, |
| "step": 9550 |
| }, |
| { |
| "epoch": 0.4527674526230151, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.504306476247462e-06, |
| "loss": 0.6333, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.45512561643876, |
| "grad_norm": 2.203125, |
| "learning_rate": 8.496433576343512e-06, |
| "loss": 0.6462, |
| "step": 9650 |
| }, |
| { |
| "epoch": 0.4574837802545048, |
| "grad_norm": 2.0625, |
| "learning_rate": 8.48856067643956e-06, |
| "loss": 0.6441, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.4598419440702497, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.480687776535609e-06, |
| "loss": 0.6426, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.4622001078859946, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.47281487663166e-06, |
| "loss": 0.6278, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.46455827170173947, |
| "grad_norm": 1.65625, |
| "learning_rate": 8.464941976727708e-06, |
| "loss": 0.6422, |
| "step": 9850 |
| }, |
| { |
| "epoch": 0.4669164355174843, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.457069076823758e-06, |
| "loss": 0.6276, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.4692745993332292, |
| "grad_norm": 1.953125, |
| "learning_rate": 8.449196176919807e-06, |
| "loss": 0.6345, |
| "step": 9950 |
| }, |
| { |
| "epoch": 0.47163276314897407, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.441323277015857e-06, |
| "loss": 0.6492, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.47163276314897407, |
| "eval_loss": 0.642467200756073, |
| "eval_runtime": 475.5523, |
| "eval_samples_per_second": 75.092, |
| "eval_steps_per_second": 37.546, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.4739909269647189, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.433450377111905e-06, |
| "loss": 0.6459, |
| "step": 10050 |
| }, |
| { |
| "epoch": 0.4763490907804638, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.425577477207956e-06, |
| "loss": 0.6475, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.47870725459620866, |
| "grad_norm": 1.8984375, |
| "learning_rate": 8.417704577304004e-06, |
| "loss": 0.614, |
| "step": 10150 |
| }, |
| { |
| "epoch": 0.48106541841195355, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.409831677400054e-06, |
| "loss": 0.6374, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.4834235822276984, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.401958777496103e-06, |
| "loss": 0.6389, |
| "step": 10250 |
| }, |
| { |
| "epoch": 0.48578174604344326, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.394085877592153e-06, |
| "loss": 0.6308, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.48813990985918815, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.386212977688203e-06, |
| "loss": 0.6357, |
| "step": 10350 |
| }, |
| { |
| "epoch": 0.49049807367493303, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.378340077784252e-06, |
| "loss": 0.6295, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.49285623749067786, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.370467177880302e-06, |
| "loss": 0.6455, |
| "step": 10450 |
| }, |
| { |
| "epoch": 0.49521440130642275, |
| "grad_norm": 2.203125, |
| "learning_rate": 8.36259427797635e-06, |
| "loss": 0.6293, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.49521440130642275, |
| "eval_loss": 0.6422578692436218, |
| "eval_runtime": 476.5916, |
| "eval_samples_per_second": 74.928, |
| "eval_steps_per_second": 37.464, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.49757256512216763, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.3547213780724e-06, |
| "loss": 0.6423, |
| "step": 10550 |
| }, |
| { |
| "epoch": 0.4999307289379125, |
| "grad_norm": 1.7109375, |
| "learning_rate": 8.34684847816845e-06, |
| "loss": 0.6451, |
| "step": 10600 |
| }, |
| { |
| "epoch": 0.5022888927536574, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.338975578264498e-06, |
| "loss": 0.6396, |
| "step": 10650 |
| }, |
| { |
| "epoch": 0.5046470565694022, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.331102678360547e-06, |
| "loss": 0.6358, |
| "step": 10700 |
| }, |
| { |
| "epoch": 0.5070052203851471, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.323229778456597e-06, |
| "loss": 0.6354, |
| "step": 10750 |
| }, |
| { |
| "epoch": 0.509363384200892, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.315356878552647e-06, |
| "loss": 0.6307, |
| "step": 10800 |
| }, |
| { |
| "epoch": 0.5117215480166368, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.307483978648696e-06, |
| "loss": 0.6379, |
| "step": 10850 |
| }, |
| { |
| "epoch": 0.5140797118323818, |
| "grad_norm": 2.140625, |
| "learning_rate": 8.299611078744746e-06, |
| "loss": 0.6415, |
| "step": 10900 |
| }, |
| { |
| "epoch": 0.5164378756481266, |
| "grad_norm": 1.7109375, |
| "learning_rate": 8.291738178840795e-06, |
| "loss": 0.6415, |
| "step": 10950 |
| }, |
| { |
| "epoch": 0.5187960394638714, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.283865278936845e-06, |
| "loss": 0.6357, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5187960394638714, |
| "eval_loss": 0.6421868205070496, |
| "eval_runtime": 471.328, |
| "eval_samples_per_second": 75.765, |
| "eval_steps_per_second": 37.882, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5211542032796164, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.275992379032893e-06, |
| "loss": 0.643, |
| "step": 11050 |
| }, |
| { |
| "epoch": 0.5235123670953612, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.268119479128944e-06, |
| "loss": 0.6446, |
| "step": 11100 |
| }, |
| { |
| "epoch": 0.525870530911106, |
| "grad_norm": 2.125, |
| "learning_rate": 8.260246579224992e-06, |
| "loss": 0.6344, |
| "step": 11150 |
| }, |
| { |
| "epoch": 0.528228694726851, |
| "grad_norm": 2.03125, |
| "learning_rate": 8.252373679321042e-06, |
| "loss": 0.6479, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.5305868585425958, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.244500779417093e-06, |
| "loss": 0.6367, |
| "step": 11250 |
| }, |
| { |
| "epoch": 0.5329450223583406, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.236627879513141e-06, |
| "loss": 0.6417, |
| "step": 11300 |
| }, |
| { |
| "epoch": 0.5353031861740856, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.22875497960919e-06, |
| "loss": 0.6372, |
| "step": 11350 |
| }, |
| { |
| "epoch": 0.5376613499898304, |
| "grad_norm": 1.859375, |
| "learning_rate": 8.22088207970524e-06, |
| "loss": 0.6315, |
| "step": 11400 |
| }, |
| { |
| "epoch": 0.5400195138055753, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.213009179801288e-06, |
| "loss": 0.6412, |
| "step": 11450 |
| }, |
| { |
| "epoch": 0.5423776776213202, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.205136279897337e-06, |
| "loss": 0.6573, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.5423776776213202, |
| "eval_loss": 0.6420803070068359, |
| "eval_runtime": 468.9426, |
| "eval_samples_per_second": 76.15, |
| "eval_steps_per_second": 38.075, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.544735841437065, |
| "grad_norm": 1.6328125, |
| "learning_rate": 8.197263379993387e-06, |
| "loss": 0.6475, |
| "step": 11550 |
| }, |
| { |
| "epoch": 0.5470940052528099, |
| "grad_norm": 1.828125, |
| "learning_rate": 8.189390480089436e-06, |
| "loss": 0.6408, |
| "step": 11600 |
| }, |
| { |
| "epoch": 0.5494521690685548, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.181517580185486e-06, |
| "loss": 0.6427, |
| "step": 11650 |
| }, |
| { |
| "epoch": 0.5518103328842996, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.173644680281535e-06, |
| "loss": 0.644, |
| "step": 11700 |
| }, |
| { |
| "epoch": 0.5541684967000445, |
| "grad_norm": 2.09375, |
| "learning_rate": 8.165771780377585e-06, |
| "loss": 0.6398, |
| "step": 11750 |
| }, |
| { |
| "epoch": 0.5565266605157894, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.157898880473635e-06, |
| "loss": 0.6373, |
| "step": 11800 |
| }, |
| { |
| "epoch": 0.5588848243315343, |
| "grad_norm": 1.609375, |
| "learning_rate": 8.150025980569684e-06, |
| "loss": 0.6484, |
| "step": 11850 |
| }, |
| { |
| "epoch": 0.5612429881472791, |
| "grad_norm": 2.0, |
| "learning_rate": 8.142153080665734e-06, |
| "loss": 0.6412, |
| "step": 11900 |
| }, |
| { |
| "epoch": 0.563601151963024, |
| "grad_norm": 1.8203125, |
| "learning_rate": 8.134280180761782e-06, |
| "loss": 0.6412, |
| "step": 11950 |
| }, |
| { |
| "epoch": 0.5659593157787689, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.126407280857833e-06, |
| "loss": 0.637, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.5659593157787689, |
| "eval_loss": 0.6419476866722107, |
| "eval_runtime": 471.7113, |
| "eval_samples_per_second": 75.703, |
| "eval_steps_per_second": 37.852, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.5683174795945137, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.118534380953881e-06, |
| "loss": 0.6438, |
| "step": 12050 |
| }, |
| { |
| "epoch": 0.5706756434102586, |
| "grad_norm": 2.15625, |
| "learning_rate": 8.110661481049931e-06, |
| "loss": 0.6358, |
| "step": 12100 |
| }, |
| { |
| "epoch": 0.5730338072260035, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.10278858114598e-06, |
| "loss": 0.6365, |
| "step": 12150 |
| }, |
| { |
| "epoch": 0.5753919710417483, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.09491568124203e-06, |
| "loss": 0.6452, |
| "step": 12200 |
| }, |
| { |
| "epoch": 0.5777501348574932, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.087042781338079e-06, |
| "loss": 0.6365, |
| "step": 12250 |
| }, |
| { |
| "epoch": 0.5801082986732381, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.079169881434127e-06, |
| "loss": 0.6426, |
| "step": 12300 |
| }, |
| { |
| "epoch": 0.5824664624889829, |
| "grad_norm": 1.8984375, |
| "learning_rate": 8.071296981530177e-06, |
| "loss": 0.628, |
| "step": 12350 |
| }, |
| { |
| "epoch": 0.5848246263047279, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.063424081626226e-06, |
| "loss": 0.6315, |
| "step": 12400 |
| }, |
| { |
| "epoch": 0.5871827901204727, |
| "grad_norm": 2.125, |
| "learning_rate": 8.055551181722276e-06, |
| "loss": 0.6461, |
| "step": 12450 |
| }, |
| { |
| "epoch": 0.5895409539362175, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.047678281818325e-06, |
| "loss": 0.646, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.5895409539362175, |
| "eval_loss": 0.6418334245681763, |
| "eval_runtime": 469.7223, |
| "eval_samples_per_second": 76.024, |
| "eval_steps_per_second": 38.012, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.5918991177519625, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.039805381914375e-06, |
| "loss": 0.6535, |
| "step": 12550 |
| }, |
| { |
| "epoch": 0.5942572815677073, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.031932482010424e-06, |
| "loss": 0.6525, |
| "step": 12600 |
| }, |
| { |
| "epoch": 0.5966154453834521, |
| "grad_norm": 1.765625, |
| "learning_rate": 8.024059582106474e-06, |
| "loss": 0.6432, |
| "step": 12650 |
| }, |
| { |
| "epoch": 0.5989736091991971, |
| "grad_norm": 1.890625, |
| "learning_rate": 8.016186682202524e-06, |
| "loss": 0.633, |
| "step": 12700 |
| }, |
| { |
| "epoch": 0.6013317730149419, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.008313782298573e-06, |
| "loss": 0.6234, |
| "step": 12750 |
| }, |
| { |
| "epoch": 0.6036899368306868, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.000440882394623e-06, |
| "loss": 0.6388, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.6060481006464317, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.992567982490671e-06, |
| "loss": 0.6481, |
| "step": 12850 |
| }, |
| { |
| "epoch": 0.6084062644621765, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.984695082586722e-06, |
| "loss": 0.6321, |
| "step": 12900 |
| }, |
| { |
| "epoch": 0.6107644282779214, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.97682218268277e-06, |
| "loss": 0.6405, |
| "step": 12950 |
| }, |
| { |
| "epoch": 0.6131225920936663, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.968949282778819e-06, |
| "loss": 0.6287, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6131225920936663, |
| "eval_loss": 0.6417333483695984, |
| "eval_runtime": 469.8773, |
| "eval_samples_per_second": 75.999, |
| "eval_steps_per_second": 37.999, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6154807559094111, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.961076382874869e-06, |
| "loss": 0.6433, |
| "step": 13050 |
| }, |
| { |
| "epoch": 0.617838919725156, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.953203482970918e-06, |
| "loss": 0.6285, |
| "step": 13100 |
| }, |
| { |
| "epoch": 0.6201970835409009, |
| "grad_norm": 2.125, |
| "learning_rate": 7.945330583066968e-06, |
| "loss": 0.6368, |
| "step": 13150 |
| }, |
| { |
| "epoch": 0.6225552473566458, |
| "grad_norm": 1.8828125, |
| "learning_rate": 7.937457683163016e-06, |
| "loss": 0.6379, |
| "step": 13200 |
| }, |
| { |
| "epoch": 0.6249134111723906, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.929584783259067e-06, |
| "loss": 0.6415, |
| "step": 13250 |
| }, |
| { |
| "epoch": 0.6272715749881355, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.921711883355115e-06, |
| "loss": 0.6394, |
| "step": 13300 |
| }, |
| { |
| "epoch": 0.6296297388038804, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.913838983451165e-06, |
| "loss": 0.6415, |
| "step": 13350 |
| }, |
| { |
| "epoch": 0.6319879026196252, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.905966083547214e-06, |
| "loss": 0.6376, |
| "step": 13400 |
| }, |
| { |
| "epoch": 0.6343460664353701, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.898093183643264e-06, |
| "loss": 0.6344, |
| "step": 13450 |
| }, |
| { |
| "epoch": 0.636704230251115, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.890220283739313e-06, |
| "loss": 0.635, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.636704230251115, |
| "eval_loss": 0.6417108774185181, |
| "eval_runtime": 473.8052, |
| "eval_samples_per_second": 75.369, |
| "eval_steps_per_second": 37.684, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.6390623940668598, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.882347383835363e-06, |
| "loss": 0.637, |
| "step": 13550 |
| }, |
| { |
| "epoch": 0.6414205578826047, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.874474483931413e-06, |
| "loss": 0.6376, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.6437787216983496, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.866601584027462e-06, |
| "loss": 0.6297, |
| "step": 13650 |
| }, |
| { |
| "epoch": 0.6461368855140944, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.858728684123512e-06, |
| "loss": 0.6184, |
| "step": 13700 |
| }, |
| { |
| "epoch": 0.6484950493298394, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.85085578421956e-06, |
| "loss": 0.6494, |
| "step": 13750 |
| }, |
| { |
| "epoch": 0.6508532131455842, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.842982884315609e-06, |
| "loss": 0.6466, |
| "step": 13800 |
| }, |
| { |
| "epoch": 0.653211376961329, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.83510998441166e-06, |
| "loss": 0.647, |
| "step": 13850 |
| }, |
| { |
| "epoch": 0.655569540777074, |
| "grad_norm": 1.7578125, |
| "learning_rate": 7.827237084507708e-06, |
| "loss": 0.6332, |
| "step": 13900 |
| }, |
| { |
| "epoch": 0.6579277045928188, |
| "grad_norm": 1.6328125, |
| "learning_rate": 7.819364184603756e-06, |
| "loss": 0.6253, |
| "step": 13950 |
| }, |
| { |
| "epoch": 0.6602858684085636, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.811491284699807e-06, |
| "loss": 0.6401, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.6602858684085636, |
| "eval_loss": 0.641613781452179, |
| "eval_runtime": 470.3187, |
| "eval_samples_per_second": 75.927, |
| "eval_steps_per_second": 37.964, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.6626440322243086, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.803618384795857e-06, |
| "loss": 0.6275, |
| "step": 14050 |
| }, |
| { |
| "epoch": 0.6650021960400534, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.795745484891905e-06, |
| "loss": 0.6356, |
| "step": 14100 |
| }, |
| { |
| "epoch": 0.6673603598557983, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.787872584987956e-06, |
| "loss": 0.6431, |
| "step": 14150 |
| }, |
| { |
| "epoch": 0.6697185236715432, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.779999685084004e-06, |
| "loss": 0.6406, |
| "step": 14200 |
| }, |
| { |
| "epoch": 0.672076687487288, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.772126785180054e-06, |
| "loss": 0.6532, |
| "step": 14250 |
| }, |
| { |
| "epoch": 0.6744348513030329, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.764253885276103e-06, |
| "loss": 0.6331, |
| "step": 14300 |
| }, |
| { |
| "epoch": 0.6767930151187778, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.756380985372153e-06, |
| "loss": 0.6451, |
| "step": 14350 |
| }, |
| { |
| "epoch": 0.6791511789345226, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.748508085468202e-06, |
| "loss": 0.6376, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.6815093427502675, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.740635185564252e-06, |
| "loss": 0.6391, |
| "step": 14450 |
| }, |
| { |
| "epoch": 0.6838675065660124, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.7327622856603e-06, |
| "loss": 0.6455, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.6838675065660124, |
| "eval_loss": 0.6413908004760742, |
| "eval_runtime": 472.9873, |
| "eval_samples_per_second": 75.499, |
| "eval_steps_per_second": 37.749, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.6862256703817572, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.72488938575635e-06, |
| "loss": 0.657, |
| "step": 14550 |
| }, |
| { |
| "epoch": 0.6885838341975021, |
| "grad_norm": 2.25, |
| "learning_rate": 7.7170164858524e-06, |
| "loss": 0.6424, |
| "step": 14600 |
| }, |
| { |
| "epoch": 0.690941998013247, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.70914358594845e-06, |
| "loss": 0.6374, |
| "step": 14650 |
| }, |
| { |
| "epoch": 0.6933001618289919, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.701270686044498e-06, |
| "loss": 0.636, |
| "step": 14700 |
| }, |
| { |
| "epoch": 0.6956583256447367, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.693397786140547e-06, |
| "loss": 0.6358, |
| "step": 14750 |
| }, |
| { |
| "epoch": 0.6980164894604816, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.685524886236597e-06, |
| "loss": 0.6299, |
| "step": 14800 |
| }, |
| { |
| "epoch": 0.7003746532762265, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.677651986332645e-06, |
| "loss": 0.6294, |
| "step": 14850 |
| }, |
| { |
| "epoch": 0.7027328170919713, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.669779086428696e-06, |
| "loss": 0.6389, |
| "step": 14900 |
| }, |
| { |
| "epoch": 0.7050909809077162, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.661906186524744e-06, |
| "loss": 0.6359, |
| "step": 14950 |
| }, |
| { |
| "epoch": 0.7074491447234611, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.654033286620794e-06, |
| "loss": 0.6476, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7074491447234611, |
| "eval_loss": 0.641543447971344, |
| "eval_runtime": 470.9999, |
| "eval_samples_per_second": 75.817, |
| "eval_steps_per_second": 37.909, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7098073085392059, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.646160386716845e-06, |
| "loss": 0.6437, |
| "step": 15050 |
| }, |
| { |
| "epoch": 0.7121654723549509, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.638287486812893e-06, |
| "loss": 0.6388, |
| "step": 15100 |
| }, |
| { |
| "epoch": 0.7145236361706957, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.630414586908943e-06, |
| "loss": 0.6578, |
| "step": 15150 |
| }, |
| { |
| "epoch": 0.7168817999864405, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.622541687004992e-06, |
| "loss": 0.6549, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.7192399638021855, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.614668787101041e-06, |
| "loss": 0.6396, |
| "step": 15250 |
| }, |
| { |
| "epoch": 0.7215981276179303, |
| "grad_norm": 2.125, |
| "learning_rate": 7.606795887197091e-06, |
| "loss": 0.6534, |
| "step": 15300 |
| }, |
| { |
| "epoch": 0.7239562914336751, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.59892298729314e-06, |
| "loss": 0.6353, |
| "step": 15350 |
| }, |
| { |
| "epoch": 0.7263144552494201, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.591050087389189e-06, |
| "loss": 0.641, |
| "step": 15400 |
| }, |
| { |
| "epoch": 0.7286726190651649, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.583177187485239e-06, |
| "loss": 0.6392, |
| "step": 15450 |
| }, |
| { |
| "epoch": 0.7310307828809097, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.575304287581289e-06, |
| "loss": 0.6425, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.7310307828809097, |
| "eval_loss": 0.6413320899009705, |
| "eval_runtime": 472.198, |
| "eval_samples_per_second": 75.625, |
| "eval_steps_per_second": 37.813, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.7333889466966547, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.567431387677338e-06, |
| "loss": 0.6468, |
| "step": 15550 |
| }, |
| { |
| "epoch": 0.7357471105123995, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.559558487773387e-06, |
| "loss": 0.6364, |
| "step": 15600 |
| }, |
| { |
| "epoch": 0.7381052743281444, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.5516855878694365e-06, |
| "loss": 0.6297, |
| "step": 15650 |
| }, |
| { |
| "epoch": 0.7404634381438893, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.543812687965486e-06, |
| "loss": 0.6364, |
| "step": 15700 |
| }, |
| { |
| "epoch": 0.7428216019596341, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.5359397880615344e-06, |
| "loss": 0.6347, |
| "step": 15750 |
| }, |
| { |
| "epoch": 0.745179765775379, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.528066888157585e-06, |
| "loss": 0.6454, |
| "step": 15800 |
| }, |
| { |
| "epoch": 0.7475379295911239, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.520193988253633e-06, |
| "loss": 0.6375, |
| "step": 15850 |
| }, |
| { |
| "epoch": 0.7498960934068687, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.5123210883496835e-06, |
| "loss": 0.6398, |
| "step": 15900 |
| }, |
| { |
| "epoch": 0.7522542572226136, |
| "grad_norm": 1.7109375, |
| "learning_rate": 7.504448188445733e-06, |
| "loss": 0.6281, |
| "step": 15950 |
| }, |
| { |
| "epoch": 0.7546124210383585, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.496575288541782e-06, |
| "loss": 0.6447, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.7546124210383585, |
| "eval_loss": 0.6413915157318115, |
| "eval_runtime": 472.6251, |
| "eval_samples_per_second": 75.557, |
| "eval_steps_per_second": 37.778, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.7569705848541034, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.488702388637832e-06, |
| "loss": 0.6496, |
| "step": 16050 |
| }, |
| { |
| "epoch": 0.7593287486698482, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.48082948873388e-06, |
| "loss": 0.6417, |
| "step": 16100 |
| }, |
| { |
| "epoch": 0.7616869124855931, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.4729565888299304e-06, |
| "loss": 0.6509, |
| "step": 16150 |
| }, |
| { |
| "epoch": 0.764045076301338, |
| "grad_norm": 1.7265625, |
| "learning_rate": 7.465083688925979e-06, |
| "loss": 0.6329, |
| "step": 16200 |
| }, |
| { |
| "epoch": 0.7664032401170828, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.457210789022029e-06, |
| "loss": 0.6424, |
| "step": 16250 |
| }, |
| { |
| "epoch": 0.7687614039328277, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.449337889118078e-06, |
| "loss": 0.6315, |
| "step": 16300 |
| }, |
| { |
| "epoch": 0.7711195677485726, |
| "grad_norm": 1.8828125, |
| "learning_rate": 7.441464989214128e-06, |
| "loss": 0.6321, |
| "step": 16350 |
| }, |
| { |
| "epoch": 0.7734777315643174, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.433592089310177e-06, |
| "loss": 0.6324, |
| "step": 16400 |
| }, |
| { |
| "epoch": 0.7758358953800624, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.425719189406227e-06, |
| "loss": 0.6436, |
| "step": 16450 |
| }, |
| { |
| "epoch": 0.7781940591958072, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.417846289502276e-06, |
| "loss": 0.6555, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.7781940591958072, |
| "eval_loss": 0.6413031220436096, |
| "eval_runtime": 472.5746, |
| "eval_samples_per_second": 75.565, |
| "eval_steps_per_second": 37.782, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.780552223011552, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.409973389598325e-06, |
| "loss": 0.6379, |
| "step": 16550 |
| }, |
| { |
| "epoch": 0.782910386827297, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.402100489694375e-06, |
| "loss": 0.6414, |
| "step": 16600 |
| }, |
| { |
| "epoch": 0.7852685506430418, |
| "grad_norm": 1.8203125, |
| "learning_rate": 7.3942275897904235e-06, |
| "loss": 0.6404, |
| "step": 16650 |
| }, |
| { |
| "epoch": 0.7876267144587866, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.386354689886474e-06, |
| "loss": 0.6417, |
| "step": 16700 |
| }, |
| { |
| "epoch": 0.7899848782745316, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.378481789982522e-06, |
| "loss": 0.6353, |
| "step": 16750 |
| }, |
| { |
| "epoch": 0.7923430420902764, |
| "grad_norm": 1.859375, |
| "learning_rate": 7.3706088900785725e-06, |
| "loss": 0.6238, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.7947012059060212, |
| "grad_norm": 1.8125, |
| "learning_rate": 7.362735990174622e-06, |
| "loss": 0.6422, |
| "step": 16850 |
| }, |
| { |
| "epoch": 0.7970593697217662, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.3548630902706705e-06, |
| "loss": 0.64, |
| "step": 16900 |
| }, |
| { |
| "epoch": 0.799417533537511, |
| "grad_norm": 2.0, |
| "learning_rate": 7.346990190366721e-06, |
| "loss": 0.6515, |
| "step": 16950 |
| }, |
| { |
| "epoch": 0.8017756973532559, |
| "grad_norm": 1.6171875, |
| "learning_rate": 7.339117290462769e-06, |
| "loss": 0.6322, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8017756973532559, |
| "eval_loss": 0.6412806510925293, |
| "eval_runtime": 473.922, |
| "eval_samples_per_second": 75.35, |
| "eval_steps_per_second": 37.675, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8041338611690008, |
| "grad_norm": 1.6796875, |
| "learning_rate": 7.3312443905588195e-06, |
| "loss": 0.6362, |
| "step": 17050 |
| }, |
| { |
| "epoch": 0.8064920249847456, |
| "grad_norm": 1.8515625, |
| "learning_rate": 7.323371490654868e-06, |
| "loss": 0.6358, |
| "step": 17100 |
| }, |
| { |
| "epoch": 0.8088501888004905, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.315498590750918e-06, |
| "loss": 0.6296, |
| "step": 17150 |
| }, |
| { |
| "epoch": 0.8112083526162354, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.307625690846967e-06, |
| "loss": 0.6368, |
| "step": 17200 |
| }, |
| { |
| "epoch": 0.8135665164319802, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.299752790943016e-06, |
| "loss": 0.6366, |
| "step": 17250 |
| }, |
| { |
| "epoch": 0.8159246802477251, |
| "grad_norm": 1.734375, |
| "learning_rate": 7.2918798910390664e-06, |
| "loss": 0.6318, |
| "step": 17300 |
| }, |
| { |
| "epoch": 0.81828284406347, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.284006991135115e-06, |
| "loss": 0.6331, |
| "step": 17350 |
| }, |
| { |
| "epoch": 0.8206410078792149, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.276134091231165e-06, |
| "loss": 0.641, |
| "step": 17400 |
| }, |
| { |
| "epoch": 0.8229991716949597, |
| "grad_norm": 2.125, |
| "learning_rate": 7.268261191327214e-06, |
| "loss": 0.6536, |
| "step": 17450 |
| }, |
| { |
| "epoch": 0.8253573355107046, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.260388291423264e-06, |
| "loss": 0.6396, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.8253573355107046, |
| "eval_loss": 0.6412404179573059, |
| "eval_runtime": 471.5495, |
| "eval_samples_per_second": 75.729, |
| "eval_steps_per_second": 37.865, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.8277154993264495, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.2525153915193126e-06, |
| "loss": 0.6262, |
| "step": 17550 |
| }, |
| { |
| "epoch": 0.8300736631421943, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.244642491615362e-06, |
| "loss": 0.6445, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.8324318269579392, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.236769591711411e-06, |
| "loss": 0.6375, |
| "step": 17650 |
| }, |
| { |
| "epoch": 0.8347899907736841, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.228896691807461e-06, |
| "loss": 0.6377, |
| "step": 17700 |
| }, |
| { |
| "epoch": 0.8371481545894289, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.221023791903511e-06, |
| "loss": 0.6472, |
| "step": 17750 |
| }, |
| { |
| "epoch": 0.8395063184051738, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.2131508919995595e-06, |
| "loss": 0.6404, |
| "step": 17800 |
| }, |
| { |
| "epoch": 0.8418644822209187, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.20527799209561e-06, |
| "loss": 0.6257, |
| "step": 17850 |
| }, |
| { |
| "epoch": 0.8442226460366635, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.197405092191658e-06, |
| "loss": 0.636, |
| "step": 17900 |
| }, |
| { |
| "epoch": 0.8465808098524085, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.1895321922877085e-06, |
| "loss": 0.6441, |
| "step": 17950 |
| }, |
| { |
| "epoch": 0.8489389736681533, |
| "grad_norm": 1.796875, |
| "learning_rate": 7.181659292383757e-06, |
| "loss": 0.6458, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.8489389736681533, |
| "eval_loss": 0.6412122249603271, |
| "eval_runtime": 478.107, |
| "eval_samples_per_second": 74.69, |
| "eval_steps_per_second": 37.345, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.8512971374838981, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.1737863924798065e-06, |
| "loss": 0.6376, |
| "step": 18050 |
| }, |
| { |
| "epoch": 0.8536553012996431, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.165913492575856e-06, |
| "loss": 0.6456, |
| "step": 18100 |
| }, |
| { |
| "epoch": 0.8560134651153879, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.158040592671905e-06, |
| "loss": 0.6409, |
| "step": 18150 |
| }, |
| { |
| "epoch": 0.8583716289311327, |
| "grad_norm": 1.71875, |
| "learning_rate": 7.150167692767954e-06, |
| "loss": 0.6282, |
| "step": 18200 |
| }, |
| { |
| "epoch": 0.8607297927468777, |
| "grad_norm": 1.640625, |
| "learning_rate": 7.142294792864004e-06, |
| "loss": 0.6502, |
| "step": 18250 |
| }, |
| { |
| "epoch": 0.8630879565626225, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.134421892960054e-06, |
| "loss": 0.6497, |
| "step": 18300 |
| }, |
| { |
| "epoch": 0.8654461203783674, |
| "grad_norm": 1.78125, |
| "learning_rate": 7.126548993056103e-06, |
| "loss": 0.6329, |
| "step": 18350 |
| }, |
| { |
| "epoch": 0.8678042841941123, |
| "grad_norm": 1.8359375, |
| "learning_rate": 7.118676093152152e-06, |
| "loss": 0.6363, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.8701624480098571, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.110803193248202e-06, |
| "loss": 0.6356, |
| "step": 18450 |
| }, |
| { |
| "epoch": 0.872520611825602, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.102930293344251e-06, |
| "loss": 0.6454, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.872520611825602, |
| "eval_loss": 0.6411958336830139, |
| "eval_runtime": 472.9637, |
| "eval_samples_per_second": 75.503, |
| "eval_steps_per_second": 37.751, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.8748787756413469, |
| "grad_norm": 1.765625, |
| "learning_rate": 7.0950573934402996e-06, |
| "loss": 0.6335, |
| "step": 18550 |
| }, |
| { |
| "epoch": 0.8772369394570917, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.08718449353635e-06, |
| "loss": 0.6538, |
| "step": 18600 |
| }, |
| { |
| "epoch": 0.8795951032728366, |
| "grad_norm": 1.578125, |
| "learning_rate": 7.079311593632398e-06, |
| "loss": 0.6379, |
| "step": 18650 |
| }, |
| { |
| "epoch": 0.8819532670885815, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.071438693728449e-06, |
| "loss": 0.6519, |
| "step": 18700 |
| }, |
| { |
| "epoch": 0.8843114309043264, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.063565793824498e-06, |
| "loss": 0.6393, |
| "step": 18750 |
| }, |
| { |
| "epoch": 0.8866695947200712, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.055692893920547e-06, |
| "loss": 0.6461, |
| "step": 18800 |
| }, |
| { |
| "epoch": 0.8890277585358161, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.047819994016597e-06, |
| "loss": 0.6414, |
| "step": 18850 |
| }, |
| { |
| "epoch": 0.891385922351561, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.039947094112646e-06, |
| "loss": 0.6471, |
| "step": 18900 |
| }, |
| { |
| "epoch": 0.8937440861673058, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.0320741942086955e-06, |
| "loss": 0.6467, |
| "step": 18950 |
| }, |
| { |
| "epoch": 0.8961022499830507, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.024201294304744e-06, |
| "loss": 0.6279, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.8961022499830507, |
| "eval_loss": 0.6412160992622375, |
| "eval_runtime": 473.5786, |
| "eval_samples_per_second": 75.405, |
| "eval_steps_per_second": 37.702, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.8984604137987956, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.016328394400794e-06, |
| "loss": 0.6585, |
| "step": 19050 |
| }, |
| { |
| "epoch": 0.9008185776145404, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.008455494496843e-06, |
| "loss": 0.6204, |
| "step": 19100 |
| }, |
| { |
| "epoch": 0.9031767414302853, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.000582594592893e-06, |
| "loss": 0.6424, |
| "step": 19150 |
| }, |
| { |
| "epoch": 0.9055349052460302, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.9927096946889425e-06, |
| "loss": 0.6374, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.907893069061775, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.984836794784992e-06, |
| "loss": 0.636, |
| "step": 19250 |
| }, |
| { |
| "epoch": 0.91025123287752, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.976963894881041e-06, |
| "loss": 0.632, |
| "step": 19300 |
| }, |
| { |
| "epoch": 0.9126093966932648, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.96909099497709e-06, |
| "loss": 0.6378, |
| "step": 19350 |
| }, |
| { |
| "epoch": 0.9149675605090096, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.96121809507314e-06, |
| "loss": 0.6415, |
| "step": 19400 |
| }, |
| { |
| "epoch": 0.9173257243247546, |
| "grad_norm": 1.765625, |
| "learning_rate": 6.953345195169189e-06, |
| "loss": 0.6396, |
| "step": 19450 |
| }, |
| { |
| "epoch": 0.9196838881404994, |
| "grad_norm": 2.265625, |
| "learning_rate": 6.945472295265239e-06, |
| "loss": 0.6405, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.9196838881404994, |
| "eval_loss": 0.6411221623420715, |
| "eval_runtime": 471.8835, |
| "eval_samples_per_second": 75.675, |
| "eval_steps_per_second": 37.838, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.9220420519562442, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.937599395361287e-06, |
| "loss": 0.6424, |
| "step": 19550 |
| }, |
| { |
| "epoch": 0.9244002157719892, |
| "grad_norm": 2.34375, |
| "learning_rate": 6.929726495457338e-06, |
| "loss": 0.6437, |
| "step": 19600 |
| }, |
| { |
| "epoch": 0.926758379587734, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.921853595553387e-06, |
| "loss": 0.6394, |
| "step": 19650 |
| }, |
| { |
| "epoch": 0.9291165434034789, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.9139806956494356e-06, |
| "loss": 0.6403, |
| "step": 19700 |
| }, |
| { |
| "epoch": 0.9314747072192238, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.906107795745486e-06, |
| "loss": 0.6372, |
| "step": 19750 |
| }, |
| { |
| "epoch": 0.9338328710349686, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.898234895841534e-06, |
| "loss": 0.6433, |
| "step": 19800 |
| }, |
| { |
| "epoch": 0.9361910348507135, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.890361995937585e-06, |
| "loss": 0.6428, |
| "step": 19850 |
| }, |
| { |
| "epoch": 0.9385491986664584, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.882489096033633e-06, |
| "loss": 0.6396, |
| "step": 19900 |
| }, |
| { |
| "epoch": 0.9409073624822032, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.874616196129683e-06, |
| "loss": 0.638, |
| "step": 19950 |
| }, |
| { |
| "epoch": 0.9432655262979481, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.866743296225732e-06, |
| "loss": 0.6332, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.9432655262979481, |
| "eval_loss": 0.6411899328231812, |
| "eval_runtime": 472.9551, |
| "eval_samples_per_second": 75.504, |
| "eval_steps_per_second": 37.752, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.945623690113693, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.858870396321782e-06, |
| "loss": 0.6313, |
| "step": 20050 |
| }, |
| { |
| "epoch": 0.9479818539294378, |
| "grad_norm": 2.390625, |
| "learning_rate": 6.8509974964178316e-06, |
| "loss": 0.641, |
| "step": 20100 |
| }, |
| { |
| "epoch": 0.9503400177451827, |
| "grad_norm": 2.375, |
| "learning_rate": 6.84312459651388e-06, |
| "loss": 0.6418, |
| "step": 20150 |
| }, |
| { |
| "epoch": 0.9526981815609276, |
| "grad_norm": 1.7421875, |
| "learning_rate": 6.83525169660993e-06, |
| "loss": 0.6373, |
| "step": 20200 |
| }, |
| { |
| "epoch": 0.9550563453766725, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.827378796705979e-06, |
| "loss": 0.6461, |
| "step": 20250 |
| }, |
| { |
| "epoch": 0.9574145091924173, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.819505896802029e-06, |
| "loss": 0.6497, |
| "step": 20300 |
| }, |
| { |
| "epoch": 0.9597726730081622, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.811632996898078e-06, |
| "loss": 0.6433, |
| "step": 20350 |
| }, |
| { |
| "epoch": 0.9621308368239071, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.803760096994128e-06, |
| "loss": 0.6368, |
| "step": 20400 |
| }, |
| { |
| "epoch": 0.9644890006396519, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.7958871970901765e-06, |
| "loss": 0.6492, |
| "step": 20450 |
| }, |
| { |
| "epoch": 0.9668471644553968, |
| "grad_norm": 2.0, |
| "learning_rate": 6.788014297186226e-06, |
| "loss": 0.6398, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.9668471644553968, |
| "eval_loss": 0.6411147713661194, |
| "eval_runtime": 471.0929, |
| "eval_samples_per_second": 75.802, |
| "eval_steps_per_second": 37.901, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.9692053282711417, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.780141397282276e-06, |
| "loss": 0.6608, |
| "step": 20550 |
| }, |
| { |
| "epoch": 0.9715634920868865, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.772268497378325e-06, |
| "loss": 0.6324, |
| "step": 20600 |
| }, |
| { |
| "epoch": 0.9739216559026315, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.764395597474375e-06, |
| "loss": 0.6338, |
| "step": 20650 |
| }, |
| { |
| "epoch": 0.9762798197183763, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.756522697570423e-06, |
| "loss": 0.6476, |
| "step": 20700 |
| }, |
| { |
| "epoch": 0.9786379835341211, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.748649797666474e-06, |
| "loss": 0.6364, |
| "step": 20750 |
| }, |
| { |
| "epoch": 0.9809961473498661, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.740776897762522e-06, |
| "loss": 0.6292, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.9833543111656109, |
| "grad_norm": 2.234375, |
| "learning_rate": 6.732903997858572e-06, |
| "loss": 0.6336, |
| "step": 20850 |
| }, |
| { |
| "epoch": 0.9857124749813557, |
| "grad_norm": 2.359375, |
| "learning_rate": 6.725031097954621e-06, |
| "loss": 0.6448, |
| "step": 20900 |
| }, |
| { |
| "epoch": 0.9880706387971007, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.71715819805067e-06, |
| "loss": 0.6408, |
| "step": 20950 |
| }, |
| { |
| "epoch": 0.9904288026128455, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.70928529814672e-06, |
| "loss": 0.6433, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.9904288026128455, |
| "eval_loss": 0.6410422325134277, |
| "eval_runtime": 474.7996, |
| "eval_samples_per_second": 75.211, |
| "eval_steps_per_second": 37.605, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.9927869664285903, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.701412398242769e-06, |
| "loss": 0.6331, |
| "step": 21050 |
| }, |
| { |
| "epoch": 0.9951451302443353, |
| "grad_norm": 1.765625, |
| "learning_rate": 6.693539498338819e-06, |
| "loss": 0.6354, |
| "step": 21100 |
| }, |
| { |
| "epoch": 0.9975032940600801, |
| "grad_norm": 1.734375, |
| "learning_rate": 6.685666598434868e-06, |
| "loss": 0.6303, |
| "step": 21150 |
| }, |
| { |
| "epoch": 0.999861457875825, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.677793698530917e-06, |
| "loss": 0.659, |
| "step": 21200 |
| }, |
| { |
| "epoch": 1.0022166739868001, |
| "grad_norm": 1.421875, |
| "learning_rate": 6.669920798626967e-06, |
| "loss": 0.6318, |
| "step": 21250 |
| }, |
| { |
| "epoch": 1.004574837802545, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.662047898723016e-06, |
| "loss": 0.6472, |
| "step": 21300 |
| }, |
| { |
| "epoch": 1.00693300161829, |
| "grad_norm": 1.0703125, |
| "learning_rate": 6.6541749988190655e-06, |
| "loss": 0.6326, |
| "step": 21350 |
| }, |
| { |
| "epoch": 1.0092911654340349, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.646302098915115e-06, |
| "loss": 0.6432, |
| "step": 21400 |
| }, |
| { |
| "epoch": 1.0116493292497797, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.6384291990111635e-06, |
| "loss": 0.6514, |
| "step": 21450 |
| }, |
| { |
| "epoch": 1.0140074930655245, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.630556299107214e-06, |
| "loss": 0.6438, |
| "step": 21500 |
| }, |
| { |
| "epoch": 1.0140074930655245, |
| "eval_loss": 0.6410008668899536, |
| "eval_runtime": 473.6601, |
| "eval_samples_per_second": 75.392, |
| "eval_steps_per_second": 37.696, |
| "step": 21500 |
| }, |
| { |
| "epoch": 1.0163656568812693, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.622683399203264e-06, |
| "loss": 0.6499, |
| "step": 21550 |
| }, |
| { |
| "epoch": 1.0187238206970142, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.6148104992993125e-06, |
| "loss": 0.634, |
| "step": 21600 |
| }, |
| { |
| "epoch": 1.0210819845127592, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.606937599395362e-06, |
| "loss": 0.6428, |
| "step": 21650 |
| }, |
| { |
| "epoch": 1.023440148328504, |
| "grad_norm": 1.125, |
| "learning_rate": 6.599064699491411e-06, |
| "loss": 0.6375, |
| "step": 21700 |
| }, |
| { |
| "epoch": 1.0257983121442489, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.591191799587461e-06, |
| "loss": 0.6451, |
| "step": 21750 |
| }, |
| { |
| "epoch": 1.0281564759599937, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.583318899683509e-06, |
| "loss": 0.631, |
| "step": 21800 |
| }, |
| { |
| "epoch": 1.0305146397757385, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.5754459997795594e-06, |
| "loss": 0.6653, |
| "step": 21850 |
| }, |
| { |
| "epoch": 1.0328728035914836, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.567573099875608e-06, |
| "loss": 0.6344, |
| "step": 21900 |
| }, |
| { |
| "epoch": 1.0352309674072284, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.559700199971658e-06, |
| "loss": 0.6349, |
| "step": 21950 |
| }, |
| { |
| "epoch": 1.0375891312229732, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.551827300067708e-06, |
| "loss": 0.6457, |
| "step": 22000 |
| }, |
| { |
| "epoch": 1.0375891312229732, |
| "eval_loss": 0.6410118937492371, |
| "eval_runtime": 473.0113, |
| "eval_samples_per_second": 75.495, |
| "eval_steps_per_second": 37.748, |
| "step": 22000 |
| }, |
| { |
| "epoch": 1.039947295038718, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.543954400163757e-06, |
| "loss": 0.637, |
| "step": 22050 |
| }, |
| { |
| "epoch": 1.042305458854463, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.536081500259806e-06, |
| "loss": 0.6337, |
| "step": 22100 |
| }, |
| { |
| "epoch": 1.0446636226702077, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.528208600355855e-06, |
| "loss": 0.643, |
| "step": 22150 |
| }, |
| { |
| "epoch": 1.0470217864859528, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.520335700451905e-06, |
| "loss": 0.6347, |
| "step": 22200 |
| }, |
| { |
| "epoch": 1.0493799503016976, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.512462800547954e-06, |
| "loss": 0.6531, |
| "step": 22250 |
| }, |
| { |
| "epoch": 1.0517381141174424, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.504589900644004e-06, |
| "loss": 0.6465, |
| "step": 22300 |
| }, |
| { |
| "epoch": 1.0540962779331873, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.4967170007400525e-06, |
| "loss": 0.6229, |
| "step": 22350 |
| }, |
| { |
| "epoch": 1.056454441748932, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.488844100836103e-06, |
| "loss": 0.6405, |
| "step": 22400 |
| }, |
| { |
| "epoch": 1.0588126055646772, |
| "grad_norm": 1.25, |
| "learning_rate": 6.480971200932152e-06, |
| "loss": 0.6283, |
| "step": 22450 |
| }, |
| { |
| "epoch": 1.061170769380422, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.4730983010282015e-06, |
| "loss": 0.6441, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1.061170769380422, |
| "eval_loss": 0.6409561634063721, |
| "eval_runtime": 473.1309, |
| "eval_samples_per_second": 75.476, |
| "eval_steps_per_second": 37.738, |
| "step": 22500 |
| }, |
| { |
| "epoch": 1.0635289331961668, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.465225401124251e-06, |
| "loss": 0.6365, |
| "step": 22550 |
| }, |
| { |
| "epoch": 1.0658870970119116, |
| "grad_norm": 1.375, |
| "learning_rate": 6.4573525012202995e-06, |
| "loss": 0.6447, |
| "step": 22600 |
| }, |
| { |
| "epoch": 1.0682452608276565, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.44947960131635e-06, |
| "loss": 0.6376, |
| "step": 22650 |
| }, |
| { |
| "epoch": 1.0706034246434015, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.441606701412398e-06, |
| "loss": 0.6293, |
| "step": 22700 |
| }, |
| { |
| "epoch": 1.0729615884591464, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.4337338015084485e-06, |
| "loss": 0.62, |
| "step": 22750 |
| }, |
| { |
| "epoch": 1.0753197522748912, |
| "grad_norm": 1.4609375, |
| "learning_rate": 6.425860901604497e-06, |
| "loss": 0.6379, |
| "step": 22800 |
| }, |
| { |
| "epoch": 1.077677916090636, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.417988001700547e-06, |
| "loss": 0.6271, |
| "step": 22850 |
| }, |
| { |
| "epoch": 1.0800360799063808, |
| "grad_norm": 1.25, |
| "learning_rate": 6.410115101796597e-06, |
| "loss": 0.6341, |
| "step": 22900 |
| }, |
| { |
| "epoch": 1.0823942437221257, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.402242201892645e-06, |
| "loss": 0.6425, |
| "step": 22950 |
| }, |
| { |
| "epoch": 1.0847524075378707, |
| "grad_norm": 1.359375, |
| "learning_rate": 6.3943693019886955e-06, |
| "loss": 0.6442, |
| "step": 23000 |
| }, |
| { |
| "epoch": 1.0847524075378707, |
| "eval_loss": 0.6410566568374634, |
| "eval_runtime": 470.874, |
| "eval_samples_per_second": 75.838, |
| "eval_steps_per_second": 37.919, |
| "step": 23000 |
| }, |
| { |
| "epoch": 1.0871105713536156, |
| "grad_norm": 1.3984375, |
| "learning_rate": 6.386496402084744e-06, |
| "loss": 0.637, |
| "step": 23050 |
| }, |
| { |
| "epoch": 1.0894687351693604, |
| "grad_norm": 1.25, |
| "learning_rate": 6.378623502180794e-06, |
| "loss": 0.6278, |
| "step": 23100 |
| }, |
| { |
| "epoch": 1.0918268989851052, |
| "grad_norm": 1.25, |
| "learning_rate": 6.370750602276843e-06, |
| "loss": 0.6419, |
| "step": 23150 |
| }, |
| { |
| "epoch": 1.09418506280085, |
| "grad_norm": 1.4453125, |
| "learning_rate": 6.362877702372893e-06, |
| "loss": 0.639, |
| "step": 23200 |
| }, |
| { |
| "epoch": 1.096543226616595, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.3550048024689416e-06, |
| "loss": 0.6232, |
| "step": 23250 |
| }, |
| { |
| "epoch": 1.09890139043234, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.347131902564991e-06, |
| "loss": 0.6294, |
| "step": 23300 |
| }, |
| { |
| "epoch": 1.1012595542480847, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.339259002661041e-06, |
| "loss": 0.6486, |
| "step": 23350 |
| }, |
| { |
| "epoch": 1.1036177180638296, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.33138610275709e-06, |
| "loss": 0.6347, |
| "step": 23400 |
| }, |
| { |
| "epoch": 1.1059758818795744, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.32351320285314e-06, |
| "loss": 0.633, |
| "step": 23450 |
| }, |
| { |
| "epoch": 1.1083340456953192, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.3156403029491885e-06, |
| "loss": 0.6377, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1.1083340456953192, |
| "eval_loss": 0.6410402655601501, |
| "eval_runtime": 473.9522, |
| "eval_samples_per_second": 75.345, |
| "eval_steps_per_second": 37.673, |
| "step": 23500 |
| }, |
| { |
| "epoch": 1.1106922095110643, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.307767403045239e-06, |
| "loss": 0.6332, |
| "step": 23550 |
| }, |
| { |
| "epoch": 1.1130503733268091, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.299894503141287e-06, |
| "loss": 0.64, |
| "step": 23600 |
| }, |
| { |
| "epoch": 1.115408537142554, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.2920216032373375e-06, |
| "loss": 0.6496, |
| "step": 23650 |
| }, |
| { |
| "epoch": 1.1177667009582988, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.284148703333386e-06, |
| "loss": 0.6391, |
| "step": 23700 |
| }, |
| { |
| "epoch": 1.1201248647740436, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.2762758034294355e-06, |
| "loss": 0.6297, |
| "step": 23750 |
| }, |
| { |
| "epoch": 1.1224830285897887, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.268402903525486e-06, |
| "loss": 0.63, |
| "step": 23800 |
| }, |
| { |
| "epoch": 1.1248411924055335, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.260530003621534e-06, |
| "loss": 0.6436, |
| "step": 23850 |
| }, |
| { |
| "epoch": 1.1271993562212783, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.2526571037175845e-06, |
| "loss": 0.6371, |
| "step": 23900 |
| }, |
| { |
| "epoch": 1.1295575200370231, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.244784203813633e-06, |
| "loss": 0.6367, |
| "step": 23950 |
| }, |
| { |
| "epoch": 1.131915683852768, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.236911303909683e-06, |
| "loss": 0.6365, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1.131915683852768, |
| "eval_loss": 0.6410369277000427, |
| "eval_runtime": 475.7391, |
| "eval_samples_per_second": 75.062, |
| "eval_steps_per_second": 37.531, |
| "step": 24000 |
| }, |
| { |
| "epoch": 1.134273847668513, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.219245704224246e-06, |
| "loss": 0.6305, |
| "step": 24050 |
| }, |
| { |
| "epoch": 1.1366320114842579, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.211385181342263e-06, |
| "loss": 0.6499, |
| "step": 24100 |
| }, |
| { |
| "epoch": 1.1389901753000027, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.203524658460281e-06, |
| "loss": 0.6463, |
| "step": 24150 |
| }, |
| { |
| "epoch": 1.1413483391157475, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.195664135578299e-06, |
| "loss": 0.6432, |
| "step": 24200 |
| }, |
| { |
| "epoch": 1.1437065029314923, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.187803612696317e-06, |
| "loss": 0.6267, |
| "step": 24250 |
| }, |
| { |
| "epoch": 1.1460646667472372, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.179943089814336e-06, |
| "loss": 0.6306, |
| "step": 24300 |
| }, |
| { |
| "epoch": 1.1484228305629822, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.172082566932352e-06, |
| "loss": 0.6505, |
| "step": 24350 |
| }, |
| { |
| "epoch": 1.150780994378727, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.164222044050371e-06, |
| "loss": 0.6408, |
| "step": 24400 |
| }, |
| { |
| "epoch": 1.1531391581944719, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.156361521168389e-06, |
| "loss": 0.6428, |
| "step": 24450 |
| }, |
| { |
| "epoch": 1.1554973220102167, |
| "grad_norm": 1.328125, |
| "learning_rate": 6.1485009982864065e-06, |
| "loss": 0.6304, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1.1554973220102167, |
| "eval_loss": 0.6410101652145386, |
| "eval_runtime": 472.4338, |
| "eval_samples_per_second": 75.587, |
| "eval_steps_per_second": 37.794, |
| "step": 24500 |
| }, |
| { |
| "epoch": 1.1578554858259615, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.140640475404425e-06, |
| "loss": 0.6399, |
| "step": 24550 |
| }, |
| { |
| "epoch": 1.1602136496417064, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.1327799525224415e-06, |
| "loss": 0.6397, |
| "step": 24600 |
| }, |
| { |
| "epoch": 1.1625718134574514, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.12491942964046e-06, |
| "loss": 0.6299, |
| "step": 24650 |
| }, |
| { |
| "epoch": 1.1649299772731962, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.117058906758478e-06, |
| "loss": 0.6401, |
| "step": 24700 |
| }, |
| { |
| "epoch": 1.167288141088941, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.109198383876496e-06, |
| "loss": 0.6473, |
| "step": 24750 |
| }, |
| { |
| "epoch": 1.169646304904686, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.101337860994514e-06, |
| "loss": 0.6371, |
| "step": 24800 |
| }, |
| { |
| "epoch": 1.1720044687204307, |
| "grad_norm": 1.125, |
| "learning_rate": 6.0934773381125325e-06, |
| "loss": 0.6357, |
| "step": 24850 |
| }, |
| { |
| "epoch": 1.1743626325361758, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.085616815230549e-06, |
| "loss": 0.6408, |
| "step": 24900 |
| }, |
| { |
| "epoch": 1.1767207963519206, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.0777562923485675e-06, |
| "loss": 0.6531, |
| "step": 24950 |
| }, |
| { |
| "epoch": 1.1790789601676654, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.069895769466585e-06, |
| "loss": 0.6298, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1.1790789601676654, |
| "eval_loss": 0.6410490274429321, |
| "eval_runtime": 472.9546, |
| "eval_samples_per_second": 75.504, |
| "eval_steps_per_second": 37.752, |
| "step": 25000 |
| }, |
| { |
| "epoch": 1.1814371239834103, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.062035246584603e-06, |
| "loss": 0.633, |
| "step": 25050 |
| }, |
| { |
| "epoch": 1.183795287799155, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.054174723702622e-06, |
| "loss": 0.6221, |
| "step": 25100 |
| }, |
| { |
| "epoch": 1.1861534516149002, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.0463142008206384e-06, |
| "loss": 0.6363, |
| "step": 25150 |
| }, |
| { |
| "epoch": 1.188511615430645, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.038453677938657e-06, |
| "loss": 0.6423, |
| "step": 25200 |
| }, |
| { |
| "epoch": 1.1908697792463898, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.030593155056674e-06, |
| "loss": 0.6444, |
| "step": 25250 |
| }, |
| { |
| "epoch": 1.1932279430621346, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.022732632174693e-06, |
| "loss": 0.634, |
| "step": 25300 |
| }, |
| { |
| "epoch": 1.1955861068778795, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.014872109292711e-06, |
| "loss": 0.6559, |
| "step": 25350 |
| }, |
| { |
| "epoch": 1.1979442706936245, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.0070115864107285e-06, |
| "loss": 0.6475, |
| "step": 25400 |
| }, |
| { |
| "epoch": 1.2003024345093694, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.999151063528746e-06, |
| "loss": 0.6306, |
| "step": 25450 |
| }, |
| { |
| "epoch": 1.2026605983251142, |
| "grad_norm": 1.125, |
| "learning_rate": 5.991290540646764e-06, |
| "loss": 0.6347, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1.2026605983251142, |
| "eval_loss": 0.6409837007522583, |
| "eval_runtime": 469.7269, |
| "eval_samples_per_second": 76.023, |
| "eval_steps_per_second": 38.011, |
| "step": 25500 |
| }, |
| { |
| "epoch": 1.205018762140859, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.983430017764782e-06, |
| "loss": 0.6417, |
| "step": 25550 |
| }, |
| { |
| "epoch": 1.2073769259566038, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.9755694948828e-06, |
| "loss": 0.6452, |
| "step": 25600 |
| }, |
| { |
| "epoch": 1.2097350897723487, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.967708972000818e-06, |
| "loss": 0.6397, |
| "step": 25650 |
| }, |
| { |
| "epoch": 1.2120932535880937, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.959848449118836e-06, |
| "loss": 0.6419, |
| "step": 25700 |
| }, |
| { |
| "epoch": 1.2144514174038386, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.951987926236854e-06, |
| "loss": 0.6456, |
| "step": 25750 |
| }, |
| { |
| "epoch": 1.2168095812195834, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.944127403354871e-06, |
| "loss": 0.6403, |
| "step": 25800 |
| }, |
| { |
| "epoch": 1.2191677450353282, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.9362668804728895e-06, |
| "loss": 0.6463, |
| "step": 25850 |
| }, |
| { |
| "epoch": 1.221525908851073, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.928406357590908e-06, |
| "loss": 0.6397, |
| "step": 25900 |
| }, |
| { |
| "epoch": 1.2238840726668179, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.9205458347089254e-06, |
| "loss": 0.6444, |
| "step": 25950 |
| }, |
| { |
| "epoch": 1.226242236482563, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.912685311826944e-06, |
| "loss": 0.6331, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1.226242236482563, |
| "eval_loss": 0.6410670280456543, |
| "eval_runtime": 470.7207, |
| "eval_samples_per_second": 75.862, |
| "eval_steps_per_second": 37.931, |
| "step": 26000 |
| }, |
| { |
| "epoch": 1.2286004002983077, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.9048247889449605e-06, |
| "loss": 0.6377, |
| "step": 26050 |
| }, |
| { |
| "epoch": 1.2309585641140526, |
| "grad_norm": 1.25, |
| "learning_rate": 5.896964266062979e-06, |
| "loss": 0.6402, |
| "step": 26100 |
| }, |
| { |
| "epoch": 1.2333167279297974, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.889103743180997e-06, |
| "loss": 0.6233, |
| "step": 26150 |
| }, |
| { |
| "epoch": 1.2356748917455422, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.881243220299015e-06, |
| "loss": 0.6502, |
| "step": 26200 |
| }, |
| { |
| "epoch": 1.2380330555612873, |
| "grad_norm": 1.328125, |
| "learning_rate": 5.873382697417033e-06, |
| "loss": 0.6228, |
| "step": 26250 |
| }, |
| { |
| "epoch": 1.2403912193770321, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.865522174535051e-06, |
| "loss": 0.629, |
| "step": 26300 |
| }, |
| { |
| "epoch": 1.242749383192777, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.857661651653068e-06, |
| "loss": 0.6172, |
| "step": 26350 |
| }, |
| { |
| "epoch": 1.2451075470085218, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.8498011287710864e-06, |
| "loss": 0.6424, |
| "step": 26400 |
| }, |
| { |
| "epoch": 1.2474657108242666, |
| "grad_norm": 1.125, |
| "learning_rate": 5.841940605889104e-06, |
| "loss": 0.647, |
| "step": 26450 |
| }, |
| { |
| "epoch": 1.2498238746400117, |
| "grad_norm": 1.328125, |
| "learning_rate": 5.834080083007122e-06, |
| "loss": 0.6509, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1.2498238746400117, |
| "eval_loss": 0.6410943865776062, |
| "eval_runtime": 471.4169, |
| "eval_samples_per_second": 75.75, |
| "eval_steps_per_second": 37.875, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1.2521820384557565, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.826219560125141e-06, |
| "loss": 0.6297, |
| "step": 26550 |
| }, |
| { |
| "epoch": 1.2545402022715013, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.818359037243157e-06, |
| "loss": 0.6353, |
| "step": 26600 |
| }, |
| { |
| "epoch": 1.2568983660872461, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.810498514361176e-06, |
| "loss": 0.6326, |
| "step": 26650 |
| }, |
| { |
| "epoch": 1.259256529902991, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.802637991479194e-06, |
| "loss": 0.6367, |
| "step": 26700 |
| }, |
| { |
| "epoch": 1.261614693718736, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.794777468597212e-06, |
| "loss": 0.6308, |
| "step": 26750 |
| }, |
| { |
| "epoch": 1.2639728575344809, |
| "grad_norm": 1.484375, |
| "learning_rate": 5.78691694571523e-06, |
| "loss": 0.6441, |
| "step": 26800 |
| }, |
| { |
| "epoch": 1.2663310213502257, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.779056422833247e-06, |
| "loss": 0.6448, |
| "step": 26850 |
| }, |
| { |
| "epoch": 1.2686891851659705, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.771195899951265e-06, |
| "loss": 0.6447, |
| "step": 26900 |
| }, |
| { |
| "epoch": 1.2710473489817153, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.763335377069283e-06, |
| "loss": 0.628, |
| "step": 26950 |
| }, |
| { |
| "epoch": 1.2734055127974604, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.755474854187301e-06, |
| "loss": 0.642, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.2734055127974604, |
| "eval_loss": 0.641043484210968, |
| "eval_runtime": 470.7463, |
| "eval_samples_per_second": 75.858, |
| "eval_steps_per_second": 37.929, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.275763676613205, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.747614331305319e-06, |
| "loss": 0.6418, |
| "step": 27050 |
| }, |
| { |
| "epoch": 1.27812184042895, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.7397538084233376e-06, |
| "loss": 0.6401, |
| "step": 27100 |
| }, |
| { |
| "epoch": 1.2804800042446949, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.731893285541354e-06, |
| "loss": 0.6416, |
| "step": 27150 |
| }, |
| { |
| "epoch": 1.2828381680604397, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.724032762659373e-06, |
| "loss": 0.6381, |
| "step": 27200 |
| }, |
| { |
| "epoch": 1.2851963318761845, |
| "grad_norm": 1.328125, |
| "learning_rate": 5.71617223977739e-06, |
| "loss": 0.6388, |
| "step": 27250 |
| }, |
| { |
| "epoch": 1.2875544956919294, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.7083117168954085e-06, |
| "loss": 0.6339, |
| "step": 27300 |
| }, |
| { |
| "epoch": 1.2899126595076744, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.700451194013427e-06, |
| "loss": 0.6406, |
| "step": 27350 |
| }, |
| { |
| "epoch": 1.2922708233234192, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.6925906711314435e-06, |
| "loss": 0.6264, |
| "step": 27400 |
| }, |
| { |
| "epoch": 1.294628987139164, |
| "grad_norm": 1.25, |
| "learning_rate": 5.684730148249462e-06, |
| "loss": 0.6534, |
| "step": 27450 |
| }, |
| { |
| "epoch": 1.296987150954909, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.67686962536748e-06, |
| "loss": 0.6418, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1.296987150954909, |
| "eval_loss": 0.6410250067710876, |
| "eval_runtime": 471.8783, |
| "eval_samples_per_second": 75.676, |
| "eval_steps_per_second": 37.838, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1.2993453147706537, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.669009102485498e-06, |
| "loss": 0.6301, |
| "step": 27550 |
| }, |
| { |
| "epoch": 1.3017034785863988, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.661148579603516e-06, |
| "loss": 0.6365, |
| "step": 27600 |
| }, |
| { |
| "epoch": 1.3040616424021436, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.653288056721533e-06, |
| "loss": 0.6429, |
| "step": 27650 |
| }, |
| { |
| "epoch": 1.3064198062178884, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.645427533839551e-06, |
| "loss": 0.6358, |
| "step": 27700 |
| }, |
| { |
| "epoch": 1.3087779700336333, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.6375670109575695e-06, |
| "loss": 0.6435, |
| "step": 27750 |
| }, |
| { |
| "epoch": 1.311136133849378, |
| "grad_norm": 1.578125, |
| "learning_rate": 5.629706488075587e-06, |
| "loss": 0.651, |
| "step": 27800 |
| }, |
| { |
| "epoch": 1.3134942976651232, |
| "grad_norm": 1.3671875, |
| "learning_rate": 5.621845965193605e-06, |
| "loss": 0.6522, |
| "step": 27850 |
| }, |
| { |
| "epoch": 1.315852461480868, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.613985442311624e-06, |
| "loss": 0.6345, |
| "step": 27900 |
| }, |
| { |
| "epoch": 1.3182106252966128, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.60612491942964e-06, |
| "loss": 0.6419, |
| "step": 27950 |
| }, |
| { |
| "epoch": 1.3205687891123576, |
| "grad_norm": 1.46875, |
| "learning_rate": 5.598264396547659e-06, |
| "loss": 0.6324, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1.3205687891123576, |
| "eval_loss": 0.6410648822784424, |
| "eval_runtime": 471.9597, |
| "eval_samples_per_second": 75.663, |
| "eval_steps_per_second": 37.832, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1.3229269529281025, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.590403873665676e-06, |
| "loss": 0.6405, |
| "step": 28050 |
| }, |
| { |
| "epoch": 1.3252851167438475, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.582543350783695e-06, |
| "loss": 0.6354, |
| "step": 28100 |
| }, |
| { |
| "epoch": 1.3276432805595924, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.574682827901713e-06, |
| "loss": 0.6559, |
| "step": 28150 |
| }, |
| { |
| "epoch": 1.3300014443753372, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.5668223050197305e-06, |
| "loss": 0.6304, |
| "step": 28200 |
| }, |
| { |
| "epoch": 1.332359608191082, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.558961782137748e-06, |
| "loss": 0.6345, |
| "step": 28250 |
| }, |
| { |
| "epoch": 1.3347177720068268, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.5511012592557655e-06, |
| "loss": 0.6435, |
| "step": 28300 |
| }, |
| { |
| "epoch": 1.337075935822572, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.543240736373784e-06, |
| "loss": 0.6345, |
| "step": 28350 |
| }, |
| { |
| "epoch": 1.3394340996383165, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.535380213491802e-06, |
| "loss": 0.6344, |
| "step": 28400 |
| }, |
| { |
| "epoch": 1.3417922634540616, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.52751969060982e-06, |
| "loss": 0.6452, |
| "step": 28450 |
| }, |
| { |
| "epoch": 1.3441504272698064, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.519659167727838e-06, |
| "loss": 0.6333, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.3441504272698064, |
| "eval_loss": 0.6410021185874939, |
| "eval_runtime": 473.4866, |
| "eval_samples_per_second": 75.419, |
| "eval_steps_per_second": 37.71, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.3465085910855512, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.511798644845856e-06, |
| "loss": 0.6361, |
| "step": 28550 |
| }, |
| { |
| "epoch": 1.348866754901296, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.503938121963873e-06, |
| "loss": 0.6394, |
| "step": 28600 |
| }, |
| { |
| "epoch": 1.3512249187170409, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.4960775990818915e-06, |
| "loss": 0.6444, |
| "step": 28650 |
| }, |
| { |
| "epoch": 1.353583082532786, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.488217076199909e-06, |
| "loss": 0.6462, |
| "step": 28700 |
| }, |
| { |
| "epoch": 1.3559412463485307, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.480356553317927e-06, |
| "loss": 0.6325, |
| "step": 28750 |
| }, |
| { |
| "epoch": 1.3582994101642756, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.472496030435946e-06, |
| "loss": 0.6401, |
| "step": 28800 |
| }, |
| { |
| "epoch": 1.3606575739800204, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.464635507553962e-06, |
| "loss": 0.6395, |
| "step": 28850 |
| }, |
| { |
| "epoch": 1.3630157377957652, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.456774984671981e-06, |
| "loss": 0.6476, |
| "step": 28900 |
| }, |
| { |
| "epoch": 1.3653739016115103, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.448914461789999e-06, |
| "loss": 0.6337, |
| "step": 28950 |
| }, |
| { |
| "epoch": 1.3677320654272551, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.441053938908017e-06, |
| "loss": 0.6444, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1.3677320654272551, |
| "eval_loss": 0.6410331726074219, |
| "eval_runtime": 470.8654, |
| "eval_samples_per_second": 75.839, |
| "eval_steps_per_second": 37.92, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1.370090229243, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.433193416026035e-06, |
| "loss": 0.6422, |
| "step": 29050 |
| }, |
| { |
| "epoch": 1.3724483930587448, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.425332893144052e-06, |
| "loss": 0.6292, |
| "step": 29100 |
| }, |
| { |
| "epoch": 1.3748065568744896, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.41747237026207e-06, |
| "loss": 0.6409, |
| "step": 29150 |
| }, |
| { |
| "epoch": 1.3771647206902347, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.409611847380088e-06, |
| "loss": 0.643, |
| "step": 29200 |
| }, |
| { |
| "epoch": 1.3795228845059795, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.401751324498106e-06, |
| "loss": 0.6528, |
| "step": 29250 |
| }, |
| { |
| "epoch": 1.3818810483217243, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.393890801616124e-06, |
| "loss": 0.6191, |
| "step": 29300 |
| }, |
| { |
| "epoch": 1.3842392121374691, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.386030278734143e-06, |
| "loss": 0.6266, |
| "step": 29350 |
| }, |
| { |
| "epoch": 1.386597375953214, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.378169755852159e-06, |
| "loss": 0.6535, |
| "step": 29400 |
| }, |
| { |
| "epoch": 1.388955539768959, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.370309232970178e-06, |
| "loss": 0.6388, |
| "step": 29450 |
| }, |
| { |
| "epoch": 1.3913137035847039, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.362448710088195e-06, |
| "loss": 0.6424, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1.3913137035847039, |
| "eval_loss": 0.6410022974014282, |
| "eval_runtime": 470.5101, |
| "eval_samples_per_second": 75.896, |
| "eval_steps_per_second": 37.948, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1.3936718674004487, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.3545881872062135e-06, |
| "loss": 0.6241, |
| "step": 29550 |
| }, |
| { |
| "epoch": 1.3960300312161935, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.346727664324232e-06, |
| "loss": 0.6472, |
| "step": 29600 |
| }, |
| { |
| "epoch": 1.3983881950319383, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.3388671414422486e-06, |
| "loss": 0.6318, |
| "step": 29650 |
| }, |
| { |
| "epoch": 1.4007463588476832, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.331006618560267e-06, |
| "loss": 0.6387, |
| "step": 29700 |
| }, |
| { |
| "epoch": 1.403104522663428, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.323146095678285e-06, |
| "loss": 0.642, |
| "step": 29750 |
| }, |
| { |
| "epoch": 1.405462686479173, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.315285572796303e-06, |
| "loss": 0.6334, |
| "step": 29800 |
| }, |
| { |
| "epoch": 1.4078208502949179, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.307425049914321e-06, |
| "loss": 0.6287, |
| "step": 29850 |
| }, |
| { |
| "epoch": 1.4101790141106627, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.299564527032338e-06, |
| "loss": 0.6306, |
| "step": 29900 |
| }, |
| { |
| "epoch": 1.4125371779264075, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.291704004150356e-06, |
| "loss": 0.6339, |
| "step": 29950 |
| }, |
| { |
| "epoch": 1.4148953417421524, |
| "grad_norm": 1.34375, |
| "learning_rate": 5.2838434812683745e-06, |
| "loss": 0.6481, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.4148953417421524, |
| "eval_loss": 0.6409640312194824, |
| "eval_runtime": 470.4098, |
| "eval_samples_per_second": 75.913, |
| "eval_steps_per_second": 37.956, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.4172535055578974, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.275982958386392e-06, |
| "loss": 0.657, |
| "step": 30050 |
| }, |
| { |
| "epoch": 1.4196116693736422, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.26812243550441e-06, |
| "loss": 0.6331, |
| "step": 30100 |
| }, |
| { |
| "epoch": 1.421969833189387, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.260261912622429e-06, |
| "loss": 0.6356, |
| "step": 30150 |
| }, |
| { |
| "epoch": 1.424327997005132, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.2524013897404454e-06, |
| "loss": 0.6263, |
| "step": 30200 |
| }, |
| { |
| "epoch": 1.4266861608208767, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.244540866858464e-06, |
| "loss": 0.6462, |
| "step": 30250 |
| }, |
| { |
| "epoch": 1.4290443246366218, |
| "grad_norm": 1.4375, |
| "learning_rate": 5.236680343976481e-06, |
| "loss": 0.6289, |
| "step": 30300 |
| }, |
| { |
| "epoch": 1.4314024884523666, |
| "grad_norm": 1.3984375, |
| "learning_rate": 5.2288198210945e-06, |
| "loss": 0.6341, |
| "step": 30350 |
| }, |
| { |
| "epoch": 1.4337606522681114, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.220959298212518e-06, |
| "loss": 0.6305, |
| "step": 30400 |
| }, |
| { |
| "epoch": 1.4361188160838563, |
| "grad_norm": 1.125, |
| "learning_rate": 5.213098775330535e-06, |
| "loss": 0.6356, |
| "step": 30450 |
| }, |
| { |
| "epoch": 1.438476979899601, |
| "grad_norm": 1.25, |
| "learning_rate": 5.205238252448553e-06, |
| "loss": 0.6516, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1.438476979899601, |
| "eval_loss": 0.6409685611724854, |
| "eval_runtime": 470.4364, |
| "eval_samples_per_second": 75.908, |
| "eval_steps_per_second": 37.954, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1.4408351437153462, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.1973777295665714e-06, |
| "loss": 0.6507, |
| "step": 30550 |
| }, |
| { |
| "epoch": 1.443193307531091, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.189517206684589e-06, |
| "loss": 0.6417, |
| "step": 30600 |
| }, |
| { |
| "epoch": 1.4455514713468358, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.181656683802607e-06, |
| "loss": 0.6325, |
| "step": 30650 |
| }, |
| { |
| "epoch": 1.4479096351625806, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.173796160920625e-06, |
| "loss": 0.6243, |
| "step": 30700 |
| }, |
| { |
| "epoch": 1.4502677989783255, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.165935638038642e-06, |
| "loss": 0.6327, |
| "step": 30750 |
| }, |
| { |
| "epoch": 1.4526259627940705, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.158075115156661e-06, |
| "loss": 0.6323, |
| "step": 30800 |
| }, |
| { |
| "epoch": 1.4549841266098151, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.150214592274678e-06, |
| "loss": 0.64, |
| "step": 30850 |
| }, |
| { |
| "epoch": 1.4573422904255602, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.1423540693926966e-06, |
| "loss": 0.6423, |
| "step": 30900 |
| }, |
| { |
| "epoch": 1.459700454241305, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.134493546510715e-06, |
| "loss": 0.6436, |
| "step": 30950 |
| }, |
| { |
| "epoch": 1.4620586180570498, |
| "grad_norm": 1.125, |
| "learning_rate": 5.1266330236287324e-06, |
| "loss": 0.6249, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1.4620586180570498, |
| "eval_loss": 0.6409750580787659, |
| "eval_runtime": 467.22, |
| "eval_samples_per_second": 76.431, |
| "eval_steps_per_second": 38.215, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1.4644167818727947, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.11877250074675e-06, |
| "loss": 0.6405, |
| "step": 31050 |
| }, |
| { |
| "epoch": 1.4667749456885395, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.1109119778647675e-06, |
| "loss": 0.6238, |
| "step": 31100 |
| }, |
| { |
| "epoch": 1.4691331095042846, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.103051454982786e-06, |
| "loss": 0.6318, |
| "step": 31150 |
| }, |
| { |
| "epoch": 1.4714912733200294, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.095190932100804e-06, |
| "loss": 0.6452, |
| "step": 31200 |
| }, |
| { |
| "epoch": 1.4738494371357742, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.087330409218822e-06, |
| "loss": 0.6424, |
| "step": 31250 |
| }, |
| { |
| "epoch": 1.476207600951519, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.07946988633684e-06, |
| "loss": 0.6448, |
| "step": 31300 |
| }, |
| { |
| "epoch": 1.4785657647672639, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.071609363454857e-06, |
| "loss": 0.6141, |
| "step": 31350 |
| }, |
| { |
| "epoch": 1.480923928583009, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.063748840572875e-06, |
| "loss": 0.6339, |
| "step": 31400 |
| }, |
| { |
| "epoch": 1.4832820923987537, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.0558883176908935e-06, |
| "loss": 0.6389, |
| "step": 31450 |
| }, |
| { |
| "epoch": 1.4856402562144986, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.048027794808911e-06, |
| "loss": 0.6262, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.4856402562144986, |
| "eval_loss": 0.6409618258476257, |
| "eval_runtime": 470.7912, |
| "eval_samples_per_second": 75.851, |
| "eval_steps_per_second": 37.926, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.4879984200302434, |
| "grad_norm": 1.3359375, |
| "learning_rate": 5.040167271926929e-06, |
| "loss": 0.6329, |
| "step": 31550 |
| }, |
| { |
| "epoch": 1.4903565838459882, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.032306749044948e-06, |
| "loss": 0.6297, |
| "step": 31600 |
| }, |
| { |
| "epoch": 1.4927147476617333, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.024446226162964e-06, |
| "loss": 0.6418, |
| "step": 31650 |
| }, |
| { |
| "epoch": 1.4950729114774781, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.016585703280983e-06, |
| "loss": 0.6281, |
| "step": 31700 |
| }, |
| { |
| "epoch": 1.497431075293223, |
| "grad_norm": 1.3203125, |
| "learning_rate": 5.008725180399e-06, |
| "loss": 0.6398, |
| "step": 31750 |
| }, |
| { |
| "epoch": 1.4997892391089678, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.000864657517019e-06, |
| "loss": 0.645, |
| "step": 31800 |
| }, |
| { |
| "epoch": 1.5021474029247126, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.993004134635036e-06, |
| "loss": 0.6319, |
| "step": 31850 |
| }, |
| { |
| "epoch": 1.5045055667404577, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.9851436117530545e-06, |
| "loss": 0.6381, |
| "step": 31900 |
| }, |
| { |
| "epoch": 1.5068637305562023, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.977283088871072e-06, |
| "loss": 0.6308, |
| "step": 31950 |
| }, |
| { |
| "epoch": 1.5092218943719473, |
| "grad_norm": 1.0078125, |
| "learning_rate": 4.9694225659890895e-06, |
| "loss": 0.632, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1.5092218943719473, |
| "eval_loss": 0.6409916877746582, |
| "eval_runtime": 470.5977, |
| "eval_samples_per_second": 75.882, |
| "eval_steps_per_second": 37.941, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1.5115800581876921, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.961562043107108e-06, |
| "loss": 0.6354, |
| "step": 32050 |
| }, |
| { |
| "epoch": 1.513938222003437, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.953701520225126e-06, |
| "loss": 0.6386, |
| "step": 32100 |
| }, |
| { |
| "epoch": 1.516296385819182, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.945840997343144e-06, |
| "loss": 0.6411, |
| "step": 32150 |
| }, |
| { |
| "epoch": 1.5186545496349266, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.937980474461161e-06, |
| "loss": 0.6347, |
| "step": 32200 |
| }, |
| { |
| "epoch": 1.5210127134506717, |
| "grad_norm": 1.5546875, |
| "learning_rate": 4.93011995157918e-06, |
| "loss": 0.6375, |
| "step": 32250 |
| }, |
| { |
| "epoch": 1.5233708772664165, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.922259428697197e-06, |
| "loss": 0.6435, |
| "step": 32300 |
| }, |
| { |
| "epoch": 1.5257290410821613, |
| "grad_norm": 1.46875, |
| "learning_rate": 4.9143989058152155e-06, |
| "loss": 0.6291, |
| "step": 32350 |
| }, |
| { |
| "epoch": 1.5280872048979064, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.906538382933233e-06, |
| "loss": 0.6455, |
| "step": 32400 |
| }, |
| { |
| "epoch": 1.530445368713651, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.8986778600512505e-06, |
| "loss": 0.6324, |
| "step": 32450 |
| }, |
| { |
| "epoch": 1.532803532529396, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.890817337169269e-06, |
| "loss": 0.6474, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1.532803532529396, |
| "eval_loss": 0.6410307884216309, |
| "eval_runtime": 471.1719, |
| "eval_samples_per_second": 75.79, |
| "eval_steps_per_second": 37.895, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1.5351616963451409, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.882956814287287e-06, |
| "loss": 0.634, |
| "step": 32550 |
| }, |
| { |
| "epoch": 1.5375198601608857, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.875096291405305e-06, |
| "loss": 0.6319, |
| "step": 32600 |
| }, |
| { |
| "epoch": 1.5398780239766308, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.867235768523322e-06, |
| "loss": 0.6397, |
| "step": 32650 |
| }, |
| { |
| "epoch": 1.5422361877923754, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.859375245641341e-06, |
| "loss": 0.6525, |
| "step": 32700 |
| }, |
| { |
| "epoch": 1.5445943516081204, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.851514722759358e-06, |
| "loss": 0.6479, |
| "step": 32750 |
| }, |
| { |
| "epoch": 1.5469525154238652, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.8436541998773765e-06, |
| "loss": 0.6387, |
| "step": 32800 |
| }, |
| { |
| "epoch": 1.54931067923961, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.835793676995394e-06, |
| "loss": 0.634, |
| "step": 32850 |
| }, |
| { |
| "epoch": 1.551668843055355, |
| "grad_norm": 1.53125, |
| "learning_rate": 4.827933154113412e-06, |
| "loss": 0.6468, |
| "step": 32900 |
| }, |
| { |
| "epoch": 1.5540270068710997, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.82007263123143e-06, |
| "loss": 0.6375, |
| "step": 32950 |
| }, |
| { |
| "epoch": 1.5563851706868448, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.812212108349447e-06, |
| "loss": 0.6377, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1.5563851706868448, |
| "eval_loss": 0.6409004926681519, |
| "eval_runtime": 473.1432, |
| "eval_samples_per_second": 75.474, |
| "eval_steps_per_second": 37.737, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1.5587433345025894, |
| "grad_norm": 1.484375, |
| "learning_rate": 4.804351585467466e-06, |
| "loss": 0.6472, |
| "step": 33050 |
| }, |
| { |
| "epoch": 1.5611014983183344, |
| "grad_norm": 1.25, |
| "learning_rate": 4.796491062585484e-06, |
| "loss": 0.6357, |
| "step": 33100 |
| }, |
| { |
| "epoch": 1.5634596621340793, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.788630539703502e-06, |
| "loss": 0.6432, |
| "step": 33150 |
| }, |
| { |
| "epoch": 1.565817825949824, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.780770016821519e-06, |
| "loss": 0.6315, |
| "step": 33200 |
| }, |
| { |
| "epoch": 1.5681759897655692, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.772909493939537e-06, |
| "loss": 0.643, |
| "step": 33250 |
| }, |
| { |
| "epoch": 1.5705341535813138, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.765048971057555e-06, |
| "loss": 0.635, |
| "step": 33300 |
| }, |
| { |
| "epoch": 1.5728923173970588, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.757188448175573e-06, |
| "loss": 0.6339, |
| "step": 33350 |
| }, |
| { |
| "epoch": 1.5752504812128036, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.749327925293591e-06, |
| "loss": 0.643, |
| "step": 33400 |
| }, |
| { |
| "epoch": 1.5776086450285485, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.741467402411608e-06, |
| "loss": 0.634, |
| "step": 33450 |
| }, |
| { |
| "epoch": 1.5799668088442935, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.733606879529627e-06, |
| "loss": 0.6472, |
| "step": 33500 |
| }, |
| { |
| "epoch": 1.5799668088442935, |
| "eval_loss": 0.640994668006897, |
| "eval_runtime": 472.6883, |
| "eval_samples_per_second": 75.547, |
| "eval_steps_per_second": 37.773, |
| "step": 33500 |
| }, |
| { |
| "epoch": 1.5823249726600381, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.725746356647644e-06, |
| "loss": 0.6235, |
| "step": 33550 |
| }, |
| { |
| "epoch": 1.5846831364757832, |
| "grad_norm": 1.484375, |
| "learning_rate": 4.717885833765663e-06, |
| "loss": 0.631, |
| "step": 33600 |
| }, |
| { |
| "epoch": 1.587041300291528, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.71002531088368e-06, |
| "loss": 0.6394, |
| "step": 33650 |
| }, |
| { |
| "epoch": 1.5893994641072728, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.7021647880016985e-06, |
| "loss": 0.6488, |
| "step": 33700 |
| }, |
| { |
| "epoch": 1.591757627923018, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.694304265119716e-06, |
| "loss": 0.6458, |
| "step": 33750 |
| }, |
| { |
| "epoch": 1.5941157917387625, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.686443742237734e-06, |
| "loss": 0.6506, |
| "step": 33800 |
| }, |
| { |
| "epoch": 1.5964739555545076, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.678583219355752e-06, |
| "loss": 0.6466, |
| "step": 33850 |
| }, |
| { |
| "epoch": 1.5988321193702524, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.67072269647377e-06, |
| "loss": 0.6315, |
| "step": 33900 |
| }, |
| { |
| "epoch": 1.6011902831859972, |
| "grad_norm": 1.125, |
| "learning_rate": 4.662862173591788e-06, |
| "loss": 0.6233, |
| "step": 33950 |
| }, |
| { |
| "epoch": 1.603548447001742, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.655001650709805e-06, |
| "loss": 0.6336, |
| "step": 34000 |
| }, |
| { |
| "epoch": 1.603548447001742, |
| "eval_loss": 0.6410415768623352, |
| "eval_runtime": 476.8599, |
| "eval_samples_per_second": 74.886, |
| "eval_steps_per_second": 37.443, |
| "step": 34000 |
| }, |
| { |
| "epoch": 1.6059066108174869, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.647141127827824e-06, |
| "loss": 0.6474, |
| "step": 34050 |
| }, |
| { |
| "epoch": 1.608264774633232, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.639280604945842e-06, |
| "loss": 0.6289, |
| "step": 34100 |
| }, |
| { |
| "epoch": 1.6106229384489767, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.6314200820638595e-06, |
| "loss": 0.6416, |
| "step": 34150 |
| }, |
| { |
| "epoch": 1.6129811022647216, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.623559559181877e-06, |
| "loss": 0.6313, |
| "step": 34200 |
| }, |
| { |
| "epoch": 1.6153392660804664, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.6156990362998946e-06, |
| "loss": 0.6351, |
| "step": 34250 |
| }, |
| { |
| "epoch": 1.6176974298962112, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.607838513417913e-06, |
| "loss": 0.6333, |
| "step": 34300 |
| }, |
| { |
| "epoch": 1.6200555937119563, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.599977990535931e-06, |
| "loss": 0.631, |
| "step": 34350 |
| }, |
| { |
| "epoch": 1.622413757527701, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.592117467653949e-06, |
| "loss": 0.6391, |
| "step": 34400 |
| }, |
| { |
| "epoch": 1.624771921343446, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.584256944771966e-06, |
| "loss": 0.6356, |
| "step": 34450 |
| }, |
| { |
| "epoch": 1.6271300851591908, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.576396421889984e-06, |
| "loss": 0.6414, |
| "step": 34500 |
| }, |
| { |
| "epoch": 1.6271300851591908, |
| "eval_loss": 0.6410160064697266, |
| "eval_runtime": 470.0441, |
| "eval_samples_per_second": 75.972, |
| "eval_steps_per_second": 37.986, |
| "step": 34500 |
| }, |
| { |
| "epoch": 1.6294882489749356, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.568535899008002e-06, |
| "loss": 0.6401, |
| "step": 34550 |
| }, |
| { |
| "epoch": 1.6318464127906807, |
| "grad_norm": 1.359375, |
| "learning_rate": 4.5606753761260206e-06, |
| "loss": 0.6384, |
| "step": 34600 |
| }, |
| { |
| "epoch": 1.6342045766064253, |
| "grad_norm": 1.5078125, |
| "learning_rate": 4.552814853244038e-06, |
| "loss": 0.6287, |
| "step": 34650 |
| }, |
| { |
| "epoch": 1.6365627404221703, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.544954330362056e-06, |
| "loss": 0.6348, |
| "step": 34700 |
| }, |
| { |
| "epoch": 1.6389209042379151, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.537093807480074e-06, |
| "loss": 0.6362, |
| "step": 34750 |
| }, |
| { |
| "epoch": 1.64127906805366, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.5292332845980915e-06, |
| "loss": 0.639, |
| "step": 34800 |
| }, |
| { |
| "epoch": 1.643637231869405, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.52137276171611e-06, |
| "loss": 0.6291, |
| "step": 34850 |
| }, |
| { |
| "epoch": 1.6459953956851496, |
| "grad_norm": 1.4765625, |
| "learning_rate": 4.513512238834127e-06, |
| "loss": 0.6169, |
| "step": 34900 |
| }, |
| { |
| "epoch": 1.6483535595008947, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.505651715952146e-06, |
| "loss": 0.6425, |
| "step": 34950 |
| }, |
| { |
| "epoch": 1.6507117233166395, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.497791193070163e-06, |
| "loss": 0.6439, |
| "step": 35000 |
| }, |
| { |
| "epoch": 1.6507117233166395, |
| "eval_loss": 0.6410257816314697, |
| "eval_runtime": 472.2674, |
| "eval_samples_per_second": 75.614, |
| "eval_steps_per_second": 37.807, |
| "step": 35000 |
| }, |
| { |
| "epoch": 1.6530698871323843, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.4899306701881816e-06, |
| "loss": 0.6505, |
| "step": 35050 |
| }, |
| { |
| "epoch": 1.6554280509481294, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.482070147306199e-06, |
| "loss": 0.6317, |
| "step": 35100 |
| }, |
| { |
| "epoch": 1.657786214763874, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.4742096244242174e-06, |
| "loss": 0.6279, |
| "step": 35150 |
| }, |
| { |
| "epoch": 1.660144378579619, |
| "grad_norm": 1.640625, |
| "learning_rate": 4.466349101542235e-06, |
| "loss": 0.635, |
| "step": 35200 |
| }, |
| { |
| "epoch": 1.6625025423953639, |
| "grad_norm": 1.125, |
| "learning_rate": 4.4584885786602525e-06, |
| "loss": 0.6263, |
| "step": 35250 |
| }, |
| { |
| "epoch": 1.6648607062111087, |
| "grad_norm": 1.3515625, |
| "learning_rate": 4.450628055778271e-06, |
| "loss": 0.6381, |
| "step": 35300 |
| }, |
| { |
| "epoch": 1.6672188700268535, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.442767532896289e-06, |
| "loss": 0.643, |
| "step": 35350 |
| }, |
| { |
| "epoch": 1.6695770338425984, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.434907010014307e-06, |
| "loss": 0.6314, |
| "step": 35400 |
| }, |
| { |
| "epoch": 1.6719351976583434, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.427046487132324e-06, |
| "loss": 0.6557, |
| "step": 35450 |
| }, |
| { |
| "epoch": 1.6742933614740882, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.419185964250342e-06, |
| "loss": 0.6341, |
| "step": 35500 |
| }, |
| { |
| "epoch": 1.6742933614740882, |
| "eval_loss": 0.6410233378410339, |
| "eval_runtime": 469.7461, |
| "eval_samples_per_second": 76.02, |
| "eval_steps_per_second": 38.01, |
| "step": 35500 |
| }, |
| { |
| "epoch": 1.676651525289833, |
| "grad_norm": 1.25, |
| "learning_rate": 4.41132544136836e-06, |
| "loss": 0.6392, |
| "step": 35550 |
| }, |
| { |
| "epoch": 1.679009689105578, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.4034649184863785e-06, |
| "loss": 0.6355, |
| "step": 35600 |
| }, |
| { |
| "epoch": 1.6813678529213227, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.395604395604396e-06, |
| "loss": 0.6393, |
| "step": 35650 |
| }, |
| { |
| "epoch": 1.6837260167370678, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.3877438727224135e-06, |
| "loss": 0.6475, |
| "step": 35700 |
| }, |
| { |
| "epoch": 1.6860841805528124, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.379883349840432e-06, |
| "loss": 0.6545, |
| "step": 35750 |
| }, |
| { |
| "epoch": 1.6884423443685574, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.372022826958449e-06, |
| "loss": 0.641, |
| "step": 35800 |
| }, |
| { |
| "epoch": 1.6908005081843023, |
| "grad_norm": 1.3828125, |
| "learning_rate": 4.364162304076468e-06, |
| "loss": 0.6346, |
| "step": 35850 |
| }, |
| { |
| "epoch": 1.693158672000047, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.356301781194485e-06, |
| "loss": 0.637, |
| "step": 35900 |
| }, |
| { |
| "epoch": 1.6955168358157922, |
| "grad_norm": 1.515625, |
| "learning_rate": 4.348441258312504e-06, |
| "loss": 0.6352, |
| "step": 35950 |
| }, |
| { |
| "epoch": 1.6978749996315368, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.340580735430521e-06, |
| "loss": 0.6302, |
| "step": 36000 |
| }, |
| { |
| "epoch": 1.6978749996315368, |
| "eval_loss": 0.6410489082336426, |
| "eval_runtime": 473.009, |
| "eval_samples_per_second": 75.495, |
| "eval_steps_per_second": 37.748, |
| "step": 36000 |
| }, |
| { |
| "epoch": 1.7002331634472818, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.332720212548539e-06, |
| "loss": 0.624, |
| "step": 36050 |
| }, |
| { |
| "epoch": 1.7025913272630266, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.324859689666557e-06, |
| "loss": 0.6359, |
| "step": 36100 |
| }, |
| { |
| "epoch": 1.7049494910787715, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.316999166784575e-06, |
| "loss": 0.6363, |
| "step": 36150 |
| }, |
| { |
| "epoch": 1.7073076548945165, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.309138643902593e-06, |
| "loss": 0.6505, |
| "step": 36200 |
| }, |
| { |
| "epoch": 1.7096658187102611, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.30127812102061e-06, |
| "loss": 0.6392, |
| "step": 36250 |
| }, |
| { |
| "epoch": 1.7120239825260062, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.293417598138629e-06, |
| "loss": 0.6396, |
| "step": 36300 |
| }, |
| { |
| "epoch": 1.714382146341751, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.285557075256646e-06, |
| "loss": 0.6521, |
| "step": 36350 |
| }, |
| { |
| "epoch": 1.7167403101574958, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.277696552374665e-06, |
| "loss": 0.6569, |
| "step": 36400 |
| }, |
| { |
| "epoch": 1.719098473973241, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.269836029492682e-06, |
| "loss": 0.6409, |
| "step": 36450 |
| }, |
| { |
| "epoch": 1.7214566377889855, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.2619755066107e-06, |
| "loss": 0.6459, |
| "step": 36500 |
| }, |
| { |
| "epoch": 1.7214566377889855, |
| "eval_loss": 0.6409702301025391, |
| "eval_runtime": 471.027, |
| "eval_samples_per_second": 75.813, |
| "eval_steps_per_second": 37.907, |
| "step": 36500 |
| }, |
| { |
| "epoch": 1.7238148016047306, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.254114983728718e-06, |
| "loss": 0.6402, |
| "step": 36550 |
| }, |
| { |
| "epoch": 1.7261729654204754, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.246254460846736e-06, |
| "loss": 0.6424, |
| "step": 36600 |
| }, |
| { |
| "epoch": 1.7285311292362202, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.238393937964754e-06, |
| "loss": 0.6386, |
| "step": 36650 |
| }, |
| { |
| "epoch": 1.730889293051965, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.230533415082771e-06, |
| "loss": 0.6396, |
| "step": 36700 |
| }, |
| { |
| "epoch": 1.7332474568677099, |
| "grad_norm": 1.5078125, |
| "learning_rate": 4.22267289220079e-06, |
| "loss": 0.6439, |
| "step": 36750 |
| }, |
| { |
| "epoch": 1.735605620683455, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.214812369318807e-06, |
| "loss": 0.6368, |
| "step": 36800 |
| }, |
| { |
| "epoch": 1.7379637844991997, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.206951846436826e-06, |
| "loss": 0.6288, |
| "step": 36850 |
| }, |
| { |
| "epoch": 1.7403219483149446, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.199091323554843e-06, |
| "loss": 0.6338, |
| "step": 36900 |
| }, |
| { |
| "epoch": 1.7426801121306894, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.1912308006728615e-06, |
| "loss": 0.6385, |
| "step": 36950 |
| }, |
| { |
| "epoch": 1.7450382759464342, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.183370277790879e-06, |
| "loss": 0.6429, |
| "step": 37000 |
| }, |
| { |
| "epoch": 1.7450382759464342, |
| "eval_loss": 0.6410494446754456, |
| "eval_runtime": 473.4591, |
| "eval_samples_per_second": 75.424, |
| "eval_steps_per_second": 37.712, |
| "step": 37000 |
| }, |
| { |
| "epoch": 1.7473964397621793, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.1755097549088965e-06, |
| "loss": 0.6357, |
| "step": 37050 |
| }, |
| { |
| "epoch": 1.749754603577924, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.167649232026915e-06, |
| "loss": 0.6364, |
| "step": 37100 |
| }, |
| { |
| "epoch": 1.752112767393669, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.159788709144933e-06, |
| "loss": 0.6325, |
| "step": 37150 |
| }, |
| { |
| "epoch": 1.7544709312094138, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.151928186262951e-06, |
| "loss": 0.6421, |
| "step": 37200 |
| }, |
| { |
| "epoch": 1.7568290950251586, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.144067663380968e-06, |
| "loss": 0.649, |
| "step": 37250 |
| }, |
| { |
| "epoch": 1.7591872588409037, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.136207140498986e-06, |
| "loss": 0.6391, |
| "step": 37300 |
| }, |
| { |
| "epoch": 1.7615454226566483, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.128346617617004e-06, |
| "loss": 0.6531, |
| "step": 37350 |
| }, |
| { |
| "epoch": 1.7639035864723933, |
| "grad_norm": 1.34375, |
| "learning_rate": 4.1204860947350225e-06, |
| "loss": 0.6343, |
| "step": 37400 |
| }, |
| { |
| "epoch": 1.7662617502881381, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.11262557185304e-06, |
| "loss": 0.6334, |
| "step": 37450 |
| }, |
| { |
| "epoch": 1.768619914103883, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.1047650489710575e-06, |
| "loss": 0.636, |
| "step": 37500 |
| }, |
| { |
| "epoch": 1.768619914103883, |
| "eval_loss": 0.6409830451011658, |
| "eval_runtime": 474.1329, |
| "eval_samples_per_second": 75.316, |
| "eval_steps_per_second": 37.658, |
| "step": 37500 |
| }, |
| { |
| "epoch": 1.770978077919628, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.096904526089076e-06, |
| "loss": 0.6304, |
| "step": 37550 |
| }, |
| { |
| "epoch": 1.7733362417353726, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.089044003207093e-06, |
| "loss": 0.6303, |
| "step": 37600 |
| }, |
| { |
| "epoch": 1.7756944055511177, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.081183480325112e-06, |
| "loss": 0.644, |
| "step": 37650 |
| }, |
| { |
| "epoch": 1.7780525693668625, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.073322957443129e-06, |
| "loss": 0.6532, |
| "step": 37700 |
| }, |
| { |
| "epoch": 1.7804107331826073, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.065462434561147e-06, |
| "loss": 0.6406, |
| "step": 37750 |
| }, |
| { |
| "epoch": 1.7827688969983524, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.057601911679165e-06, |
| "loss": 0.6399, |
| "step": 37800 |
| }, |
| { |
| "epoch": 1.785127060814097, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.0497413887971835e-06, |
| "loss": 0.6383, |
| "step": 37850 |
| }, |
| { |
| "epoch": 1.787485224629842, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.041880865915201e-06, |
| "loss": 0.6403, |
| "step": 37900 |
| }, |
| { |
| "epoch": 1.7898433884455869, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.0340203430332186e-06, |
| "loss": 0.6376, |
| "step": 37950 |
| }, |
| { |
| "epoch": 1.7922015522613317, |
| "grad_norm": 1.125, |
| "learning_rate": 4.026159820151237e-06, |
| "loss": 0.624, |
| "step": 38000 |
| }, |
| { |
| "epoch": 1.7922015522613317, |
| "eval_loss": 0.6410136818885803, |
| "eval_runtime": 472.5539, |
| "eval_samples_per_second": 75.568, |
| "eval_steps_per_second": 37.784, |
| "step": 38000 |
| }, |
| { |
| "epoch": 1.7945597160770765, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.0182992972692544e-06, |
| "loss": 0.6402, |
| "step": 38050 |
| }, |
| { |
| "epoch": 1.7969178798928214, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.010438774387273e-06, |
| "loss": 0.6356, |
| "step": 38100 |
| }, |
| { |
| "epoch": 1.7992760437085664, |
| "grad_norm": 1.125, |
| "learning_rate": 4.00257825150529e-06, |
| "loss": 0.6471, |
| "step": 38150 |
| }, |
| { |
| "epoch": 1.8016342075243112, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.994717728623309e-06, |
| "loss": 0.6409, |
| "step": 38200 |
| }, |
| { |
| "epoch": 1.803992371340056, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.986857205741326e-06, |
| "loss": 0.6336, |
| "step": 38250 |
| }, |
| { |
| "epoch": 1.806350535155801, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.978996682859344e-06, |
| "loss": 0.6292, |
| "step": 38300 |
| }, |
| { |
| "epoch": 1.8087086989715457, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.971136159977362e-06, |
| "loss": 0.6324, |
| "step": 38350 |
| }, |
| { |
| "epoch": 1.8110668627872908, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.96327563709538e-06, |
| "loss": 0.6348, |
| "step": 38400 |
| }, |
| { |
| "epoch": 1.8134250266030354, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.955415114213398e-06, |
| "loss": 0.6332, |
| "step": 38450 |
| }, |
| { |
| "epoch": 1.8157831904187804, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.9475545913314154e-06, |
| "loss": 0.6369, |
| "step": 38500 |
| }, |
| { |
| "epoch": 1.8157831904187804, |
| "eval_loss": 0.6410770416259766, |
| "eval_runtime": 473.3108, |
| "eval_samples_per_second": 75.447, |
| "eval_steps_per_second": 37.724, |
| "step": 38500 |
| }, |
| { |
| "epoch": 1.8181413542345253, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.939694068449433e-06, |
| "loss": 0.6299, |
| "step": 38550 |
| }, |
| { |
| "epoch": 1.82049951805027, |
| "grad_norm": 1.09375, |
| "learning_rate": 3.931833545567451e-06, |
| "loss": 0.6387, |
| "step": 38600 |
| }, |
| { |
| "epoch": 1.8228576818660152, |
| "grad_norm": 1.359375, |
| "learning_rate": 3.92397302268547e-06, |
| "loss": 0.6561, |
| "step": 38650 |
| }, |
| { |
| "epoch": 1.8252158456817598, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.916112499803487e-06, |
| "loss": 0.6377, |
| "step": 38700 |
| }, |
| { |
| "epoch": 1.8275740094975048, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.908251976921505e-06, |
| "loss": 0.6281, |
| "step": 38750 |
| }, |
| { |
| "epoch": 1.8299321733132496, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.900391454039523e-06, |
| "loss": 0.6379, |
| "step": 38800 |
| }, |
| { |
| "epoch": 1.8322903371289945, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.892530931157541e-06, |
| "loss": 0.6401, |
| "step": 38850 |
| }, |
| { |
| "epoch": 1.8346485009447395, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.884670408275559e-06, |
| "loss": 0.6382, |
| "step": 38900 |
| }, |
| { |
| "epoch": 1.8370066647604841, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.8768098853935765e-06, |
| "loss": 0.6479, |
| "step": 38950 |
| }, |
| { |
| "epoch": 1.8393648285762292, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.868949362511595e-06, |
| "loss": 0.6354, |
| "step": 39000 |
| }, |
| { |
| "epoch": 1.8393648285762292, |
| "eval_loss": 0.6409469246864319, |
| "eval_runtime": 471.1776, |
| "eval_samples_per_second": 75.789, |
| "eval_steps_per_second": 37.894, |
| "step": 39000 |
| }, |
| { |
| "epoch": 1.841722992391974, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.861088839629612e-06, |
| "loss": 0.6231, |
| "step": 39050 |
| }, |
| { |
| "epoch": 1.8440811562077188, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.853228316747631e-06, |
| "loss": 0.6389, |
| "step": 39100 |
| }, |
| { |
| "epoch": 1.846439320023464, |
| "grad_norm": 1.46875, |
| "learning_rate": 3.845367793865648e-06, |
| "loss": 0.6477, |
| "step": 39150 |
| }, |
| { |
| "epoch": 1.8487974838392085, |
| "grad_norm": 1.2578125, |
| "learning_rate": 3.8375072709836666e-06, |
| "loss": 0.6389, |
| "step": 39200 |
| }, |
| { |
| "epoch": 1.8511556476549536, |
| "grad_norm": 1.375, |
| "learning_rate": 3.829646748101684e-06, |
| "loss": 0.6398, |
| "step": 39250 |
| }, |
| { |
| "epoch": 1.8535138114706984, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.821786225219702e-06, |
| "loss": 0.6461, |
| "step": 39300 |
| }, |
| { |
| "epoch": 1.8558719752864432, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.8139257023377195e-06, |
| "loss": 0.6365, |
| "step": 39350 |
| }, |
| { |
| "epoch": 1.858230139102188, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.806065179455738e-06, |
| "loss": 0.6285, |
| "step": 39400 |
| }, |
| { |
| "epoch": 1.8605883029179329, |
| "grad_norm": 1.25, |
| "learning_rate": 3.798204656573756e-06, |
| "loss": 0.6483, |
| "step": 39450 |
| }, |
| { |
| "epoch": 1.862946466733678, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.7903441336917733e-06, |
| "loss": 0.6471, |
| "step": 39500 |
| }, |
| { |
| "epoch": 1.862946466733678, |
| "eval_loss": 0.6410489082336426, |
| "eval_runtime": 470.662, |
| "eval_samples_per_second": 75.872, |
| "eval_steps_per_second": 37.936, |
| "step": 39500 |
| }, |
| { |
| "epoch": 1.8653046305494225, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.7824836108097913e-06, |
| "loss": 0.6416, |
| "step": 39550 |
| }, |
| { |
| "epoch": 1.8676627943651676, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.7746230879278096e-06, |
| "loss": 0.6312, |
| "step": 39600 |
| }, |
| { |
| "epoch": 1.8700209581809124, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.766762565045827e-06, |
| "loss": 0.6358, |
| "step": 39650 |
| }, |
| { |
| "epoch": 1.8723791219966572, |
| "grad_norm": 1.3671875, |
| "learning_rate": 3.758902042163845e-06, |
| "loss": 0.6439, |
| "step": 39700 |
| }, |
| { |
| "epoch": 1.8747372858124023, |
| "grad_norm": 1.125, |
| "learning_rate": 3.7510415192818626e-06, |
| "loss": 0.632, |
| "step": 39750 |
| }, |
| { |
| "epoch": 1.877095449628147, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.743180996399881e-06, |
| "loss": 0.6491, |
| "step": 39800 |
| }, |
| { |
| "epoch": 1.879453613443892, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.735320473517899e-06, |
| "loss": 0.6456, |
| "step": 39850 |
| }, |
| { |
| "epoch": 1.8818117772596368, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.7274599506359164e-06, |
| "loss": 0.6477, |
| "step": 39900 |
| }, |
| { |
| "epoch": 1.8841699410753816, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.7195994277539344e-06, |
| "loss": 0.6369, |
| "step": 39950 |
| }, |
| { |
| "epoch": 1.8865281048911267, |
| "grad_norm": 1.453125, |
| "learning_rate": 3.7117389048719527e-06, |
| "loss": 0.6444, |
| "step": 40000 |
| }, |
| { |
| "epoch": 1.8865281048911267, |
| "eval_loss": 0.6410700082778931, |
| "eval_runtime": 469.0067, |
| "eval_samples_per_second": 76.14, |
| "eval_steps_per_second": 38.07, |
| "step": 40000 |
| }, |
| { |
| "epoch": 1.8888862687068713, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.7038783819899702e-06, |
| "loss": 0.645, |
| "step": 40050 |
| }, |
| { |
| "epoch": 1.8912444325226163, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.696017859107988e-06, |
| "loss": 0.6442, |
| "step": 40100 |
| }, |
| { |
| "epoch": 1.8936025963383611, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.688157336226006e-06, |
| "loss": 0.6434, |
| "step": 40150 |
| }, |
| { |
| "epoch": 1.895960760154106, |
| "grad_norm": 1.4375, |
| "learning_rate": 3.680296813344024e-06, |
| "loss": 0.6304, |
| "step": 40200 |
| }, |
| { |
| "epoch": 1.898318923969851, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.672436290462042e-06, |
| "loss": 0.658, |
| "step": 40250 |
| }, |
| { |
| "epoch": 1.9006770877855956, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.66457576758006e-06, |
| "loss": 0.6265, |
| "step": 40300 |
| }, |
| { |
| "epoch": 1.9030352516013407, |
| "grad_norm": 1.4296875, |
| "learning_rate": 3.6567152446980774e-06, |
| "loss": 0.6332, |
| "step": 40350 |
| }, |
| { |
| "epoch": 1.9053934154170855, |
| "grad_norm": 1.125, |
| "learning_rate": 3.6488547218160954e-06, |
| "loss": 0.638, |
| "step": 40400 |
| }, |
| { |
| "epoch": 1.9077515792328303, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.6409941989341137e-06, |
| "loss": 0.6379, |
| "step": 40450 |
| }, |
| { |
| "epoch": 1.9101097430485754, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.6331336760521312e-06, |
| "loss": 0.6283, |
| "step": 40500 |
| }, |
| { |
| "epoch": 1.9101097430485754, |
| "eval_loss": 0.641033411026001, |
| "eval_runtime": 468.7258, |
| "eval_samples_per_second": 76.185, |
| "eval_steps_per_second": 38.093, |
| "step": 40500 |
| }, |
| { |
| "epoch": 1.91246790686432, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.625273153170149e-06, |
| "loss": 0.6358, |
| "step": 40550 |
| }, |
| { |
| "epoch": 1.914826070680065, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.6174126302881667e-06, |
| "loss": 0.6464, |
| "step": 40600 |
| }, |
| { |
| "epoch": 1.9171842344958099, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.609552107406185e-06, |
| "loss": 0.6384, |
| "step": 40650 |
| }, |
| { |
| "epoch": 1.9195423983115547, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.601691584524203e-06, |
| "loss": 0.6381, |
| "step": 40700 |
| }, |
| { |
| "epoch": 1.9219005621272995, |
| "grad_norm": 1.5, |
| "learning_rate": 3.5938310616422205e-06, |
| "loss": 0.6437, |
| "step": 40750 |
| }, |
| { |
| "epoch": 1.9242587259430444, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.5859705387602384e-06, |
| "loss": 0.643, |
| "step": 40800 |
| }, |
| { |
| "epoch": 1.9266168897587894, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.578110015878257e-06, |
| "loss": 0.6365, |
| "step": 40850 |
| }, |
| { |
| "epoch": 1.928975053574534, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.5702494929962743e-06, |
| "loss": 0.6364, |
| "step": 40900 |
| }, |
| { |
| "epoch": 1.931333217390279, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.5623889701142923e-06, |
| "loss": 0.6417, |
| "step": 40950 |
| }, |
| { |
| "epoch": 1.933691381206024, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.5545284472323098e-06, |
| "loss": 0.6433, |
| "step": 41000 |
| }, |
| { |
| "epoch": 1.933691381206024, |
| "eval_loss": 0.6409995555877686, |
| "eval_runtime": 469.2679, |
| "eval_samples_per_second": 76.097, |
| "eval_steps_per_second": 38.049, |
| "step": 41000 |
| }, |
| { |
| "epoch": 1.9360495450217687, |
| "grad_norm": 1.390625, |
| "learning_rate": 3.546667924350328e-06, |
| "loss": 0.6409, |
| "step": 41050 |
| }, |
| { |
| "epoch": 1.9384077088375138, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.538807401468346e-06, |
| "loss": 0.6393, |
| "step": 41100 |
| }, |
| { |
| "epoch": 1.9407658726532584, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.5309468785863636e-06, |
| "loss": 0.6403, |
| "step": 41150 |
| }, |
| { |
| "epoch": 1.9431240364690034, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.5230863557043815e-06, |
| "loss": 0.6353, |
| "step": 41200 |
| }, |
| { |
| "epoch": 1.9454822002847483, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.5152258328224e-06, |
| "loss": 0.6287, |
| "step": 41250 |
| }, |
| { |
| "epoch": 1.947840364100493, |
| "grad_norm": 1.2734375, |
| "learning_rate": 3.5073653099404174e-06, |
| "loss": 0.6329, |
| "step": 41300 |
| }, |
| { |
| "epoch": 1.9501985279162382, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.4995047870584353e-06, |
| "loss": 0.6449, |
| "step": 41350 |
| }, |
| { |
| "epoch": 1.9525566917319828, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.4916442641764533e-06, |
| "loss": 0.6399, |
| "step": 41400 |
| }, |
| { |
| "epoch": 1.9549148555477278, |
| "grad_norm": 1.1484375, |
| "learning_rate": 3.4837837412944712e-06, |
| "loss": 0.6449, |
| "step": 41450 |
| }, |
| { |
| "epoch": 1.9572730193634726, |
| "grad_norm": 1.4453125, |
| "learning_rate": 3.475923218412489e-06, |
| "loss": 0.6499, |
| "step": 41500 |
| }, |
| { |
| "epoch": 1.9572730193634726, |
| "eval_loss": 0.6410717964172363, |
| "eval_runtime": 469.3759, |
| "eval_samples_per_second": 76.08, |
| "eval_steps_per_second": 38.04, |
| "step": 41500 |
| }, |
| { |
| "epoch": 1.9596311831792175, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.468062695530507e-06, |
| "loss": 0.6426, |
| "step": 41550 |
| }, |
| { |
| "epoch": 1.9619893469949625, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.4602021726485246e-06, |
| "loss": 0.6327, |
| "step": 41600 |
| }, |
| { |
| "epoch": 1.9643475108107071, |
| "grad_norm": 1.1953125, |
| "learning_rate": 3.452341649766543e-06, |
| "loss": 0.6491, |
| "step": 41650 |
| }, |
| { |
| "epoch": 1.9667056746264522, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.444481126884561e-06, |
| "loss": 0.6416, |
| "step": 41700 |
| }, |
| { |
| "epoch": 1.969063838442197, |
| "grad_norm": 1.25, |
| "learning_rate": 3.4366206040025784e-06, |
| "loss": 0.6597, |
| "step": 41750 |
| }, |
| { |
| "epoch": 1.9714220022579418, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.4287600811205964e-06, |
| "loss": 0.6347, |
| "step": 41800 |
| }, |
| { |
| "epoch": 1.9737801660736867, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.4208995582386147e-06, |
| "loss": 0.6337, |
| "step": 41850 |
| }, |
| { |
| "epoch": 1.9761383298894315, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.4130390353566322e-06, |
| "loss": 0.6446, |
| "step": 41900 |
| }, |
| { |
| "epoch": 1.9784964937051766, |
| "grad_norm": 1.265625, |
| "learning_rate": 3.40517851247465e-06, |
| "loss": 0.6355, |
| "step": 41950 |
| }, |
| { |
| "epoch": 1.9808546575209214, |
| "grad_norm": 1.25, |
| "learning_rate": 3.3973179895926677e-06, |
| "loss": 0.6317, |
| "step": 42000 |
| }, |
| { |
| "epoch": 1.9808546575209214, |
| "eval_loss": 0.6410444378852844, |
| "eval_runtime": 470.2222, |
| "eval_samples_per_second": 75.943, |
| "eval_steps_per_second": 37.971, |
| "step": 42000 |
| }, |
| { |
| "epoch": 1.9832128213366662, |
| "grad_norm": 1.3046875, |
| "learning_rate": 3.389457466710686e-06, |
| "loss": 0.6303, |
| "step": 42050 |
| }, |
| { |
| "epoch": 1.985570985152411, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.381596943828704e-06, |
| "loss": 0.6429, |
| "step": 42100 |
| }, |
| { |
| "epoch": 1.9879291489681559, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.3737364209467215e-06, |
| "loss": 0.6477, |
| "step": 42150 |
| }, |
| { |
| "epoch": 1.990287312783901, |
| "grad_norm": 1.28125, |
| "learning_rate": 3.3658758980647394e-06, |
| "loss": 0.6388, |
| "step": 42200 |
| }, |
| { |
| "epoch": 1.9926454765996455, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.3580153751827578e-06, |
| "loss": 0.6322, |
| "step": 42250 |
| }, |
| { |
| "epoch": 1.9950036404153906, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.3501548523007753e-06, |
| "loss": 0.6404, |
| "step": 42300 |
| }, |
| { |
| "epoch": 1.9973618042311354, |
| "grad_norm": 1.234375, |
| "learning_rate": 3.3422943294187932e-06, |
| "loss": 0.6281, |
| "step": 42350 |
| }, |
| { |
| "epoch": 1.9997199680468802, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.3344338065368108e-06, |
| "loss": 0.6563, |
| "step": 42400 |
| }, |
| { |
| "epoch": 2.0020751841578557, |
| "grad_norm": 1.125, |
| "learning_rate": 3.326573283654829e-06, |
| "loss": 0.633, |
| "step": 42450 |
| }, |
| { |
| "epoch": 2.0044333479736003, |
| "grad_norm": 0.984375, |
| "learning_rate": 3.318712760772847e-06, |
| "loss": 0.6426, |
| "step": 42500 |
| }, |
| { |
| "epoch": 2.0044333479736003, |
| "eval_loss": 0.6410335898399353, |
| "eval_runtime": 469.6391, |
| "eval_samples_per_second": 76.037, |
| "eval_steps_per_second": 38.019, |
| "step": 42500 |
| }, |
| { |
| "epoch": 2.0067915117893453, |
| "grad_norm": 1.0, |
| "learning_rate": 3.3108522378908646e-06, |
| "loss": 0.6385, |
| "step": 42550 |
| }, |
| { |
| "epoch": 2.00914967560509, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.3029917150088825e-06, |
| "loss": 0.6285, |
| "step": 42600 |
| }, |
| { |
| "epoch": 2.011507839420835, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.295131192126901e-06, |
| "loss": 0.636, |
| "step": 42650 |
| }, |
| { |
| "epoch": 2.01386600323658, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.2872706692449184e-06, |
| "loss": 0.6265, |
| "step": 42700 |
| }, |
| { |
| "epoch": 2.0162241670523247, |
| "grad_norm": 1.0, |
| "learning_rate": 3.2794101463629363e-06, |
| "loss": 0.6338, |
| "step": 42750 |
| }, |
| { |
| "epoch": 2.0185823308680697, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.2715496234809543e-06, |
| "loss": 0.6408, |
| "step": 42800 |
| }, |
| { |
| "epoch": 2.0209404946838143, |
| "grad_norm": 1.109375, |
| "learning_rate": 3.263689100598972e-06, |
| "loss": 0.6398, |
| "step": 42850 |
| }, |
| { |
| "epoch": 2.0232986584995594, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.25582857771699e-06, |
| "loss": 0.6332, |
| "step": 42900 |
| }, |
| { |
| "epoch": 2.025656822315304, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.247968054835008e-06, |
| "loss": 0.6408, |
| "step": 42950 |
| }, |
| { |
| "epoch": 2.028014986131049, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.2401075319530256e-06, |
| "loss": 0.6297, |
| "step": 43000 |
| }, |
| { |
| "epoch": 2.028014986131049, |
| "eval_loss": 0.6411005854606628, |
| "eval_runtime": 473.6412, |
| "eval_samples_per_second": 75.395, |
| "eval_steps_per_second": 37.697, |
| "step": 43000 |
| }, |
| { |
| "epoch": 2.030373149946794, |
| "grad_norm": 0.9765625, |
| "learning_rate": 3.232247009071044e-06, |
| "loss": 0.6486, |
| "step": 43050 |
| }, |
| { |
| "epoch": 2.0327313137625387, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.224386486189062e-06, |
| "loss": 0.6387, |
| "step": 43100 |
| }, |
| { |
| "epoch": 2.0350894775782837, |
| "grad_norm": 1.1015625, |
| "learning_rate": 3.2165259633070794e-06, |
| "loss": 0.642, |
| "step": 43150 |
| }, |
| { |
| "epoch": 2.0374476413940283, |
| "grad_norm": 1.0234375, |
| "learning_rate": 3.2086654404250973e-06, |
| "loss": 0.6331, |
| "step": 43200 |
| }, |
| { |
| "epoch": 2.0398058052097734, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.200804917543115e-06, |
| "loss": 0.6429, |
| "step": 43250 |
| }, |
| { |
| "epoch": 2.0421639690255184, |
| "grad_norm": 0.97265625, |
| "learning_rate": 3.192944394661133e-06, |
| "loss": 0.6319, |
| "step": 43300 |
| }, |
| { |
| "epoch": 2.044522132841263, |
| "grad_norm": 0.98046875, |
| "learning_rate": 3.185083871779151e-06, |
| "loss": 0.629, |
| "step": 43350 |
| }, |
| { |
| "epoch": 2.046880296657008, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.1772233488971687e-06, |
| "loss": 0.6327, |
| "step": 43400 |
| }, |
| { |
| "epoch": 2.0492384604727527, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.1693628260151866e-06, |
| "loss": 0.6462, |
| "step": 43450 |
| }, |
| { |
| "epoch": 2.0515966242884978, |
| "grad_norm": 0.98046875, |
| "learning_rate": 3.161502303133205e-06, |
| "loss": 0.6433, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2.0515966242884978, |
| "eval_loss": 0.6410638689994812, |
| "eval_runtime": 473.3603, |
| "eval_samples_per_second": 75.439, |
| "eval_steps_per_second": 37.72, |
| "step": 43500 |
| }, |
| { |
| "epoch": 2.053954788104243, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.1536417802512225e-06, |
| "loss": 0.6416, |
| "step": 43550 |
| }, |
| { |
| "epoch": 2.0563129519199874, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.1457812573692404e-06, |
| "loss": 0.645, |
| "step": 43600 |
| }, |
| { |
| "epoch": 2.0586711157357325, |
| "grad_norm": 0.9921875, |
| "learning_rate": 3.137920734487258e-06, |
| "loss": 0.634, |
| "step": 43650 |
| }, |
| { |
| "epoch": 2.061029279551477, |
| "grad_norm": 0.95703125, |
| "learning_rate": 3.1300602116052763e-06, |
| "loss": 0.642, |
| "step": 43700 |
| }, |
| { |
| "epoch": 2.063387443367222, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.1221996887232942e-06, |
| "loss": 0.6184, |
| "step": 43750 |
| }, |
| { |
| "epoch": 2.065745607182967, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.1143391658413117e-06, |
| "loss": 0.6444, |
| "step": 43800 |
| }, |
| { |
| "epoch": 2.068103770998712, |
| "grad_norm": 1.125, |
| "learning_rate": 3.1064786429593297e-06, |
| "loss": 0.6347, |
| "step": 43850 |
| }, |
| { |
| "epoch": 2.070461934814457, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.098618120077348e-06, |
| "loss": 0.6416, |
| "step": 43900 |
| }, |
| { |
| "epoch": 2.0728200986302014, |
| "grad_norm": 1.125, |
| "learning_rate": 3.0907575971953655e-06, |
| "loss": 0.6464, |
| "step": 43950 |
| }, |
| { |
| "epoch": 2.0751782624459465, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.0828970743133835e-06, |
| "loss": 0.6362, |
| "step": 44000 |
| }, |
| { |
| "epoch": 2.0751782624459465, |
| "eval_loss": 0.6410983204841614, |
| "eval_runtime": 470.4002, |
| "eval_samples_per_second": 75.914, |
| "eval_steps_per_second": 37.957, |
| "step": 44000 |
| }, |
| { |
| "epoch": 2.0775364262616915, |
| "grad_norm": 1.2265625, |
| "learning_rate": 3.0750365514314014e-06, |
| "loss": 0.639, |
| "step": 44050 |
| }, |
| { |
| "epoch": 2.079894590077436, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.0671760285494194e-06, |
| "loss": 0.6321, |
| "step": 44100 |
| }, |
| { |
| "epoch": 2.082252753893181, |
| "grad_norm": 1.1171875, |
| "learning_rate": 3.0593155056674373e-06, |
| "loss": 0.6321, |
| "step": 44150 |
| }, |
| { |
| "epoch": 2.084610917708926, |
| "grad_norm": 1.203125, |
| "learning_rate": 3.0514549827854552e-06, |
| "loss": 0.6378, |
| "step": 44200 |
| }, |
| { |
| "epoch": 2.086969081524671, |
| "grad_norm": 0.96484375, |
| "learning_rate": 3.0435944599034727e-06, |
| "loss": 0.6315, |
| "step": 44250 |
| }, |
| { |
| "epoch": 2.0893272453404155, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.035733937021491e-06, |
| "loss": 0.6456, |
| "step": 44300 |
| }, |
| { |
| "epoch": 2.0916854091561605, |
| "grad_norm": 1.140625, |
| "learning_rate": 3.027873414139509e-06, |
| "loss": 0.6438, |
| "step": 44350 |
| }, |
| { |
| "epoch": 2.0940435729719056, |
| "grad_norm": 0.984375, |
| "learning_rate": 3.0200128912575266e-06, |
| "loss": 0.6345, |
| "step": 44400 |
| }, |
| { |
| "epoch": 2.09640173678765, |
| "grad_norm": 0.921875, |
| "learning_rate": 3.0121523683755445e-06, |
| "loss": 0.658, |
| "step": 44450 |
| }, |
| { |
| "epoch": 2.0987599006033952, |
| "grad_norm": 0.98828125, |
| "learning_rate": 3.004291845493563e-06, |
| "loss": 0.6351, |
| "step": 44500 |
| }, |
| { |
| "epoch": 2.0987599006033952, |
| "eval_loss": 0.6411724090576172, |
| "eval_runtime": 471.2456, |
| "eval_samples_per_second": 75.778, |
| "eval_steps_per_second": 37.889, |
| "step": 44500 |
| }, |
| { |
| "epoch": 2.10111806441914, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.9964313226115804e-06, |
| "loss": 0.6249, |
| "step": 44550 |
| }, |
| { |
| "epoch": 2.103476228234885, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.9885707997295983e-06, |
| "loss": 0.6377, |
| "step": 44600 |
| }, |
| { |
| "epoch": 2.10583439205063, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.980710276847616e-06, |
| "loss": 0.6466, |
| "step": 44650 |
| }, |
| { |
| "epoch": 2.1081925558663746, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.972849753965634e-06, |
| "loss": 0.6393, |
| "step": 44700 |
| }, |
| { |
| "epoch": 2.1105507196821196, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.964989231083652e-06, |
| "loss": 0.6548, |
| "step": 44750 |
| }, |
| { |
| "epoch": 2.112908883497864, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.9571287082016696e-06, |
| "loss": 0.6414, |
| "step": 44800 |
| }, |
| { |
| "epoch": 2.1152670473136093, |
| "grad_norm": 1.0, |
| "learning_rate": 2.9492681853196876e-06, |
| "loss": 0.6384, |
| "step": 44850 |
| }, |
| { |
| "epoch": 2.1176252111293543, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.941407662437706e-06, |
| "loss": 0.6432, |
| "step": 44900 |
| }, |
| { |
| "epoch": 2.119983374945099, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.9335471395557234e-06, |
| "loss": 0.6463, |
| "step": 44950 |
| }, |
| { |
| "epoch": 2.122341538760844, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.9256866166737414e-06, |
| "loss": 0.6403, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2.122341538760844, |
| "eval_loss": 0.6410689353942871, |
| "eval_runtime": 470.4062, |
| "eval_samples_per_second": 75.913, |
| "eval_steps_per_second": 37.957, |
| "step": 45000 |
| }, |
| { |
| "epoch": 2.1246997025765886, |
| "grad_norm": 1.0, |
| "learning_rate": 2.917826093791759e-06, |
| "loss": 0.6389, |
| "step": 45050 |
| }, |
| { |
| "epoch": 2.1270578663923336, |
| "grad_norm": 0.96484375, |
| "learning_rate": 2.9099655709097773e-06, |
| "loss": 0.637, |
| "step": 45100 |
| }, |
| { |
| "epoch": 2.1294160302080787, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.902105048027795e-06, |
| "loss": 0.6278, |
| "step": 45150 |
| }, |
| { |
| "epoch": 2.1317741940238233, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.8942445251458127e-06, |
| "loss": 0.643, |
| "step": 45200 |
| }, |
| { |
| "epoch": 2.1341323578395683, |
| "grad_norm": 0.96875, |
| "learning_rate": 2.8863840022638306e-06, |
| "loss": 0.6384, |
| "step": 45250 |
| }, |
| { |
| "epoch": 2.136490521655313, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.878523479381849e-06, |
| "loss": 0.6393, |
| "step": 45300 |
| }, |
| { |
| "epoch": 2.138848685471058, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.8706629564998665e-06, |
| "loss": 0.6413, |
| "step": 45350 |
| }, |
| { |
| "epoch": 2.141206849286803, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.8628024336178845e-06, |
| "loss": 0.6533, |
| "step": 45400 |
| }, |
| { |
| "epoch": 2.1435650131025477, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.8549419107359024e-06, |
| "loss": 0.6509, |
| "step": 45450 |
| }, |
| { |
| "epoch": 2.1459231769182927, |
| "grad_norm": 0.91015625, |
| "learning_rate": 2.8470813878539203e-06, |
| "loss": 0.6382, |
| "step": 45500 |
| }, |
| { |
| "epoch": 2.1459231769182927, |
| "eval_loss": 0.6410402059555054, |
| "eval_runtime": 473.4485, |
| "eval_samples_per_second": 75.425, |
| "eval_steps_per_second": 37.713, |
| "step": 45500 |
| }, |
| { |
| "epoch": 2.1482813407340373, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.8392208649719383e-06, |
| "loss": 0.6198, |
| "step": 45550 |
| }, |
| { |
| "epoch": 2.1506395045497824, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.831360342089956e-06, |
| "loss": 0.6359, |
| "step": 45600 |
| }, |
| { |
| "epoch": 2.1529976683655274, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.8234998192079737e-06, |
| "loss": 0.6353, |
| "step": 45650 |
| }, |
| { |
| "epoch": 2.155355832181272, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.815639296325992e-06, |
| "loss": 0.6479, |
| "step": 45700 |
| }, |
| { |
| "epoch": 2.157713995997017, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.80777877344401e-06, |
| "loss": 0.6416, |
| "step": 45750 |
| }, |
| { |
| "epoch": 2.1600721598127617, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.7999182505620275e-06, |
| "loss": 0.6517, |
| "step": 45800 |
| }, |
| { |
| "epoch": 2.1624303236285067, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.7920577276800455e-06, |
| "loss": 0.6443, |
| "step": 45850 |
| }, |
| { |
| "epoch": 2.1647884874442513, |
| "grad_norm": 0.91796875, |
| "learning_rate": 2.784197204798064e-06, |
| "loss": 0.6422, |
| "step": 45900 |
| }, |
| { |
| "epoch": 2.1671466512599964, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.7763366819160813e-06, |
| "loss": 0.6313, |
| "step": 45950 |
| }, |
| { |
| "epoch": 2.1695048150757414, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.7684761590340993e-06, |
| "loss": 0.6271, |
| "step": 46000 |
| }, |
| { |
| "epoch": 2.1695048150757414, |
| "eval_loss": 0.6410887837409973, |
| "eval_runtime": 473.9298, |
| "eval_samples_per_second": 75.349, |
| "eval_steps_per_second": 37.674, |
| "step": 46000 |
| }, |
| { |
| "epoch": 2.171862978891486, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.760615636152117e-06, |
| "loss": 0.6376, |
| "step": 46050 |
| }, |
| { |
| "epoch": 2.174221142707231, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.752755113270135e-06, |
| "loss": 0.6308, |
| "step": 46100 |
| }, |
| { |
| "epoch": 2.1765793065229757, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.744894590388153e-06, |
| "loss": 0.646, |
| "step": 46150 |
| }, |
| { |
| "epoch": 2.1789374703387208, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.7370340675061706e-06, |
| "loss": 0.6353, |
| "step": 46200 |
| }, |
| { |
| "epoch": 2.181295634154466, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.7291735446241885e-06, |
| "loss": 0.6365, |
| "step": 46250 |
| }, |
| { |
| "epoch": 2.1836537979702104, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.721313021742206e-06, |
| "loss": 0.6357, |
| "step": 46300 |
| }, |
| { |
| "epoch": 2.1860119617859555, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.7134524988602244e-06, |
| "loss": 0.6526, |
| "step": 46350 |
| }, |
| { |
| "epoch": 2.1883701256017, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.7055919759782424e-06, |
| "loss": 0.6355, |
| "step": 46400 |
| }, |
| { |
| "epoch": 2.190728289417445, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.69773145309626e-06, |
| "loss": 0.6359, |
| "step": 46450 |
| }, |
| { |
| "epoch": 2.19308645323319, |
| "grad_norm": 0.890625, |
| "learning_rate": 2.689870930214278e-06, |
| "loss": 0.6347, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2.19308645323319, |
| "eval_loss": 0.6410621404647827, |
| "eval_runtime": 476.2372, |
| "eval_samples_per_second": 74.984, |
| "eval_steps_per_second": 37.492, |
| "step": 46500 |
| }, |
| { |
| "epoch": 2.195444617048935, |
| "grad_norm": 0.89453125, |
| "learning_rate": 2.682010407332296e-06, |
| "loss": 0.6262, |
| "step": 46550 |
| }, |
| { |
| "epoch": 2.19780278086468, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.6741498844503137e-06, |
| "loss": 0.6399, |
| "step": 46600 |
| }, |
| { |
| "epoch": 2.2001609446804244, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.6662893615683316e-06, |
| "loss": 0.6413, |
| "step": 46650 |
| }, |
| { |
| "epoch": 2.2025191084961695, |
| "grad_norm": 0.91015625, |
| "learning_rate": 2.6584288386863496e-06, |
| "loss": 0.6292, |
| "step": 46700 |
| }, |
| { |
| "epoch": 2.204877272311914, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.6505683158043675e-06, |
| "loss": 0.6372, |
| "step": 46750 |
| }, |
| { |
| "epoch": 2.207235436127659, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.6427077929223854e-06, |
| "loss": 0.6505, |
| "step": 46800 |
| }, |
| { |
| "epoch": 2.209593599943404, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.6348472700404034e-06, |
| "loss": 0.6359, |
| "step": 46850 |
| }, |
| { |
| "epoch": 2.211951763759149, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.626986747158421e-06, |
| "loss": 0.6359, |
| "step": 46900 |
| }, |
| { |
| "epoch": 2.214309927574894, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.6191262242764392e-06, |
| "loss": 0.6248, |
| "step": 46950 |
| }, |
| { |
| "epoch": 2.2166680913906385, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.611265701394457e-06, |
| "loss": 0.6413, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2.2166680913906385, |
| "eval_loss": 0.6410422921180725, |
| "eval_runtime": 470.9362, |
| "eval_samples_per_second": 75.828, |
| "eval_steps_per_second": 37.914, |
| "step": 47000 |
| }, |
| { |
| "epoch": 2.2190262552063835, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.6034051785124747e-06, |
| "loss": 0.6364, |
| "step": 47050 |
| }, |
| { |
| "epoch": 2.2213844190221286, |
| "grad_norm": 1.15625, |
| "learning_rate": 2.5955446556304926e-06, |
| "loss": 0.6298, |
| "step": 47100 |
| }, |
| { |
| "epoch": 2.223742582837873, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.587684132748511e-06, |
| "loss": 0.6295, |
| "step": 47150 |
| }, |
| { |
| "epoch": 2.2261007466536182, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.5798236098665285e-06, |
| "loss": 0.6258, |
| "step": 47200 |
| }, |
| { |
| "epoch": 2.228458910469363, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.5719630869845465e-06, |
| "loss": 0.6404, |
| "step": 47250 |
| }, |
| { |
| "epoch": 2.230817074285108, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.564102564102564e-06, |
| "loss": 0.6373, |
| "step": 47300 |
| }, |
| { |
| "epoch": 2.233175238100853, |
| "grad_norm": 1.203125, |
| "learning_rate": 2.5562420412205823e-06, |
| "loss": 0.6458, |
| "step": 47350 |
| }, |
| { |
| "epoch": 2.2355334019165976, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.5483815183386003e-06, |
| "loss": 0.6477, |
| "step": 47400 |
| }, |
| { |
| "epoch": 2.2378915657323426, |
| "grad_norm": 1.1484375, |
| "learning_rate": 2.5405209954566178e-06, |
| "loss": 0.6422, |
| "step": 47450 |
| }, |
| { |
| "epoch": 2.240249729548087, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.5326604725746357e-06, |
| "loss": 0.6331, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2.240249729548087, |
| "eval_loss": 0.6410676836967468, |
| "eval_runtime": 471.372, |
| "eval_samples_per_second": 75.758, |
| "eval_steps_per_second": 37.879, |
| "step": 47500 |
| }, |
| { |
| "epoch": 2.2426078933638323, |
| "grad_norm": 0.94140625, |
| "learning_rate": 2.524799949692654e-06, |
| "loss": 0.6346, |
| "step": 47550 |
| }, |
| { |
| "epoch": 2.2449660571795773, |
| "grad_norm": 0.9609375, |
| "learning_rate": 2.5169394268106716e-06, |
| "loss": 0.64, |
| "step": 47600 |
| }, |
| { |
| "epoch": 2.247324220995322, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.5090789039286895e-06, |
| "loss": 0.6336, |
| "step": 47650 |
| }, |
| { |
| "epoch": 2.249682384811067, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.501218381046707e-06, |
| "loss": 0.6598, |
| "step": 47700 |
| }, |
| { |
| "epoch": 2.2520405486268116, |
| "grad_norm": 0.94921875, |
| "learning_rate": 2.4933578581647254e-06, |
| "loss": 0.6315, |
| "step": 47750 |
| }, |
| { |
| "epoch": 2.2543987124425566, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.4854973352827433e-06, |
| "loss": 0.6378, |
| "step": 47800 |
| }, |
| { |
| "epoch": 2.2567568762583017, |
| "grad_norm": 0.890625, |
| "learning_rate": 2.477636812400761e-06, |
| "loss": 0.6317, |
| "step": 47850 |
| }, |
| { |
| "epoch": 2.2591150400740463, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.4697762895187792e-06, |
| "loss": 0.6468, |
| "step": 47900 |
| }, |
| { |
| "epoch": 2.2614732038897913, |
| "grad_norm": 0.9296875, |
| "learning_rate": 2.4619157666367967e-06, |
| "loss": 0.6323, |
| "step": 47950 |
| }, |
| { |
| "epoch": 2.263831367705536, |
| "grad_norm": 1.0, |
| "learning_rate": 2.4540552437548147e-06, |
| "loss": 0.6431, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2.263831367705536, |
| "eval_loss": 0.6410719156265259, |
| "eval_runtime": 471.4352, |
| "eval_samples_per_second": 75.747, |
| "eval_steps_per_second": 37.874, |
| "step": 48000 |
| }, |
| { |
| "epoch": 2.266189531521281, |
| "grad_norm": 1.09375, |
| "learning_rate": 2.4461947208728326e-06, |
| "loss": 0.6269, |
| "step": 48050 |
| }, |
| { |
| "epoch": 2.268547695337026, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.4383341979908505e-06, |
| "loss": 0.6366, |
| "step": 48100 |
| }, |
| { |
| "epoch": 2.2709058591527707, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.4304736751088685e-06, |
| "loss": 0.6364, |
| "step": 48150 |
| }, |
| { |
| "epoch": 2.2732640229685157, |
| "grad_norm": 0.96875, |
| "learning_rate": 2.4226131522268864e-06, |
| "loss": 0.6399, |
| "step": 48200 |
| }, |
| { |
| "epoch": 2.2756221867842603, |
| "grad_norm": 0.9453125, |
| "learning_rate": 2.4147526293449044e-06, |
| "loss": 0.6375, |
| "step": 48250 |
| }, |
| { |
| "epoch": 2.2779803506000054, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.4068921064629223e-06, |
| "loss": 0.6249, |
| "step": 48300 |
| }, |
| { |
| "epoch": 2.2803385144157504, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.39903158358094e-06, |
| "loss": 0.6211, |
| "step": 48350 |
| }, |
| { |
| "epoch": 2.282696678231495, |
| "grad_norm": 0.9609375, |
| "learning_rate": 2.391171060698958e-06, |
| "loss": 0.6343, |
| "step": 48400 |
| }, |
| { |
| "epoch": 2.28505484204724, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.3833105378169757e-06, |
| "loss": 0.6483, |
| "step": 48450 |
| }, |
| { |
| "epoch": 2.2874130058629847, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.3754500149349936e-06, |
| "loss": 0.6274, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2.2874130058629847, |
| "eval_loss": 0.6411145329475403, |
| "eval_runtime": 472.0639, |
| "eval_samples_per_second": 75.647, |
| "eval_steps_per_second": 37.823, |
| "step": 48500 |
| }, |
| { |
| "epoch": 2.2897711696787297, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.3675894920530116e-06, |
| "loss": 0.6402, |
| "step": 48550 |
| }, |
| { |
| "epoch": 2.2921293334944743, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.3597289691710295e-06, |
| "loss": 0.6439, |
| "step": 48600 |
| }, |
| { |
| "epoch": 2.2944874973102194, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.3518684462890474e-06, |
| "loss": 0.6451, |
| "step": 48650 |
| }, |
| { |
| "epoch": 2.2968456611259644, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.3440079234070654e-06, |
| "loss": 0.654, |
| "step": 48700 |
| }, |
| { |
| "epoch": 2.299203824941709, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.3361474005250833e-06, |
| "loss": 0.6352, |
| "step": 48750 |
| }, |
| { |
| "epoch": 2.301561988757454, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.3282868776431012e-06, |
| "loss": 0.6397, |
| "step": 48800 |
| }, |
| { |
| "epoch": 2.3039201525731987, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.3204263547611188e-06, |
| "loss": 0.6356, |
| "step": 48850 |
| }, |
| { |
| "epoch": 2.3062783163889438, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.312565831879137e-06, |
| "loss": 0.6359, |
| "step": 48900 |
| }, |
| { |
| "epoch": 2.308636480204689, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.3047053089971546e-06, |
| "loss": 0.651, |
| "step": 48950 |
| }, |
| { |
| "epoch": 2.3109946440204334, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.2968447861151726e-06, |
| "loss": 0.6382, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2.3109946440204334, |
| "eval_loss": 0.6411119103431702, |
| "eval_runtime": 475.9105, |
| "eval_samples_per_second": 75.035, |
| "eval_steps_per_second": 37.518, |
| "step": 49000 |
| }, |
| { |
| "epoch": 2.3133528078361785, |
| "grad_norm": 0.984375, |
| "learning_rate": 2.2889842632331905e-06, |
| "loss": 0.6447, |
| "step": 49050 |
| }, |
| { |
| "epoch": 2.315710971651923, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.2811237403512084e-06, |
| "loss": 0.6475, |
| "step": 49100 |
| }, |
| { |
| "epoch": 2.318069135467668, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.2732632174692264e-06, |
| "loss": 0.641, |
| "step": 49150 |
| }, |
| { |
| "epoch": 2.3204272992834127, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.2654026945872443e-06, |
| "loss": 0.6286, |
| "step": 49200 |
| }, |
| { |
| "epoch": 2.322785463099158, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.257542171705262e-06, |
| "loss": 0.6393, |
| "step": 49250 |
| }, |
| { |
| "epoch": 2.325143626914903, |
| "grad_norm": 0.9609375, |
| "learning_rate": 2.2496816488232798e-06, |
| "loss": 0.6358, |
| "step": 49300 |
| }, |
| { |
| "epoch": 2.3275017907306474, |
| "grad_norm": 1.2265625, |
| "learning_rate": 2.2418211259412977e-06, |
| "loss": 0.6508, |
| "step": 49350 |
| }, |
| { |
| "epoch": 2.3298599545463925, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.2339606030593156e-06, |
| "loss": 0.6309, |
| "step": 49400 |
| }, |
| { |
| "epoch": 2.332218118362137, |
| "grad_norm": 0.96484375, |
| "learning_rate": 2.2261000801773336e-06, |
| "loss": 0.6426, |
| "step": 49450 |
| }, |
| { |
| "epoch": 2.334576282177882, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.2182395572953515e-06, |
| "loss": 0.6387, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2.334576282177882, |
| "eval_loss": 0.6411243081092834, |
| "eval_runtime": 471.3649, |
| "eval_samples_per_second": 75.759, |
| "eval_steps_per_second": 37.879, |
| "step": 49500 |
| }, |
| { |
| "epoch": 2.336934445993627, |
| "grad_norm": 1.3359375, |
| "learning_rate": 2.2103790344133695e-06, |
| "loss": 0.6445, |
| "step": 49550 |
| }, |
| { |
| "epoch": 2.339292609809372, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.202518511531387e-06, |
| "loss": 0.6366, |
| "step": 49600 |
| }, |
| { |
| "epoch": 2.341650773625117, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.1946579886494053e-06, |
| "loss": 0.6341, |
| "step": 49650 |
| }, |
| { |
| "epoch": 2.3440089374408615, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.186797465767423e-06, |
| "loss": 0.6306, |
| "step": 49700 |
| }, |
| { |
| "epoch": 2.3463671012566065, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.1789369428854408e-06, |
| "loss": 0.646, |
| "step": 49750 |
| }, |
| { |
| "epoch": 2.3487252650723516, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.1710764200034587e-06, |
| "loss": 0.6357, |
| "step": 49800 |
| }, |
| { |
| "epoch": 2.351083428888096, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.1632158971214767e-06, |
| "loss": 0.6339, |
| "step": 49850 |
| }, |
| { |
| "epoch": 2.3534415927038412, |
| "grad_norm": 1.2578125, |
| "learning_rate": 2.1553553742394946e-06, |
| "loss": 0.6479, |
| "step": 49900 |
| }, |
| { |
| "epoch": 2.355799756519586, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.1474948513575125e-06, |
| "loss": 0.6441, |
| "step": 49950 |
| }, |
| { |
| "epoch": 2.358157920335331, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.1396343284755305e-06, |
| "loss": 0.6278, |
| "step": 50000 |
| }, |
| { |
| "epoch": 2.358157920335331, |
| "eval_loss": 0.6410099267959595, |
| "eval_runtime": 474.9955, |
| "eval_samples_per_second": 75.18, |
| "eval_steps_per_second": 37.59, |
| "step": 50000 |
| }, |
| { |
| "epoch": 2.360516084151076, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.1317738055935484e-06, |
| "loss": 0.6313, |
| "step": 50050 |
| }, |
| { |
| "epoch": 2.3628742479668206, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.123913282711566e-06, |
| "loss": 0.6445, |
| "step": 50100 |
| }, |
| { |
| "epoch": 2.3652324117825656, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.1160527598295843e-06, |
| "loss": 0.6401, |
| "step": 50150 |
| }, |
| { |
| "epoch": 2.36759057559831, |
| "grad_norm": 1.09375, |
| "learning_rate": 2.108192236947602e-06, |
| "loss": 0.6343, |
| "step": 50200 |
| }, |
| { |
| "epoch": 2.3699487394140553, |
| "grad_norm": 1.015625, |
| "learning_rate": 2.1003317140656197e-06, |
| "loss": 0.6334, |
| "step": 50250 |
| }, |
| { |
| "epoch": 2.3723069032298003, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.0924711911836377e-06, |
| "loss": 0.6402, |
| "step": 50300 |
| }, |
| { |
| "epoch": 2.374665067045545, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.0846106683016556e-06, |
| "loss": 0.6575, |
| "step": 50350 |
| }, |
| { |
| "epoch": 2.37702323086129, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.0767501454196735e-06, |
| "loss": 0.6326, |
| "step": 50400 |
| }, |
| { |
| "epoch": 2.3793813946770346, |
| "grad_norm": 0.94921875, |
| "learning_rate": 2.0688896225376915e-06, |
| "loss": 0.6403, |
| "step": 50450 |
| }, |
| { |
| "epoch": 2.3817395584927796, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.061029099655709e-06, |
| "loss": 0.6279, |
| "step": 50500 |
| }, |
| { |
| "epoch": 2.3817395584927796, |
| "eval_loss": 0.6410384774208069, |
| "eval_runtime": 474.5984, |
| "eval_samples_per_second": 75.243, |
| "eval_steps_per_second": 37.621, |
| "step": 50500 |
| }, |
| { |
| "epoch": 2.3840977223085247, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.0531685767737274e-06, |
| "loss": 0.6402, |
| "step": 50550 |
| }, |
| { |
| "epoch": 2.3864558861242693, |
| "grad_norm": 0.94140625, |
| "learning_rate": 2.045308053891745e-06, |
| "loss": 0.6405, |
| "step": 50600 |
| }, |
| { |
| "epoch": 2.3888140499400143, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.037447531009763e-06, |
| "loss": 0.6292, |
| "step": 50650 |
| }, |
| { |
| "epoch": 2.391172213755759, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.0295870081277807e-06, |
| "loss": 0.6528, |
| "step": 50700 |
| }, |
| { |
| "epoch": 2.393530377571504, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.0217264852457987e-06, |
| "loss": 0.6471, |
| "step": 50750 |
| }, |
| { |
| "epoch": 2.395888541387249, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.0138659623638166e-06, |
| "loss": 0.6442, |
| "step": 50800 |
| }, |
| { |
| "epoch": 2.3982467052029937, |
| "grad_norm": 1.0625, |
| "learning_rate": 2.0060054394818346e-06, |
| "loss": 0.6385, |
| "step": 50850 |
| }, |
| { |
| "epoch": 2.4006048690187387, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.9981449165998525e-06, |
| "loss": 0.6197, |
| "step": 50900 |
| }, |
| { |
| "epoch": 2.4029630328344833, |
| "grad_norm": 0.9609375, |
| "learning_rate": 1.9902843937178704e-06, |
| "loss": 0.6268, |
| "step": 50950 |
| }, |
| { |
| "epoch": 2.4053211966502284, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.982423870835888e-06, |
| "loss": 0.6453, |
| "step": 51000 |
| }, |
| { |
| "epoch": 2.4053211966502284, |
| "eval_loss": 0.6410667896270752, |
| "eval_runtime": 478.4093, |
| "eval_samples_per_second": 74.643, |
| "eval_steps_per_second": 37.322, |
| "step": 51000 |
| }, |
| { |
| "epoch": 2.4076793604659734, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.9745633479539063e-06, |
| "loss": 0.6329, |
| "step": 51050 |
| }, |
| { |
| "epoch": 2.410037524281718, |
| "grad_norm": 0.9921875, |
| "learning_rate": 1.966702825071924e-06, |
| "loss": 0.6452, |
| "step": 51100 |
| }, |
| { |
| "epoch": 2.412395688097463, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.9588423021899418e-06, |
| "loss": 0.6491, |
| "step": 51150 |
| }, |
| { |
| "epoch": 2.4147538519132077, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.9509817793079597e-06, |
| "loss": 0.6388, |
| "step": 51200 |
| }, |
| { |
| "epoch": 2.4171120157289527, |
| "grad_norm": 1.0859375, |
| "learning_rate": 1.9431212564259776e-06, |
| "loss": 0.638, |
| "step": 51250 |
| }, |
| { |
| "epoch": 2.4194701795446973, |
| "grad_norm": 1.1640625, |
| "learning_rate": 1.9352607335439956e-06, |
| "loss": 0.6462, |
| "step": 51300 |
| }, |
| { |
| "epoch": 2.4218283433604424, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.9274002106620135e-06, |
| "loss": 0.6339, |
| "step": 51350 |
| }, |
| { |
| "epoch": 2.4241865071761874, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.9195396877800314e-06, |
| "loss": 0.6367, |
| "step": 51400 |
| }, |
| { |
| "epoch": 2.426544670991932, |
| "grad_norm": 1.1875, |
| "learning_rate": 1.9116791648980494e-06, |
| "loss": 0.6362, |
| "step": 51450 |
| }, |
| { |
| "epoch": 2.428902834807677, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.9038186420160671e-06, |
| "loss": 0.6493, |
| "step": 51500 |
| }, |
| { |
| "epoch": 2.428902834807677, |
| "eval_loss": 0.6410060524940491, |
| "eval_runtime": 471.4392, |
| "eval_samples_per_second": 75.747, |
| "eval_steps_per_second": 37.873, |
| "step": 51500 |
| }, |
| { |
| "epoch": 2.4312609986234217, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.895958119134085e-06, |
| "loss": 0.6373, |
| "step": 51550 |
| }, |
| { |
| "epoch": 2.4336191624391668, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.8880975962521028e-06, |
| "loss": 0.6315, |
| "step": 51600 |
| }, |
| { |
| "epoch": 2.4359773262549114, |
| "grad_norm": 0.93359375, |
| "learning_rate": 1.880237073370121e-06, |
| "loss": 0.6253, |
| "step": 51650 |
| }, |
| { |
| "epoch": 2.4383354900706564, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.8723765504881386e-06, |
| "loss": 0.6361, |
| "step": 51700 |
| }, |
| { |
| "epoch": 2.4406936538864015, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.8645160276061566e-06, |
| "loss": 0.6379, |
| "step": 51750 |
| }, |
| { |
| "epoch": 2.443051817702146, |
| "grad_norm": 0.9765625, |
| "learning_rate": 1.8566555047241743e-06, |
| "loss": 0.6426, |
| "step": 51800 |
| }, |
| { |
| "epoch": 2.445409981517891, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.8487949818421925e-06, |
| "loss": 0.6332, |
| "step": 51850 |
| }, |
| { |
| "epoch": 2.4477681453336357, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.8409344589602102e-06, |
| "loss": 0.6342, |
| "step": 51900 |
| }, |
| { |
| "epoch": 2.450126309149381, |
| "grad_norm": 1.203125, |
| "learning_rate": 1.8330739360782281e-06, |
| "loss": 0.6358, |
| "step": 51950 |
| }, |
| { |
| "epoch": 2.452484472965126, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.8252134131962459e-06, |
| "loss": 0.6477, |
| "step": 52000 |
| }, |
| { |
| "epoch": 2.452484472965126, |
| "eval_loss": 0.6410579681396484, |
| "eval_runtime": 474.8277, |
| "eval_samples_per_second": 75.206, |
| "eval_steps_per_second": 37.603, |
| "step": 52000 |
| }, |
| { |
| "epoch": 2.4548426367808704, |
| "grad_norm": 1.2421875, |
| "learning_rate": 1.817352890314264e-06, |
| "loss": 0.6435, |
| "step": 52050 |
| }, |
| { |
| "epoch": 2.4572008005966155, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.8094923674322817e-06, |
| "loss": 0.6395, |
| "step": 52100 |
| }, |
| { |
| "epoch": 2.45955896441236, |
| "grad_norm": 0.94140625, |
| "learning_rate": 1.8016318445502999e-06, |
| "loss": 0.6372, |
| "step": 52150 |
| }, |
| { |
| "epoch": 2.461917128228105, |
| "grad_norm": 1.0, |
| "learning_rate": 1.7937713216683176e-06, |
| "loss": 0.6291, |
| "step": 52200 |
| }, |
| { |
| "epoch": 2.46427529204385, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.7859107987863353e-06, |
| "loss": 0.6459, |
| "step": 52250 |
| }, |
| { |
| "epoch": 2.466633455859595, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.7780502759043533e-06, |
| "loss": 0.6448, |
| "step": 52300 |
| }, |
| { |
| "epoch": 2.46899161967534, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.770189753022371e-06, |
| "loss": 0.6313, |
| "step": 52350 |
| }, |
| { |
| "epoch": 2.4713497834910845, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.7623292301403891e-06, |
| "loss": 0.646, |
| "step": 52400 |
| }, |
| { |
| "epoch": 2.4737079473068295, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.7544687072584069e-06, |
| "loss": 0.6389, |
| "step": 52450 |
| }, |
| { |
| "epoch": 2.4760661111225746, |
| "grad_norm": 0.9453125, |
| "learning_rate": 1.7466081843764248e-06, |
| "loss": 0.6378, |
| "step": 52500 |
| }, |
| { |
| "epoch": 2.4760661111225746, |
| "eval_loss": 0.6410920023918152, |
| "eval_runtime": 469.8066, |
| "eval_samples_per_second": 76.01, |
| "eval_steps_per_second": 38.005, |
| "step": 52500 |
| }, |
| { |
| "epoch": 2.478424274938319, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.7387476614944425e-06, |
| "loss": 0.6373, |
| "step": 52550 |
| }, |
| { |
| "epoch": 2.4807824387540642, |
| "grad_norm": 0.97265625, |
| "learning_rate": 1.7308871386124607e-06, |
| "loss": 0.6392, |
| "step": 52600 |
| }, |
| { |
| "epoch": 2.483140602569809, |
| "grad_norm": 1.0859375, |
| "learning_rate": 1.7230266157304784e-06, |
| "loss": 0.6453, |
| "step": 52650 |
| }, |
| { |
| "epoch": 2.485498766385554, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.7151660928484963e-06, |
| "loss": 0.6282, |
| "step": 52700 |
| }, |
| { |
| "epoch": 2.487856930201299, |
| "grad_norm": 0.953125, |
| "learning_rate": 1.7073055699665143e-06, |
| "loss": 0.6431, |
| "step": 52750 |
| }, |
| { |
| "epoch": 2.4902150940170436, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.6994450470845322e-06, |
| "loss": 0.6384, |
| "step": 52800 |
| }, |
| { |
| "epoch": 2.4925732578327886, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.69158452420255e-06, |
| "loss": 0.6305, |
| "step": 52850 |
| }, |
| { |
| "epoch": 2.494931421648533, |
| "grad_norm": 1.0, |
| "learning_rate": 1.683724001320568e-06, |
| "loss": 0.6403, |
| "step": 52900 |
| }, |
| { |
| "epoch": 2.4972895854642783, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.6758634784385858e-06, |
| "loss": 0.6364, |
| "step": 52950 |
| }, |
| { |
| "epoch": 2.4996477492800233, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.6680029555566038e-06, |
| "loss": 0.6319, |
| "step": 53000 |
| }, |
| { |
| "epoch": 2.4996477492800233, |
| "eval_loss": 0.6410515904426575, |
| "eval_runtime": 470.4813, |
| "eval_samples_per_second": 75.901, |
| "eval_steps_per_second": 37.951, |
| "step": 53000 |
| }, |
| { |
| "epoch": 2.502005913095768, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.6601424326746215e-06, |
| "loss": 0.6371, |
| "step": 53050 |
| }, |
| { |
| "epoch": 2.504364076911513, |
| "grad_norm": 0.9609375, |
| "learning_rate": 1.6522819097926396e-06, |
| "loss": 0.6368, |
| "step": 53100 |
| }, |
| { |
| "epoch": 2.5067222407272576, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.6444213869106574e-06, |
| "loss": 0.6327, |
| "step": 53150 |
| }, |
| { |
| "epoch": 2.5090804045430026, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.6365608640286753e-06, |
| "loss": 0.6395, |
| "step": 53200 |
| }, |
| { |
| "epoch": 2.5114385683587477, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.628700341146693e-06, |
| "loss": 0.6396, |
| "step": 53250 |
| }, |
| { |
| "epoch": 2.5137967321744923, |
| "grad_norm": 0.9296875, |
| "learning_rate": 1.6208398182647112e-06, |
| "loss": 0.6325, |
| "step": 53300 |
| }, |
| { |
| "epoch": 2.5161548959902373, |
| "grad_norm": 1.1328125, |
| "learning_rate": 1.6129792953827289e-06, |
| "loss": 0.632, |
| "step": 53350 |
| }, |
| { |
| "epoch": 2.518513059805982, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.605118772500747e-06, |
| "loss": 0.633, |
| "step": 53400 |
| }, |
| { |
| "epoch": 2.520871223621727, |
| "grad_norm": 0.9765625, |
| "learning_rate": 1.5972582496187648e-06, |
| "loss": 0.6432, |
| "step": 53450 |
| }, |
| { |
| "epoch": 2.523229387437472, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.5893977267367827e-06, |
| "loss": 0.6322, |
| "step": 53500 |
| }, |
| { |
| "epoch": 2.523229387437472, |
| "eval_loss": 0.641071081161499, |
| "eval_runtime": 470.808, |
| "eval_samples_per_second": 75.848, |
| "eval_steps_per_second": 37.924, |
| "step": 53500 |
| }, |
| { |
| "epoch": 2.5255875512532167, |
| "grad_norm": 0.98046875, |
| "learning_rate": 1.5815372038548004e-06, |
| "loss": 0.6297, |
| "step": 53550 |
| }, |
| { |
| "epoch": 2.5279457150689617, |
| "grad_norm": 0.97265625, |
| "learning_rate": 1.5736766809728186e-06, |
| "loss": 0.6453, |
| "step": 53600 |
| }, |
| { |
| "epoch": 2.5303038788847063, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.5658161580908363e-06, |
| "loss": 0.6414, |
| "step": 53650 |
| }, |
| { |
| "epoch": 2.5326620427004514, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.5579556352088542e-06, |
| "loss": 0.634, |
| "step": 53700 |
| }, |
| { |
| "epoch": 2.5350202065161964, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.550095112326872e-06, |
| "loss": 0.6368, |
| "step": 53750 |
| }, |
| { |
| "epoch": 2.537378370331941, |
| "grad_norm": 0.97265625, |
| "learning_rate": 1.5422345894448901e-06, |
| "loss": 0.6415, |
| "step": 53800 |
| }, |
| { |
| "epoch": 2.5397365341476856, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.5343740665629078e-06, |
| "loss": 0.6283, |
| "step": 53850 |
| }, |
| { |
| "epoch": 2.5420946979634307, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.5265135436809258e-06, |
| "loss": 0.6401, |
| "step": 53900 |
| }, |
| { |
| "epoch": 2.5444528617791757, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.5186530207989437e-06, |
| "loss": 0.6371, |
| "step": 53950 |
| }, |
| { |
| "epoch": 2.546811025594921, |
| "grad_norm": 0.98046875, |
| "learning_rate": 1.5107924979169617e-06, |
| "loss": 0.6381, |
| "step": 54000 |
| }, |
| { |
| "epoch": 2.546811025594921, |
| "eval_loss": 0.6411101222038269, |
| "eval_runtime": 469.3138, |
| "eval_samples_per_second": 76.09, |
| "eval_steps_per_second": 38.045, |
| "step": 54000 |
| }, |
| { |
| "epoch": 2.5491691894106654, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.5029319750349794e-06, |
| "loss": 0.6299, |
| "step": 54050 |
| }, |
| { |
| "epoch": 2.55152735322641, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.4950714521529975e-06, |
| "loss": 0.6359, |
| "step": 54100 |
| }, |
| { |
| "epoch": 2.553885517042155, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.4872109292710153e-06, |
| "loss": 0.6527, |
| "step": 54150 |
| }, |
| { |
| "epoch": 2.5562436808579, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.4793504063890332e-06, |
| "loss": 0.6355, |
| "step": 54200 |
| }, |
| { |
| "epoch": 2.5586018446736447, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.471489883507051e-06, |
| "loss": 0.6521, |
| "step": 54250 |
| }, |
| { |
| "epoch": 2.5609600084893898, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.463629360625069e-06, |
| "loss": 0.6477, |
| "step": 54300 |
| }, |
| { |
| "epoch": 2.5633181723051344, |
| "grad_norm": 0.99609375, |
| "learning_rate": 1.4557688377430868e-06, |
| "loss": 0.6466, |
| "step": 54350 |
| }, |
| { |
| "epoch": 2.5656763361208794, |
| "grad_norm": 1.421875, |
| "learning_rate": 1.4479083148611047e-06, |
| "loss": 0.6266, |
| "step": 54400 |
| }, |
| { |
| "epoch": 2.5680344999366245, |
| "grad_norm": 1.125, |
| "learning_rate": 1.4400477919791225e-06, |
| "loss": 0.6427, |
| "step": 54450 |
| }, |
| { |
| "epoch": 2.570392663752369, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.4321872690971406e-06, |
| "loss": 0.6375, |
| "step": 54500 |
| }, |
| { |
| "epoch": 2.570392663752369, |
| "eval_loss": 0.6410405039787292, |
| "eval_runtime": 470.8842, |
| "eval_samples_per_second": 75.836, |
| "eval_steps_per_second": 37.918, |
| "step": 54500 |
| }, |
| { |
| "epoch": 2.572750827568114, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.4243267462151583e-06, |
| "loss": 0.6553, |
| "step": 54550 |
| }, |
| { |
| "epoch": 2.5751089913838587, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.4164662233331763e-06, |
| "loss": 0.6436, |
| "step": 54600 |
| }, |
| { |
| "epoch": 2.577467155199604, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.4086057004511942e-06, |
| "loss": 0.6224, |
| "step": 54650 |
| }, |
| { |
| "epoch": 2.579825319015349, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.4007451775692121e-06, |
| "loss": 0.6298, |
| "step": 54700 |
| }, |
| { |
| "epoch": 2.5821834828310934, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.3928846546872299e-06, |
| "loss": 0.6294, |
| "step": 54750 |
| }, |
| { |
| "epoch": 2.5845416466468385, |
| "grad_norm": 1.1953125, |
| "learning_rate": 1.385024131805248e-06, |
| "loss": 0.6264, |
| "step": 54800 |
| }, |
| { |
| "epoch": 2.586899810462583, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.3771636089232657e-06, |
| "loss": 0.6413, |
| "step": 54850 |
| }, |
| { |
| "epoch": 2.589257974278328, |
| "grad_norm": 1.234375, |
| "learning_rate": 1.3693030860412837e-06, |
| "loss": 0.6407, |
| "step": 54900 |
| }, |
| { |
| "epoch": 2.591616138094073, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.3614425631593014e-06, |
| "loss": 0.6537, |
| "step": 54950 |
| }, |
| { |
| "epoch": 2.593974301909818, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.3535820402773196e-06, |
| "loss": 0.6404, |
| "step": 55000 |
| }, |
| { |
| "epoch": 2.593974301909818, |
| "eval_loss": 0.6411945223808289, |
| "eval_runtime": 472.1323, |
| "eval_samples_per_second": 75.636, |
| "eval_steps_per_second": 37.818, |
| "step": 55000 |
| }, |
| { |
| "epoch": 2.596332465725563, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.3457215173953373e-06, |
| "loss": 0.6433, |
| "step": 55050 |
| }, |
| { |
| "epoch": 2.5986906295413075, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.3378609945133552e-06, |
| "loss": 0.6418, |
| "step": 55100 |
| }, |
| { |
| "epoch": 2.6010487933570525, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.330000471631373e-06, |
| "loss": 0.6216, |
| "step": 55150 |
| }, |
| { |
| "epoch": 2.6034069571727976, |
| "grad_norm": 0.98046875, |
| "learning_rate": 1.322139948749391e-06, |
| "loss": 0.6438, |
| "step": 55200 |
| }, |
| { |
| "epoch": 2.605765120988542, |
| "grad_norm": 1.125, |
| "learning_rate": 1.3142794258674088e-06, |
| "loss": 0.633, |
| "step": 55250 |
| }, |
| { |
| "epoch": 2.6081232848042872, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.3064189029854265e-06, |
| "loss": 0.6477, |
| "step": 55300 |
| }, |
| { |
| "epoch": 2.610481448620032, |
| "grad_norm": 1.03125, |
| "learning_rate": 1.2985583801034447e-06, |
| "loss": 0.6428, |
| "step": 55350 |
| }, |
| { |
| "epoch": 2.612839612435777, |
| "grad_norm": 1.25, |
| "learning_rate": 1.2906978572214624e-06, |
| "loss": 0.6387, |
| "step": 55400 |
| }, |
| { |
| "epoch": 2.615197776251522, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.2828373343394804e-06, |
| "loss": 0.6317, |
| "step": 55450 |
| }, |
| { |
| "epoch": 2.6175559400672666, |
| "grad_norm": 0.94140625, |
| "learning_rate": 1.274976811457498e-06, |
| "loss": 0.6466, |
| "step": 55500 |
| }, |
| { |
| "epoch": 2.6175559400672666, |
| "eval_loss": 0.6410502195358276, |
| "eval_runtime": 471.6129, |
| "eval_samples_per_second": 75.719, |
| "eval_steps_per_second": 37.859, |
| "step": 55500 |
| }, |
| { |
| "epoch": 2.6199141038830116, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.2671162885755162e-06, |
| "loss": 0.6439, |
| "step": 55550 |
| }, |
| { |
| "epoch": 2.622272267698756, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.259255765693534e-06, |
| "loss": 0.6274, |
| "step": 55600 |
| }, |
| { |
| "epoch": 2.6246304315145013, |
| "grad_norm": 0.97265625, |
| "learning_rate": 1.251395242811552e-06, |
| "loss": 0.6479, |
| "step": 55650 |
| }, |
| { |
| "epoch": 2.6269885953302463, |
| "grad_norm": 0.984375, |
| "learning_rate": 1.2435347199295698e-06, |
| "loss": 0.6389, |
| "step": 55700 |
| }, |
| { |
| "epoch": 2.629346759145991, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.2356741970475878e-06, |
| "loss": 0.6367, |
| "step": 55750 |
| }, |
| { |
| "epoch": 2.631704922961736, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.2278136741656057e-06, |
| "loss": 0.6448, |
| "step": 55800 |
| }, |
| { |
| "epoch": 2.6340630867774806, |
| "grad_norm": 1.171875, |
| "learning_rate": 1.2199531512836234e-06, |
| "loss": 0.634, |
| "step": 55850 |
| }, |
| { |
| "epoch": 2.6364212505932256, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.2120926284016414e-06, |
| "loss": 0.6318, |
| "step": 55900 |
| }, |
| { |
| "epoch": 2.6387794144089707, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.2042321055196593e-06, |
| "loss": 0.6379, |
| "step": 55950 |
| }, |
| { |
| "epoch": 2.6411375782247153, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.1963715826376772e-06, |
| "loss": 0.6417, |
| "step": 56000 |
| }, |
| { |
| "epoch": 2.6411375782247153, |
| "eval_loss": 0.6411082744598389, |
| "eval_runtime": 471.2106, |
| "eval_samples_per_second": 75.784, |
| "eval_steps_per_second": 37.892, |
| "step": 56000 |
| }, |
| { |
| "epoch": 2.6434957420404603, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.1885110597556952e-06, |
| "loss": 0.6384, |
| "step": 56050 |
| }, |
| { |
| "epoch": 2.645853905856205, |
| "grad_norm": 1.0, |
| "learning_rate": 1.180650536873713e-06, |
| "loss": 0.6415, |
| "step": 56100 |
| }, |
| { |
| "epoch": 2.64821206967195, |
| "grad_norm": 1.0859375, |
| "learning_rate": 1.1727900139917308e-06, |
| "loss": 0.6443, |
| "step": 56150 |
| }, |
| { |
| "epoch": 2.650570233487695, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.1649294911097488e-06, |
| "loss": 0.6414, |
| "step": 56200 |
| }, |
| { |
| "epoch": 2.6529283973034397, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.1570689682277667e-06, |
| "loss": 0.6386, |
| "step": 56250 |
| }, |
| { |
| "epoch": 2.6552865611191847, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.1492084453457844e-06, |
| "loss": 0.6233, |
| "step": 56300 |
| }, |
| { |
| "epoch": 2.6576447249349293, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.1413479224638024e-06, |
| "loss": 0.6364, |
| "step": 56350 |
| }, |
| { |
| "epoch": 2.6600028887506744, |
| "grad_norm": 0.9921875, |
| "learning_rate": 1.1334873995818203e-06, |
| "loss": 0.636, |
| "step": 56400 |
| }, |
| { |
| "epoch": 2.6623610525664194, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.1256268766998383e-06, |
| "loss": 0.6402, |
| "step": 56450 |
| }, |
| { |
| "epoch": 2.664719216382164, |
| "grad_norm": 1.0546875, |
| "learning_rate": 1.1177663538178562e-06, |
| "loss": 0.6517, |
| "step": 56500 |
| }, |
| { |
| "epoch": 2.664719216382164, |
| "eval_loss": 0.6410189867019653, |
| "eval_runtime": 471.5412, |
| "eval_samples_per_second": 75.73, |
| "eval_steps_per_second": 37.865, |
| "step": 56500 |
| }, |
| { |
| "epoch": 2.6670773801979086, |
| "grad_norm": 0.93359375, |
| "learning_rate": 1.109905830935874e-06, |
| "loss": 0.6403, |
| "step": 56550 |
| }, |
| { |
| "epoch": 2.6694355440136537, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.1020453080538919e-06, |
| "loss": 0.6357, |
| "step": 56600 |
| }, |
| { |
| "epoch": 2.6717937078293987, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.0941847851719098e-06, |
| "loss": 0.6385, |
| "step": 56650 |
| }, |
| { |
| "epoch": 2.674151871645144, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.0863242622899277e-06, |
| "loss": 0.621, |
| "step": 56700 |
| }, |
| { |
| "epoch": 2.6765100354608884, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.0784637394079455e-06, |
| "loss": 0.6335, |
| "step": 56750 |
| }, |
| { |
| "epoch": 2.678868199276633, |
| "grad_norm": 1.0, |
| "learning_rate": 1.0706032165259634e-06, |
| "loss": 0.6392, |
| "step": 56800 |
| }, |
| { |
| "epoch": 2.681226363092378, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.0627426936439811e-06, |
| "loss": 0.6418, |
| "step": 56850 |
| }, |
| { |
| "epoch": 2.683584526908123, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.054882170761999e-06, |
| "loss": 0.6334, |
| "step": 56900 |
| }, |
| { |
| "epoch": 2.6859426907238677, |
| "grad_norm": 0.9296875, |
| "learning_rate": 1.047021647880017e-06, |
| "loss": 0.6373, |
| "step": 56950 |
| }, |
| { |
| "epoch": 2.6883008545396128, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.039161124998035e-06, |
| "loss": 0.6361, |
| "step": 57000 |
| }, |
| { |
| "epoch": 2.6883008545396128, |
| "eval_loss": 0.6411524415016174, |
| "eval_runtime": 472.1931, |
| "eval_samples_per_second": 75.626, |
| "eval_steps_per_second": 37.813, |
| "step": 57000 |
| }, |
| { |
| "epoch": 2.6906590183553574, |
| "grad_norm": 0.98828125, |
| "learning_rate": 1.0313006021160529e-06, |
| "loss": 0.6436, |
| "step": 57050 |
| }, |
| { |
| "epoch": 2.6930171821711024, |
| "grad_norm": 0.9765625, |
| "learning_rate": 1.0234400792340706e-06, |
| "loss": 0.6409, |
| "step": 57100 |
| }, |
| { |
| "epoch": 2.6953753459868475, |
| "grad_norm": 1.5703125, |
| "learning_rate": 1.0155795563520885e-06, |
| "loss": 0.632, |
| "step": 57150 |
| }, |
| { |
| "epoch": 2.697733509802592, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.0077190334701065e-06, |
| "loss": 0.6359, |
| "step": 57200 |
| }, |
| { |
| "epoch": 2.700091673618337, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.998585105881244e-07, |
| "loss": 0.6363, |
| "step": 57250 |
| }, |
| { |
| "epoch": 2.7024498374340817, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.919979877061423e-07, |
| "loss": 0.6353, |
| "step": 57300 |
| }, |
| { |
| "epoch": 2.704808001249827, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.8413746482416e-07, |
| "loss": 0.6131, |
| "step": 57350 |
| }, |
| { |
| "epoch": 2.707166165065572, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.76276941942178e-07, |
| "loss": 0.6279, |
| "step": 57400 |
| }, |
| { |
| "epoch": 2.7095243288813164, |
| "grad_norm": 1.125, |
| "learning_rate": 9.68416419060196e-07, |
| "loss": 0.6299, |
| "step": 57450 |
| }, |
| { |
| "epoch": 2.7118824926970615, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.605558961782139e-07, |
| "loss": 0.6513, |
| "step": 57500 |
| }, |
| { |
| "epoch": 2.7118824926970615, |
| "eval_loss": 0.6411252021789551, |
| "eval_runtime": 470.5871, |
| "eval_samples_per_second": 75.884, |
| "eval_steps_per_second": 37.942, |
| "step": 57500 |
| }, |
| { |
| "epoch": 2.714240656512806, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.526953732962317e-07, |
| "loss": 0.6413, |
| "step": 57550 |
| }, |
| { |
| "epoch": 2.716598820328551, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.448348504142497e-07, |
| "loss": 0.6312, |
| "step": 57600 |
| }, |
| { |
| "epoch": 2.718956984144296, |
| "grad_norm": 1.0234375, |
| "learning_rate": 9.369743275322675e-07, |
| "loss": 0.6465, |
| "step": 57650 |
| }, |
| { |
| "epoch": 2.721315147960041, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.291138046502854e-07, |
| "loss": 0.6352, |
| "step": 57700 |
| }, |
| { |
| "epoch": 2.723673311775786, |
| "grad_norm": 0.9765625, |
| "learning_rate": 9.212532817683033e-07, |
| "loss": 0.6452, |
| "step": 57750 |
| }, |
| { |
| "epoch": 2.7260314755915305, |
| "grad_norm": 1.0078125, |
| "learning_rate": 9.133927588863212e-07, |
| "loss": 0.6383, |
| "step": 57800 |
| }, |
| { |
| "epoch": 2.7283896394072755, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.05532236004339e-07, |
| "loss": 0.6449, |
| "step": 57850 |
| }, |
| { |
| "epoch": 2.7307478032230206, |
| "grad_norm": 1.0078125, |
| "learning_rate": 8.97671713122357e-07, |
| "loss": 0.6313, |
| "step": 57900 |
| }, |
| { |
| "epoch": 2.733105967038765, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.898111902403749e-07, |
| "loss": 0.6402, |
| "step": 57950 |
| }, |
| { |
| "epoch": 2.7354641308545102, |
| "grad_norm": 0.98828125, |
| "learning_rate": 8.819506673583927e-07, |
| "loss": 0.6406, |
| "step": 58000 |
| }, |
| { |
| "epoch": 2.7354641308545102, |
| "eval_loss": 0.6410654783248901, |
| "eval_runtime": 469.5278, |
| "eval_samples_per_second": 76.055, |
| "eval_steps_per_second": 38.028, |
| "step": 58000 |
| }, |
| { |
| "epoch": 2.737822294670255, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.740901444764107e-07, |
| "loss": 0.6313, |
| "step": 58050 |
| }, |
| { |
| "epoch": 2.740180458486, |
| "grad_norm": 0.93359375, |
| "learning_rate": 8.662296215944285e-07, |
| "loss": 0.624, |
| "step": 58100 |
| }, |
| { |
| "epoch": 2.742538622301745, |
| "grad_norm": 1.0, |
| "learning_rate": 8.583690987124464e-07, |
| "loss": 0.6479, |
| "step": 58150 |
| }, |
| { |
| "epoch": 2.7448967861174896, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.505085758304643e-07, |
| "loss": 0.6311, |
| "step": 58200 |
| }, |
| { |
| "epoch": 2.7472549499332346, |
| "grad_norm": 1.0078125, |
| "learning_rate": 8.426480529484822e-07, |
| "loss": 0.6461, |
| "step": 58250 |
| }, |
| { |
| "epoch": 2.749613113748979, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.347875300665001e-07, |
| "loss": 0.6463, |
| "step": 58300 |
| }, |
| { |
| "epoch": 2.7519712775647243, |
| "grad_norm": 0.984375, |
| "learning_rate": 8.26927007184518e-07, |
| "loss": 0.6301, |
| "step": 58350 |
| }, |
| { |
| "epoch": 2.7543294413804693, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.190664843025359e-07, |
| "loss": 0.6478, |
| "step": 58400 |
| }, |
| { |
| "epoch": 2.756687605196214, |
| "grad_norm": 0.98828125, |
| "learning_rate": 8.112059614205537e-07, |
| "loss": 0.632, |
| "step": 58450 |
| }, |
| { |
| "epoch": 2.759045769011959, |
| "grad_norm": 1.0, |
| "learning_rate": 8.033454385385717e-07, |
| "loss": 0.6381, |
| "step": 58500 |
| }, |
| { |
| "epoch": 2.759045769011959, |
| "eval_loss": 0.6410369277000427, |
| "eval_runtime": 470.0761, |
| "eval_samples_per_second": 75.966, |
| "eval_steps_per_second": 37.983, |
| "step": 58500 |
| }, |
| { |
| "epoch": 2.7614039328277036, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.954849156565895e-07, |
| "loss": 0.6414, |
| "step": 58550 |
| }, |
| { |
| "epoch": 2.7637620966434486, |
| "grad_norm": 1.28125, |
| "learning_rate": 7.876243927746075e-07, |
| "loss": 0.6424, |
| "step": 58600 |
| }, |
| { |
| "epoch": 2.7661202604591937, |
| "grad_norm": 1.0625, |
| "learning_rate": 7.797638698926254e-07, |
| "loss": 0.6327, |
| "step": 58650 |
| }, |
| { |
| "epoch": 2.7684784242749383, |
| "grad_norm": 1.0234375, |
| "learning_rate": 7.719033470106432e-07, |
| "loss": 0.6454, |
| "step": 58700 |
| }, |
| { |
| "epoch": 2.7708365880906833, |
| "grad_norm": 1.0078125, |
| "learning_rate": 7.640428241286612e-07, |
| "loss": 0.6395, |
| "step": 58750 |
| }, |
| { |
| "epoch": 2.773194751906428, |
| "grad_norm": 0.98046875, |
| "learning_rate": 7.56182301246679e-07, |
| "loss": 0.6353, |
| "step": 58800 |
| }, |
| { |
| "epoch": 2.775552915722173, |
| "grad_norm": 0.9921875, |
| "learning_rate": 7.483217783646969e-07, |
| "loss": 0.6427, |
| "step": 58850 |
| }, |
| { |
| "epoch": 2.777911079537918, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.404612554827148e-07, |
| "loss": 0.6473, |
| "step": 58900 |
| }, |
| { |
| "epoch": 2.7802692433536627, |
| "grad_norm": 0.98828125, |
| "learning_rate": 7.326007326007327e-07, |
| "loss": 0.62, |
| "step": 58950 |
| }, |
| { |
| "epoch": 2.7826274071694077, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.247402097187506e-07, |
| "loss": 0.6349, |
| "step": 59000 |
| }, |
| { |
| "epoch": 2.7826274071694077, |
| "eval_loss": 0.6410152912139893, |
| "eval_runtime": 473.057, |
| "eval_samples_per_second": 75.488, |
| "eval_steps_per_second": 37.744, |
| "step": 59000 |
| }, |
| { |
| "epoch": 2.7849855709851523, |
| "grad_norm": 1.125, |
| "learning_rate": 7.168796868367685e-07, |
| "loss": 0.64, |
| "step": 59050 |
| }, |
| { |
| "epoch": 2.7873437348008974, |
| "grad_norm": 1.296875, |
| "learning_rate": 7.090191639547864e-07, |
| "loss": 0.6439, |
| "step": 59100 |
| }, |
| { |
| "epoch": 2.7897018986166424, |
| "grad_norm": 0.98046875, |
| "learning_rate": 7.011586410728042e-07, |
| "loss": 0.6366, |
| "step": 59150 |
| }, |
| { |
| "epoch": 2.792060062432387, |
| "grad_norm": 0.93359375, |
| "learning_rate": 6.932981181908222e-07, |
| "loss": 0.6253, |
| "step": 59200 |
| }, |
| { |
| "epoch": 2.7944182262481316, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.8543759530884e-07, |
| "loss": 0.6412, |
| "step": 59250 |
| }, |
| { |
| "epoch": 2.7967763900638767, |
| "grad_norm": 0.9921875, |
| "learning_rate": 6.775770724268579e-07, |
| "loss": 0.6454, |
| "step": 59300 |
| }, |
| { |
| "epoch": 2.7991345538796217, |
| "grad_norm": 1.0078125, |
| "learning_rate": 6.697165495448759e-07, |
| "loss": 0.6422, |
| "step": 59350 |
| }, |
| { |
| "epoch": 2.8014927176953663, |
| "grad_norm": 0.95703125, |
| "learning_rate": 6.618560266628937e-07, |
| "loss": 0.6319, |
| "step": 59400 |
| }, |
| { |
| "epoch": 2.8038508815111114, |
| "grad_norm": 1.0234375, |
| "learning_rate": 6.539955037809116e-07, |
| "loss": 0.6447, |
| "step": 59450 |
| }, |
| { |
| "epoch": 2.806209045326856, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.461349808989295e-07, |
| "loss": 0.6316, |
| "step": 59500 |
| }, |
| { |
| "epoch": 2.806209045326856, |
| "eval_loss": 0.6411675214767456, |
| "eval_runtime": 469.7714, |
| "eval_samples_per_second": 76.016, |
| "eval_steps_per_second": 38.008, |
| "step": 59500 |
| }, |
| { |
| "epoch": 2.808567209142601, |
| "grad_norm": 1.015625, |
| "learning_rate": 6.382744580169474e-07, |
| "loss": 0.6454, |
| "step": 59550 |
| }, |
| { |
| "epoch": 2.810925372958346, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.304139351349652e-07, |
| "loss": 0.641, |
| "step": 59600 |
| }, |
| { |
| "epoch": 2.8132835367740907, |
| "grad_norm": 0.96875, |
| "learning_rate": 6.225534122529831e-07, |
| "loss": 0.6424, |
| "step": 59650 |
| }, |
| { |
| "epoch": 2.8156417005898358, |
| "grad_norm": 0.94921875, |
| "learning_rate": 6.14692889371001e-07, |
| "loss": 0.6486, |
| "step": 59700 |
| }, |
| { |
| "epoch": 2.8179998644055804, |
| "grad_norm": 0.9921875, |
| "learning_rate": 6.068323664890188e-07, |
| "loss": 0.6416, |
| "step": 59750 |
| }, |
| { |
| "epoch": 2.8203580282213254, |
| "grad_norm": 1.0703125, |
| "learning_rate": 5.989718436070368e-07, |
| "loss": 0.6306, |
| "step": 59800 |
| }, |
| { |
| "epoch": 2.8227161920370705, |
| "grad_norm": 0.94921875, |
| "learning_rate": 5.911113207250547e-07, |
| "loss": 0.6408, |
| "step": 59850 |
| }, |
| { |
| "epoch": 2.825074355852815, |
| "grad_norm": 0.93359375, |
| "learning_rate": 5.832507978430726e-07, |
| "loss": 0.6327, |
| "step": 59900 |
| }, |
| { |
| "epoch": 2.82743251966856, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.753902749610905e-07, |
| "loss": 0.6402, |
| "step": 59950 |
| }, |
| { |
| "epoch": 2.8297906834843047, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.675297520791083e-07, |
| "loss": 0.6373, |
| "step": 60000 |
| }, |
| { |
| "epoch": 2.8297906834843047, |
| "eval_loss": 0.6411817669868469, |
| "eval_runtime": 470.2946, |
| "eval_samples_per_second": 75.931, |
| "eval_steps_per_second": 37.966, |
| "step": 60000 |
| }, |
| { |
| "epoch": 2.83214884730005, |
| "grad_norm": 1.0390625, |
| "learning_rate": 5.596692291971263e-07, |
| "loss": 0.642, |
| "step": 60050 |
| }, |
| { |
| "epoch": 2.834507011115795, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.518087063151441e-07, |
| "loss": 0.635, |
| "step": 60100 |
| }, |
| { |
| "epoch": 2.8368651749315394, |
| "grad_norm": 0.94140625, |
| "learning_rate": 5.43948183433162e-07, |
| "loss": 0.6441, |
| "step": 60150 |
| }, |
| { |
| "epoch": 2.8392233387472845, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.3608766055118e-07, |
| "loss": 0.6256, |
| "step": 60200 |
| }, |
| { |
| "epoch": 2.841581502563029, |
| "grad_norm": 1.0234375, |
| "learning_rate": 5.282271376691978e-07, |
| "loss": 0.6383, |
| "step": 60250 |
| }, |
| { |
| "epoch": 2.843939666378774, |
| "grad_norm": 1.015625, |
| "learning_rate": 5.203666147872157e-07, |
| "loss": 0.653, |
| "step": 60300 |
| }, |
| { |
| "epoch": 2.846297830194519, |
| "grad_norm": 1.015625, |
| "learning_rate": 5.125060919052336e-07, |
| "loss": 0.6333, |
| "step": 60350 |
| }, |
| { |
| "epoch": 2.848655994010264, |
| "grad_norm": 0.91796875, |
| "learning_rate": 5.046455690232515e-07, |
| "loss": 0.6362, |
| "step": 60400 |
| }, |
| { |
| "epoch": 2.851014157826009, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.967850461412694e-07, |
| "loss": 0.6434, |
| "step": 60450 |
| }, |
| { |
| "epoch": 2.8533723216417535, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.889245232592872e-07, |
| "loss": 0.6395, |
| "step": 60500 |
| }, |
| { |
| "epoch": 2.8533723216417535, |
| "eval_loss": 0.6410468220710754, |
| "eval_runtime": 473.7881, |
| "eval_samples_per_second": 75.371, |
| "eval_steps_per_second": 37.686, |
| "step": 60500 |
| }, |
| { |
| "epoch": 2.8557304854574985, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.810640003773051e-07, |
| "loss": 0.6297, |
| "step": 60550 |
| }, |
| { |
| "epoch": 2.8580886492732436, |
| "grad_norm": 0.9375, |
| "learning_rate": 4.73203477495323e-07, |
| "loss": 0.6338, |
| "step": 60600 |
| }, |
| { |
| "epoch": 2.860446813088988, |
| "grad_norm": 0.94921875, |
| "learning_rate": 4.653429546133409e-07, |
| "loss": 0.6337, |
| "step": 60650 |
| }, |
| { |
| "epoch": 2.8628049769047332, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.5748243173135876e-07, |
| "loss": 0.6432, |
| "step": 60700 |
| }, |
| { |
| "epoch": 2.865163140720478, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.496219088493767e-07, |
| "loss": 0.6343, |
| "step": 60750 |
| }, |
| { |
| "epoch": 2.867521304536223, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.417613859673946e-07, |
| "loss": 0.6397, |
| "step": 60800 |
| }, |
| { |
| "epoch": 2.869879468351968, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.3390086308541247e-07, |
| "loss": 0.6363, |
| "step": 60850 |
| }, |
| { |
| "epoch": 2.8722376321677126, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.2604034020343035e-07, |
| "loss": 0.6337, |
| "step": 60900 |
| }, |
| { |
| "epoch": 2.8745957959834576, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.1817981732144824e-07, |
| "loss": 0.64, |
| "step": 60950 |
| }, |
| { |
| "epoch": 2.876953959799202, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.103192944394661e-07, |
| "loss": 0.6338, |
| "step": 61000 |
| }, |
| { |
| "epoch": 2.876953959799202, |
| "eval_loss": 0.6411173343658447, |
| "eval_runtime": 469.9749, |
| "eval_samples_per_second": 75.983, |
| "eval_steps_per_second": 37.991, |
| "step": 61000 |
| }, |
| { |
| "epoch": 2.8793121236149473, |
| "grad_norm": 0.984375, |
| "learning_rate": 4.02458771557484e-07, |
| "loss": 0.6353, |
| "step": 61050 |
| }, |
| { |
| "epoch": 2.8816702874306923, |
| "grad_norm": 1.34375, |
| "learning_rate": 3.9459824867550194e-07, |
| "loss": 0.6449, |
| "step": 61100 |
| }, |
| { |
| "epoch": 2.884028451246437, |
| "grad_norm": 1.3125, |
| "learning_rate": 3.8673772579351983e-07, |
| "loss": 0.6456, |
| "step": 61150 |
| }, |
| { |
| "epoch": 2.886386615062182, |
| "grad_norm": 1.03125, |
| "learning_rate": 3.788772029115377e-07, |
| "loss": 0.6267, |
| "step": 61200 |
| }, |
| { |
| "epoch": 2.8887447788779266, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.710166800295556e-07, |
| "loss": 0.6437, |
| "step": 61250 |
| }, |
| { |
| "epoch": 2.8911029426936716, |
| "grad_norm": 1.0625, |
| "learning_rate": 3.631561571475735e-07, |
| "loss": 0.6396, |
| "step": 61300 |
| }, |
| { |
| "epoch": 2.8934611065094167, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.5529563426559137e-07, |
| "loss": 0.6386, |
| "step": 61350 |
| }, |
| { |
| "epoch": 2.8958192703251613, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.4743511138360925e-07, |
| "loss": 0.637, |
| "step": 61400 |
| }, |
| { |
| "epoch": 2.8981774341409063, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.395745885016272e-07, |
| "loss": 0.6291, |
| "step": 61450 |
| }, |
| { |
| "epoch": 2.900535597956651, |
| "grad_norm": 1.078125, |
| "learning_rate": 3.3171406561964507e-07, |
| "loss": 0.6411, |
| "step": 61500 |
| }, |
| { |
| "epoch": 2.900535597956651, |
| "eval_loss": 0.6411393284797668, |
| "eval_runtime": 474.1568, |
| "eval_samples_per_second": 75.313, |
| "eval_steps_per_second": 37.656, |
| "step": 61500 |
| }, |
| { |
| "epoch": 2.902893761772396, |
| "grad_norm": 1.0859375, |
| "learning_rate": 3.2385354273766296e-07, |
| "loss": 0.6399, |
| "step": 61550 |
| }, |
| { |
| "epoch": 2.905251925588141, |
| "grad_norm": 0.99609375, |
| "learning_rate": 3.1599301985568084e-07, |
| "loss": 0.6441, |
| "step": 61600 |
| }, |
| { |
| "epoch": 2.9076100894038857, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.081324969736987e-07, |
| "loss": 0.6298, |
| "step": 61650 |
| }, |
| { |
| "epoch": 2.9099682532196303, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.002719740917166e-07, |
| "loss": 0.6394, |
| "step": 61700 |
| }, |
| { |
| "epoch": 2.9123264170353753, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.924114512097345e-07, |
| "loss": 0.6328, |
| "step": 61750 |
| }, |
| { |
| "epoch": 2.9146845808511204, |
| "grad_norm": 0.9765625, |
| "learning_rate": 2.845509283277524e-07, |
| "loss": 0.6292, |
| "step": 61800 |
| }, |
| { |
| "epoch": 2.9170427446668654, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.7669040544577026e-07, |
| "loss": 0.6324, |
| "step": 61850 |
| }, |
| { |
| "epoch": 2.91940090848261, |
| "grad_norm": 1.171875, |
| "learning_rate": 2.6882988256378815e-07, |
| "loss": 0.6342, |
| "step": 61900 |
| }, |
| { |
| "epoch": 2.9217590722983546, |
| "grad_norm": 1.078125, |
| "learning_rate": 2.6096935968180603e-07, |
| "loss": 0.6352, |
| "step": 61950 |
| }, |
| { |
| "epoch": 2.9241172361140997, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.5310883679982397e-07, |
| "loss": 0.6279, |
| "step": 62000 |
| }, |
| { |
| "epoch": 2.9241172361140997, |
| "eval_loss": 0.6410698294639587, |
| "eval_runtime": 471.6688, |
| "eval_samples_per_second": 75.71, |
| "eval_steps_per_second": 37.855, |
| "step": 62000 |
| }, |
| { |
| "epoch": 2.9264753999298447, |
| "grad_norm": 0.96875, |
| "learning_rate": 2.4524831391784185e-07, |
| "loss": 0.6303, |
| "step": 62050 |
| }, |
| { |
| "epoch": 2.9288335637455893, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.3738779103585974e-07, |
| "loss": 0.6424, |
| "step": 62100 |
| }, |
| { |
| "epoch": 2.9311917275613344, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.2952726815387762e-07, |
| "loss": 0.6429, |
| "step": 62150 |
| }, |
| { |
| "epoch": 2.933549891377079, |
| "grad_norm": 1.0859375, |
| "learning_rate": 2.216667452718955e-07, |
| "loss": 0.6431, |
| "step": 62200 |
| }, |
| { |
| "epoch": 2.935908055192824, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.1380622238991342e-07, |
| "loss": 0.6369, |
| "step": 62250 |
| }, |
| { |
| "epoch": 2.938266219008569, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.059456995079313e-07, |
| "loss": 0.6399, |
| "step": 62300 |
| }, |
| { |
| "epoch": 2.9406243828243137, |
| "grad_norm": 1.1875, |
| "learning_rate": 1.9808517662594916e-07, |
| "loss": 0.6258, |
| "step": 62350 |
| }, |
| { |
| "epoch": 2.9429825466400588, |
| "grad_norm": 1.0, |
| "learning_rate": 1.9022465374396705e-07, |
| "loss": 0.6411, |
| "step": 62400 |
| }, |
| { |
| "epoch": 2.9453407104558034, |
| "grad_norm": 0.9296875, |
| "learning_rate": 1.8236413086198493e-07, |
| "loss": 0.6371, |
| "step": 62450 |
| }, |
| { |
| "epoch": 2.9476988742715484, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.7450360798000284e-07, |
| "loss": 0.6454, |
| "step": 62500 |
| }, |
| { |
| "epoch": 2.9476988742715484, |
| "eval_loss": 0.6410679221153259, |
| "eval_runtime": 471.0373, |
| "eval_samples_per_second": 75.811, |
| "eval_steps_per_second": 37.906, |
| "step": 62500 |
| }, |
| { |
| "epoch": 2.9500570380872935, |
| "grad_norm": 0.99609375, |
| "learning_rate": 1.6664308509802073e-07, |
| "loss": 0.6419, |
| "step": 62550 |
| }, |
| { |
| "epoch": 2.952415201903038, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.587825622160386e-07, |
| "loss": 0.6397, |
| "step": 62600 |
| }, |
| { |
| "epoch": 2.954773365718783, |
| "grad_norm": 1.0, |
| "learning_rate": 1.5092203933405652e-07, |
| "loss": 0.6429, |
| "step": 62650 |
| }, |
| { |
| "epoch": 2.9571315295345277, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.430615164520744e-07, |
| "loss": 0.6389, |
| "step": 62700 |
| }, |
| { |
| "epoch": 2.959489693350273, |
| "grad_norm": 0.921875, |
| "learning_rate": 1.352009935700923e-07, |
| "loss": 0.6471, |
| "step": 62750 |
| }, |
| { |
| "epoch": 2.961847857166018, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.2734047068811018e-07, |
| "loss": 0.6362, |
| "step": 62800 |
| }, |
| { |
| "epoch": 2.9642060209817624, |
| "grad_norm": 0.8984375, |
| "learning_rate": 1.1947994780612809e-07, |
| "loss": 0.6305, |
| "step": 62850 |
| }, |
| { |
| "epoch": 2.9665641847975075, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.1161942492414596e-07, |
| "loss": 0.642, |
| "step": 62900 |
| }, |
| { |
| "epoch": 2.968922348613252, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.0375890204216384e-07, |
| "loss": 0.6424, |
| "step": 62950 |
| }, |
| { |
| "epoch": 2.971280512428997, |
| "grad_norm": 0.9765625, |
| "learning_rate": 9.589837916018174e-08, |
| "loss": 0.6486, |
| "step": 63000 |
| }, |
| { |
| "epoch": 2.971280512428997, |
| "eval_loss": 0.6411649584770203, |
| "eval_runtime": 470.5283, |
| "eval_samples_per_second": 75.893, |
| "eval_steps_per_second": 37.947, |
| "step": 63000 |
| }, |
| { |
| "epoch": 2.973638676244742, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.803785627819964e-08, |
| "loss": 0.6288, |
| "step": 63050 |
| }, |
| { |
| "epoch": 2.975996840060487, |
| "grad_norm": 1.0078125, |
| "learning_rate": 8.017733339621752e-08, |
| "loss": 0.643, |
| "step": 63100 |
| }, |
| { |
| "epoch": 2.978355003876232, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.231681051423542e-08, |
| "loss": 0.6484, |
| "step": 63150 |
| }, |
| { |
| "epoch": 2.9807131676919765, |
| "grad_norm": 1.375, |
| "learning_rate": 6.44562876322533e-08, |
| "loss": 0.6447, |
| "step": 63200 |
| }, |
| { |
| "epoch": 2.9830713315077215, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.659576475027119e-08, |
| "loss": 0.6435, |
| "step": 63250 |
| }, |
| { |
| "epoch": 2.9854294953234666, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.873524186828909e-08, |
| "loss": 0.6504, |
| "step": 63300 |
| }, |
| { |
| "epoch": 2.987787659139211, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.087471898630698e-08, |
| "loss": 0.6366, |
| "step": 63350 |
| }, |
| { |
| "epoch": 2.9901458229549562, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.301419610432486e-08, |
| "loss": 0.6427, |
| "step": 63400 |
| }, |
| { |
| "epoch": 2.992503986770701, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.515367322234275e-08, |
| "loss": 0.6422, |
| "step": 63450 |
| }, |
| { |
| "epoch": 2.994862150586446, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.729315034036064e-08, |
| "loss": 0.6265, |
| "step": 63500 |
| }, |
| { |
| "epoch": 2.994862150586446, |
| "eval_loss": 0.6410553455352783, |
| "eval_runtime": 471.7509, |
| "eval_samples_per_second": 75.697, |
| "eval_steps_per_second": 37.848, |
| "step": 63500 |
| }, |
| { |
| "epoch": 2.997220314402191, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.432627458378533e-09, |
| "loss": 0.6428, |
| "step": 63550 |
| }, |
| { |
| "epoch": 2.9995784782179356, |
| "grad_norm": 1.265625, |
| "learning_rate": 1.572104576396422e-09, |
| "loss": 0.6394, |
| "step": 63600 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 63609, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 3 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.845666644465661e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|