sid172002's picture
Upload folder using huggingface_hub
843291d verified
{
"best_global_step": 33000,
"best_metric": 0.6409004926681519,
"best_model_checkpoint": "/home/ubuntu/deepseek-math-b200-resumed/checkpoint-33000",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 63609,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.7163276314897404e-05,
"grad_norm": 2.6875,
"learning_rate": 0.0,
"loss": 1.4702,
"step": 1
},
{
"epoch": 0.0023581638157448705,
"grad_norm": 16.5,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.8687,
"step": 50
},
{
"epoch": 0.004716327631489741,
"grad_norm": 1.9765625,
"learning_rate": 9.9e-06,
"loss": 0.762,
"step": 100
},
{
"epoch": 0.007074491447234611,
"grad_norm": 1.984375,
"learning_rate": 9.99228455809413e-06,
"loss": 0.7395,
"step": 150
},
{
"epoch": 0.009432655262979482,
"grad_norm": 2.0625,
"learning_rate": 9.984411658190178e-06,
"loss": 0.7109,
"step": 200
},
{
"epoch": 0.011790819078724351,
"grad_norm": 1.7578125,
"learning_rate": 9.976538758286228e-06,
"loss": 0.6712,
"step": 250
},
{
"epoch": 0.014148982894469222,
"grad_norm": 1.9296875,
"learning_rate": 9.968665858382277e-06,
"loss": 0.6872,
"step": 300
},
{
"epoch": 0.016507146710214093,
"grad_norm": 1.546875,
"learning_rate": 9.960792958478327e-06,
"loss": 0.6699,
"step": 350
},
{
"epoch": 0.018865310525958964,
"grad_norm": 1.8828125,
"learning_rate": 9.952920058574375e-06,
"loss": 0.6767,
"step": 400
},
{
"epoch": 0.02122347434170383,
"grad_norm": 1.8359375,
"learning_rate": 9.945047158670426e-06,
"loss": 0.6632,
"step": 450
},
{
"epoch": 0.023581638157448702,
"grad_norm": 1.9765625,
"learning_rate": 9.937174258766474e-06,
"loss": 0.6635,
"step": 500
},
{
"epoch": 0.023581638157448702,
"eval_loss": 0.6704570055007935,
"eval_runtime": 471.8556,
"eval_samples_per_second": 75.68,
"eval_steps_per_second": 37.84,
"step": 500
},
{
"epoch": 0.025939801973193573,
"grad_norm": 1.7421875,
"learning_rate": 9.929301358862524e-06,
"loss": 0.6684,
"step": 550
},
{
"epoch": 0.028297965788938444,
"grad_norm": 1.859375,
"learning_rate": 9.921428458958575e-06,
"loss": 0.6743,
"step": 600
},
{
"epoch": 0.030656129604683315,
"grad_norm": 1.7421875,
"learning_rate": 9.913555559054623e-06,
"loss": 0.6726,
"step": 650
},
{
"epoch": 0.033014293420428185,
"grad_norm": 1.953125,
"learning_rate": 9.905682659150673e-06,
"loss": 0.6692,
"step": 700
},
{
"epoch": 0.035372457236173056,
"grad_norm": 2.015625,
"learning_rate": 9.897809759246722e-06,
"loss": 0.6622,
"step": 750
},
{
"epoch": 0.03773062105191793,
"grad_norm": 1.765625,
"learning_rate": 9.889936859342772e-06,
"loss": 0.6613,
"step": 800
},
{
"epoch": 0.04008878486766279,
"grad_norm": 1.859375,
"learning_rate": 9.88206395943882e-06,
"loss": 0.6643,
"step": 850
},
{
"epoch": 0.04244694868340766,
"grad_norm": 1.6171875,
"learning_rate": 9.87419105953487e-06,
"loss": 0.6457,
"step": 900
},
{
"epoch": 0.04480511249915253,
"grad_norm": 1.734375,
"learning_rate": 9.86631815963092e-06,
"loss": 0.6598,
"step": 950
},
{
"epoch": 0.047163276314897404,
"grad_norm": 1.9375,
"learning_rate": 9.858445259726968e-06,
"loss": 0.6687,
"step": 1000
},
{
"epoch": 0.047163276314897404,
"eval_loss": 0.6613409519195557,
"eval_runtime": 471.2575,
"eval_samples_per_second": 75.776,
"eval_steps_per_second": 37.888,
"step": 1000
},
{
"epoch": 0.049521440130642275,
"grad_norm": 1.609375,
"learning_rate": 9.850572359823018e-06,
"loss": 0.6618,
"step": 1050
},
{
"epoch": 0.051879603946387146,
"grad_norm": 1.578125,
"learning_rate": 9.842699459919067e-06,
"loss": 0.654,
"step": 1100
},
{
"epoch": 0.054237767762132016,
"grad_norm": 1.8046875,
"learning_rate": 9.834826560015117e-06,
"loss": 0.6651,
"step": 1150
},
{
"epoch": 0.05659593157787689,
"grad_norm": 1.8671875,
"learning_rate": 9.826953660111166e-06,
"loss": 0.6597,
"step": 1200
},
{
"epoch": 0.05895409539362176,
"grad_norm": 1.703125,
"learning_rate": 9.819080760207216e-06,
"loss": 0.6654,
"step": 1250
},
{
"epoch": 0.06131225920936663,
"grad_norm": 1.78125,
"learning_rate": 9.811207860303264e-06,
"loss": 0.6743,
"step": 1300
},
{
"epoch": 0.0636704230251115,
"grad_norm": 1.75,
"learning_rate": 9.803334960399315e-06,
"loss": 0.6578,
"step": 1350
},
{
"epoch": 0.06602858684085637,
"grad_norm": 1.765625,
"learning_rate": 9.795462060495363e-06,
"loss": 0.6495,
"step": 1400
},
{
"epoch": 0.06838675065660124,
"grad_norm": 1.59375,
"learning_rate": 9.787589160591413e-06,
"loss": 0.668,
"step": 1450
},
{
"epoch": 0.07074491447234611,
"grad_norm": 1.8203125,
"learning_rate": 9.779716260687464e-06,
"loss": 0.6636,
"step": 1500
},
{
"epoch": 0.07074491447234611,
"eval_loss": 0.6566535830497742,
"eval_runtime": 469.1444,
"eval_samples_per_second": 76.117,
"eval_steps_per_second": 38.059,
"step": 1500
},
{
"epoch": 0.07310307828809098,
"grad_norm": 1.734375,
"learning_rate": 9.771843360783512e-06,
"loss": 0.6704,
"step": 1550
},
{
"epoch": 0.07546124210383585,
"grad_norm": 1.6875,
"learning_rate": 9.76397046087956e-06,
"loss": 0.6636,
"step": 1600
},
{
"epoch": 0.07781940591958073,
"grad_norm": 1.7109375,
"learning_rate": 9.756097560975611e-06,
"loss": 0.6451,
"step": 1650
},
{
"epoch": 0.08017756973532558,
"grad_norm": 1.796875,
"learning_rate": 9.74822466107166e-06,
"loss": 0.6523,
"step": 1700
},
{
"epoch": 0.08253573355107045,
"grad_norm": 1.7578125,
"learning_rate": 9.74035176116771e-06,
"loss": 0.6636,
"step": 1750
},
{
"epoch": 0.08489389736681532,
"grad_norm": 1.953125,
"learning_rate": 9.732478861263758e-06,
"loss": 0.645,
"step": 1800
},
{
"epoch": 0.0872520611825602,
"grad_norm": 1.5703125,
"learning_rate": 9.724605961359807e-06,
"loss": 0.6565,
"step": 1850
},
{
"epoch": 0.08961022499830507,
"grad_norm": 1.5625,
"learning_rate": 9.716733061455857e-06,
"loss": 0.6424,
"step": 1900
},
{
"epoch": 0.09196838881404994,
"grad_norm": 1.7734375,
"learning_rate": 9.708860161551907e-06,
"loss": 0.6612,
"step": 1950
},
{
"epoch": 0.09432655262979481,
"grad_norm": 1.90625,
"learning_rate": 9.700987261647956e-06,
"loss": 0.6457,
"step": 2000
},
{
"epoch": 0.09432655262979481,
"eval_loss": 0.6536301374435425,
"eval_runtime": 469.9529,
"eval_samples_per_second": 75.986,
"eval_steps_per_second": 37.993,
"step": 2000
},
{
"epoch": 0.09668471644553968,
"grad_norm": 1.578125,
"learning_rate": 9.693114361744006e-06,
"loss": 0.6538,
"step": 2050
},
{
"epoch": 0.09904288026128455,
"grad_norm": 1.765625,
"learning_rate": 9.685241461840055e-06,
"loss": 0.6505,
"step": 2100
},
{
"epoch": 0.10140104407702942,
"grad_norm": 1.9765625,
"learning_rate": 9.677368561936105e-06,
"loss": 0.6554,
"step": 2150
},
{
"epoch": 0.10375920789277429,
"grad_norm": 1.78125,
"learning_rate": 9.669495662032153e-06,
"loss": 0.6541,
"step": 2200
},
{
"epoch": 0.10611737170851916,
"grad_norm": 1.9453125,
"learning_rate": 9.661622762128204e-06,
"loss": 0.6525,
"step": 2250
},
{
"epoch": 0.10847553552426403,
"grad_norm": 1.8203125,
"learning_rate": 9.653749862224252e-06,
"loss": 0.6487,
"step": 2300
},
{
"epoch": 0.1108336993400089,
"grad_norm": 1.78125,
"learning_rate": 9.645876962320302e-06,
"loss": 0.6497,
"step": 2350
},
{
"epoch": 0.11319186315575377,
"grad_norm": 1.515625,
"learning_rate": 9.638004062416351e-06,
"loss": 0.6603,
"step": 2400
},
{
"epoch": 0.11555002697149865,
"grad_norm": 1.796875,
"learning_rate": 9.630131162512401e-06,
"loss": 0.6615,
"step": 2450
},
{
"epoch": 0.11790819078724352,
"grad_norm": 1.671875,
"learning_rate": 9.62225826260845e-06,
"loss": 0.6497,
"step": 2500
},
{
"epoch": 0.11790819078724352,
"eval_loss": 0.6514254212379456,
"eval_runtime": 468.4051,
"eval_samples_per_second": 76.237,
"eval_steps_per_second": 38.119,
"step": 2500
},
{
"epoch": 0.12026635460298839,
"grad_norm": 1.9375,
"learning_rate": 9.6143853627045e-06,
"loss": 0.6438,
"step": 2550
},
{
"epoch": 0.12262451841873326,
"grad_norm": 1.8671875,
"learning_rate": 9.606512462800549e-06,
"loss": 0.6503,
"step": 2600
},
{
"epoch": 0.12498268223447813,
"grad_norm": 1.7578125,
"learning_rate": 9.598639562896597e-06,
"loss": 0.6413,
"step": 2650
},
{
"epoch": 0.127340846050223,
"grad_norm": 1.6796875,
"learning_rate": 9.590766662992647e-06,
"loss": 0.6598,
"step": 2700
},
{
"epoch": 0.12969900986596786,
"grad_norm": 1.5546875,
"learning_rate": 9.582893763088696e-06,
"loss": 0.6363,
"step": 2750
},
{
"epoch": 0.13205717368171274,
"grad_norm": 1.8125,
"learning_rate": 9.575020863184746e-06,
"loss": 0.6558,
"step": 2800
},
{
"epoch": 0.1344153374974576,
"grad_norm": 2.078125,
"learning_rate": 9.567147963280795e-06,
"loss": 0.6429,
"step": 2850
},
{
"epoch": 0.13677350131320248,
"grad_norm": 1.6875,
"learning_rate": 9.559275063376845e-06,
"loss": 0.6619,
"step": 2900
},
{
"epoch": 0.13913166512894734,
"grad_norm": 1.9921875,
"learning_rate": 9.551402163472895e-06,
"loss": 0.6607,
"step": 2950
},
{
"epoch": 0.14148982894469223,
"grad_norm": 1.8125,
"learning_rate": 9.543529263568944e-06,
"loss": 0.6558,
"step": 3000
},
{
"epoch": 0.14148982894469223,
"eval_loss": 0.649859607219696,
"eval_runtime": 471.3671,
"eval_samples_per_second": 75.758,
"eval_steps_per_second": 37.879,
"step": 3000
},
{
"epoch": 0.14384799276043708,
"grad_norm": 1.7109375,
"learning_rate": 9.535656363664994e-06,
"loss": 0.6347,
"step": 3050
},
{
"epoch": 0.14620615657618197,
"grad_norm": 1.8203125,
"learning_rate": 9.527783463761042e-06,
"loss": 0.6467,
"step": 3100
},
{
"epoch": 0.14856432039192682,
"grad_norm": 1.75,
"learning_rate": 9.519910563857093e-06,
"loss": 0.6584,
"step": 3150
},
{
"epoch": 0.1509224842076717,
"grad_norm": 2.015625,
"learning_rate": 9.512037663953141e-06,
"loss": 0.6499,
"step": 3200
},
{
"epoch": 0.15328064802341657,
"grad_norm": 1.84375,
"learning_rate": 9.504164764049192e-06,
"loss": 0.6528,
"step": 3250
},
{
"epoch": 0.15563881183916145,
"grad_norm": 1.8828125,
"learning_rate": 9.49629186414524e-06,
"loss": 0.6434,
"step": 3300
},
{
"epoch": 0.1579969756549063,
"grad_norm": 1.7265625,
"learning_rate": 9.488418964241289e-06,
"loss": 0.6542,
"step": 3350
},
{
"epoch": 0.16035513947065116,
"grad_norm": 1.78125,
"learning_rate": 9.480546064337339e-06,
"loss": 0.6469,
"step": 3400
},
{
"epoch": 0.16271330328639605,
"grad_norm": 1.75,
"learning_rate": 9.472673164433387e-06,
"loss": 0.6385,
"step": 3450
},
{
"epoch": 0.1650714671021409,
"grad_norm": 1.9609375,
"learning_rate": 9.464800264529438e-06,
"loss": 0.6535,
"step": 3500
},
{
"epoch": 0.1650714671021409,
"eval_loss": 0.6484876871109009,
"eval_runtime": 476.3019,
"eval_samples_per_second": 74.973,
"eval_steps_per_second": 37.487,
"step": 3500
},
{
"epoch": 0.1674296309178858,
"grad_norm": 1.875,
"learning_rate": 9.456927364625486e-06,
"loss": 0.6531,
"step": 3550
},
{
"epoch": 0.16978779473363065,
"grad_norm": 1.6796875,
"learning_rate": 9.449054464721536e-06,
"loss": 0.6472,
"step": 3600
},
{
"epoch": 0.17214595854937553,
"grad_norm": 1.625,
"learning_rate": 9.441181564817585e-06,
"loss": 0.6474,
"step": 3650
},
{
"epoch": 0.1745041223651204,
"grad_norm": 1.6953125,
"learning_rate": 9.433308664913635e-06,
"loss": 0.6498,
"step": 3700
},
{
"epoch": 0.17686228618086527,
"grad_norm": 1.734375,
"learning_rate": 9.425435765009684e-06,
"loss": 0.6615,
"step": 3750
},
{
"epoch": 0.17922044999661013,
"grad_norm": 2.015625,
"learning_rate": 9.417562865105734e-06,
"loss": 0.6431,
"step": 3800
},
{
"epoch": 0.18157861381235502,
"grad_norm": 1.7421875,
"learning_rate": 9.409689965201784e-06,
"loss": 0.6369,
"step": 3850
},
{
"epoch": 0.18393677762809987,
"grad_norm": 1.671875,
"learning_rate": 9.401817065297833e-06,
"loss": 0.6329,
"step": 3900
},
{
"epoch": 0.18629494144384476,
"grad_norm": 1.7421875,
"learning_rate": 9.393944165393883e-06,
"loss": 0.6452,
"step": 3950
},
{
"epoch": 0.18865310525958962,
"grad_norm": 2.109375,
"learning_rate": 9.386071265489932e-06,
"loss": 0.6562,
"step": 4000
},
{
"epoch": 0.18865310525958962,
"eval_loss": 0.6474871635437012,
"eval_runtime": 471.3025,
"eval_samples_per_second": 75.769,
"eval_steps_per_second": 37.884,
"step": 4000
},
{
"epoch": 0.1910112690753345,
"grad_norm": 1.9375,
"learning_rate": 9.378198365585982e-06,
"loss": 0.6507,
"step": 4050
},
{
"epoch": 0.19336943289107936,
"grad_norm": 1.8828125,
"learning_rate": 9.37032546568203e-06,
"loss": 0.6486,
"step": 4100
},
{
"epoch": 0.19572759670682424,
"grad_norm": 1.8046875,
"learning_rate": 9.362452565778079e-06,
"loss": 0.6593,
"step": 4150
},
{
"epoch": 0.1980857605225691,
"grad_norm": 1.546875,
"learning_rate": 9.354579665874129e-06,
"loss": 0.6526,
"step": 4200
},
{
"epoch": 0.20044392433831398,
"grad_norm": 1.8046875,
"learning_rate": 9.346706765970178e-06,
"loss": 0.6393,
"step": 4250
},
{
"epoch": 0.20280208815405884,
"grad_norm": 1.9609375,
"learning_rate": 9.338833866066228e-06,
"loss": 0.6465,
"step": 4300
},
{
"epoch": 0.20516025196980373,
"grad_norm": 1.8515625,
"learning_rate": 9.330960966162276e-06,
"loss": 0.6501,
"step": 4350
},
{
"epoch": 0.20751841578554858,
"grad_norm": 1.9453125,
"learning_rate": 9.323088066258327e-06,
"loss": 0.6485,
"step": 4400
},
{
"epoch": 0.20987657960129344,
"grad_norm": 1.875,
"learning_rate": 9.315215166354375e-06,
"loss": 0.6468,
"step": 4450
},
{
"epoch": 0.21223474341703832,
"grad_norm": 1.7890625,
"learning_rate": 9.307342266450425e-06,
"loss": 0.6492,
"step": 4500
},
{
"epoch": 0.21223474341703832,
"eval_loss": 0.6467618346214294,
"eval_runtime": 472.4529,
"eval_samples_per_second": 75.584,
"eval_steps_per_second": 37.792,
"step": 4500
},
{
"epoch": 0.21459290723278318,
"grad_norm": 1.5390625,
"learning_rate": 9.299469366546474e-06,
"loss": 0.6532,
"step": 4550
},
{
"epoch": 0.21695107104852807,
"grad_norm": 1.9375,
"learning_rate": 9.291596466642524e-06,
"loss": 0.6518,
"step": 4600
},
{
"epoch": 0.21930923486427292,
"grad_norm": 1.8203125,
"learning_rate": 9.283723566738573e-06,
"loss": 0.6533,
"step": 4650
},
{
"epoch": 0.2216673986800178,
"grad_norm": 1.8203125,
"learning_rate": 9.275850666834623e-06,
"loss": 0.6467,
"step": 4700
},
{
"epoch": 0.22402556249576266,
"grad_norm": 1.78125,
"learning_rate": 9.267977766930673e-06,
"loss": 0.6557,
"step": 4750
},
{
"epoch": 0.22638372631150755,
"grad_norm": 1.875,
"learning_rate": 9.260104867026722e-06,
"loss": 0.6405,
"step": 4800
},
{
"epoch": 0.2287418901272524,
"grad_norm": 1.8828125,
"learning_rate": 9.25223196712277e-06,
"loss": 0.6461,
"step": 4850
},
{
"epoch": 0.2311000539429973,
"grad_norm": 1.546875,
"learning_rate": 9.24435906721882e-06,
"loss": 0.6404,
"step": 4900
},
{
"epoch": 0.23345821775874215,
"grad_norm": 1.796875,
"learning_rate": 9.236486167314869e-06,
"loss": 0.6345,
"step": 4950
},
{
"epoch": 0.23581638157448703,
"grad_norm": 1.6640625,
"learning_rate": 9.22861326741092e-06,
"loss": 0.6569,
"step": 5000
},
{
"epoch": 0.23581638157448703,
"eval_loss": 0.6459140777587891,
"eval_runtime": 471.5037,
"eval_samples_per_second": 75.736,
"eval_steps_per_second": 37.868,
"step": 5000
},
{
"epoch": 0.2381745453902319,
"grad_norm": 1.9140625,
"learning_rate": 9.220740367506968e-06,
"loss": 0.627,
"step": 5050
},
{
"epoch": 0.24053270920597677,
"grad_norm": 1.7890625,
"learning_rate": 9.212867467603016e-06,
"loss": 0.6353,
"step": 5100
},
{
"epoch": 0.24289087302172163,
"grad_norm": 1.734375,
"learning_rate": 9.204994567699067e-06,
"loss": 0.6251,
"step": 5150
},
{
"epoch": 0.24524903683746652,
"grad_norm": 1.9375,
"learning_rate": 9.197121667795117e-06,
"loss": 0.652,
"step": 5200
},
{
"epoch": 0.24760720065321137,
"grad_norm": 1.71875,
"learning_rate": 9.189248767891166e-06,
"loss": 0.6512,
"step": 5250
},
{
"epoch": 0.24996536446895626,
"grad_norm": 1.671875,
"learning_rate": 9.181375867987216e-06,
"loss": 0.6522,
"step": 5300
},
{
"epoch": 0.2523235282847011,
"grad_norm": 1.8046875,
"learning_rate": 9.173502968083264e-06,
"loss": 0.6397,
"step": 5350
},
{
"epoch": 0.254681692100446,
"grad_norm": 2.03125,
"learning_rate": 9.165630068179315e-06,
"loss": 0.6427,
"step": 5400
},
{
"epoch": 0.2570398559161909,
"grad_norm": 1.6328125,
"learning_rate": 9.157757168275363e-06,
"loss": 0.6367,
"step": 5450
},
{
"epoch": 0.2593980197319357,
"grad_norm": 1.78125,
"learning_rate": 9.149884268371413e-06,
"loss": 0.6441,
"step": 5500
},
{
"epoch": 0.2593980197319357,
"eval_loss": 0.6453782320022583,
"eval_runtime": 474.9186,
"eval_samples_per_second": 75.192,
"eval_steps_per_second": 37.596,
"step": 5500
},
{
"epoch": 0.2617561835476806,
"grad_norm": 1.7265625,
"learning_rate": 9.142011368467462e-06,
"loss": 0.6361,
"step": 5550
},
{
"epoch": 0.2641143473634255,
"grad_norm": 1.84375,
"learning_rate": 9.134138468563512e-06,
"loss": 0.6535,
"step": 5600
},
{
"epoch": 0.2664725111791703,
"grad_norm": 1.6953125,
"learning_rate": 9.12626556865956e-06,
"loss": 0.6484,
"step": 5650
},
{
"epoch": 0.2688306749949152,
"grad_norm": 1.9375,
"learning_rate": 9.118392668755611e-06,
"loss": 0.6505,
"step": 5700
},
{
"epoch": 0.2711888388106601,
"grad_norm": 1.65625,
"learning_rate": 9.11051976885166e-06,
"loss": 0.6319,
"step": 5750
},
{
"epoch": 0.27354700262640497,
"grad_norm": 1.90625,
"learning_rate": 9.102646868947708e-06,
"loss": 0.6495,
"step": 5800
},
{
"epoch": 0.2759051664421498,
"grad_norm": 1.8671875,
"learning_rate": 9.094773969043758e-06,
"loss": 0.65,
"step": 5850
},
{
"epoch": 0.2782633302578947,
"grad_norm": 1.78125,
"learning_rate": 9.086901069139807e-06,
"loss": 0.6459,
"step": 5900
},
{
"epoch": 0.28062149407363957,
"grad_norm": 1.8828125,
"learning_rate": 9.079028169235857e-06,
"loss": 0.6462,
"step": 5950
},
{
"epoch": 0.28297965788938445,
"grad_norm": 1.765625,
"learning_rate": 9.071155269331906e-06,
"loss": 0.6378,
"step": 6000
},
{
"epoch": 0.28297965788938445,
"eval_loss": 0.6447737812995911,
"eval_runtime": 470.2799,
"eval_samples_per_second": 75.933,
"eval_steps_per_second": 37.967,
"step": 6000
},
{
"epoch": 0.2853378217051293,
"grad_norm": 1.765625,
"learning_rate": 9.063282369427956e-06,
"loss": 0.6452,
"step": 6050
},
{
"epoch": 0.28769598552087416,
"grad_norm": 1.703125,
"learning_rate": 9.055409469524004e-06,
"loss": 0.6443,
"step": 6100
},
{
"epoch": 0.29005414933661905,
"grad_norm": 2.046875,
"learning_rate": 9.047536569620055e-06,
"loss": 0.6429,
"step": 6150
},
{
"epoch": 0.29241231315236393,
"grad_norm": 1.828125,
"learning_rate": 9.039663669716105e-06,
"loss": 0.6313,
"step": 6200
},
{
"epoch": 0.29477047696810876,
"grad_norm": 1.984375,
"learning_rate": 9.031790769812153e-06,
"loss": 0.6576,
"step": 6250
},
{
"epoch": 0.29712864078385365,
"grad_norm": 1.6328125,
"learning_rate": 9.023917869908204e-06,
"loss": 0.6467,
"step": 6300
},
{
"epoch": 0.29948680459959853,
"grad_norm": 1.8203125,
"learning_rate": 9.016044970004252e-06,
"loss": 0.6372,
"step": 6350
},
{
"epoch": 0.3018449684153434,
"grad_norm": 1.6484375,
"learning_rate": 9.008172070100302e-06,
"loss": 0.6413,
"step": 6400
},
{
"epoch": 0.30420313223108825,
"grad_norm": 1.8125,
"learning_rate": 9.000299170196351e-06,
"loss": 0.6488,
"step": 6450
},
{
"epoch": 0.30656129604683313,
"grad_norm": 2.0,
"learning_rate": 8.992426270292401e-06,
"loss": 0.6407,
"step": 6500
},
{
"epoch": 0.30656129604683313,
"eval_loss": 0.6444392800331116,
"eval_runtime": 471.0868,
"eval_samples_per_second": 75.803,
"eval_steps_per_second": 37.902,
"step": 6500
},
{
"epoch": 0.308919459862578,
"grad_norm": 1.9140625,
"learning_rate": 8.98455337038845e-06,
"loss": 0.6493,
"step": 6550
},
{
"epoch": 0.3112776236783229,
"grad_norm": 2.0,
"learning_rate": 8.976680470484498e-06,
"loss": 0.6578,
"step": 6600
},
{
"epoch": 0.31363578749406773,
"grad_norm": 1.7890625,
"learning_rate": 8.968807570580548e-06,
"loss": 0.6534,
"step": 6650
},
{
"epoch": 0.3159939513098126,
"grad_norm": 1.8125,
"learning_rate": 8.960934670676597e-06,
"loss": 0.6371,
"step": 6700
},
{
"epoch": 0.3183521151255575,
"grad_norm": 1.828125,
"learning_rate": 8.953061770772647e-06,
"loss": 0.6489,
"step": 6750
},
{
"epoch": 0.32071027894130233,
"grad_norm": 1.8828125,
"learning_rate": 8.945188870868696e-06,
"loss": 0.6379,
"step": 6800
},
{
"epoch": 0.3230684427570472,
"grad_norm": 2.09375,
"learning_rate": 8.937315970964746e-06,
"loss": 0.6462,
"step": 6850
},
{
"epoch": 0.3254266065727921,
"grad_norm": 1.6484375,
"learning_rate": 8.929443071060795e-06,
"loss": 0.6363,
"step": 6900
},
{
"epoch": 0.327784770388537,
"grad_norm": 2.3125,
"learning_rate": 8.921570171156845e-06,
"loss": 0.6599,
"step": 6950
},
{
"epoch": 0.3301429342042818,
"grad_norm": 1.9453125,
"learning_rate": 8.913697271252893e-06,
"loss": 0.6359,
"step": 7000
},
{
"epoch": 0.3301429342042818,
"eval_loss": 0.6439831852912903,
"eval_runtime": 470.7382,
"eval_samples_per_second": 75.86,
"eval_steps_per_second": 37.93,
"step": 7000
},
{
"epoch": 0.3325010980200267,
"grad_norm": 2.09375,
"learning_rate": 8.905824371348944e-06,
"loss": 0.6382,
"step": 7050
},
{
"epoch": 0.3348592618357716,
"grad_norm": 2.03125,
"learning_rate": 8.897951471444994e-06,
"loss": 0.6516,
"step": 7100
},
{
"epoch": 0.33721742565151647,
"grad_norm": 1.703125,
"learning_rate": 8.890078571541042e-06,
"loss": 0.6342,
"step": 7150
},
{
"epoch": 0.3395755894672613,
"grad_norm": 1.9140625,
"learning_rate": 8.882205671637093e-06,
"loss": 0.6407,
"step": 7200
},
{
"epoch": 0.3419337532830062,
"grad_norm": 2.109375,
"learning_rate": 8.874332771733141e-06,
"loss": 0.6477,
"step": 7250
},
{
"epoch": 0.34429191709875107,
"grad_norm": 1.796875,
"learning_rate": 8.86645987182919e-06,
"loss": 0.6366,
"step": 7300
},
{
"epoch": 0.34665008091449595,
"grad_norm": 2.109375,
"learning_rate": 8.85858697192524e-06,
"loss": 0.6438,
"step": 7350
},
{
"epoch": 0.3490082447302408,
"grad_norm": 1.7421875,
"learning_rate": 8.850714072021289e-06,
"loss": 0.641,
"step": 7400
},
{
"epoch": 0.35136640854598566,
"grad_norm": 2.0625,
"learning_rate": 8.842841172117339e-06,
"loss": 0.649,
"step": 7450
},
{
"epoch": 0.35372457236173055,
"grad_norm": 1.890625,
"learning_rate": 8.834968272213387e-06,
"loss": 0.6515,
"step": 7500
},
{
"epoch": 0.35372457236173055,
"eval_loss": 0.6437468528747559,
"eval_runtime": 473.1767,
"eval_samples_per_second": 75.469,
"eval_steps_per_second": 37.734,
"step": 7500
},
{
"epoch": 0.35608273617747543,
"grad_norm": 2.21875,
"learning_rate": 8.827095372309438e-06,
"loss": 0.6373,
"step": 7550
},
{
"epoch": 0.35844089999322026,
"grad_norm": 2.109375,
"learning_rate": 8.819222472405486e-06,
"loss": 0.6434,
"step": 7600
},
{
"epoch": 0.36079906380896515,
"grad_norm": 2.03125,
"learning_rate": 8.811349572501536e-06,
"loss": 0.6448,
"step": 7650
},
{
"epoch": 0.36315722762471003,
"grad_norm": 1.8515625,
"learning_rate": 8.803476672597585e-06,
"loss": 0.6461,
"step": 7700
},
{
"epoch": 0.36551539144045486,
"grad_norm": 1.796875,
"learning_rate": 8.795603772693635e-06,
"loss": 0.6427,
"step": 7750
},
{
"epoch": 0.36787355525619975,
"grad_norm": 1.9375,
"learning_rate": 8.787730872789684e-06,
"loss": 0.6454,
"step": 7800
},
{
"epoch": 0.37023171907194463,
"grad_norm": 1.9765625,
"learning_rate": 8.779857972885734e-06,
"loss": 0.6451,
"step": 7850
},
{
"epoch": 0.3725898828876895,
"grad_norm": 1.9453125,
"learning_rate": 8.771985072981782e-06,
"loss": 0.6312,
"step": 7900
},
{
"epoch": 0.37494804670343435,
"grad_norm": 1.90625,
"learning_rate": 8.764112173077833e-06,
"loss": 0.6457,
"step": 7950
},
{
"epoch": 0.37730621051917923,
"grad_norm": 2.125,
"learning_rate": 8.756239273173883e-06,
"loss": 0.6505,
"step": 8000
},
{
"epoch": 0.37730621051917923,
"eval_loss": 0.6432761549949646,
"eval_runtime": 471.5659,
"eval_samples_per_second": 75.726,
"eval_steps_per_second": 37.863,
"step": 8000
},
{
"epoch": 0.3796643743349241,
"grad_norm": 1.71875,
"learning_rate": 8.748366373269931e-06,
"loss": 0.6487,
"step": 8050
},
{
"epoch": 0.382022538150669,
"grad_norm": 1.6171875,
"learning_rate": 8.74049347336598e-06,
"loss": 0.6217,
"step": 8100
},
{
"epoch": 0.38438070196641383,
"grad_norm": 2.078125,
"learning_rate": 8.73262057346203e-06,
"loss": 0.6399,
"step": 8150
},
{
"epoch": 0.3867388657821587,
"grad_norm": 1.734375,
"learning_rate": 8.724747673558079e-06,
"loss": 0.6562,
"step": 8200
},
{
"epoch": 0.3890970295979036,
"grad_norm": 1.8125,
"learning_rate": 8.716874773654127e-06,
"loss": 0.64,
"step": 8250
},
{
"epoch": 0.3914551934136485,
"grad_norm": 1.828125,
"learning_rate": 8.709001873750178e-06,
"loss": 0.6441,
"step": 8300
},
{
"epoch": 0.3938133572293933,
"grad_norm": 1.9765625,
"learning_rate": 8.701128973846226e-06,
"loss": 0.6292,
"step": 8350
},
{
"epoch": 0.3961715210451382,
"grad_norm": 1.671875,
"learning_rate": 8.693256073942276e-06,
"loss": 0.6465,
"step": 8400
},
{
"epoch": 0.3985296848608831,
"grad_norm": 2.203125,
"learning_rate": 8.685383174038327e-06,
"loss": 0.6421,
"step": 8450
},
{
"epoch": 0.40088784867662797,
"grad_norm": 1.7265625,
"learning_rate": 8.677510274134375e-06,
"loss": 0.6361,
"step": 8500
},
{
"epoch": 0.40088784867662797,
"eval_loss": 0.6430058479309082,
"eval_runtime": 473.3534,
"eval_samples_per_second": 75.44,
"eval_steps_per_second": 37.72,
"step": 8500
},
{
"epoch": 0.4032460124923728,
"grad_norm": 1.734375,
"learning_rate": 8.669637374230425e-06,
"loss": 0.6427,
"step": 8550
},
{
"epoch": 0.4056041763081177,
"grad_norm": 1.8828125,
"learning_rate": 8.661764474326474e-06,
"loss": 0.6402,
"step": 8600
},
{
"epoch": 0.40796234012386257,
"grad_norm": 1.8828125,
"learning_rate": 8.653891574422524e-06,
"loss": 0.629,
"step": 8650
},
{
"epoch": 0.41032050393960745,
"grad_norm": 1.6796875,
"learning_rate": 8.646018674518573e-06,
"loss": 0.6307,
"step": 8700
},
{
"epoch": 0.4126786677553523,
"grad_norm": 1.8671875,
"learning_rate": 8.638145774614623e-06,
"loss": 0.6407,
"step": 8750
},
{
"epoch": 0.41503683157109716,
"grad_norm": 1.84375,
"learning_rate": 8.630272874710671e-06,
"loss": 0.6553,
"step": 8800
},
{
"epoch": 0.41739499538684205,
"grad_norm": 2.265625,
"learning_rate": 8.622399974806722e-06,
"loss": 0.6571,
"step": 8850
},
{
"epoch": 0.4197531592025869,
"grad_norm": 1.828125,
"learning_rate": 8.61452707490277e-06,
"loss": 0.6312,
"step": 8900
},
{
"epoch": 0.42211132301833176,
"grad_norm": 1.9609375,
"learning_rate": 8.60665417499882e-06,
"loss": 0.6407,
"step": 8950
},
{
"epoch": 0.42446948683407665,
"grad_norm": 1.8515625,
"learning_rate": 8.598781275094869e-06,
"loss": 0.6328,
"step": 9000
},
{
"epoch": 0.42446948683407665,
"eval_loss": 0.642894446849823,
"eval_runtime": 471.7695,
"eval_samples_per_second": 75.694,
"eval_steps_per_second": 37.847,
"step": 9000
},
{
"epoch": 0.42682765064982153,
"grad_norm": 2.03125,
"learning_rate": 8.590908375190918e-06,
"loss": 0.6457,
"step": 9050
},
{
"epoch": 0.42918581446556636,
"grad_norm": 2.09375,
"learning_rate": 8.583035475286968e-06,
"loss": 0.6342,
"step": 9100
},
{
"epoch": 0.43154397828131125,
"grad_norm": 1.78125,
"learning_rate": 8.575162575383016e-06,
"loss": 0.634,
"step": 9150
},
{
"epoch": 0.43390214209705613,
"grad_norm": 1.890625,
"learning_rate": 8.567289675479067e-06,
"loss": 0.638,
"step": 9200
},
{
"epoch": 0.436260305912801,
"grad_norm": 1.9296875,
"learning_rate": 8.559416775575115e-06,
"loss": 0.6377,
"step": 9250
},
{
"epoch": 0.43861846972854585,
"grad_norm": 2.15625,
"learning_rate": 8.551543875671165e-06,
"loss": 0.6573,
"step": 9300
},
{
"epoch": 0.44097663354429073,
"grad_norm": 1.6015625,
"learning_rate": 8.543670975767214e-06,
"loss": 0.6486,
"step": 9350
},
{
"epoch": 0.4433347973600356,
"grad_norm": 1.7734375,
"learning_rate": 8.535798075863264e-06,
"loss": 0.6453,
"step": 9400
},
{
"epoch": 0.4456929611757805,
"grad_norm": 1.7421875,
"learning_rate": 8.527925175959314e-06,
"loss": 0.6308,
"step": 9450
},
{
"epoch": 0.44805112499152533,
"grad_norm": 1.7421875,
"learning_rate": 8.520052276055363e-06,
"loss": 0.6301,
"step": 9500
},
{
"epoch": 0.44805112499152533,
"eval_loss": 0.6426186561584473,
"eval_runtime": 475.5113,
"eval_samples_per_second": 75.098,
"eval_steps_per_second": 37.549,
"step": 9500
},
{
"epoch": 0.4504092888072702,
"grad_norm": 1.90625,
"learning_rate": 8.512179376151413e-06,
"loss": 0.6382,
"step": 9550
},
{
"epoch": 0.4527674526230151,
"grad_norm": 2.109375,
"learning_rate": 8.504306476247462e-06,
"loss": 0.6333,
"step": 9600
},
{
"epoch": 0.45512561643876,
"grad_norm": 2.203125,
"learning_rate": 8.496433576343512e-06,
"loss": 0.6462,
"step": 9650
},
{
"epoch": 0.4574837802545048,
"grad_norm": 2.0625,
"learning_rate": 8.48856067643956e-06,
"loss": 0.6441,
"step": 9700
},
{
"epoch": 0.4598419440702497,
"grad_norm": 1.9296875,
"learning_rate": 8.480687776535609e-06,
"loss": 0.6426,
"step": 9750
},
{
"epoch": 0.4622001078859946,
"grad_norm": 1.859375,
"learning_rate": 8.47281487663166e-06,
"loss": 0.6278,
"step": 9800
},
{
"epoch": 0.46455827170173947,
"grad_norm": 1.65625,
"learning_rate": 8.464941976727708e-06,
"loss": 0.6422,
"step": 9850
},
{
"epoch": 0.4669164355174843,
"grad_norm": 1.8125,
"learning_rate": 8.457069076823758e-06,
"loss": 0.6276,
"step": 9900
},
{
"epoch": 0.4692745993332292,
"grad_norm": 1.953125,
"learning_rate": 8.449196176919807e-06,
"loss": 0.6345,
"step": 9950
},
{
"epoch": 0.47163276314897407,
"grad_norm": 1.8203125,
"learning_rate": 8.441323277015857e-06,
"loss": 0.6492,
"step": 10000
},
{
"epoch": 0.47163276314897407,
"eval_loss": 0.642467200756073,
"eval_runtime": 475.5523,
"eval_samples_per_second": 75.092,
"eval_steps_per_second": 37.546,
"step": 10000
},
{
"epoch": 0.4739909269647189,
"grad_norm": 2.015625,
"learning_rate": 8.433450377111905e-06,
"loss": 0.6459,
"step": 10050
},
{
"epoch": 0.4763490907804638,
"grad_norm": 1.859375,
"learning_rate": 8.425577477207956e-06,
"loss": 0.6475,
"step": 10100
},
{
"epoch": 0.47870725459620866,
"grad_norm": 1.8984375,
"learning_rate": 8.417704577304004e-06,
"loss": 0.614,
"step": 10150
},
{
"epoch": 0.48106541841195355,
"grad_norm": 1.7734375,
"learning_rate": 8.409831677400054e-06,
"loss": 0.6374,
"step": 10200
},
{
"epoch": 0.4834235822276984,
"grad_norm": 2.171875,
"learning_rate": 8.401958777496103e-06,
"loss": 0.6389,
"step": 10250
},
{
"epoch": 0.48578174604344326,
"grad_norm": 1.7265625,
"learning_rate": 8.394085877592153e-06,
"loss": 0.6308,
"step": 10300
},
{
"epoch": 0.48813990985918815,
"grad_norm": 1.8671875,
"learning_rate": 8.386212977688203e-06,
"loss": 0.6357,
"step": 10350
},
{
"epoch": 0.49049807367493303,
"grad_norm": 2.015625,
"learning_rate": 8.378340077784252e-06,
"loss": 0.6295,
"step": 10400
},
{
"epoch": 0.49285623749067786,
"grad_norm": 1.8671875,
"learning_rate": 8.370467177880302e-06,
"loss": 0.6455,
"step": 10450
},
{
"epoch": 0.49521440130642275,
"grad_norm": 2.203125,
"learning_rate": 8.36259427797635e-06,
"loss": 0.6293,
"step": 10500
},
{
"epoch": 0.49521440130642275,
"eval_loss": 0.6422578692436218,
"eval_runtime": 476.5916,
"eval_samples_per_second": 74.928,
"eval_steps_per_second": 37.464,
"step": 10500
},
{
"epoch": 0.49757256512216763,
"grad_norm": 1.96875,
"learning_rate": 8.3547213780724e-06,
"loss": 0.6423,
"step": 10550
},
{
"epoch": 0.4999307289379125,
"grad_norm": 1.7109375,
"learning_rate": 8.34684847816845e-06,
"loss": 0.6451,
"step": 10600
},
{
"epoch": 0.5022888927536574,
"grad_norm": 2.078125,
"learning_rate": 8.338975578264498e-06,
"loss": 0.6396,
"step": 10650
},
{
"epoch": 0.5046470565694022,
"grad_norm": 1.890625,
"learning_rate": 8.331102678360547e-06,
"loss": 0.6358,
"step": 10700
},
{
"epoch": 0.5070052203851471,
"grad_norm": 1.921875,
"learning_rate": 8.323229778456597e-06,
"loss": 0.6354,
"step": 10750
},
{
"epoch": 0.509363384200892,
"grad_norm": 1.7421875,
"learning_rate": 8.315356878552647e-06,
"loss": 0.6307,
"step": 10800
},
{
"epoch": 0.5117215480166368,
"grad_norm": 1.734375,
"learning_rate": 8.307483978648696e-06,
"loss": 0.6379,
"step": 10850
},
{
"epoch": 0.5140797118323818,
"grad_norm": 2.140625,
"learning_rate": 8.299611078744746e-06,
"loss": 0.6415,
"step": 10900
},
{
"epoch": 0.5164378756481266,
"grad_norm": 1.7109375,
"learning_rate": 8.291738178840795e-06,
"loss": 0.6415,
"step": 10950
},
{
"epoch": 0.5187960394638714,
"grad_norm": 2.015625,
"learning_rate": 8.283865278936845e-06,
"loss": 0.6357,
"step": 11000
},
{
"epoch": 0.5187960394638714,
"eval_loss": 0.6421868205070496,
"eval_runtime": 471.328,
"eval_samples_per_second": 75.765,
"eval_steps_per_second": 37.882,
"step": 11000
},
{
"epoch": 0.5211542032796164,
"grad_norm": 1.9296875,
"learning_rate": 8.275992379032893e-06,
"loss": 0.643,
"step": 11050
},
{
"epoch": 0.5235123670953612,
"grad_norm": 1.859375,
"learning_rate": 8.268119479128944e-06,
"loss": 0.6446,
"step": 11100
},
{
"epoch": 0.525870530911106,
"grad_norm": 2.125,
"learning_rate": 8.260246579224992e-06,
"loss": 0.6344,
"step": 11150
},
{
"epoch": 0.528228694726851,
"grad_norm": 2.03125,
"learning_rate": 8.252373679321042e-06,
"loss": 0.6479,
"step": 11200
},
{
"epoch": 0.5305868585425958,
"grad_norm": 2.171875,
"learning_rate": 8.244500779417093e-06,
"loss": 0.6367,
"step": 11250
},
{
"epoch": 0.5329450223583406,
"grad_norm": 1.7265625,
"learning_rate": 8.236627879513141e-06,
"loss": 0.6417,
"step": 11300
},
{
"epoch": 0.5353031861740856,
"grad_norm": 1.6875,
"learning_rate": 8.22875497960919e-06,
"loss": 0.6372,
"step": 11350
},
{
"epoch": 0.5376613499898304,
"grad_norm": 1.859375,
"learning_rate": 8.22088207970524e-06,
"loss": 0.6315,
"step": 11400
},
{
"epoch": 0.5400195138055753,
"grad_norm": 1.84375,
"learning_rate": 8.213009179801288e-06,
"loss": 0.6412,
"step": 11450
},
{
"epoch": 0.5423776776213202,
"grad_norm": 1.71875,
"learning_rate": 8.205136279897337e-06,
"loss": 0.6573,
"step": 11500
},
{
"epoch": 0.5423776776213202,
"eval_loss": 0.6420803070068359,
"eval_runtime": 468.9426,
"eval_samples_per_second": 76.15,
"eval_steps_per_second": 38.075,
"step": 11500
},
{
"epoch": 0.544735841437065,
"grad_norm": 1.6328125,
"learning_rate": 8.197263379993387e-06,
"loss": 0.6475,
"step": 11550
},
{
"epoch": 0.5470940052528099,
"grad_norm": 1.828125,
"learning_rate": 8.189390480089436e-06,
"loss": 0.6408,
"step": 11600
},
{
"epoch": 0.5494521690685548,
"grad_norm": 2.1875,
"learning_rate": 8.181517580185486e-06,
"loss": 0.6427,
"step": 11650
},
{
"epoch": 0.5518103328842996,
"grad_norm": 2.171875,
"learning_rate": 8.173644680281535e-06,
"loss": 0.644,
"step": 11700
},
{
"epoch": 0.5541684967000445,
"grad_norm": 2.09375,
"learning_rate": 8.165771780377585e-06,
"loss": 0.6398,
"step": 11750
},
{
"epoch": 0.5565266605157894,
"grad_norm": 1.7421875,
"learning_rate": 8.157898880473635e-06,
"loss": 0.6373,
"step": 11800
},
{
"epoch": 0.5588848243315343,
"grad_norm": 1.609375,
"learning_rate": 8.150025980569684e-06,
"loss": 0.6484,
"step": 11850
},
{
"epoch": 0.5612429881472791,
"grad_norm": 2.0,
"learning_rate": 8.142153080665734e-06,
"loss": 0.6412,
"step": 11900
},
{
"epoch": 0.563601151963024,
"grad_norm": 1.8203125,
"learning_rate": 8.134280180761782e-06,
"loss": 0.6412,
"step": 11950
},
{
"epoch": 0.5659593157787689,
"grad_norm": 1.9140625,
"learning_rate": 8.126407280857833e-06,
"loss": 0.637,
"step": 12000
},
{
"epoch": 0.5659593157787689,
"eval_loss": 0.6419476866722107,
"eval_runtime": 471.7113,
"eval_samples_per_second": 75.703,
"eval_steps_per_second": 37.852,
"step": 12000
},
{
"epoch": 0.5683174795945137,
"grad_norm": 1.84375,
"learning_rate": 8.118534380953881e-06,
"loss": 0.6438,
"step": 12050
},
{
"epoch": 0.5706756434102586,
"grad_norm": 2.15625,
"learning_rate": 8.110661481049931e-06,
"loss": 0.6358,
"step": 12100
},
{
"epoch": 0.5730338072260035,
"grad_norm": 1.9453125,
"learning_rate": 8.10278858114598e-06,
"loss": 0.6365,
"step": 12150
},
{
"epoch": 0.5753919710417483,
"grad_norm": 1.9453125,
"learning_rate": 8.09491568124203e-06,
"loss": 0.6452,
"step": 12200
},
{
"epoch": 0.5777501348574932,
"grad_norm": 2.015625,
"learning_rate": 8.087042781338079e-06,
"loss": 0.6365,
"step": 12250
},
{
"epoch": 0.5801082986732381,
"grad_norm": 1.7734375,
"learning_rate": 8.079169881434127e-06,
"loss": 0.6426,
"step": 12300
},
{
"epoch": 0.5824664624889829,
"grad_norm": 1.8984375,
"learning_rate": 8.071296981530177e-06,
"loss": 0.628,
"step": 12350
},
{
"epoch": 0.5848246263047279,
"grad_norm": 1.890625,
"learning_rate": 8.063424081626226e-06,
"loss": 0.6315,
"step": 12400
},
{
"epoch": 0.5871827901204727,
"grad_norm": 2.125,
"learning_rate": 8.055551181722276e-06,
"loss": 0.6461,
"step": 12450
},
{
"epoch": 0.5895409539362175,
"grad_norm": 2.078125,
"learning_rate": 8.047678281818325e-06,
"loss": 0.646,
"step": 12500
},
{
"epoch": 0.5895409539362175,
"eval_loss": 0.6418334245681763,
"eval_runtime": 469.7223,
"eval_samples_per_second": 76.024,
"eval_steps_per_second": 38.012,
"step": 12500
},
{
"epoch": 0.5918991177519625,
"grad_norm": 2.171875,
"learning_rate": 8.039805381914375e-06,
"loss": 0.6535,
"step": 12550
},
{
"epoch": 0.5942572815677073,
"grad_norm": 1.9296875,
"learning_rate": 8.031932482010424e-06,
"loss": 0.6525,
"step": 12600
},
{
"epoch": 0.5966154453834521,
"grad_norm": 1.765625,
"learning_rate": 8.024059582106474e-06,
"loss": 0.6432,
"step": 12650
},
{
"epoch": 0.5989736091991971,
"grad_norm": 1.890625,
"learning_rate": 8.016186682202524e-06,
"loss": 0.633,
"step": 12700
},
{
"epoch": 0.6013317730149419,
"grad_norm": 1.9453125,
"learning_rate": 8.008313782298573e-06,
"loss": 0.6234,
"step": 12750
},
{
"epoch": 0.6036899368306868,
"grad_norm": 2.1875,
"learning_rate": 8.000440882394623e-06,
"loss": 0.6388,
"step": 12800
},
{
"epoch": 0.6060481006464317,
"grad_norm": 1.9296875,
"learning_rate": 7.992567982490671e-06,
"loss": 0.6481,
"step": 12850
},
{
"epoch": 0.6084062644621765,
"grad_norm": 1.9609375,
"learning_rate": 7.984695082586722e-06,
"loss": 0.6321,
"step": 12900
},
{
"epoch": 0.6107644282779214,
"grad_norm": 1.96875,
"learning_rate": 7.97682218268277e-06,
"loss": 0.6405,
"step": 12950
},
{
"epoch": 0.6131225920936663,
"grad_norm": 1.8671875,
"learning_rate": 7.968949282778819e-06,
"loss": 0.6287,
"step": 13000
},
{
"epoch": 0.6131225920936663,
"eval_loss": 0.6417333483695984,
"eval_runtime": 469.8773,
"eval_samples_per_second": 75.999,
"eval_steps_per_second": 37.999,
"step": 13000
},
{
"epoch": 0.6154807559094111,
"grad_norm": 1.8515625,
"learning_rate": 7.961076382874869e-06,
"loss": 0.6433,
"step": 13050
},
{
"epoch": 0.617838919725156,
"grad_norm": 1.8203125,
"learning_rate": 7.953203482970918e-06,
"loss": 0.6285,
"step": 13100
},
{
"epoch": 0.6201970835409009,
"grad_norm": 2.125,
"learning_rate": 7.945330583066968e-06,
"loss": 0.6368,
"step": 13150
},
{
"epoch": 0.6225552473566458,
"grad_norm": 1.8828125,
"learning_rate": 7.937457683163016e-06,
"loss": 0.6379,
"step": 13200
},
{
"epoch": 0.6249134111723906,
"grad_norm": 1.8359375,
"learning_rate": 7.929584783259067e-06,
"loss": 0.6415,
"step": 13250
},
{
"epoch": 0.6272715749881355,
"grad_norm": 2.046875,
"learning_rate": 7.921711883355115e-06,
"loss": 0.6394,
"step": 13300
},
{
"epoch": 0.6296297388038804,
"grad_norm": 1.890625,
"learning_rate": 7.913838983451165e-06,
"loss": 0.6415,
"step": 13350
},
{
"epoch": 0.6319879026196252,
"grad_norm": 1.90625,
"learning_rate": 7.905966083547214e-06,
"loss": 0.6376,
"step": 13400
},
{
"epoch": 0.6343460664353701,
"grad_norm": 1.90625,
"learning_rate": 7.898093183643264e-06,
"loss": 0.6344,
"step": 13450
},
{
"epoch": 0.636704230251115,
"grad_norm": 1.9140625,
"learning_rate": 7.890220283739313e-06,
"loss": 0.635,
"step": 13500
},
{
"epoch": 0.636704230251115,
"eval_loss": 0.6417108774185181,
"eval_runtime": 473.8052,
"eval_samples_per_second": 75.369,
"eval_steps_per_second": 37.684,
"step": 13500
},
{
"epoch": 0.6390623940668598,
"grad_norm": 2.015625,
"learning_rate": 7.882347383835363e-06,
"loss": 0.637,
"step": 13550
},
{
"epoch": 0.6414205578826047,
"grad_norm": 2.03125,
"learning_rate": 7.874474483931413e-06,
"loss": 0.6376,
"step": 13600
},
{
"epoch": 0.6437787216983496,
"grad_norm": 1.921875,
"learning_rate": 7.866601584027462e-06,
"loss": 0.6297,
"step": 13650
},
{
"epoch": 0.6461368855140944,
"grad_norm": 2.09375,
"learning_rate": 7.858728684123512e-06,
"loss": 0.6184,
"step": 13700
},
{
"epoch": 0.6484950493298394,
"grad_norm": 1.78125,
"learning_rate": 7.85085578421956e-06,
"loss": 0.6494,
"step": 13750
},
{
"epoch": 0.6508532131455842,
"grad_norm": 1.9453125,
"learning_rate": 7.842982884315609e-06,
"loss": 0.6466,
"step": 13800
},
{
"epoch": 0.653211376961329,
"grad_norm": 2.09375,
"learning_rate": 7.83510998441166e-06,
"loss": 0.647,
"step": 13850
},
{
"epoch": 0.655569540777074,
"grad_norm": 1.7578125,
"learning_rate": 7.827237084507708e-06,
"loss": 0.6332,
"step": 13900
},
{
"epoch": 0.6579277045928188,
"grad_norm": 1.6328125,
"learning_rate": 7.819364184603756e-06,
"loss": 0.6253,
"step": 13950
},
{
"epoch": 0.6602858684085636,
"grad_norm": 1.90625,
"learning_rate": 7.811491284699807e-06,
"loss": 0.6401,
"step": 14000
},
{
"epoch": 0.6602858684085636,
"eval_loss": 0.641613781452179,
"eval_runtime": 470.3187,
"eval_samples_per_second": 75.927,
"eval_steps_per_second": 37.964,
"step": 14000
},
{
"epoch": 0.6626440322243086,
"grad_norm": 1.796875,
"learning_rate": 7.803618384795857e-06,
"loss": 0.6275,
"step": 14050
},
{
"epoch": 0.6650021960400534,
"grad_norm": 1.8125,
"learning_rate": 7.795745484891905e-06,
"loss": 0.6356,
"step": 14100
},
{
"epoch": 0.6673603598557983,
"grad_norm": 1.90625,
"learning_rate": 7.787872584987956e-06,
"loss": 0.6431,
"step": 14150
},
{
"epoch": 0.6697185236715432,
"grad_norm": 2.046875,
"learning_rate": 7.779999685084004e-06,
"loss": 0.6406,
"step": 14200
},
{
"epoch": 0.672076687487288,
"grad_norm": 1.7734375,
"learning_rate": 7.772126785180054e-06,
"loss": 0.6532,
"step": 14250
},
{
"epoch": 0.6744348513030329,
"grad_norm": 2.015625,
"learning_rate": 7.764253885276103e-06,
"loss": 0.6331,
"step": 14300
},
{
"epoch": 0.6767930151187778,
"grad_norm": 1.859375,
"learning_rate": 7.756380985372153e-06,
"loss": 0.6451,
"step": 14350
},
{
"epoch": 0.6791511789345226,
"grad_norm": 2.203125,
"learning_rate": 7.748508085468202e-06,
"loss": 0.6376,
"step": 14400
},
{
"epoch": 0.6815093427502675,
"grad_norm": 2.078125,
"learning_rate": 7.740635185564252e-06,
"loss": 0.6391,
"step": 14450
},
{
"epoch": 0.6838675065660124,
"grad_norm": 1.734375,
"learning_rate": 7.7327622856603e-06,
"loss": 0.6455,
"step": 14500
},
{
"epoch": 0.6838675065660124,
"eval_loss": 0.6413908004760742,
"eval_runtime": 472.9873,
"eval_samples_per_second": 75.499,
"eval_steps_per_second": 37.749,
"step": 14500
},
{
"epoch": 0.6862256703817572,
"grad_norm": 1.8125,
"learning_rate": 7.72488938575635e-06,
"loss": 0.657,
"step": 14550
},
{
"epoch": 0.6885838341975021,
"grad_norm": 2.25,
"learning_rate": 7.7170164858524e-06,
"loss": 0.6424,
"step": 14600
},
{
"epoch": 0.690941998013247,
"grad_norm": 2.03125,
"learning_rate": 7.70914358594845e-06,
"loss": 0.6374,
"step": 14650
},
{
"epoch": 0.6933001618289919,
"grad_norm": 2.109375,
"learning_rate": 7.701270686044498e-06,
"loss": 0.636,
"step": 14700
},
{
"epoch": 0.6956583256447367,
"grad_norm": 1.890625,
"learning_rate": 7.693397786140547e-06,
"loss": 0.6358,
"step": 14750
},
{
"epoch": 0.6980164894604816,
"grad_norm": 1.921875,
"learning_rate": 7.685524886236597e-06,
"loss": 0.6299,
"step": 14800
},
{
"epoch": 0.7003746532762265,
"grad_norm": 1.9375,
"learning_rate": 7.677651986332645e-06,
"loss": 0.6294,
"step": 14850
},
{
"epoch": 0.7027328170919713,
"grad_norm": 1.8359375,
"learning_rate": 7.669779086428696e-06,
"loss": 0.6389,
"step": 14900
},
{
"epoch": 0.7050909809077162,
"grad_norm": 2.03125,
"learning_rate": 7.661906186524744e-06,
"loss": 0.6359,
"step": 14950
},
{
"epoch": 0.7074491447234611,
"grad_norm": 1.90625,
"learning_rate": 7.654033286620794e-06,
"loss": 0.6476,
"step": 15000
},
{
"epoch": 0.7074491447234611,
"eval_loss": 0.641543447971344,
"eval_runtime": 470.9999,
"eval_samples_per_second": 75.817,
"eval_steps_per_second": 37.909,
"step": 15000
},
{
"epoch": 0.7098073085392059,
"grad_norm": 1.9609375,
"learning_rate": 7.646160386716845e-06,
"loss": 0.6437,
"step": 15050
},
{
"epoch": 0.7121654723549509,
"grad_norm": 1.8515625,
"learning_rate": 7.638287486812893e-06,
"loss": 0.6388,
"step": 15100
},
{
"epoch": 0.7145236361706957,
"grad_norm": 2.03125,
"learning_rate": 7.630414586908943e-06,
"loss": 0.6578,
"step": 15150
},
{
"epoch": 0.7168817999864405,
"grad_norm": 2.0625,
"learning_rate": 7.622541687004992e-06,
"loss": 0.6549,
"step": 15200
},
{
"epoch": 0.7192399638021855,
"grad_norm": 2.078125,
"learning_rate": 7.614668787101041e-06,
"loss": 0.6396,
"step": 15250
},
{
"epoch": 0.7215981276179303,
"grad_norm": 2.125,
"learning_rate": 7.606795887197091e-06,
"loss": 0.6534,
"step": 15300
},
{
"epoch": 0.7239562914336751,
"grad_norm": 1.84375,
"learning_rate": 7.59892298729314e-06,
"loss": 0.6353,
"step": 15350
},
{
"epoch": 0.7263144552494201,
"grad_norm": 2.03125,
"learning_rate": 7.591050087389189e-06,
"loss": 0.641,
"step": 15400
},
{
"epoch": 0.7286726190651649,
"grad_norm": 2.234375,
"learning_rate": 7.583177187485239e-06,
"loss": 0.6392,
"step": 15450
},
{
"epoch": 0.7310307828809097,
"grad_norm": 2.046875,
"learning_rate": 7.575304287581289e-06,
"loss": 0.6425,
"step": 15500
},
{
"epoch": 0.7310307828809097,
"eval_loss": 0.6413320899009705,
"eval_runtime": 472.198,
"eval_samples_per_second": 75.625,
"eval_steps_per_second": 37.813,
"step": 15500
},
{
"epoch": 0.7333889466966547,
"grad_norm": 2.078125,
"learning_rate": 7.567431387677338e-06,
"loss": 0.6468,
"step": 15550
},
{
"epoch": 0.7357471105123995,
"grad_norm": 2.234375,
"learning_rate": 7.559558487773387e-06,
"loss": 0.6364,
"step": 15600
},
{
"epoch": 0.7381052743281444,
"grad_norm": 1.9140625,
"learning_rate": 7.5516855878694365e-06,
"loss": 0.6297,
"step": 15650
},
{
"epoch": 0.7404634381438893,
"grad_norm": 2.203125,
"learning_rate": 7.543812687965486e-06,
"loss": 0.6364,
"step": 15700
},
{
"epoch": 0.7428216019596341,
"grad_norm": 1.78125,
"learning_rate": 7.5359397880615344e-06,
"loss": 0.6347,
"step": 15750
},
{
"epoch": 0.745179765775379,
"grad_norm": 1.8359375,
"learning_rate": 7.528066888157585e-06,
"loss": 0.6454,
"step": 15800
},
{
"epoch": 0.7475379295911239,
"grad_norm": 2.046875,
"learning_rate": 7.520193988253633e-06,
"loss": 0.6375,
"step": 15850
},
{
"epoch": 0.7498960934068687,
"grad_norm": 2.078125,
"learning_rate": 7.5123210883496835e-06,
"loss": 0.6398,
"step": 15900
},
{
"epoch": 0.7522542572226136,
"grad_norm": 1.7109375,
"learning_rate": 7.504448188445733e-06,
"loss": 0.6281,
"step": 15950
},
{
"epoch": 0.7546124210383585,
"grad_norm": 1.9296875,
"learning_rate": 7.496575288541782e-06,
"loss": 0.6447,
"step": 16000
},
{
"epoch": 0.7546124210383585,
"eval_loss": 0.6413915157318115,
"eval_runtime": 472.6251,
"eval_samples_per_second": 75.557,
"eval_steps_per_second": 37.778,
"step": 16000
},
{
"epoch": 0.7569705848541034,
"grad_norm": 2.046875,
"learning_rate": 7.488702388637832e-06,
"loss": 0.6496,
"step": 16050
},
{
"epoch": 0.7593287486698482,
"grad_norm": 2.09375,
"learning_rate": 7.48082948873388e-06,
"loss": 0.6417,
"step": 16100
},
{
"epoch": 0.7616869124855931,
"grad_norm": 1.890625,
"learning_rate": 7.4729565888299304e-06,
"loss": 0.6509,
"step": 16150
},
{
"epoch": 0.764045076301338,
"grad_norm": 1.7265625,
"learning_rate": 7.465083688925979e-06,
"loss": 0.6329,
"step": 16200
},
{
"epoch": 0.7664032401170828,
"grad_norm": 2.03125,
"learning_rate": 7.457210789022029e-06,
"loss": 0.6424,
"step": 16250
},
{
"epoch": 0.7687614039328277,
"grad_norm": 1.9296875,
"learning_rate": 7.449337889118078e-06,
"loss": 0.6315,
"step": 16300
},
{
"epoch": 0.7711195677485726,
"grad_norm": 1.8828125,
"learning_rate": 7.441464989214128e-06,
"loss": 0.6321,
"step": 16350
},
{
"epoch": 0.7734777315643174,
"grad_norm": 1.96875,
"learning_rate": 7.433592089310177e-06,
"loss": 0.6324,
"step": 16400
},
{
"epoch": 0.7758358953800624,
"grad_norm": 1.8125,
"learning_rate": 7.425719189406227e-06,
"loss": 0.6436,
"step": 16450
},
{
"epoch": 0.7781940591958072,
"grad_norm": 1.828125,
"learning_rate": 7.417846289502276e-06,
"loss": 0.6555,
"step": 16500
},
{
"epoch": 0.7781940591958072,
"eval_loss": 0.6413031220436096,
"eval_runtime": 472.5746,
"eval_samples_per_second": 75.565,
"eval_steps_per_second": 37.782,
"step": 16500
},
{
"epoch": 0.780552223011552,
"grad_norm": 1.734375,
"learning_rate": 7.409973389598325e-06,
"loss": 0.6379,
"step": 16550
},
{
"epoch": 0.782910386827297,
"grad_norm": 1.859375,
"learning_rate": 7.402100489694375e-06,
"loss": 0.6414,
"step": 16600
},
{
"epoch": 0.7852685506430418,
"grad_norm": 1.8203125,
"learning_rate": 7.3942275897904235e-06,
"loss": 0.6404,
"step": 16650
},
{
"epoch": 0.7876267144587866,
"grad_norm": 1.765625,
"learning_rate": 7.386354689886474e-06,
"loss": 0.6417,
"step": 16700
},
{
"epoch": 0.7899848782745316,
"grad_norm": 1.78125,
"learning_rate": 7.378481789982522e-06,
"loss": 0.6353,
"step": 16750
},
{
"epoch": 0.7923430420902764,
"grad_norm": 1.859375,
"learning_rate": 7.3706088900785725e-06,
"loss": 0.6238,
"step": 16800
},
{
"epoch": 0.7947012059060212,
"grad_norm": 1.8125,
"learning_rate": 7.362735990174622e-06,
"loss": 0.6422,
"step": 16850
},
{
"epoch": 0.7970593697217662,
"grad_norm": 2.234375,
"learning_rate": 7.3548630902706705e-06,
"loss": 0.64,
"step": 16900
},
{
"epoch": 0.799417533537511,
"grad_norm": 2.0,
"learning_rate": 7.346990190366721e-06,
"loss": 0.6515,
"step": 16950
},
{
"epoch": 0.8017756973532559,
"grad_norm": 1.6171875,
"learning_rate": 7.339117290462769e-06,
"loss": 0.6322,
"step": 17000
},
{
"epoch": 0.8017756973532559,
"eval_loss": 0.6412806510925293,
"eval_runtime": 473.922,
"eval_samples_per_second": 75.35,
"eval_steps_per_second": 37.675,
"step": 17000
},
{
"epoch": 0.8041338611690008,
"grad_norm": 1.6796875,
"learning_rate": 7.3312443905588195e-06,
"loss": 0.6362,
"step": 17050
},
{
"epoch": 0.8064920249847456,
"grad_norm": 1.8515625,
"learning_rate": 7.323371490654868e-06,
"loss": 0.6358,
"step": 17100
},
{
"epoch": 0.8088501888004905,
"grad_norm": 1.90625,
"learning_rate": 7.315498590750918e-06,
"loss": 0.6296,
"step": 17150
},
{
"epoch": 0.8112083526162354,
"grad_norm": 1.90625,
"learning_rate": 7.307625690846967e-06,
"loss": 0.6368,
"step": 17200
},
{
"epoch": 0.8135665164319802,
"grad_norm": 1.890625,
"learning_rate": 7.299752790943016e-06,
"loss": 0.6366,
"step": 17250
},
{
"epoch": 0.8159246802477251,
"grad_norm": 1.734375,
"learning_rate": 7.2918798910390664e-06,
"loss": 0.6318,
"step": 17300
},
{
"epoch": 0.81828284406347,
"grad_norm": 2.4375,
"learning_rate": 7.284006991135115e-06,
"loss": 0.6331,
"step": 17350
},
{
"epoch": 0.8206410078792149,
"grad_norm": 1.7734375,
"learning_rate": 7.276134091231165e-06,
"loss": 0.641,
"step": 17400
},
{
"epoch": 0.8229991716949597,
"grad_norm": 2.125,
"learning_rate": 7.268261191327214e-06,
"loss": 0.6536,
"step": 17450
},
{
"epoch": 0.8253573355107046,
"grad_norm": 1.9296875,
"learning_rate": 7.260388291423264e-06,
"loss": 0.6396,
"step": 17500
},
{
"epoch": 0.8253573355107046,
"eval_loss": 0.6412404179573059,
"eval_runtime": 471.5495,
"eval_samples_per_second": 75.729,
"eval_steps_per_second": 37.865,
"step": 17500
},
{
"epoch": 0.8277154993264495,
"grad_norm": 1.890625,
"learning_rate": 7.2525153915193126e-06,
"loss": 0.6262,
"step": 17550
},
{
"epoch": 0.8300736631421943,
"grad_norm": 2.09375,
"learning_rate": 7.244642491615362e-06,
"loss": 0.6445,
"step": 17600
},
{
"epoch": 0.8324318269579392,
"grad_norm": 2.1875,
"learning_rate": 7.236769591711411e-06,
"loss": 0.6375,
"step": 17650
},
{
"epoch": 0.8347899907736841,
"grad_norm": 2.09375,
"learning_rate": 7.228896691807461e-06,
"loss": 0.6377,
"step": 17700
},
{
"epoch": 0.8371481545894289,
"grad_norm": 1.890625,
"learning_rate": 7.221023791903511e-06,
"loss": 0.6472,
"step": 17750
},
{
"epoch": 0.8395063184051738,
"grad_norm": 2.015625,
"learning_rate": 7.2131508919995595e-06,
"loss": 0.6404,
"step": 17800
},
{
"epoch": 0.8418644822209187,
"grad_norm": 2.109375,
"learning_rate": 7.20527799209561e-06,
"loss": 0.6257,
"step": 17850
},
{
"epoch": 0.8442226460366635,
"grad_norm": 2.09375,
"learning_rate": 7.197405092191658e-06,
"loss": 0.636,
"step": 17900
},
{
"epoch": 0.8465808098524085,
"grad_norm": 1.984375,
"learning_rate": 7.1895321922877085e-06,
"loss": 0.6441,
"step": 17950
},
{
"epoch": 0.8489389736681533,
"grad_norm": 1.796875,
"learning_rate": 7.181659292383757e-06,
"loss": 0.6458,
"step": 18000
},
{
"epoch": 0.8489389736681533,
"eval_loss": 0.6412122249603271,
"eval_runtime": 478.107,
"eval_samples_per_second": 74.69,
"eval_steps_per_second": 37.345,
"step": 18000
},
{
"epoch": 0.8512971374838981,
"grad_norm": 1.8671875,
"learning_rate": 7.1737863924798065e-06,
"loss": 0.6376,
"step": 18050
},
{
"epoch": 0.8536553012996431,
"grad_norm": 1.9140625,
"learning_rate": 7.165913492575856e-06,
"loss": 0.6456,
"step": 18100
},
{
"epoch": 0.8560134651153879,
"grad_norm": 2.1875,
"learning_rate": 7.158040592671905e-06,
"loss": 0.6409,
"step": 18150
},
{
"epoch": 0.8583716289311327,
"grad_norm": 1.71875,
"learning_rate": 7.150167692767954e-06,
"loss": 0.6282,
"step": 18200
},
{
"epoch": 0.8607297927468777,
"grad_norm": 1.640625,
"learning_rate": 7.142294792864004e-06,
"loss": 0.6502,
"step": 18250
},
{
"epoch": 0.8630879565626225,
"grad_norm": 2.21875,
"learning_rate": 7.134421892960054e-06,
"loss": 0.6497,
"step": 18300
},
{
"epoch": 0.8654461203783674,
"grad_norm": 1.78125,
"learning_rate": 7.126548993056103e-06,
"loss": 0.6329,
"step": 18350
},
{
"epoch": 0.8678042841941123,
"grad_norm": 1.8359375,
"learning_rate": 7.118676093152152e-06,
"loss": 0.6363,
"step": 18400
},
{
"epoch": 0.8701624480098571,
"grad_norm": 1.921875,
"learning_rate": 7.110803193248202e-06,
"loss": 0.6356,
"step": 18450
},
{
"epoch": 0.872520611825602,
"grad_norm": 2.265625,
"learning_rate": 7.102930293344251e-06,
"loss": 0.6454,
"step": 18500
},
{
"epoch": 0.872520611825602,
"eval_loss": 0.6411958336830139,
"eval_runtime": 472.9637,
"eval_samples_per_second": 75.503,
"eval_steps_per_second": 37.751,
"step": 18500
},
{
"epoch": 0.8748787756413469,
"grad_norm": 1.765625,
"learning_rate": 7.0950573934402996e-06,
"loss": 0.6335,
"step": 18550
},
{
"epoch": 0.8772369394570917,
"grad_norm": 1.984375,
"learning_rate": 7.08718449353635e-06,
"loss": 0.6538,
"step": 18600
},
{
"epoch": 0.8795951032728366,
"grad_norm": 1.578125,
"learning_rate": 7.079311593632398e-06,
"loss": 0.6379,
"step": 18650
},
{
"epoch": 0.8819532670885815,
"grad_norm": 2.171875,
"learning_rate": 7.071438693728449e-06,
"loss": 0.6519,
"step": 18700
},
{
"epoch": 0.8843114309043264,
"grad_norm": 2.09375,
"learning_rate": 7.063565793824498e-06,
"loss": 0.6393,
"step": 18750
},
{
"epoch": 0.8866695947200712,
"grad_norm": 2.21875,
"learning_rate": 7.055692893920547e-06,
"loss": 0.6461,
"step": 18800
},
{
"epoch": 0.8890277585358161,
"grad_norm": 2.03125,
"learning_rate": 7.047819994016597e-06,
"loss": 0.6414,
"step": 18850
},
{
"epoch": 0.891385922351561,
"grad_norm": 1.9296875,
"learning_rate": 7.039947094112646e-06,
"loss": 0.6471,
"step": 18900
},
{
"epoch": 0.8937440861673058,
"grad_norm": 2.46875,
"learning_rate": 7.0320741942086955e-06,
"loss": 0.6467,
"step": 18950
},
{
"epoch": 0.8961022499830507,
"grad_norm": 1.96875,
"learning_rate": 7.024201294304744e-06,
"loss": 0.6279,
"step": 19000
},
{
"epoch": 0.8961022499830507,
"eval_loss": 0.6412160992622375,
"eval_runtime": 473.5786,
"eval_samples_per_second": 75.405,
"eval_steps_per_second": 37.702,
"step": 19000
},
{
"epoch": 0.8984604137987956,
"grad_norm": 2.09375,
"learning_rate": 7.016328394400794e-06,
"loss": 0.6585,
"step": 19050
},
{
"epoch": 0.9008185776145404,
"grad_norm": 1.84375,
"learning_rate": 7.008455494496843e-06,
"loss": 0.6204,
"step": 19100
},
{
"epoch": 0.9031767414302853,
"grad_norm": 1.8671875,
"learning_rate": 7.000582594592893e-06,
"loss": 0.6424,
"step": 19150
},
{
"epoch": 0.9055349052460302,
"grad_norm": 2.15625,
"learning_rate": 6.9927096946889425e-06,
"loss": 0.6374,
"step": 19200
},
{
"epoch": 0.907893069061775,
"grad_norm": 2.28125,
"learning_rate": 6.984836794784992e-06,
"loss": 0.636,
"step": 19250
},
{
"epoch": 0.91025123287752,
"grad_norm": 2.078125,
"learning_rate": 6.976963894881041e-06,
"loss": 0.632,
"step": 19300
},
{
"epoch": 0.9126093966932648,
"grad_norm": 2.109375,
"learning_rate": 6.96909099497709e-06,
"loss": 0.6378,
"step": 19350
},
{
"epoch": 0.9149675605090096,
"grad_norm": 2.109375,
"learning_rate": 6.96121809507314e-06,
"loss": 0.6415,
"step": 19400
},
{
"epoch": 0.9173257243247546,
"grad_norm": 1.765625,
"learning_rate": 6.953345195169189e-06,
"loss": 0.6396,
"step": 19450
},
{
"epoch": 0.9196838881404994,
"grad_norm": 2.265625,
"learning_rate": 6.945472295265239e-06,
"loss": 0.6405,
"step": 19500
},
{
"epoch": 0.9196838881404994,
"eval_loss": 0.6411221623420715,
"eval_runtime": 471.8835,
"eval_samples_per_second": 75.675,
"eval_steps_per_second": 37.838,
"step": 19500
},
{
"epoch": 0.9220420519562442,
"grad_norm": 1.90625,
"learning_rate": 6.937599395361287e-06,
"loss": 0.6424,
"step": 19550
},
{
"epoch": 0.9244002157719892,
"grad_norm": 2.34375,
"learning_rate": 6.929726495457338e-06,
"loss": 0.6437,
"step": 19600
},
{
"epoch": 0.926758379587734,
"grad_norm": 1.9453125,
"learning_rate": 6.921853595553387e-06,
"loss": 0.6394,
"step": 19650
},
{
"epoch": 0.9291165434034789,
"grad_norm": 2.171875,
"learning_rate": 6.9139806956494356e-06,
"loss": 0.6403,
"step": 19700
},
{
"epoch": 0.9314747072192238,
"grad_norm": 1.828125,
"learning_rate": 6.906107795745486e-06,
"loss": 0.6372,
"step": 19750
},
{
"epoch": 0.9338328710349686,
"grad_norm": 1.984375,
"learning_rate": 6.898234895841534e-06,
"loss": 0.6433,
"step": 19800
},
{
"epoch": 0.9361910348507135,
"grad_norm": 2.171875,
"learning_rate": 6.890361995937585e-06,
"loss": 0.6428,
"step": 19850
},
{
"epoch": 0.9385491986664584,
"grad_norm": 2.015625,
"learning_rate": 6.882489096033633e-06,
"loss": 0.6396,
"step": 19900
},
{
"epoch": 0.9409073624822032,
"grad_norm": 2.03125,
"learning_rate": 6.874616196129683e-06,
"loss": 0.638,
"step": 19950
},
{
"epoch": 0.9432655262979481,
"grad_norm": 1.71875,
"learning_rate": 6.866743296225732e-06,
"loss": 0.6332,
"step": 20000
},
{
"epoch": 0.9432655262979481,
"eval_loss": 0.6411899328231812,
"eval_runtime": 472.9551,
"eval_samples_per_second": 75.504,
"eval_steps_per_second": 37.752,
"step": 20000
},
{
"epoch": 0.945623690113693,
"grad_norm": 2.109375,
"learning_rate": 6.858870396321782e-06,
"loss": 0.6313,
"step": 20050
},
{
"epoch": 0.9479818539294378,
"grad_norm": 2.390625,
"learning_rate": 6.8509974964178316e-06,
"loss": 0.641,
"step": 20100
},
{
"epoch": 0.9503400177451827,
"grad_norm": 2.375,
"learning_rate": 6.84312459651388e-06,
"loss": 0.6418,
"step": 20150
},
{
"epoch": 0.9526981815609276,
"grad_norm": 1.7421875,
"learning_rate": 6.83525169660993e-06,
"loss": 0.6373,
"step": 20200
},
{
"epoch": 0.9550563453766725,
"grad_norm": 1.796875,
"learning_rate": 6.827378796705979e-06,
"loss": 0.6461,
"step": 20250
},
{
"epoch": 0.9574145091924173,
"grad_norm": 2.0625,
"learning_rate": 6.819505896802029e-06,
"loss": 0.6497,
"step": 20300
},
{
"epoch": 0.9597726730081622,
"grad_norm": 1.8046875,
"learning_rate": 6.811632996898078e-06,
"loss": 0.6433,
"step": 20350
},
{
"epoch": 0.9621308368239071,
"grad_norm": 2.0625,
"learning_rate": 6.803760096994128e-06,
"loss": 0.6368,
"step": 20400
},
{
"epoch": 0.9644890006396519,
"grad_norm": 2.15625,
"learning_rate": 6.7958871970901765e-06,
"loss": 0.6492,
"step": 20450
},
{
"epoch": 0.9668471644553968,
"grad_norm": 2.0,
"learning_rate": 6.788014297186226e-06,
"loss": 0.6398,
"step": 20500
},
{
"epoch": 0.9668471644553968,
"eval_loss": 0.6411147713661194,
"eval_runtime": 471.0929,
"eval_samples_per_second": 75.802,
"eval_steps_per_second": 37.901,
"step": 20500
},
{
"epoch": 0.9692053282711417,
"grad_norm": 1.9140625,
"learning_rate": 6.780141397282276e-06,
"loss": 0.6608,
"step": 20550
},
{
"epoch": 0.9715634920868865,
"grad_norm": 1.8359375,
"learning_rate": 6.772268497378325e-06,
"loss": 0.6324,
"step": 20600
},
{
"epoch": 0.9739216559026315,
"grad_norm": 1.9921875,
"learning_rate": 6.764395597474375e-06,
"loss": 0.6338,
"step": 20650
},
{
"epoch": 0.9762798197183763,
"grad_norm": 1.9921875,
"learning_rate": 6.756522697570423e-06,
"loss": 0.6476,
"step": 20700
},
{
"epoch": 0.9786379835341211,
"grad_norm": 1.984375,
"learning_rate": 6.748649797666474e-06,
"loss": 0.6364,
"step": 20750
},
{
"epoch": 0.9809961473498661,
"grad_norm": 2.078125,
"learning_rate": 6.740776897762522e-06,
"loss": 0.6292,
"step": 20800
},
{
"epoch": 0.9833543111656109,
"grad_norm": 2.234375,
"learning_rate": 6.732903997858572e-06,
"loss": 0.6336,
"step": 20850
},
{
"epoch": 0.9857124749813557,
"grad_norm": 2.359375,
"learning_rate": 6.725031097954621e-06,
"loss": 0.6448,
"step": 20900
},
{
"epoch": 0.9880706387971007,
"grad_norm": 1.9375,
"learning_rate": 6.71715819805067e-06,
"loss": 0.6408,
"step": 20950
},
{
"epoch": 0.9904288026128455,
"grad_norm": 1.9296875,
"learning_rate": 6.70928529814672e-06,
"loss": 0.6433,
"step": 21000
},
{
"epoch": 0.9904288026128455,
"eval_loss": 0.6410422325134277,
"eval_runtime": 474.7996,
"eval_samples_per_second": 75.211,
"eval_steps_per_second": 37.605,
"step": 21000
},
{
"epoch": 0.9927869664285903,
"grad_norm": 2.15625,
"learning_rate": 6.701412398242769e-06,
"loss": 0.6331,
"step": 21050
},
{
"epoch": 0.9951451302443353,
"grad_norm": 1.765625,
"learning_rate": 6.693539498338819e-06,
"loss": 0.6354,
"step": 21100
},
{
"epoch": 0.9975032940600801,
"grad_norm": 1.734375,
"learning_rate": 6.685666598434868e-06,
"loss": 0.6303,
"step": 21150
},
{
"epoch": 0.999861457875825,
"grad_norm": 2.28125,
"learning_rate": 6.677793698530917e-06,
"loss": 0.659,
"step": 21200
},
{
"epoch": 1.0022166739868001,
"grad_norm": 1.421875,
"learning_rate": 6.669920798626967e-06,
"loss": 0.6318,
"step": 21250
},
{
"epoch": 1.004574837802545,
"grad_norm": 1.1640625,
"learning_rate": 6.662047898723016e-06,
"loss": 0.6472,
"step": 21300
},
{
"epoch": 1.00693300161829,
"grad_norm": 1.0703125,
"learning_rate": 6.6541749988190655e-06,
"loss": 0.6326,
"step": 21350
},
{
"epoch": 1.0092911654340349,
"grad_norm": 1.21875,
"learning_rate": 6.646302098915115e-06,
"loss": 0.6432,
"step": 21400
},
{
"epoch": 1.0116493292497797,
"grad_norm": 1.2265625,
"learning_rate": 6.6384291990111635e-06,
"loss": 0.6514,
"step": 21450
},
{
"epoch": 1.0140074930655245,
"grad_norm": 1.3125,
"learning_rate": 6.630556299107214e-06,
"loss": 0.6438,
"step": 21500
},
{
"epoch": 1.0140074930655245,
"eval_loss": 0.6410008668899536,
"eval_runtime": 473.6601,
"eval_samples_per_second": 75.392,
"eval_steps_per_second": 37.696,
"step": 21500
},
{
"epoch": 1.0163656568812693,
"grad_norm": 1.3046875,
"learning_rate": 6.622683399203264e-06,
"loss": 0.6499,
"step": 21550
},
{
"epoch": 1.0187238206970142,
"grad_norm": 1.265625,
"learning_rate": 6.6148104992993125e-06,
"loss": 0.634,
"step": 21600
},
{
"epoch": 1.0210819845127592,
"grad_norm": 1.3125,
"learning_rate": 6.606937599395362e-06,
"loss": 0.6428,
"step": 21650
},
{
"epoch": 1.023440148328504,
"grad_norm": 1.125,
"learning_rate": 6.599064699491411e-06,
"loss": 0.6375,
"step": 21700
},
{
"epoch": 1.0257983121442489,
"grad_norm": 1.203125,
"learning_rate": 6.591191799587461e-06,
"loss": 0.6451,
"step": 21750
},
{
"epoch": 1.0281564759599937,
"grad_norm": 1.1640625,
"learning_rate": 6.583318899683509e-06,
"loss": 0.631,
"step": 21800
},
{
"epoch": 1.0305146397757385,
"grad_norm": 1.203125,
"learning_rate": 6.5754459997795594e-06,
"loss": 0.6653,
"step": 21850
},
{
"epoch": 1.0328728035914836,
"grad_norm": 1.15625,
"learning_rate": 6.567573099875608e-06,
"loss": 0.6344,
"step": 21900
},
{
"epoch": 1.0352309674072284,
"grad_norm": 1.1953125,
"learning_rate": 6.559700199971658e-06,
"loss": 0.6349,
"step": 21950
},
{
"epoch": 1.0375891312229732,
"grad_norm": 1.2890625,
"learning_rate": 6.551827300067708e-06,
"loss": 0.6457,
"step": 22000
},
{
"epoch": 1.0375891312229732,
"eval_loss": 0.6410118937492371,
"eval_runtime": 473.0113,
"eval_samples_per_second": 75.495,
"eval_steps_per_second": 37.748,
"step": 22000
},
{
"epoch": 1.039947295038718,
"grad_norm": 1.2421875,
"learning_rate": 6.543954400163757e-06,
"loss": 0.637,
"step": 22050
},
{
"epoch": 1.042305458854463,
"grad_norm": 1.1484375,
"learning_rate": 6.536081500259806e-06,
"loss": 0.6337,
"step": 22100
},
{
"epoch": 1.0446636226702077,
"grad_norm": 1.2109375,
"learning_rate": 6.528208600355855e-06,
"loss": 0.643,
"step": 22150
},
{
"epoch": 1.0470217864859528,
"grad_norm": 1.2109375,
"learning_rate": 6.520335700451905e-06,
"loss": 0.6347,
"step": 22200
},
{
"epoch": 1.0493799503016976,
"grad_norm": 1.15625,
"learning_rate": 6.512462800547954e-06,
"loss": 0.6531,
"step": 22250
},
{
"epoch": 1.0517381141174424,
"grad_norm": 1.1484375,
"learning_rate": 6.504589900644004e-06,
"loss": 0.6465,
"step": 22300
},
{
"epoch": 1.0540962779331873,
"grad_norm": 1.078125,
"learning_rate": 6.4967170007400525e-06,
"loss": 0.6229,
"step": 22350
},
{
"epoch": 1.056454441748932,
"grad_norm": 1.109375,
"learning_rate": 6.488844100836103e-06,
"loss": 0.6405,
"step": 22400
},
{
"epoch": 1.0588126055646772,
"grad_norm": 1.25,
"learning_rate": 6.480971200932152e-06,
"loss": 0.6283,
"step": 22450
},
{
"epoch": 1.061170769380422,
"grad_norm": 1.234375,
"learning_rate": 6.4730983010282015e-06,
"loss": 0.6441,
"step": 22500
},
{
"epoch": 1.061170769380422,
"eval_loss": 0.6409561634063721,
"eval_runtime": 473.1309,
"eval_samples_per_second": 75.476,
"eval_steps_per_second": 37.738,
"step": 22500
},
{
"epoch": 1.0635289331961668,
"grad_norm": 1.1171875,
"learning_rate": 6.465225401124251e-06,
"loss": 0.6365,
"step": 22550
},
{
"epoch": 1.0658870970119116,
"grad_norm": 1.375,
"learning_rate": 6.4573525012202995e-06,
"loss": 0.6447,
"step": 22600
},
{
"epoch": 1.0682452608276565,
"grad_norm": 1.2265625,
"learning_rate": 6.44947960131635e-06,
"loss": 0.6376,
"step": 22650
},
{
"epoch": 1.0706034246434015,
"grad_norm": 1.265625,
"learning_rate": 6.441606701412398e-06,
"loss": 0.6293,
"step": 22700
},
{
"epoch": 1.0729615884591464,
"grad_norm": 1.0625,
"learning_rate": 6.4337338015084485e-06,
"loss": 0.62,
"step": 22750
},
{
"epoch": 1.0753197522748912,
"grad_norm": 1.4609375,
"learning_rate": 6.425860901604497e-06,
"loss": 0.6379,
"step": 22800
},
{
"epoch": 1.077677916090636,
"grad_norm": 1.15625,
"learning_rate": 6.417988001700547e-06,
"loss": 0.6271,
"step": 22850
},
{
"epoch": 1.0800360799063808,
"grad_norm": 1.25,
"learning_rate": 6.410115101796597e-06,
"loss": 0.6341,
"step": 22900
},
{
"epoch": 1.0823942437221257,
"grad_norm": 1.1484375,
"learning_rate": 6.402242201892645e-06,
"loss": 0.6425,
"step": 22950
},
{
"epoch": 1.0847524075378707,
"grad_norm": 1.359375,
"learning_rate": 6.3943693019886955e-06,
"loss": 0.6442,
"step": 23000
},
{
"epoch": 1.0847524075378707,
"eval_loss": 0.6410566568374634,
"eval_runtime": 470.874,
"eval_samples_per_second": 75.838,
"eval_steps_per_second": 37.919,
"step": 23000
},
{
"epoch": 1.0871105713536156,
"grad_norm": 1.3984375,
"learning_rate": 6.386496402084744e-06,
"loss": 0.637,
"step": 23050
},
{
"epoch": 1.0894687351693604,
"grad_norm": 1.25,
"learning_rate": 6.378623502180794e-06,
"loss": 0.6278,
"step": 23100
},
{
"epoch": 1.0918268989851052,
"grad_norm": 1.25,
"learning_rate": 6.370750602276843e-06,
"loss": 0.6419,
"step": 23150
},
{
"epoch": 1.09418506280085,
"grad_norm": 1.4453125,
"learning_rate": 6.362877702372893e-06,
"loss": 0.639,
"step": 23200
},
{
"epoch": 1.096543226616595,
"grad_norm": 1.1875,
"learning_rate": 6.3550048024689416e-06,
"loss": 0.6232,
"step": 23250
},
{
"epoch": 1.09890139043234,
"grad_norm": 1.2578125,
"learning_rate": 6.347131902564991e-06,
"loss": 0.6294,
"step": 23300
},
{
"epoch": 1.1012595542480847,
"grad_norm": 1.265625,
"learning_rate": 6.339259002661041e-06,
"loss": 0.6486,
"step": 23350
},
{
"epoch": 1.1036177180638296,
"grad_norm": 1.1875,
"learning_rate": 6.33138610275709e-06,
"loss": 0.6347,
"step": 23400
},
{
"epoch": 1.1059758818795744,
"grad_norm": 1.1328125,
"learning_rate": 6.32351320285314e-06,
"loss": 0.633,
"step": 23450
},
{
"epoch": 1.1083340456953192,
"grad_norm": 1.109375,
"learning_rate": 6.3156403029491885e-06,
"loss": 0.6377,
"step": 23500
},
{
"epoch": 1.1083340456953192,
"eval_loss": 0.6410402655601501,
"eval_runtime": 473.9522,
"eval_samples_per_second": 75.345,
"eval_steps_per_second": 37.673,
"step": 23500
},
{
"epoch": 1.1106922095110643,
"grad_norm": 1.21875,
"learning_rate": 6.307767403045239e-06,
"loss": 0.6332,
"step": 23550
},
{
"epoch": 1.1130503733268091,
"grad_norm": 1.1171875,
"learning_rate": 6.299894503141287e-06,
"loss": 0.64,
"step": 23600
},
{
"epoch": 1.115408537142554,
"grad_norm": 1.234375,
"learning_rate": 6.2920216032373375e-06,
"loss": 0.6496,
"step": 23650
},
{
"epoch": 1.1177667009582988,
"grad_norm": 1.1171875,
"learning_rate": 6.284148703333386e-06,
"loss": 0.6391,
"step": 23700
},
{
"epoch": 1.1201248647740436,
"grad_norm": 1.1640625,
"learning_rate": 6.2762758034294355e-06,
"loss": 0.6297,
"step": 23750
},
{
"epoch": 1.1224830285897887,
"grad_norm": 1.2265625,
"learning_rate": 6.268402903525486e-06,
"loss": 0.63,
"step": 23800
},
{
"epoch": 1.1248411924055335,
"grad_norm": 1.1875,
"learning_rate": 6.260530003621534e-06,
"loss": 0.6436,
"step": 23850
},
{
"epoch": 1.1271993562212783,
"grad_norm": 1.171875,
"learning_rate": 6.2526571037175845e-06,
"loss": 0.6371,
"step": 23900
},
{
"epoch": 1.1295575200370231,
"grad_norm": 1.1484375,
"learning_rate": 6.244784203813633e-06,
"loss": 0.6367,
"step": 23950
},
{
"epoch": 1.131915683852768,
"grad_norm": 1.2109375,
"learning_rate": 6.236911303909683e-06,
"loss": 0.6365,
"step": 24000
},
{
"epoch": 1.131915683852768,
"eval_loss": 0.6410369277000427,
"eval_runtime": 475.7391,
"eval_samples_per_second": 75.062,
"eval_steps_per_second": 37.531,
"step": 24000
},
{
"epoch": 1.134273847668513,
"grad_norm": 1.2265625,
"learning_rate": 6.219245704224246e-06,
"loss": 0.6305,
"step": 24050
},
{
"epoch": 1.1366320114842579,
"grad_norm": 1.171875,
"learning_rate": 6.211385181342263e-06,
"loss": 0.6499,
"step": 24100
},
{
"epoch": 1.1389901753000027,
"grad_norm": 1.265625,
"learning_rate": 6.203524658460281e-06,
"loss": 0.6463,
"step": 24150
},
{
"epoch": 1.1413483391157475,
"grad_norm": 1.1640625,
"learning_rate": 6.195664135578299e-06,
"loss": 0.6432,
"step": 24200
},
{
"epoch": 1.1437065029314923,
"grad_norm": 1.34375,
"learning_rate": 6.187803612696317e-06,
"loss": 0.6267,
"step": 24250
},
{
"epoch": 1.1460646667472372,
"grad_norm": 1.1484375,
"learning_rate": 6.179943089814336e-06,
"loss": 0.6306,
"step": 24300
},
{
"epoch": 1.1484228305629822,
"grad_norm": 1.1796875,
"learning_rate": 6.172082566932352e-06,
"loss": 0.6505,
"step": 24350
},
{
"epoch": 1.150780994378727,
"grad_norm": 1.3046875,
"learning_rate": 6.164222044050371e-06,
"loss": 0.6408,
"step": 24400
},
{
"epoch": 1.1531391581944719,
"grad_norm": 1.234375,
"learning_rate": 6.156361521168389e-06,
"loss": 0.6428,
"step": 24450
},
{
"epoch": 1.1554973220102167,
"grad_norm": 1.328125,
"learning_rate": 6.1485009982864065e-06,
"loss": 0.6304,
"step": 24500
},
{
"epoch": 1.1554973220102167,
"eval_loss": 0.6410101652145386,
"eval_runtime": 472.4338,
"eval_samples_per_second": 75.587,
"eval_steps_per_second": 37.794,
"step": 24500
},
{
"epoch": 1.1578554858259615,
"grad_norm": 1.265625,
"learning_rate": 6.140640475404425e-06,
"loss": 0.6399,
"step": 24550
},
{
"epoch": 1.1602136496417064,
"grad_norm": 1.1171875,
"learning_rate": 6.1327799525224415e-06,
"loss": 0.6397,
"step": 24600
},
{
"epoch": 1.1625718134574514,
"grad_norm": 1.1015625,
"learning_rate": 6.12491942964046e-06,
"loss": 0.6299,
"step": 24650
},
{
"epoch": 1.1649299772731962,
"grad_norm": 1.21875,
"learning_rate": 6.117058906758478e-06,
"loss": 0.6401,
"step": 24700
},
{
"epoch": 1.167288141088941,
"grad_norm": 1.265625,
"learning_rate": 6.109198383876496e-06,
"loss": 0.6473,
"step": 24750
},
{
"epoch": 1.169646304904686,
"grad_norm": 1.1171875,
"learning_rate": 6.101337860994514e-06,
"loss": 0.6371,
"step": 24800
},
{
"epoch": 1.1720044687204307,
"grad_norm": 1.125,
"learning_rate": 6.0934773381125325e-06,
"loss": 0.6357,
"step": 24850
},
{
"epoch": 1.1743626325361758,
"grad_norm": 1.3046875,
"learning_rate": 6.085616815230549e-06,
"loss": 0.6408,
"step": 24900
},
{
"epoch": 1.1767207963519206,
"grad_norm": 1.0859375,
"learning_rate": 6.0777562923485675e-06,
"loss": 0.6531,
"step": 24950
},
{
"epoch": 1.1790789601676654,
"grad_norm": 1.1796875,
"learning_rate": 6.069895769466585e-06,
"loss": 0.6298,
"step": 25000
},
{
"epoch": 1.1790789601676654,
"eval_loss": 0.6410490274429321,
"eval_runtime": 472.9546,
"eval_samples_per_second": 75.504,
"eval_steps_per_second": 37.752,
"step": 25000
},
{
"epoch": 1.1814371239834103,
"grad_norm": 1.15625,
"learning_rate": 6.062035246584603e-06,
"loss": 0.633,
"step": 25050
},
{
"epoch": 1.183795287799155,
"grad_norm": 1.21875,
"learning_rate": 6.054174723702622e-06,
"loss": 0.6221,
"step": 25100
},
{
"epoch": 1.1861534516149002,
"grad_norm": 1.1171875,
"learning_rate": 6.0463142008206384e-06,
"loss": 0.6363,
"step": 25150
},
{
"epoch": 1.188511615430645,
"grad_norm": 1.1640625,
"learning_rate": 6.038453677938657e-06,
"loss": 0.6423,
"step": 25200
},
{
"epoch": 1.1908697792463898,
"grad_norm": 1.1796875,
"learning_rate": 6.030593155056674e-06,
"loss": 0.6444,
"step": 25250
},
{
"epoch": 1.1932279430621346,
"grad_norm": 1.2109375,
"learning_rate": 6.022732632174693e-06,
"loss": 0.634,
"step": 25300
},
{
"epoch": 1.1955861068778795,
"grad_norm": 1.1640625,
"learning_rate": 6.014872109292711e-06,
"loss": 0.6559,
"step": 25350
},
{
"epoch": 1.1979442706936245,
"grad_norm": 1.1875,
"learning_rate": 6.0070115864107285e-06,
"loss": 0.6475,
"step": 25400
},
{
"epoch": 1.2003024345093694,
"grad_norm": 1.265625,
"learning_rate": 5.999151063528746e-06,
"loss": 0.6306,
"step": 25450
},
{
"epoch": 1.2026605983251142,
"grad_norm": 1.125,
"learning_rate": 5.991290540646764e-06,
"loss": 0.6347,
"step": 25500
},
{
"epoch": 1.2026605983251142,
"eval_loss": 0.6409837007522583,
"eval_runtime": 469.7269,
"eval_samples_per_second": 76.023,
"eval_steps_per_second": 38.011,
"step": 25500
},
{
"epoch": 1.205018762140859,
"grad_norm": 1.1328125,
"learning_rate": 5.983430017764782e-06,
"loss": 0.6417,
"step": 25550
},
{
"epoch": 1.2073769259566038,
"grad_norm": 1.1875,
"learning_rate": 5.9755694948828e-06,
"loss": 0.6452,
"step": 25600
},
{
"epoch": 1.2097350897723487,
"grad_norm": 1.1796875,
"learning_rate": 5.967708972000818e-06,
"loss": 0.6397,
"step": 25650
},
{
"epoch": 1.2120932535880937,
"grad_norm": 1.2265625,
"learning_rate": 5.959848449118836e-06,
"loss": 0.6419,
"step": 25700
},
{
"epoch": 1.2144514174038386,
"grad_norm": 1.2734375,
"learning_rate": 5.951987926236854e-06,
"loss": 0.6456,
"step": 25750
},
{
"epoch": 1.2168095812195834,
"grad_norm": 1.140625,
"learning_rate": 5.944127403354871e-06,
"loss": 0.6403,
"step": 25800
},
{
"epoch": 1.2191677450353282,
"grad_norm": 1.1796875,
"learning_rate": 5.9362668804728895e-06,
"loss": 0.6463,
"step": 25850
},
{
"epoch": 1.221525908851073,
"grad_norm": 1.109375,
"learning_rate": 5.928406357590908e-06,
"loss": 0.6397,
"step": 25900
},
{
"epoch": 1.2238840726668179,
"grad_norm": 1.2578125,
"learning_rate": 5.9205458347089254e-06,
"loss": 0.6444,
"step": 25950
},
{
"epoch": 1.226242236482563,
"grad_norm": 1.1484375,
"learning_rate": 5.912685311826944e-06,
"loss": 0.6331,
"step": 26000
},
{
"epoch": 1.226242236482563,
"eval_loss": 0.6410670280456543,
"eval_runtime": 470.7207,
"eval_samples_per_second": 75.862,
"eval_steps_per_second": 37.931,
"step": 26000
},
{
"epoch": 1.2286004002983077,
"grad_norm": 1.2890625,
"learning_rate": 5.9048247889449605e-06,
"loss": 0.6377,
"step": 26050
},
{
"epoch": 1.2309585641140526,
"grad_norm": 1.25,
"learning_rate": 5.896964266062979e-06,
"loss": 0.6402,
"step": 26100
},
{
"epoch": 1.2333167279297974,
"grad_norm": 1.1484375,
"learning_rate": 5.889103743180997e-06,
"loss": 0.6233,
"step": 26150
},
{
"epoch": 1.2356748917455422,
"grad_norm": 1.15625,
"learning_rate": 5.881243220299015e-06,
"loss": 0.6502,
"step": 26200
},
{
"epoch": 1.2380330555612873,
"grad_norm": 1.328125,
"learning_rate": 5.873382697417033e-06,
"loss": 0.6228,
"step": 26250
},
{
"epoch": 1.2403912193770321,
"grad_norm": 1.265625,
"learning_rate": 5.865522174535051e-06,
"loss": 0.629,
"step": 26300
},
{
"epoch": 1.242749383192777,
"grad_norm": 1.1953125,
"learning_rate": 5.857661651653068e-06,
"loss": 0.6172,
"step": 26350
},
{
"epoch": 1.2451075470085218,
"grad_norm": 1.21875,
"learning_rate": 5.8498011287710864e-06,
"loss": 0.6424,
"step": 26400
},
{
"epoch": 1.2474657108242666,
"grad_norm": 1.125,
"learning_rate": 5.841940605889104e-06,
"loss": 0.647,
"step": 26450
},
{
"epoch": 1.2498238746400117,
"grad_norm": 1.328125,
"learning_rate": 5.834080083007122e-06,
"loss": 0.6509,
"step": 26500
},
{
"epoch": 1.2498238746400117,
"eval_loss": 0.6410943865776062,
"eval_runtime": 471.4169,
"eval_samples_per_second": 75.75,
"eval_steps_per_second": 37.875,
"step": 26500
},
{
"epoch": 1.2521820384557565,
"grad_norm": 1.1171875,
"learning_rate": 5.826219560125141e-06,
"loss": 0.6297,
"step": 26550
},
{
"epoch": 1.2545402022715013,
"grad_norm": 1.1171875,
"learning_rate": 5.818359037243157e-06,
"loss": 0.6353,
"step": 26600
},
{
"epoch": 1.2568983660872461,
"grad_norm": 1.078125,
"learning_rate": 5.810498514361176e-06,
"loss": 0.6326,
"step": 26650
},
{
"epoch": 1.259256529902991,
"grad_norm": 1.203125,
"learning_rate": 5.802637991479194e-06,
"loss": 0.6367,
"step": 26700
},
{
"epoch": 1.261614693718736,
"grad_norm": 1.109375,
"learning_rate": 5.794777468597212e-06,
"loss": 0.6308,
"step": 26750
},
{
"epoch": 1.2639728575344809,
"grad_norm": 1.484375,
"learning_rate": 5.78691694571523e-06,
"loss": 0.6441,
"step": 26800
},
{
"epoch": 1.2663310213502257,
"grad_norm": 1.1953125,
"learning_rate": 5.779056422833247e-06,
"loss": 0.6448,
"step": 26850
},
{
"epoch": 1.2686891851659705,
"grad_norm": 1.1484375,
"learning_rate": 5.771195899951265e-06,
"loss": 0.6447,
"step": 26900
},
{
"epoch": 1.2710473489817153,
"grad_norm": 1.234375,
"learning_rate": 5.763335377069283e-06,
"loss": 0.628,
"step": 26950
},
{
"epoch": 1.2734055127974604,
"grad_norm": 1.2109375,
"learning_rate": 5.755474854187301e-06,
"loss": 0.642,
"step": 27000
},
{
"epoch": 1.2734055127974604,
"eval_loss": 0.641043484210968,
"eval_runtime": 470.7463,
"eval_samples_per_second": 75.858,
"eval_steps_per_second": 37.929,
"step": 27000
},
{
"epoch": 1.275763676613205,
"grad_norm": 1.1171875,
"learning_rate": 5.747614331305319e-06,
"loss": 0.6418,
"step": 27050
},
{
"epoch": 1.27812184042895,
"grad_norm": 1.296875,
"learning_rate": 5.7397538084233376e-06,
"loss": 0.6401,
"step": 27100
},
{
"epoch": 1.2804800042446949,
"grad_norm": 1.3671875,
"learning_rate": 5.731893285541354e-06,
"loss": 0.6416,
"step": 27150
},
{
"epoch": 1.2828381680604397,
"grad_norm": 1.09375,
"learning_rate": 5.724032762659373e-06,
"loss": 0.6381,
"step": 27200
},
{
"epoch": 1.2851963318761845,
"grad_norm": 1.328125,
"learning_rate": 5.71617223977739e-06,
"loss": 0.6388,
"step": 27250
},
{
"epoch": 1.2875544956919294,
"grad_norm": 1.1875,
"learning_rate": 5.7083117168954085e-06,
"loss": 0.6339,
"step": 27300
},
{
"epoch": 1.2899126595076744,
"grad_norm": 1.2578125,
"learning_rate": 5.700451194013427e-06,
"loss": 0.6406,
"step": 27350
},
{
"epoch": 1.2922708233234192,
"grad_norm": 1.2109375,
"learning_rate": 5.6925906711314435e-06,
"loss": 0.6264,
"step": 27400
},
{
"epoch": 1.294628987139164,
"grad_norm": 1.25,
"learning_rate": 5.684730148249462e-06,
"loss": 0.6534,
"step": 27450
},
{
"epoch": 1.296987150954909,
"grad_norm": 1.2421875,
"learning_rate": 5.67686962536748e-06,
"loss": 0.6418,
"step": 27500
},
{
"epoch": 1.296987150954909,
"eval_loss": 0.6410250067710876,
"eval_runtime": 471.8783,
"eval_samples_per_second": 75.676,
"eval_steps_per_second": 37.838,
"step": 27500
},
{
"epoch": 1.2993453147706537,
"grad_norm": 1.1640625,
"learning_rate": 5.669009102485498e-06,
"loss": 0.6301,
"step": 27550
},
{
"epoch": 1.3017034785863988,
"grad_norm": 1.1015625,
"learning_rate": 5.661148579603516e-06,
"loss": 0.6365,
"step": 27600
},
{
"epoch": 1.3040616424021436,
"grad_norm": 1.3671875,
"learning_rate": 5.653288056721533e-06,
"loss": 0.6429,
"step": 27650
},
{
"epoch": 1.3064198062178884,
"grad_norm": 1.109375,
"learning_rate": 5.645427533839551e-06,
"loss": 0.6358,
"step": 27700
},
{
"epoch": 1.3087779700336333,
"grad_norm": 1.1796875,
"learning_rate": 5.6375670109575695e-06,
"loss": 0.6435,
"step": 27750
},
{
"epoch": 1.311136133849378,
"grad_norm": 1.578125,
"learning_rate": 5.629706488075587e-06,
"loss": 0.651,
"step": 27800
},
{
"epoch": 1.3134942976651232,
"grad_norm": 1.3671875,
"learning_rate": 5.621845965193605e-06,
"loss": 0.6522,
"step": 27850
},
{
"epoch": 1.315852461480868,
"grad_norm": 1.1953125,
"learning_rate": 5.613985442311624e-06,
"loss": 0.6345,
"step": 27900
},
{
"epoch": 1.3182106252966128,
"grad_norm": 1.1484375,
"learning_rate": 5.60612491942964e-06,
"loss": 0.6419,
"step": 27950
},
{
"epoch": 1.3205687891123576,
"grad_norm": 1.46875,
"learning_rate": 5.598264396547659e-06,
"loss": 0.6324,
"step": 28000
},
{
"epoch": 1.3205687891123576,
"eval_loss": 0.6410648822784424,
"eval_runtime": 471.9597,
"eval_samples_per_second": 75.663,
"eval_steps_per_second": 37.832,
"step": 28000
},
{
"epoch": 1.3229269529281025,
"grad_norm": 1.1953125,
"learning_rate": 5.590403873665676e-06,
"loss": 0.6405,
"step": 28050
},
{
"epoch": 1.3252851167438475,
"grad_norm": 1.1953125,
"learning_rate": 5.582543350783695e-06,
"loss": 0.6354,
"step": 28100
},
{
"epoch": 1.3276432805595924,
"grad_norm": 1.171875,
"learning_rate": 5.574682827901713e-06,
"loss": 0.6559,
"step": 28150
},
{
"epoch": 1.3300014443753372,
"grad_norm": 1.1015625,
"learning_rate": 5.5668223050197305e-06,
"loss": 0.6304,
"step": 28200
},
{
"epoch": 1.332359608191082,
"grad_norm": 1.2421875,
"learning_rate": 5.558961782137748e-06,
"loss": 0.6345,
"step": 28250
},
{
"epoch": 1.3347177720068268,
"grad_norm": 1.28125,
"learning_rate": 5.5511012592557655e-06,
"loss": 0.6435,
"step": 28300
},
{
"epoch": 1.337075935822572,
"grad_norm": 1.1328125,
"learning_rate": 5.543240736373784e-06,
"loss": 0.6345,
"step": 28350
},
{
"epoch": 1.3394340996383165,
"grad_norm": 1.21875,
"learning_rate": 5.535380213491802e-06,
"loss": 0.6344,
"step": 28400
},
{
"epoch": 1.3417922634540616,
"grad_norm": 1.234375,
"learning_rate": 5.52751969060982e-06,
"loss": 0.6452,
"step": 28450
},
{
"epoch": 1.3441504272698064,
"grad_norm": 1.2890625,
"learning_rate": 5.519659167727838e-06,
"loss": 0.6333,
"step": 28500
},
{
"epoch": 1.3441504272698064,
"eval_loss": 0.6410021185874939,
"eval_runtime": 473.4866,
"eval_samples_per_second": 75.419,
"eval_steps_per_second": 37.71,
"step": 28500
},
{
"epoch": 1.3465085910855512,
"grad_norm": 1.1796875,
"learning_rate": 5.511798644845856e-06,
"loss": 0.6361,
"step": 28550
},
{
"epoch": 1.348866754901296,
"grad_norm": 1.1484375,
"learning_rate": 5.503938121963873e-06,
"loss": 0.6394,
"step": 28600
},
{
"epoch": 1.3512249187170409,
"grad_norm": 1.2734375,
"learning_rate": 5.4960775990818915e-06,
"loss": 0.6444,
"step": 28650
},
{
"epoch": 1.353583082532786,
"grad_norm": 1.203125,
"learning_rate": 5.488217076199909e-06,
"loss": 0.6462,
"step": 28700
},
{
"epoch": 1.3559412463485307,
"grad_norm": 1.265625,
"learning_rate": 5.480356553317927e-06,
"loss": 0.6325,
"step": 28750
},
{
"epoch": 1.3582994101642756,
"grad_norm": 1.234375,
"learning_rate": 5.472496030435946e-06,
"loss": 0.6401,
"step": 28800
},
{
"epoch": 1.3606575739800204,
"grad_norm": 1.21875,
"learning_rate": 5.464635507553962e-06,
"loss": 0.6395,
"step": 28850
},
{
"epoch": 1.3630157377957652,
"grad_norm": 1.1953125,
"learning_rate": 5.456774984671981e-06,
"loss": 0.6476,
"step": 28900
},
{
"epoch": 1.3653739016115103,
"grad_norm": 1.2109375,
"learning_rate": 5.448914461789999e-06,
"loss": 0.6337,
"step": 28950
},
{
"epoch": 1.3677320654272551,
"grad_norm": 1.3125,
"learning_rate": 5.441053938908017e-06,
"loss": 0.6444,
"step": 29000
},
{
"epoch": 1.3677320654272551,
"eval_loss": 0.6410331726074219,
"eval_runtime": 470.8654,
"eval_samples_per_second": 75.839,
"eval_steps_per_second": 37.92,
"step": 29000
},
{
"epoch": 1.370090229243,
"grad_norm": 1.203125,
"learning_rate": 5.433193416026035e-06,
"loss": 0.6422,
"step": 29050
},
{
"epoch": 1.3724483930587448,
"grad_norm": 1.15625,
"learning_rate": 5.425332893144052e-06,
"loss": 0.6292,
"step": 29100
},
{
"epoch": 1.3748065568744896,
"grad_norm": 1.1171875,
"learning_rate": 5.41747237026207e-06,
"loss": 0.6409,
"step": 29150
},
{
"epoch": 1.3771647206902347,
"grad_norm": 1.203125,
"learning_rate": 5.409611847380088e-06,
"loss": 0.643,
"step": 29200
},
{
"epoch": 1.3795228845059795,
"grad_norm": 1.265625,
"learning_rate": 5.401751324498106e-06,
"loss": 0.6528,
"step": 29250
},
{
"epoch": 1.3818810483217243,
"grad_norm": 1.1796875,
"learning_rate": 5.393890801616124e-06,
"loss": 0.6191,
"step": 29300
},
{
"epoch": 1.3842392121374691,
"grad_norm": 1.203125,
"learning_rate": 5.386030278734143e-06,
"loss": 0.6266,
"step": 29350
},
{
"epoch": 1.386597375953214,
"grad_norm": 1.296875,
"learning_rate": 5.378169755852159e-06,
"loss": 0.6535,
"step": 29400
},
{
"epoch": 1.388955539768959,
"grad_norm": 1.1171875,
"learning_rate": 5.370309232970178e-06,
"loss": 0.6388,
"step": 29450
},
{
"epoch": 1.3913137035847039,
"grad_norm": 1.3125,
"learning_rate": 5.362448710088195e-06,
"loss": 0.6424,
"step": 29500
},
{
"epoch": 1.3913137035847039,
"eval_loss": 0.6410022974014282,
"eval_runtime": 470.5101,
"eval_samples_per_second": 75.896,
"eval_steps_per_second": 37.948,
"step": 29500
},
{
"epoch": 1.3936718674004487,
"grad_norm": 1.1875,
"learning_rate": 5.3545881872062135e-06,
"loss": 0.6241,
"step": 29550
},
{
"epoch": 1.3960300312161935,
"grad_norm": 1.1953125,
"learning_rate": 5.346727664324232e-06,
"loss": 0.6472,
"step": 29600
},
{
"epoch": 1.3983881950319383,
"grad_norm": 1.203125,
"learning_rate": 5.3388671414422486e-06,
"loss": 0.6318,
"step": 29650
},
{
"epoch": 1.4007463588476832,
"grad_norm": 1.265625,
"learning_rate": 5.331006618560267e-06,
"loss": 0.6387,
"step": 29700
},
{
"epoch": 1.403104522663428,
"grad_norm": 1.1875,
"learning_rate": 5.323146095678285e-06,
"loss": 0.642,
"step": 29750
},
{
"epoch": 1.405462686479173,
"grad_norm": 1.21875,
"learning_rate": 5.315285572796303e-06,
"loss": 0.6334,
"step": 29800
},
{
"epoch": 1.4078208502949179,
"grad_norm": 1.140625,
"learning_rate": 5.307425049914321e-06,
"loss": 0.6287,
"step": 29850
},
{
"epoch": 1.4101790141106627,
"grad_norm": 1.234375,
"learning_rate": 5.299564527032338e-06,
"loss": 0.6306,
"step": 29900
},
{
"epoch": 1.4125371779264075,
"grad_norm": 1.171875,
"learning_rate": 5.291704004150356e-06,
"loss": 0.6339,
"step": 29950
},
{
"epoch": 1.4148953417421524,
"grad_norm": 1.34375,
"learning_rate": 5.2838434812683745e-06,
"loss": 0.6481,
"step": 30000
},
{
"epoch": 1.4148953417421524,
"eval_loss": 0.6409640312194824,
"eval_runtime": 470.4098,
"eval_samples_per_second": 75.913,
"eval_steps_per_second": 37.956,
"step": 30000
},
{
"epoch": 1.4172535055578974,
"grad_norm": 1.265625,
"learning_rate": 5.275982958386392e-06,
"loss": 0.657,
"step": 30050
},
{
"epoch": 1.4196116693736422,
"grad_norm": 1.3125,
"learning_rate": 5.26812243550441e-06,
"loss": 0.6331,
"step": 30100
},
{
"epoch": 1.421969833189387,
"grad_norm": 1.2578125,
"learning_rate": 5.260261912622429e-06,
"loss": 0.6356,
"step": 30150
},
{
"epoch": 1.424327997005132,
"grad_norm": 1.2265625,
"learning_rate": 5.2524013897404454e-06,
"loss": 0.6263,
"step": 30200
},
{
"epoch": 1.4266861608208767,
"grad_norm": 1.1640625,
"learning_rate": 5.244540866858464e-06,
"loss": 0.6462,
"step": 30250
},
{
"epoch": 1.4290443246366218,
"grad_norm": 1.4375,
"learning_rate": 5.236680343976481e-06,
"loss": 0.6289,
"step": 30300
},
{
"epoch": 1.4314024884523666,
"grad_norm": 1.3984375,
"learning_rate": 5.2288198210945e-06,
"loss": 0.6341,
"step": 30350
},
{
"epoch": 1.4337606522681114,
"grad_norm": 1.3046875,
"learning_rate": 5.220959298212518e-06,
"loss": 0.6305,
"step": 30400
},
{
"epoch": 1.4361188160838563,
"grad_norm": 1.125,
"learning_rate": 5.213098775330535e-06,
"loss": 0.6356,
"step": 30450
},
{
"epoch": 1.438476979899601,
"grad_norm": 1.25,
"learning_rate": 5.205238252448553e-06,
"loss": 0.6516,
"step": 30500
},
{
"epoch": 1.438476979899601,
"eval_loss": 0.6409685611724854,
"eval_runtime": 470.4364,
"eval_samples_per_second": 75.908,
"eval_steps_per_second": 37.954,
"step": 30500
},
{
"epoch": 1.4408351437153462,
"grad_norm": 1.296875,
"learning_rate": 5.1973777295665714e-06,
"loss": 0.6507,
"step": 30550
},
{
"epoch": 1.443193307531091,
"grad_norm": 1.2578125,
"learning_rate": 5.189517206684589e-06,
"loss": 0.6417,
"step": 30600
},
{
"epoch": 1.4455514713468358,
"grad_norm": 1.234375,
"learning_rate": 5.181656683802607e-06,
"loss": 0.6325,
"step": 30650
},
{
"epoch": 1.4479096351625806,
"grad_norm": 1.2734375,
"learning_rate": 5.173796160920625e-06,
"loss": 0.6243,
"step": 30700
},
{
"epoch": 1.4502677989783255,
"grad_norm": 1.1953125,
"learning_rate": 5.165935638038642e-06,
"loss": 0.6327,
"step": 30750
},
{
"epoch": 1.4526259627940705,
"grad_norm": 1.1171875,
"learning_rate": 5.158075115156661e-06,
"loss": 0.6323,
"step": 30800
},
{
"epoch": 1.4549841266098151,
"grad_norm": 1.2890625,
"learning_rate": 5.150214592274678e-06,
"loss": 0.64,
"step": 30850
},
{
"epoch": 1.4573422904255602,
"grad_norm": 1.2578125,
"learning_rate": 5.1423540693926966e-06,
"loss": 0.6423,
"step": 30900
},
{
"epoch": 1.459700454241305,
"grad_norm": 1.2578125,
"learning_rate": 5.134493546510715e-06,
"loss": 0.6436,
"step": 30950
},
{
"epoch": 1.4620586180570498,
"grad_norm": 1.125,
"learning_rate": 5.1266330236287324e-06,
"loss": 0.6249,
"step": 31000
},
{
"epoch": 1.4620586180570498,
"eval_loss": 0.6409750580787659,
"eval_runtime": 467.22,
"eval_samples_per_second": 76.431,
"eval_steps_per_second": 38.215,
"step": 31000
},
{
"epoch": 1.4644167818727947,
"grad_norm": 1.2265625,
"learning_rate": 5.11877250074675e-06,
"loss": 0.6405,
"step": 31050
},
{
"epoch": 1.4667749456885395,
"grad_norm": 1.2578125,
"learning_rate": 5.1109119778647675e-06,
"loss": 0.6238,
"step": 31100
},
{
"epoch": 1.4691331095042846,
"grad_norm": 1.1875,
"learning_rate": 5.103051454982786e-06,
"loss": 0.6318,
"step": 31150
},
{
"epoch": 1.4714912733200294,
"grad_norm": 1.3046875,
"learning_rate": 5.095190932100804e-06,
"loss": 0.6452,
"step": 31200
},
{
"epoch": 1.4738494371357742,
"grad_norm": 1.1796875,
"learning_rate": 5.087330409218822e-06,
"loss": 0.6424,
"step": 31250
},
{
"epoch": 1.476207600951519,
"grad_norm": 1.109375,
"learning_rate": 5.07946988633684e-06,
"loss": 0.6448,
"step": 31300
},
{
"epoch": 1.4785657647672639,
"grad_norm": 1.3046875,
"learning_rate": 5.071609363454857e-06,
"loss": 0.6141,
"step": 31350
},
{
"epoch": 1.480923928583009,
"grad_norm": 1.09375,
"learning_rate": 5.063748840572875e-06,
"loss": 0.6339,
"step": 31400
},
{
"epoch": 1.4832820923987537,
"grad_norm": 1.21875,
"learning_rate": 5.0558883176908935e-06,
"loss": 0.6389,
"step": 31450
},
{
"epoch": 1.4856402562144986,
"grad_norm": 1.09375,
"learning_rate": 5.048027794808911e-06,
"loss": 0.6262,
"step": 31500
},
{
"epoch": 1.4856402562144986,
"eval_loss": 0.6409618258476257,
"eval_runtime": 470.7912,
"eval_samples_per_second": 75.851,
"eval_steps_per_second": 37.926,
"step": 31500
},
{
"epoch": 1.4879984200302434,
"grad_norm": 1.3359375,
"learning_rate": 5.040167271926929e-06,
"loss": 0.6329,
"step": 31550
},
{
"epoch": 1.4903565838459882,
"grad_norm": 1.1015625,
"learning_rate": 5.032306749044948e-06,
"loss": 0.6297,
"step": 31600
},
{
"epoch": 1.4927147476617333,
"grad_norm": 1.1328125,
"learning_rate": 5.024446226162964e-06,
"loss": 0.6418,
"step": 31650
},
{
"epoch": 1.4950729114774781,
"grad_norm": 1.2421875,
"learning_rate": 5.016585703280983e-06,
"loss": 0.6281,
"step": 31700
},
{
"epoch": 1.497431075293223,
"grad_norm": 1.3203125,
"learning_rate": 5.008725180399e-06,
"loss": 0.6398,
"step": 31750
},
{
"epoch": 1.4997892391089678,
"grad_norm": 1.28125,
"learning_rate": 5.000864657517019e-06,
"loss": 0.645,
"step": 31800
},
{
"epoch": 1.5021474029247126,
"grad_norm": 1.2421875,
"learning_rate": 4.993004134635036e-06,
"loss": 0.6319,
"step": 31850
},
{
"epoch": 1.5045055667404577,
"grad_norm": 1.2890625,
"learning_rate": 4.9851436117530545e-06,
"loss": 0.6381,
"step": 31900
},
{
"epoch": 1.5068637305562023,
"grad_norm": 1.109375,
"learning_rate": 4.977283088871072e-06,
"loss": 0.6308,
"step": 31950
},
{
"epoch": 1.5092218943719473,
"grad_norm": 1.0078125,
"learning_rate": 4.9694225659890895e-06,
"loss": 0.632,
"step": 32000
},
{
"epoch": 1.5092218943719473,
"eval_loss": 0.6409916877746582,
"eval_runtime": 470.5977,
"eval_samples_per_second": 75.882,
"eval_steps_per_second": 37.941,
"step": 32000
},
{
"epoch": 1.5115800581876921,
"grad_norm": 1.109375,
"learning_rate": 4.961562043107108e-06,
"loss": 0.6354,
"step": 32050
},
{
"epoch": 1.513938222003437,
"grad_norm": 1.1015625,
"learning_rate": 4.953701520225126e-06,
"loss": 0.6386,
"step": 32100
},
{
"epoch": 1.516296385819182,
"grad_norm": 1.2109375,
"learning_rate": 4.945840997343144e-06,
"loss": 0.6411,
"step": 32150
},
{
"epoch": 1.5186545496349266,
"grad_norm": 1.09375,
"learning_rate": 4.937980474461161e-06,
"loss": 0.6347,
"step": 32200
},
{
"epoch": 1.5210127134506717,
"grad_norm": 1.5546875,
"learning_rate": 4.93011995157918e-06,
"loss": 0.6375,
"step": 32250
},
{
"epoch": 1.5233708772664165,
"grad_norm": 1.0703125,
"learning_rate": 4.922259428697197e-06,
"loss": 0.6435,
"step": 32300
},
{
"epoch": 1.5257290410821613,
"grad_norm": 1.46875,
"learning_rate": 4.9143989058152155e-06,
"loss": 0.6291,
"step": 32350
},
{
"epoch": 1.5280872048979064,
"grad_norm": 1.1015625,
"learning_rate": 4.906538382933233e-06,
"loss": 0.6455,
"step": 32400
},
{
"epoch": 1.530445368713651,
"grad_norm": 1.0703125,
"learning_rate": 4.8986778600512505e-06,
"loss": 0.6324,
"step": 32450
},
{
"epoch": 1.532803532529396,
"grad_norm": 1.2890625,
"learning_rate": 4.890817337169269e-06,
"loss": 0.6474,
"step": 32500
},
{
"epoch": 1.532803532529396,
"eval_loss": 0.6410307884216309,
"eval_runtime": 471.1719,
"eval_samples_per_second": 75.79,
"eval_steps_per_second": 37.895,
"step": 32500
},
{
"epoch": 1.5351616963451409,
"grad_norm": 1.265625,
"learning_rate": 4.882956814287287e-06,
"loss": 0.634,
"step": 32550
},
{
"epoch": 1.5375198601608857,
"grad_norm": 1.1484375,
"learning_rate": 4.875096291405305e-06,
"loss": 0.6319,
"step": 32600
},
{
"epoch": 1.5398780239766308,
"grad_norm": 1.2890625,
"learning_rate": 4.867235768523322e-06,
"loss": 0.6397,
"step": 32650
},
{
"epoch": 1.5422361877923754,
"grad_norm": 1.15625,
"learning_rate": 4.859375245641341e-06,
"loss": 0.6525,
"step": 32700
},
{
"epoch": 1.5445943516081204,
"grad_norm": 1.109375,
"learning_rate": 4.851514722759358e-06,
"loss": 0.6479,
"step": 32750
},
{
"epoch": 1.5469525154238652,
"grad_norm": 1.265625,
"learning_rate": 4.8436541998773765e-06,
"loss": 0.6387,
"step": 32800
},
{
"epoch": 1.54931067923961,
"grad_norm": 1.265625,
"learning_rate": 4.835793676995394e-06,
"loss": 0.634,
"step": 32850
},
{
"epoch": 1.551668843055355,
"grad_norm": 1.53125,
"learning_rate": 4.827933154113412e-06,
"loss": 0.6468,
"step": 32900
},
{
"epoch": 1.5540270068710997,
"grad_norm": 1.09375,
"learning_rate": 4.82007263123143e-06,
"loss": 0.6375,
"step": 32950
},
{
"epoch": 1.5563851706868448,
"grad_norm": 1.3046875,
"learning_rate": 4.812212108349447e-06,
"loss": 0.6377,
"step": 33000
},
{
"epoch": 1.5563851706868448,
"eval_loss": 0.6409004926681519,
"eval_runtime": 473.1432,
"eval_samples_per_second": 75.474,
"eval_steps_per_second": 37.737,
"step": 33000
},
{
"epoch": 1.5587433345025894,
"grad_norm": 1.484375,
"learning_rate": 4.804351585467466e-06,
"loss": 0.6472,
"step": 33050
},
{
"epoch": 1.5611014983183344,
"grad_norm": 1.25,
"learning_rate": 4.796491062585484e-06,
"loss": 0.6357,
"step": 33100
},
{
"epoch": 1.5634596621340793,
"grad_norm": 1.2578125,
"learning_rate": 4.788630539703502e-06,
"loss": 0.6432,
"step": 33150
},
{
"epoch": 1.565817825949824,
"grad_norm": 1.1484375,
"learning_rate": 4.780770016821519e-06,
"loss": 0.6315,
"step": 33200
},
{
"epoch": 1.5681759897655692,
"grad_norm": 1.265625,
"learning_rate": 4.772909493939537e-06,
"loss": 0.643,
"step": 33250
},
{
"epoch": 1.5705341535813138,
"grad_norm": 1.234375,
"learning_rate": 4.765048971057555e-06,
"loss": 0.635,
"step": 33300
},
{
"epoch": 1.5728923173970588,
"grad_norm": 1.09375,
"learning_rate": 4.757188448175573e-06,
"loss": 0.6339,
"step": 33350
},
{
"epoch": 1.5752504812128036,
"grad_norm": 1.1953125,
"learning_rate": 4.749327925293591e-06,
"loss": 0.643,
"step": 33400
},
{
"epoch": 1.5776086450285485,
"grad_norm": 1.1953125,
"learning_rate": 4.741467402411608e-06,
"loss": 0.634,
"step": 33450
},
{
"epoch": 1.5799668088442935,
"grad_norm": 1.234375,
"learning_rate": 4.733606879529627e-06,
"loss": 0.6472,
"step": 33500
},
{
"epoch": 1.5799668088442935,
"eval_loss": 0.640994668006897,
"eval_runtime": 472.6883,
"eval_samples_per_second": 75.547,
"eval_steps_per_second": 37.773,
"step": 33500
},
{
"epoch": 1.5823249726600381,
"grad_norm": 1.140625,
"learning_rate": 4.725746356647644e-06,
"loss": 0.6235,
"step": 33550
},
{
"epoch": 1.5846831364757832,
"grad_norm": 1.484375,
"learning_rate": 4.717885833765663e-06,
"loss": 0.631,
"step": 33600
},
{
"epoch": 1.587041300291528,
"grad_norm": 1.2265625,
"learning_rate": 4.71002531088368e-06,
"loss": 0.6394,
"step": 33650
},
{
"epoch": 1.5893994641072728,
"grad_norm": 1.28125,
"learning_rate": 4.7021647880016985e-06,
"loss": 0.6488,
"step": 33700
},
{
"epoch": 1.591757627923018,
"grad_norm": 1.2109375,
"learning_rate": 4.694304265119716e-06,
"loss": 0.6458,
"step": 33750
},
{
"epoch": 1.5941157917387625,
"grad_norm": 1.1484375,
"learning_rate": 4.686443742237734e-06,
"loss": 0.6506,
"step": 33800
},
{
"epoch": 1.5964739555545076,
"grad_norm": 1.1875,
"learning_rate": 4.678583219355752e-06,
"loss": 0.6466,
"step": 33850
},
{
"epoch": 1.5988321193702524,
"grad_norm": 1.234375,
"learning_rate": 4.67072269647377e-06,
"loss": 0.6315,
"step": 33900
},
{
"epoch": 1.6011902831859972,
"grad_norm": 1.125,
"learning_rate": 4.662862173591788e-06,
"loss": 0.6233,
"step": 33950
},
{
"epoch": 1.603548447001742,
"grad_norm": 1.1484375,
"learning_rate": 4.655001650709805e-06,
"loss": 0.6336,
"step": 34000
},
{
"epoch": 1.603548447001742,
"eval_loss": 0.6410415768623352,
"eval_runtime": 476.8599,
"eval_samples_per_second": 74.886,
"eval_steps_per_second": 37.443,
"step": 34000
},
{
"epoch": 1.6059066108174869,
"grad_norm": 1.234375,
"learning_rate": 4.647141127827824e-06,
"loss": 0.6474,
"step": 34050
},
{
"epoch": 1.608264774633232,
"grad_norm": 1.234375,
"learning_rate": 4.639280604945842e-06,
"loss": 0.6289,
"step": 34100
},
{
"epoch": 1.6106229384489767,
"grad_norm": 1.265625,
"learning_rate": 4.6314200820638595e-06,
"loss": 0.6416,
"step": 34150
},
{
"epoch": 1.6129811022647216,
"grad_norm": 1.2421875,
"learning_rate": 4.623559559181877e-06,
"loss": 0.6313,
"step": 34200
},
{
"epoch": 1.6153392660804664,
"grad_norm": 1.2578125,
"learning_rate": 4.6156990362998946e-06,
"loss": 0.6351,
"step": 34250
},
{
"epoch": 1.6176974298962112,
"grad_norm": 1.265625,
"learning_rate": 4.607838513417913e-06,
"loss": 0.6333,
"step": 34300
},
{
"epoch": 1.6200555937119563,
"grad_norm": 1.0859375,
"learning_rate": 4.599977990535931e-06,
"loss": 0.631,
"step": 34350
},
{
"epoch": 1.622413757527701,
"grad_norm": 1.1328125,
"learning_rate": 4.592117467653949e-06,
"loss": 0.6391,
"step": 34400
},
{
"epoch": 1.624771921343446,
"grad_norm": 1.3046875,
"learning_rate": 4.584256944771966e-06,
"loss": 0.6356,
"step": 34450
},
{
"epoch": 1.6271300851591908,
"grad_norm": 1.2265625,
"learning_rate": 4.576396421889984e-06,
"loss": 0.6414,
"step": 34500
},
{
"epoch": 1.6271300851591908,
"eval_loss": 0.6410160064697266,
"eval_runtime": 470.0441,
"eval_samples_per_second": 75.972,
"eval_steps_per_second": 37.986,
"step": 34500
},
{
"epoch": 1.6294882489749356,
"grad_norm": 1.1484375,
"learning_rate": 4.568535899008002e-06,
"loss": 0.6401,
"step": 34550
},
{
"epoch": 1.6318464127906807,
"grad_norm": 1.359375,
"learning_rate": 4.5606753761260206e-06,
"loss": 0.6384,
"step": 34600
},
{
"epoch": 1.6342045766064253,
"grad_norm": 1.5078125,
"learning_rate": 4.552814853244038e-06,
"loss": 0.6287,
"step": 34650
},
{
"epoch": 1.6365627404221703,
"grad_norm": 1.15625,
"learning_rate": 4.544954330362056e-06,
"loss": 0.6348,
"step": 34700
},
{
"epoch": 1.6389209042379151,
"grad_norm": 1.140625,
"learning_rate": 4.537093807480074e-06,
"loss": 0.6362,
"step": 34750
},
{
"epoch": 1.64127906805366,
"grad_norm": 1.234375,
"learning_rate": 4.5292332845980915e-06,
"loss": 0.639,
"step": 34800
},
{
"epoch": 1.643637231869405,
"grad_norm": 1.2109375,
"learning_rate": 4.52137276171611e-06,
"loss": 0.6291,
"step": 34850
},
{
"epoch": 1.6459953956851496,
"grad_norm": 1.4765625,
"learning_rate": 4.513512238834127e-06,
"loss": 0.6169,
"step": 34900
},
{
"epoch": 1.6483535595008947,
"grad_norm": 1.1953125,
"learning_rate": 4.505651715952146e-06,
"loss": 0.6425,
"step": 34950
},
{
"epoch": 1.6507117233166395,
"grad_norm": 1.140625,
"learning_rate": 4.497791193070163e-06,
"loss": 0.6439,
"step": 35000
},
{
"epoch": 1.6507117233166395,
"eval_loss": 0.6410257816314697,
"eval_runtime": 472.2674,
"eval_samples_per_second": 75.614,
"eval_steps_per_second": 37.807,
"step": 35000
},
{
"epoch": 1.6530698871323843,
"grad_norm": 1.1484375,
"learning_rate": 4.4899306701881816e-06,
"loss": 0.6505,
"step": 35050
},
{
"epoch": 1.6554280509481294,
"grad_norm": 1.2265625,
"learning_rate": 4.482070147306199e-06,
"loss": 0.6317,
"step": 35100
},
{
"epoch": 1.657786214763874,
"grad_norm": 1.28125,
"learning_rate": 4.4742096244242174e-06,
"loss": 0.6279,
"step": 35150
},
{
"epoch": 1.660144378579619,
"grad_norm": 1.640625,
"learning_rate": 4.466349101542235e-06,
"loss": 0.635,
"step": 35200
},
{
"epoch": 1.6625025423953639,
"grad_norm": 1.125,
"learning_rate": 4.4584885786602525e-06,
"loss": 0.6263,
"step": 35250
},
{
"epoch": 1.6648607062111087,
"grad_norm": 1.3515625,
"learning_rate": 4.450628055778271e-06,
"loss": 0.6381,
"step": 35300
},
{
"epoch": 1.6672188700268535,
"grad_norm": 1.2734375,
"learning_rate": 4.442767532896289e-06,
"loss": 0.643,
"step": 35350
},
{
"epoch": 1.6695770338425984,
"grad_norm": 1.1015625,
"learning_rate": 4.434907010014307e-06,
"loss": 0.6314,
"step": 35400
},
{
"epoch": 1.6719351976583434,
"grad_norm": 1.2109375,
"learning_rate": 4.427046487132324e-06,
"loss": 0.6557,
"step": 35450
},
{
"epoch": 1.6742933614740882,
"grad_norm": 1.203125,
"learning_rate": 4.419185964250342e-06,
"loss": 0.6341,
"step": 35500
},
{
"epoch": 1.6742933614740882,
"eval_loss": 0.6410233378410339,
"eval_runtime": 469.7461,
"eval_samples_per_second": 76.02,
"eval_steps_per_second": 38.01,
"step": 35500
},
{
"epoch": 1.676651525289833,
"grad_norm": 1.25,
"learning_rate": 4.41132544136836e-06,
"loss": 0.6392,
"step": 35550
},
{
"epoch": 1.679009689105578,
"grad_norm": 1.109375,
"learning_rate": 4.4034649184863785e-06,
"loss": 0.6355,
"step": 35600
},
{
"epoch": 1.6813678529213227,
"grad_norm": 1.2421875,
"learning_rate": 4.395604395604396e-06,
"loss": 0.6393,
"step": 35650
},
{
"epoch": 1.6837260167370678,
"grad_norm": 1.2109375,
"learning_rate": 4.3877438727224135e-06,
"loss": 0.6475,
"step": 35700
},
{
"epoch": 1.6860841805528124,
"grad_norm": 1.1796875,
"learning_rate": 4.379883349840432e-06,
"loss": 0.6545,
"step": 35750
},
{
"epoch": 1.6884423443685574,
"grad_norm": 1.3203125,
"learning_rate": 4.372022826958449e-06,
"loss": 0.641,
"step": 35800
},
{
"epoch": 1.6908005081843023,
"grad_norm": 1.3828125,
"learning_rate": 4.364162304076468e-06,
"loss": 0.6346,
"step": 35850
},
{
"epoch": 1.693158672000047,
"grad_norm": 1.2265625,
"learning_rate": 4.356301781194485e-06,
"loss": 0.637,
"step": 35900
},
{
"epoch": 1.6955168358157922,
"grad_norm": 1.515625,
"learning_rate": 4.348441258312504e-06,
"loss": 0.6352,
"step": 35950
},
{
"epoch": 1.6978749996315368,
"grad_norm": 1.296875,
"learning_rate": 4.340580735430521e-06,
"loss": 0.6302,
"step": 36000
},
{
"epoch": 1.6978749996315368,
"eval_loss": 0.6410489082336426,
"eval_runtime": 473.009,
"eval_samples_per_second": 75.495,
"eval_steps_per_second": 37.748,
"step": 36000
},
{
"epoch": 1.7002331634472818,
"grad_norm": 1.1796875,
"learning_rate": 4.332720212548539e-06,
"loss": 0.624,
"step": 36050
},
{
"epoch": 1.7025913272630266,
"grad_norm": 1.1328125,
"learning_rate": 4.324859689666557e-06,
"loss": 0.6359,
"step": 36100
},
{
"epoch": 1.7049494910787715,
"grad_norm": 1.1328125,
"learning_rate": 4.316999166784575e-06,
"loss": 0.6363,
"step": 36150
},
{
"epoch": 1.7073076548945165,
"grad_norm": 1.078125,
"learning_rate": 4.309138643902593e-06,
"loss": 0.6505,
"step": 36200
},
{
"epoch": 1.7096658187102611,
"grad_norm": 1.21875,
"learning_rate": 4.30127812102061e-06,
"loss": 0.6392,
"step": 36250
},
{
"epoch": 1.7120239825260062,
"grad_norm": 1.1640625,
"learning_rate": 4.293417598138629e-06,
"loss": 0.6396,
"step": 36300
},
{
"epoch": 1.714382146341751,
"grad_norm": 1.2421875,
"learning_rate": 4.285557075256646e-06,
"loss": 0.6521,
"step": 36350
},
{
"epoch": 1.7167403101574958,
"grad_norm": 1.3203125,
"learning_rate": 4.277696552374665e-06,
"loss": 0.6569,
"step": 36400
},
{
"epoch": 1.719098473973241,
"grad_norm": 1.2890625,
"learning_rate": 4.269836029492682e-06,
"loss": 0.6409,
"step": 36450
},
{
"epoch": 1.7214566377889855,
"grad_norm": 1.109375,
"learning_rate": 4.2619755066107e-06,
"loss": 0.6459,
"step": 36500
},
{
"epoch": 1.7214566377889855,
"eval_loss": 0.6409702301025391,
"eval_runtime": 471.027,
"eval_samples_per_second": 75.813,
"eval_steps_per_second": 37.907,
"step": 36500
},
{
"epoch": 1.7238148016047306,
"grad_norm": 1.2890625,
"learning_rate": 4.254114983728718e-06,
"loss": 0.6402,
"step": 36550
},
{
"epoch": 1.7261729654204754,
"grad_norm": 1.3046875,
"learning_rate": 4.246254460846736e-06,
"loss": 0.6424,
"step": 36600
},
{
"epoch": 1.7285311292362202,
"grad_norm": 1.078125,
"learning_rate": 4.238393937964754e-06,
"loss": 0.6386,
"step": 36650
},
{
"epoch": 1.730889293051965,
"grad_norm": 1.140625,
"learning_rate": 4.230533415082771e-06,
"loss": 0.6396,
"step": 36700
},
{
"epoch": 1.7332474568677099,
"grad_norm": 1.5078125,
"learning_rate": 4.22267289220079e-06,
"loss": 0.6439,
"step": 36750
},
{
"epoch": 1.735605620683455,
"grad_norm": 1.234375,
"learning_rate": 4.214812369318807e-06,
"loss": 0.6368,
"step": 36800
},
{
"epoch": 1.7379637844991997,
"grad_norm": 1.296875,
"learning_rate": 4.206951846436826e-06,
"loss": 0.6288,
"step": 36850
},
{
"epoch": 1.7403219483149446,
"grad_norm": 1.1484375,
"learning_rate": 4.199091323554843e-06,
"loss": 0.6338,
"step": 36900
},
{
"epoch": 1.7426801121306894,
"grad_norm": 1.28125,
"learning_rate": 4.1912308006728615e-06,
"loss": 0.6385,
"step": 36950
},
{
"epoch": 1.7450382759464342,
"grad_norm": 1.1015625,
"learning_rate": 4.183370277790879e-06,
"loss": 0.6429,
"step": 37000
},
{
"epoch": 1.7450382759464342,
"eval_loss": 0.6410494446754456,
"eval_runtime": 473.4591,
"eval_samples_per_second": 75.424,
"eval_steps_per_second": 37.712,
"step": 37000
},
{
"epoch": 1.7473964397621793,
"grad_norm": 1.3203125,
"learning_rate": 4.1755097549088965e-06,
"loss": 0.6357,
"step": 37050
},
{
"epoch": 1.749754603577924,
"grad_norm": 1.1484375,
"learning_rate": 4.167649232026915e-06,
"loss": 0.6364,
"step": 37100
},
{
"epoch": 1.752112767393669,
"grad_norm": 1.2265625,
"learning_rate": 4.159788709144933e-06,
"loss": 0.6325,
"step": 37150
},
{
"epoch": 1.7544709312094138,
"grad_norm": 1.15625,
"learning_rate": 4.151928186262951e-06,
"loss": 0.6421,
"step": 37200
},
{
"epoch": 1.7568290950251586,
"grad_norm": 1.21875,
"learning_rate": 4.144067663380968e-06,
"loss": 0.649,
"step": 37250
},
{
"epoch": 1.7591872588409037,
"grad_norm": 1.2578125,
"learning_rate": 4.136207140498986e-06,
"loss": 0.6391,
"step": 37300
},
{
"epoch": 1.7615454226566483,
"grad_norm": 1.140625,
"learning_rate": 4.128346617617004e-06,
"loss": 0.6531,
"step": 37350
},
{
"epoch": 1.7639035864723933,
"grad_norm": 1.34375,
"learning_rate": 4.1204860947350225e-06,
"loss": 0.6343,
"step": 37400
},
{
"epoch": 1.7662617502881381,
"grad_norm": 1.2421875,
"learning_rate": 4.11262557185304e-06,
"loss": 0.6334,
"step": 37450
},
{
"epoch": 1.768619914103883,
"grad_norm": 1.21875,
"learning_rate": 4.1047650489710575e-06,
"loss": 0.636,
"step": 37500
},
{
"epoch": 1.768619914103883,
"eval_loss": 0.6409830451011658,
"eval_runtime": 474.1329,
"eval_samples_per_second": 75.316,
"eval_steps_per_second": 37.658,
"step": 37500
},
{
"epoch": 1.770978077919628,
"grad_norm": 1.109375,
"learning_rate": 4.096904526089076e-06,
"loss": 0.6304,
"step": 37550
},
{
"epoch": 1.7733362417353726,
"grad_norm": 1.234375,
"learning_rate": 4.089044003207093e-06,
"loss": 0.6303,
"step": 37600
},
{
"epoch": 1.7756944055511177,
"grad_norm": 1.1484375,
"learning_rate": 4.081183480325112e-06,
"loss": 0.644,
"step": 37650
},
{
"epoch": 1.7780525693668625,
"grad_norm": 1.1640625,
"learning_rate": 4.073322957443129e-06,
"loss": 0.6532,
"step": 37700
},
{
"epoch": 1.7804107331826073,
"grad_norm": 1.0703125,
"learning_rate": 4.065462434561147e-06,
"loss": 0.6406,
"step": 37750
},
{
"epoch": 1.7827688969983524,
"grad_norm": 1.1328125,
"learning_rate": 4.057601911679165e-06,
"loss": 0.6399,
"step": 37800
},
{
"epoch": 1.785127060814097,
"grad_norm": 1.1484375,
"learning_rate": 4.0497413887971835e-06,
"loss": 0.6383,
"step": 37850
},
{
"epoch": 1.787485224629842,
"grad_norm": 1.265625,
"learning_rate": 4.041880865915201e-06,
"loss": 0.6403,
"step": 37900
},
{
"epoch": 1.7898433884455869,
"grad_norm": 1.1796875,
"learning_rate": 4.0340203430332186e-06,
"loss": 0.6376,
"step": 37950
},
{
"epoch": 1.7922015522613317,
"grad_norm": 1.125,
"learning_rate": 4.026159820151237e-06,
"loss": 0.624,
"step": 38000
},
{
"epoch": 1.7922015522613317,
"eval_loss": 0.6410136818885803,
"eval_runtime": 472.5539,
"eval_samples_per_second": 75.568,
"eval_steps_per_second": 37.784,
"step": 38000
},
{
"epoch": 1.7945597160770765,
"grad_norm": 1.1640625,
"learning_rate": 4.0182992972692544e-06,
"loss": 0.6402,
"step": 38050
},
{
"epoch": 1.7969178798928214,
"grad_norm": 1.2109375,
"learning_rate": 4.010438774387273e-06,
"loss": 0.6356,
"step": 38100
},
{
"epoch": 1.7992760437085664,
"grad_norm": 1.125,
"learning_rate": 4.00257825150529e-06,
"loss": 0.6471,
"step": 38150
},
{
"epoch": 1.8016342075243112,
"grad_norm": 1.171875,
"learning_rate": 3.994717728623309e-06,
"loss": 0.6409,
"step": 38200
},
{
"epoch": 1.803992371340056,
"grad_norm": 1.2421875,
"learning_rate": 3.986857205741326e-06,
"loss": 0.6336,
"step": 38250
},
{
"epoch": 1.806350535155801,
"grad_norm": 1.0625,
"learning_rate": 3.978996682859344e-06,
"loss": 0.6292,
"step": 38300
},
{
"epoch": 1.8087086989715457,
"grad_norm": 1.09375,
"learning_rate": 3.971136159977362e-06,
"loss": 0.6324,
"step": 38350
},
{
"epoch": 1.8110668627872908,
"grad_norm": 1.2265625,
"learning_rate": 3.96327563709538e-06,
"loss": 0.6348,
"step": 38400
},
{
"epoch": 1.8134250266030354,
"grad_norm": 1.1796875,
"learning_rate": 3.955415114213398e-06,
"loss": 0.6332,
"step": 38450
},
{
"epoch": 1.8157831904187804,
"grad_norm": 1.1953125,
"learning_rate": 3.9475545913314154e-06,
"loss": 0.6369,
"step": 38500
},
{
"epoch": 1.8157831904187804,
"eval_loss": 0.6410770416259766,
"eval_runtime": 473.3108,
"eval_samples_per_second": 75.447,
"eval_steps_per_second": 37.724,
"step": 38500
},
{
"epoch": 1.8181413542345253,
"grad_norm": 1.1015625,
"learning_rate": 3.939694068449433e-06,
"loss": 0.6299,
"step": 38550
},
{
"epoch": 1.82049951805027,
"grad_norm": 1.09375,
"learning_rate": 3.931833545567451e-06,
"loss": 0.6387,
"step": 38600
},
{
"epoch": 1.8228576818660152,
"grad_norm": 1.359375,
"learning_rate": 3.92397302268547e-06,
"loss": 0.6561,
"step": 38650
},
{
"epoch": 1.8252158456817598,
"grad_norm": 1.1640625,
"learning_rate": 3.916112499803487e-06,
"loss": 0.6377,
"step": 38700
},
{
"epoch": 1.8275740094975048,
"grad_norm": 1.0625,
"learning_rate": 3.908251976921505e-06,
"loss": 0.6281,
"step": 38750
},
{
"epoch": 1.8299321733132496,
"grad_norm": 1.265625,
"learning_rate": 3.900391454039523e-06,
"loss": 0.6379,
"step": 38800
},
{
"epoch": 1.8322903371289945,
"grad_norm": 1.265625,
"learning_rate": 3.892530931157541e-06,
"loss": 0.6401,
"step": 38850
},
{
"epoch": 1.8346485009447395,
"grad_norm": 1.234375,
"learning_rate": 3.884670408275559e-06,
"loss": 0.6382,
"step": 38900
},
{
"epoch": 1.8370066647604841,
"grad_norm": 1.1796875,
"learning_rate": 3.8768098853935765e-06,
"loss": 0.6479,
"step": 38950
},
{
"epoch": 1.8393648285762292,
"grad_norm": 1.40625,
"learning_rate": 3.868949362511595e-06,
"loss": 0.6354,
"step": 39000
},
{
"epoch": 1.8393648285762292,
"eval_loss": 0.6409469246864319,
"eval_runtime": 471.1776,
"eval_samples_per_second": 75.789,
"eval_steps_per_second": 37.894,
"step": 39000
},
{
"epoch": 1.841722992391974,
"grad_norm": 1.1953125,
"learning_rate": 3.861088839629612e-06,
"loss": 0.6231,
"step": 39050
},
{
"epoch": 1.8440811562077188,
"grad_norm": 1.1953125,
"learning_rate": 3.853228316747631e-06,
"loss": 0.6389,
"step": 39100
},
{
"epoch": 1.846439320023464,
"grad_norm": 1.46875,
"learning_rate": 3.845367793865648e-06,
"loss": 0.6477,
"step": 39150
},
{
"epoch": 1.8487974838392085,
"grad_norm": 1.2578125,
"learning_rate": 3.8375072709836666e-06,
"loss": 0.6389,
"step": 39200
},
{
"epoch": 1.8511556476549536,
"grad_norm": 1.375,
"learning_rate": 3.829646748101684e-06,
"loss": 0.6398,
"step": 39250
},
{
"epoch": 1.8535138114706984,
"grad_norm": 1.234375,
"learning_rate": 3.821786225219702e-06,
"loss": 0.6461,
"step": 39300
},
{
"epoch": 1.8558719752864432,
"grad_norm": 1.28125,
"learning_rate": 3.8139257023377195e-06,
"loss": 0.6365,
"step": 39350
},
{
"epoch": 1.858230139102188,
"grad_norm": 1.1796875,
"learning_rate": 3.806065179455738e-06,
"loss": 0.6285,
"step": 39400
},
{
"epoch": 1.8605883029179329,
"grad_norm": 1.25,
"learning_rate": 3.798204656573756e-06,
"loss": 0.6483,
"step": 39450
},
{
"epoch": 1.862946466733678,
"grad_norm": 1.2734375,
"learning_rate": 3.7903441336917733e-06,
"loss": 0.6471,
"step": 39500
},
{
"epoch": 1.862946466733678,
"eval_loss": 0.6410489082336426,
"eval_runtime": 470.662,
"eval_samples_per_second": 75.872,
"eval_steps_per_second": 37.936,
"step": 39500
},
{
"epoch": 1.8653046305494225,
"grad_norm": 1.1640625,
"learning_rate": 3.7824836108097913e-06,
"loss": 0.6416,
"step": 39550
},
{
"epoch": 1.8676627943651676,
"grad_norm": 1.1796875,
"learning_rate": 3.7746230879278096e-06,
"loss": 0.6312,
"step": 39600
},
{
"epoch": 1.8700209581809124,
"grad_norm": 1.34375,
"learning_rate": 3.766762565045827e-06,
"loss": 0.6358,
"step": 39650
},
{
"epoch": 1.8723791219966572,
"grad_norm": 1.3671875,
"learning_rate": 3.758902042163845e-06,
"loss": 0.6439,
"step": 39700
},
{
"epoch": 1.8747372858124023,
"grad_norm": 1.125,
"learning_rate": 3.7510415192818626e-06,
"loss": 0.632,
"step": 39750
},
{
"epoch": 1.877095449628147,
"grad_norm": 1.046875,
"learning_rate": 3.743180996399881e-06,
"loss": 0.6491,
"step": 39800
},
{
"epoch": 1.879453613443892,
"grad_norm": 1.2109375,
"learning_rate": 3.735320473517899e-06,
"loss": 0.6456,
"step": 39850
},
{
"epoch": 1.8818117772596368,
"grad_norm": 1.2109375,
"learning_rate": 3.7274599506359164e-06,
"loss": 0.6477,
"step": 39900
},
{
"epoch": 1.8841699410753816,
"grad_norm": 1.0859375,
"learning_rate": 3.7195994277539344e-06,
"loss": 0.6369,
"step": 39950
},
{
"epoch": 1.8865281048911267,
"grad_norm": 1.453125,
"learning_rate": 3.7117389048719527e-06,
"loss": 0.6444,
"step": 40000
},
{
"epoch": 1.8865281048911267,
"eval_loss": 0.6410700082778931,
"eval_runtime": 469.0067,
"eval_samples_per_second": 76.14,
"eval_steps_per_second": 38.07,
"step": 40000
},
{
"epoch": 1.8888862687068713,
"grad_norm": 1.21875,
"learning_rate": 3.7038783819899702e-06,
"loss": 0.645,
"step": 40050
},
{
"epoch": 1.8912444325226163,
"grad_norm": 1.171875,
"learning_rate": 3.696017859107988e-06,
"loss": 0.6442,
"step": 40100
},
{
"epoch": 1.8936025963383611,
"grad_norm": 1.2421875,
"learning_rate": 3.688157336226006e-06,
"loss": 0.6434,
"step": 40150
},
{
"epoch": 1.895960760154106,
"grad_norm": 1.4375,
"learning_rate": 3.680296813344024e-06,
"loss": 0.6304,
"step": 40200
},
{
"epoch": 1.898318923969851,
"grad_norm": 1.1640625,
"learning_rate": 3.672436290462042e-06,
"loss": 0.658,
"step": 40250
},
{
"epoch": 1.9006770877855956,
"grad_norm": 1.1796875,
"learning_rate": 3.66457576758006e-06,
"loss": 0.6265,
"step": 40300
},
{
"epoch": 1.9030352516013407,
"grad_norm": 1.4296875,
"learning_rate": 3.6567152446980774e-06,
"loss": 0.6332,
"step": 40350
},
{
"epoch": 1.9053934154170855,
"grad_norm": 1.125,
"learning_rate": 3.6488547218160954e-06,
"loss": 0.638,
"step": 40400
},
{
"epoch": 1.9077515792328303,
"grad_norm": 1.171875,
"learning_rate": 3.6409941989341137e-06,
"loss": 0.6379,
"step": 40450
},
{
"epoch": 1.9101097430485754,
"grad_norm": 1.265625,
"learning_rate": 3.6331336760521312e-06,
"loss": 0.6283,
"step": 40500
},
{
"epoch": 1.9101097430485754,
"eval_loss": 0.641033411026001,
"eval_runtime": 468.7258,
"eval_samples_per_second": 76.185,
"eval_steps_per_second": 38.093,
"step": 40500
},
{
"epoch": 1.91246790686432,
"grad_norm": 1.1953125,
"learning_rate": 3.625273153170149e-06,
"loss": 0.6358,
"step": 40550
},
{
"epoch": 1.914826070680065,
"grad_norm": 1.2109375,
"learning_rate": 3.6174126302881667e-06,
"loss": 0.6464,
"step": 40600
},
{
"epoch": 1.9171842344958099,
"grad_norm": 1.1015625,
"learning_rate": 3.609552107406185e-06,
"loss": 0.6384,
"step": 40650
},
{
"epoch": 1.9195423983115547,
"grad_norm": 1.2734375,
"learning_rate": 3.601691584524203e-06,
"loss": 0.6381,
"step": 40700
},
{
"epoch": 1.9219005621272995,
"grad_norm": 1.5,
"learning_rate": 3.5938310616422205e-06,
"loss": 0.6437,
"step": 40750
},
{
"epoch": 1.9242587259430444,
"grad_norm": 1.1796875,
"learning_rate": 3.5859705387602384e-06,
"loss": 0.643,
"step": 40800
},
{
"epoch": 1.9266168897587894,
"grad_norm": 1.40625,
"learning_rate": 3.578110015878257e-06,
"loss": 0.6365,
"step": 40850
},
{
"epoch": 1.928975053574534,
"grad_norm": 1.21875,
"learning_rate": 3.5702494929962743e-06,
"loss": 0.6364,
"step": 40900
},
{
"epoch": 1.931333217390279,
"grad_norm": 1.2734375,
"learning_rate": 3.5623889701142923e-06,
"loss": 0.6417,
"step": 40950
},
{
"epoch": 1.933691381206024,
"grad_norm": 1.2421875,
"learning_rate": 3.5545284472323098e-06,
"loss": 0.6433,
"step": 41000
},
{
"epoch": 1.933691381206024,
"eval_loss": 0.6409995555877686,
"eval_runtime": 469.2679,
"eval_samples_per_second": 76.097,
"eval_steps_per_second": 38.049,
"step": 41000
},
{
"epoch": 1.9360495450217687,
"grad_norm": 1.390625,
"learning_rate": 3.546667924350328e-06,
"loss": 0.6409,
"step": 41050
},
{
"epoch": 1.9384077088375138,
"grad_norm": 1.3125,
"learning_rate": 3.538807401468346e-06,
"loss": 0.6393,
"step": 41100
},
{
"epoch": 1.9407658726532584,
"grad_norm": 1.2734375,
"learning_rate": 3.5309468785863636e-06,
"loss": 0.6403,
"step": 41150
},
{
"epoch": 1.9431240364690034,
"grad_norm": 1.34375,
"learning_rate": 3.5230863557043815e-06,
"loss": 0.6353,
"step": 41200
},
{
"epoch": 1.9454822002847483,
"grad_norm": 1.046875,
"learning_rate": 3.5152258328224e-06,
"loss": 0.6287,
"step": 41250
},
{
"epoch": 1.947840364100493,
"grad_norm": 1.2734375,
"learning_rate": 3.5073653099404174e-06,
"loss": 0.6329,
"step": 41300
},
{
"epoch": 1.9501985279162382,
"grad_norm": 1.1015625,
"learning_rate": 3.4995047870584353e-06,
"loss": 0.6449,
"step": 41350
},
{
"epoch": 1.9525566917319828,
"grad_norm": 1.1796875,
"learning_rate": 3.4916442641764533e-06,
"loss": 0.6399,
"step": 41400
},
{
"epoch": 1.9549148555477278,
"grad_norm": 1.1484375,
"learning_rate": 3.4837837412944712e-06,
"loss": 0.6449,
"step": 41450
},
{
"epoch": 1.9572730193634726,
"grad_norm": 1.4453125,
"learning_rate": 3.475923218412489e-06,
"loss": 0.6499,
"step": 41500
},
{
"epoch": 1.9572730193634726,
"eval_loss": 0.6410717964172363,
"eval_runtime": 469.3759,
"eval_samples_per_second": 76.08,
"eval_steps_per_second": 38.04,
"step": 41500
},
{
"epoch": 1.9596311831792175,
"grad_norm": 1.203125,
"learning_rate": 3.468062695530507e-06,
"loss": 0.6426,
"step": 41550
},
{
"epoch": 1.9619893469949625,
"grad_norm": 1.234375,
"learning_rate": 3.4602021726485246e-06,
"loss": 0.6327,
"step": 41600
},
{
"epoch": 1.9643475108107071,
"grad_norm": 1.1953125,
"learning_rate": 3.452341649766543e-06,
"loss": 0.6491,
"step": 41650
},
{
"epoch": 1.9667056746264522,
"grad_norm": 1.1875,
"learning_rate": 3.444481126884561e-06,
"loss": 0.6416,
"step": 41700
},
{
"epoch": 1.969063838442197,
"grad_norm": 1.25,
"learning_rate": 3.4366206040025784e-06,
"loss": 0.6597,
"step": 41750
},
{
"epoch": 1.9714220022579418,
"grad_norm": 1.2109375,
"learning_rate": 3.4287600811205964e-06,
"loss": 0.6347,
"step": 41800
},
{
"epoch": 1.9737801660736867,
"grad_norm": 1.1328125,
"learning_rate": 3.4208995582386147e-06,
"loss": 0.6337,
"step": 41850
},
{
"epoch": 1.9761383298894315,
"grad_norm": 1.140625,
"learning_rate": 3.4130390353566322e-06,
"loss": 0.6446,
"step": 41900
},
{
"epoch": 1.9784964937051766,
"grad_norm": 1.265625,
"learning_rate": 3.40517851247465e-06,
"loss": 0.6355,
"step": 41950
},
{
"epoch": 1.9808546575209214,
"grad_norm": 1.25,
"learning_rate": 3.3973179895926677e-06,
"loss": 0.6317,
"step": 42000
},
{
"epoch": 1.9808546575209214,
"eval_loss": 0.6410444378852844,
"eval_runtime": 470.2222,
"eval_samples_per_second": 75.943,
"eval_steps_per_second": 37.971,
"step": 42000
},
{
"epoch": 1.9832128213366662,
"grad_norm": 1.3046875,
"learning_rate": 3.389457466710686e-06,
"loss": 0.6303,
"step": 42050
},
{
"epoch": 1.985570985152411,
"grad_norm": 1.34375,
"learning_rate": 3.381596943828704e-06,
"loss": 0.6429,
"step": 42100
},
{
"epoch": 1.9879291489681559,
"grad_norm": 1.2109375,
"learning_rate": 3.3737364209467215e-06,
"loss": 0.6477,
"step": 42150
},
{
"epoch": 1.990287312783901,
"grad_norm": 1.28125,
"learning_rate": 3.3658758980647394e-06,
"loss": 0.6388,
"step": 42200
},
{
"epoch": 1.9926454765996455,
"grad_norm": 1.203125,
"learning_rate": 3.3580153751827578e-06,
"loss": 0.6322,
"step": 42250
},
{
"epoch": 1.9950036404153906,
"grad_norm": 1.1328125,
"learning_rate": 3.3501548523007753e-06,
"loss": 0.6404,
"step": 42300
},
{
"epoch": 1.9973618042311354,
"grad_norm": 1.234375,
"learning_rate": 3.3422943294187932e-06,
"loss": 0.6281,
"step": 42350
},
{
"epoch": 1.9997199680468802,
"grad_norm": 1.2109375,
"learning_rate": 3.3344338065368108e-06,
"loss": 0.6563,
"step": 42400
},
{
"epoch": 2.0020751841578557,
"grad_norm": 1.125,
"learning_rate": 3.326573283654829e-06,
"loss": 0.633,
"step": 42450
},
{
"epoch": 2.0044333479736003,
"grad_norm": 0.984375,
"learning_rate": 3.318712760772847e-06,
"loss": 0.6426,
"step": 42500
},
{
"epoch": 2.0044333479736003,
"eval_loss": 0.6410335898399353,
"eval_runtime": 469.6391,
"eval_samples_per_second": 76.037,
"eval_steps_per_second": 38.019,
"step": 42500
},
{
"epoch": 2.0067915117893453,
"grad_norm": 1.0,
"learning_rate": 3.3108522378908646e-06,
"loss": 0.6385,
"step": 42550
},
{
"epoch": 2.00914967560509,
"grad_norm": 1.0546875,
"learning_rate": 3.3029917150088825e-06,
"loss": 0.6285,
"step": 42600
},
{
"epoch": 2.011507839420835,
"grad_norm": 1.03125,
"learning_rate": 3.295131192126901e-06,
"loss": 0.636,
"step": 42650
},
{
"epoch": 2.01386600323658,
"grad_norm": 1.0625,
"learning_rate": 3.2872706692449184e-06,
"loss": 0.6265,
"step": 42700
},
{
"epoch": 2.0162241670523247,
"grad_norm": 1.0,
"learning_rate": 3.2794101463629363e-06,
"loss": 0.6338,
"step": 42750
},
{
"epoch": 2.0185823308680697,
"grad_norm": 1.078125,
"learning_rate": 3.2715496234809543e-06,
"loss": 0.6408,
"step": 42800
},
{
"epoch": 2.0209404946838143,
"grad_norm": 1.109375,
"learning_rate": 3.263689100598972e-06,
"loss": 0.6398,
"step": 42850
},
{
"epoch": 2.0232986584995594,
"grad_norm": 1.0546875,
"learning_rate": 3.25582857771699e-06,
"loss": 0.6332,
"step": 42900
},
{
"epoch": 2.025656822315304,
"grad_norm": 1.1328125,
"learning_rate": 3.247968054835008e-06,
"loss": 0.6408,
"step": 42950
},
{
"epoch": 2.028014986131049,
"grad_norm": 1.1875,
"learning_rate": 3.2401075319530256e-06,
"loss": 0.6297,
"step": 43000
},
{
"epoch": 2.028014986131049,
"eval_loss": 0.6411005854606628,
"eval_runtime": 473.6412,
"eval_samples_per_second": 75.395,
"eval_steps_per_second": 37.697,
"step": 43000
},
{
"epoch": 2.030373149946794,
"grad_norm": 0.9765625,
"learning_rate": 3.232247009071044e-06,
"loss": 0.6486,
"step": 43050
},
{
"epoch": 2.0327313137625387,
"grad_norm": 1.1015625,
"learning_rate": 3.224386486189062e-06,
"loss": 0.6387,
"step": 43100
},
{
"epoch": 2.0350894775782837,
"grad_norm": 1.1015625,
"learning_rate": 3.2165259633070794e-06,
"loss": 0.642,
"step": 43150
},
{
"epoch": 2.0374476413940283,
"grad_norm": 1.0234375,
"learning_rate": 3.2086654404250973e-06,
"loss": 0.6331,
"step": 43200
},
{
"epoch": 2.0398058052097734,
"grad_norm": 1.0390625,
"learning_rate": 3.200804917543115e-06,
"loss": 0.6429,
"step": 43250
},
{
"epoch": 2.0421639690255184,
"grad_norm": 0.97265625,
"learning_rate": 3.192944394661133e-06,
"loss": 0.6319,
"step": 43300
},
{
"epoch": 2.044522132841263,
"grad_norm": 0.98046875,
"learning_rate": 3.185083871779151e-06,
"loss": 0.629,
"step": 43350
},
{
"epoch": 2.046880296657008,
"grad_norm": 1.2421875,
"learning_rate": 3.1772233488971687e-06,
"loss": 0.6327,
"step": 43400
},
{
"epoch": 2.0492384604727527,
"grad_norm": 1.078125,
"learning_rate": 3.1693628260151866e-06,
"loss": 0.6462,
"step": 43450
},
{
"epoch": 2.0515966242884978,
"grad_norm": 0.98046875,
"learning_rate": 3.161502303133205e-06,
"loss": 0.6433,
"step": 43500
},
{
"epoch": 2.0515966242884978,
"eval_loss": 0.6410638689994812,
"eval_runtime": 473.3603,
"eval_samples_per_second": 75.439,
"eval_steps_per_second": 37.72,
"step": 43500
},
{
"epoch": 2.053954788104243,
"grad_norm": 1.0859375,
"learning_rate": 3.1536417802512225e-06,
"loss": 0.6416,
"step": 43550
},
{
"epoch": 2.0563129519199874,
"grad_norm": 1.0390625,
"learning_rate": 3.1457812573692404e-06,
"loss": 0.645,
"step": 43600
},
{
"epoch": 2.0586711157357325,
"grad_norm": 0.9921875,
"learning_rate": 3.137920734487258e-06,
"loss": 0.634,
"step": 43650
},
{
"epoch": 2.061029279551477,
"grad_norm": 0.95703125,
"learning_rate": 3.1300602116052763e-06,
"loss": 0.642,
"step": 43700
},
{
"epoch": 2.063387443367222,
"grad_norm": 1.203125,
"learning_rate": 3.1221996887232942e-06,
"loss": 0.6184,
"step": 43750
},
{
"epoch": 2.065745607182967,
"grad_norm": 1.1171875,
"learning_rate": 3.1143391658413117e-06,
"loss": 0.6444,
"step": 43800
},
{
"epoch": 2.068103770998712,
"grad_norm": 1.125,
"learning_rate": 3.1064786429593297e-06,
"loss": 0.6347,
"step": 43850
},
{
"epoch": 2.070461934814457,
"grad_norm": 1.046875,
"learning_rate": 3.098618120077348e-06,
"loss": 0.6416,
"step": 43900
},
{
"epoch": 2.0728200986302014,
"grad_norm": 1.125,
"learning_rate": 3.0907575971953655e-06,
"loss": 0.6464,
"step": 43950
},
{
"epoch": 2.0751782624459465,
"grad_norm": 1.2265625,
"learning_rate": 3.0828970743133835e-06,
"loss": 0.6362,
"step": 44000
},
{
"epoch": 2.0751782624459465,
"eval_loss": 0.6410983204841614,
"eval_runtime": 470.4002,
"eval_samples_per_second": 75.914,
"eval_steps_per_second": 37.957,
"step": 44000
},
{
"epoch": 2.0775364262616915,
"grad_norm": 1.2265625,
"learning_rate": 3.0750365514314014e-06,
"loss": 0.639,
"step": 44050
},
{
"epoch": 2.079894590077436,
"grad_norm": 1.1875,
"learning_rate": 3.0671760285494194e-06,
"loss": 0.6321,
"step": 44100
},
{
"epoch": 2.082252753893181,
"grad_norm": 1.1171875,
"learning_rate": 3.0593155056674373e-06,
"loss": 0.6321,
"step": 44150
},
{
"epoch": 2.084610917708926,
"grad_norm": 1.203125,
"learning_rate": 3.0514549827854552e-06,
"loss": 0.6378,
"step": 44200
},
{
"epoch": 2.086969081524671,
"grad_norm": 0.96484375,
"learning_rate": 3.0435944599034727e-06,
"loss": 0.6315,
"step": 44250
},
{
"epoch": 2.0893272453404155,
"grad_norm": 1.03125,
"learning_rate": 3.035733937021491e-06,
"loss": 0.6456,
"step": 44300
},
{
"epoch": 2.0916854091561605,
"grad_norm": 1.140625,
"learning_rate": 3.027873414139509e-06,
"loss": 0.6438,
"step": 44350
},
{
"epoch": 2.0940435729719056,
"grad_norm": 0.984375,
"learning_rate": 3.0200128912575266e-06,
"loss": 0.6345,
"step": 44400
},
{
"epoch": 2.09640173678765,
"grad_norm": 0.921875,
"learning_rate": 3.0121523683755445e-06,
"loss": 0.658,
"step": 44450
},
{
"epoch": 2.0987599006033952,
"grad_norm": 0.98828125,
"learning_rate": 3.004291845493563e-06,
"loss": 0.6351,
"step": 44500
},
{
"epoch": 2.0987599006033952,
"eval_loss": 0.6411724090576172,
"eval_runtime": 471.2456,
"eval_samples_per_second": 75.778,
"eval_steps_per_second": 37.889,
"step": 44500
},
{
"epoch": 2.10111806441914,
"grad_norm": 1.109375,
"learning_rate": 2.9964313226115804e-06,
"loss": 0.6249,
"step": 44550
},
{
"epoch": 2.103476228234885,
"grad_norm": 1.1171875,
"learning_rate": 2.9885707997295983e-06,
"loss": 0.6377,
"step": 44600
},
{
"epoch": 2.10583439205063,
"grad_norm": 1.1328125,
"learning_rate": 2.980710276847616e-06,
"loss": 0.6466,
"step": 44650
},
{
"epoch": 2.1081925558663746,
"grad_norm": 1.1328125,
"learning_rate": 2.972849753965634e-06,
"loss": 0.6393,
"step": 44700
},
{
"epoch": 2.1105507196821196,
"grad_norm": 1.1015625,
"learning_rate": 2.964989231083652e-06,
"loss": 0.6548,
"step": 44750
},
{
"epoch": 2.112908883497864,
"grad_norm": 1.0625,
"learning_rate": 2.9571287082016696e-06,
"loss": 0.6414,
"step": 44800
},
{
"epoch": 2.1152670473136093,
"grad_norm": 1.0,
"learning_rate": 2.9492681853196876e-06,
"loss": 0.6384,
"step": 44850
},
{
"epoch": 2.1176252111293543,
"grad_norm": 1.0625,
"learning_rate": 2.941407662437706e-06,
"loss": 0.6432,
"step": 44900
},
{
"epoch": 2.119983374945099,
"grad_norm": 1.0078125,
"learning_rate": 2.9335471395557234e-06,
"loss": 0.6463,
"step": 44950
},
{
"epoch": 2.122341538760844,
"grad_norm": 0.98046875,
"learning_rate": 2.9256866166737414e-06,
"loss": 0.6403,
"step": 45000
},
{
"epoch": 2.122341538760844,
"eval_loss": 0.6410689353942871,
"eval_runtime": 470.4062,
"eval_samples_per_second": 75.913,
"eval_steps_per_second": 37.957,
"step": 45000
},
{
"epoch": 2.1246997025765886,
"grad_norm": 1.0,
"learning_rate": 2.917826093791759e-06,
"loss": 0.6389,
"step": 45050
},
{
"epoch": 2.1270578663923336,
"grad_norm": 0.96484375,
"learning_rate": 2.9099655709097773e-06,
"loss": 0.637,
"step": 45100
},
{
"epoch": 2.1294160302080787,
"grad_norm": 0.91796875,
"learning_rate": 2.902105048027795e-06,
"loss": 0.6278,
"step": 45150
},
{
"epoch": 2.1317741940238233,
"grad_norm": 1.234375,
"learning_rate": 2.8942445251458127e-06,
"loss": 0.643,
"step": 45200
},
{
"epoch": 2.1341323578395683,
"grad_norm": 0.96875,
"learning_rate": 2.8863840022638306e-06,
"loss": 0.6384,
"step": 45250
},
{
"epoch": 2.136490521655313,
"grad_norm": 1.1796875,
"learning_rate": 2.878523479381849e-06,
"loss": 0.6393,
"step": 45300
},
{
"epoch": 2.138848685471058,
"grad_norm": 0.97265625,
"learning_rate": 2.8706629564998665e-06,
"loss": 0.6413,
"step": 45350
},
{
"epoch": 2.141206849286803,
"grad_norm": 1.03125,
"learning_rate": 2.8628024336178845e-06,
"loss": 0.6533,
"step": 45400
},
{
"epoch": 2.1435650131025477,
"grad_norm": 1.2734375,
"learning_rate": 2.8549419107359024e-06,
"loss": 0.6509,
"step": 45450
},
{
"epoch": 2.1459231769182927,
"grad_norm": 0.91015625,
"learning_rate": 2.8470813878539203e-06,
"loss": 0.6382,
"step": 45500
},
{
"epoch": 2.1459231769182927,
"eval_loss": 0.6410402059555054,
"eval_runtime": 473.4485,
"eval_samples_per_second": 75.425,
"eval_steps_per_second": 37.713,
"step": 45500
},
{
"epoch": 2.1482813407340373,
"grad_norm": 1.0859375,
"learning_rate": 2.8392208649719383e-06,
"loss": 0.6198,
"step": 45550
},
{
"epoch": 2.1506395045497824,
"grad_norm": 0.99609375,
"learning_rate": 2.831360342089956e-06,
"loss": 0.6359,
"step": 45600
},
{
"epoch": 2.1529976683655274,
"grad_norm": 1.046875,
"learning_rate": 2.8234998192079737e-06,
"loss": 0.6353,
"step": 45650
},
{
"epoch": 2.155355832181272,
"grad_norm": 1.1171875,
"learning_rate": 2.815639296325992e-06,
"loss": 0.6479,
"step": 45700
},
{
"epoch": 2.157713995997017,
"grad_norm": 1.0859375,
"learning_rate": 2.80777877344401e-06,
"loss": 0.6416,
"step": 45750
},
{
"epoch": 2.1600721598127617,
"grad_norm": 1.0390625,
"learning_rate": 2.7999182505620275e-06,
"loss": 0.6517,
"step": 45800
},
{
"epoch": 2.1624303236285067,
"grad_norm": 1.2109375,
"learning_rate": 2.7920577276800455e-06,
"loss": 0.6443,
"step": 45850
},
{
"epoch": 2.1647884874442513,
"grad_norm": 0.91796875,
"learning_rate": 2.784197204798064e-06,
"loss": 0.6422,
"step": 45900
},
{
"epoch": 2.1671466512599964,
"grad_norm": 0.92578125,
"learning_rate": 2.7763366819160813e-06,
"loss": 0.6313,
"step": 45950
},
{
"epoch": 2.1695048150757414,
"grad_norm": 1.1875,
"learning_rate": 2.7684761590340993e-06,
"loss": 0.6271,
"step": 46000
},
{
"epoch": 2.1695048150757414,
"eval_loss": 0.6410887837409973,
"eval_runtime": 473.9298,
"eval_samples_per_second": 75.349,
"eval_steps_per_second": 37.674,
"step": 46000
},
{
"epoch": 2.171862978891486,
"grad_norm": 1.0859375,
"learning_rate": 2.760615636152117e-06,
"loss": 0.6376,
"step": 46050
},
{
"epoch": 2.174221142707231,
"grad_norm": 1.0390625,
"learning_rate": 2.752755113270135e-06,
"loss": 0.6308,
"step": 46100
},
{
"epoch": 2.1765793065229757,
"grad_norm": 1.171875,
"learning_rate": 2.744894590388153e-06,
"loss": 0.646,
"step": 46150
},
{
"epoch": 2.1789374703387208,
"grad_norm": 1.0078125,
"learning_rate": 2.7370340675061706e-06,
"loss": 0.6353,
"step": 46200
},
{
"epoch": 2.181295634154466,
"grad_norm": 0.9375,
"learning_rate": 2.7291735446241885e-06,
"loss": 0.6365,
"step": 46250
},
{
"epoch": 2.1836537979702104,
"grad_norm": 1.1875,
"learning_rate": 2.721313021742206e-06,
"loss": 0.6357,
"step": 46300
},
{
"epoch": 2.1860119617859555,
"grad_norm": 1.109375,
"learning_rate": 2.7134524988602244e-06,
"loss": 0.6526,
"step": 46350
},
{
"epoch": 2.1883701256017,
"grad_norm": 1.03125,
"learning_rate": 2.7055919759782424e-06,
"loss": 0.6355,
"step": 46400
},
{
"epoch": 2.190728289417445,
"grad_norm": 0.97265625,
"learning_rate": 2.69773145309626e-06,
"loss": 0.6359,
"step": 46450
},
{
"epoch": 2.19308645323319,
"grad_norm": 0.890625,
"learning_rate": 2.689870930214278e-06,
"loss": 0.6347,
"step": 46500
},
{
"epoch": 2.19308645323319,
"eval_loss": 0.6410621404647827,
"eval_runtime": 476.2372,
"eval_samples_per_second": 74.984,
"eval_steps_per_second": 37.492,
"step": 46500
},
{
"epoch": 2.195444617048935,
"grad_norm": 0.89453125,
"learning_rate": 2.682010407332296e-06,
"loss": 0.6262,
"step": 46550
},
{
"epoch": 2.19780278086468,
"grad_norm": 1.0703125,
"learning_rate": 2.6741498844503137e-06,
"loss": 0.6399,
"step": 46600
},
{
"epoch": 2.2001609446804244,
"grad_norm": 1.1875,
"learning_rate": 2.6662893615683316e-06,
"loss": 0.6413,
"step": 46650
},
{
"epoch": 2.2025191084961695,
"grad_norm": 0.91015625,
"learning_rate": 2.6584288386863496e-06,
"loss": 0.6292,
"step": 46700
},
{
"epoch": 2.204877272311914,
"grad_norm": 1.0625,
"learning_rate": 2.6505683158043675e-06,
"loss": 0.6372,
"step": 46750
},
{
"epoch": 2.207235436127659,
"grad_norm": 1.1171875,
"learning_rate": 2.6427077929223854e-06,
"loss": 0.6505,
"step": 46800
},
{
"epoch": 2.209593599943404,
"grad_norm": 1.140625,
"learning_rate": 2.6348472700404034e-06,
"loss": 0.6359,
"step": 46850
},
{
"epoch": 2.211951763759149,
"grad_norm": 1.1328125,
"learning_rate": 2.626986747158421e-06,
"loss": 0.6359,
"step": 46900
},
{
"epoch": 2.214309927574894,
"grad_norm": 1.0703125,
"learning_rate": 2.6191262242764392e-06,
"loss": 0.6248,
"step": 46950
},
{
"epoch": 2.2166680913906385,
"grad_norm": 1.015625,
"learning_rate": 2.611265701394457e-06,
"loss": 0.6413,
"step": 47000
},
{
"epoch": 2.2166680913906385,
"eval_loss": 0.6410422921180725,
"eval_runtime": 470.9362,
"eval_samples_per_second": 75.828,
"eval_steps_per_second": 37.914,
"step": 47000
},
{
"epoch": 2.2190262552063835,
"grad_norm": 0.99609375,
"learning_rate": 2.6034051785124747e-06,
"loss": 0.6364,
"step": 47050
},
{
"epoch": 2.2213844190221286,
"grad_norm": 1.15625,
"learning_rate": 2.5955446556304926e-06,
"loss": 0.6298,
"step": 47100
},
{
"epoch": 2.223742582837873,
"grad_norm": 1.03125,
"learning_rate": 2.587684132748511e-06,
"loss": 0.6295,
"step": 47150
},
{
"epoch": 2.2261007466536182,
"grad_norm": 1.0625,
"learning_rate": 2.5798236098665285e-06,
"loss": 0.6258,
"step": 47200
},
{
"epoch": 2.228458910469363,
"grad_norm": 1.0234375,
"learning_rate": 2.5719630869845465e-06,
"loss": 0.6404,
"step": 47250
},
{
"epoch": 2.230817074285108,
"grad_norm": 1.0703125,
"learning_rate": 2.564102564102564e-06,
"loss": 0.6373,
"step": 47300
},
{
"epoch": 2.233175238100853,
"grad_norm": 1.203125,
"learning_rate": 2.5562420412205823e-06,
"loss": 0.6458,
"step": 47350
},
{
"epoch": 2.2355334019165976,
"grad_norm": 0.99609375,
"learning_rate": 2.5483815183386003e-06,
"loss": 0.6477,
"step": 47400
},
{
"epoch": 2.2378915657323426,
"grad_norm": 1.1484375,
"learning_rate": 2.5405209954566178e-06,
"loss": 0.6422,
"step": 47450
},
{
"epoch": 2.240249729548087,
"grad_norm": 1.015625,
"learning_rate": 2.5326604725746357e-06,
"loss": 0.6331,
"step": 47500
},
{
"epoch": 2.240249729548087,
"eval_loss": 0.6410676836967468,
"eval_runtime": 471.372,
"eval_samples_per_second": 75.758,
"eval_steps_per_second": 37.879,
"step": 47500
},
{
"epoch": 2.2426078933638323,
"grad_norm": 0.94140625,
"learning_rate": 2.524799949692654e-06,
"loss": 0.6346,
"step": 47550
},
{
"epoch": 2.2449660571795773,
"grad_norm": 0.9609375,
"learning_rate": 2.5169394268106716e-06,
"loss": 0.64,
"step": 47600
},
{
"epoch": 2.247324220995322,
"grad_norm": 1.2109375,
"learning_rate": 2.5090789039286895e-06,
"loss": 0.6336,
"step": 47650
},
{
"epoch": 2.249682384811067,
"grad_norm": 1.1171875,
"learning_rate": 2.501218381046707e-06,
"loss": 0.6598,
"step": 47700
},
{
"epoch": 2.2520405486268116,
"grad_norm": 0.94921875,
"learning_rate": 2.4933578581647254e-06,
"loss": 0.6315,
"step": 47750
},
{
"epoch": 2.2543987124425566,
"grad_norm": 1.0546875,
"learning_rate": 2.4854973352827433e-06,
"loss": 0.6378,
"step": 47800
},
{
"epoch": 2.2567568762583017,
"grad_norm": 0.890625,
"learning_rate": 2.477636812400761e-06,
"loss": 0.6317,
"step": 47850
},
{
"epoch": 2.2591150400740463,
"grad_norm": 1.1015625,
"learning_rate": 2.4697762895187792e-06,
"loss": 0.6468,
"step": 47900
},
{
"epoch": 2.2614732038897913,
"grad_norm": 0.9296875,
"learning_rate": 2.4619157666367967e-06,
"loss": 0.6323,
"step": 47950
},
{
"epoch": 2.263831367705536,
"grad_norm": 1.0,
"learning_rate": 2.4540552437548147e-06,
"loss": 0.6431,
"step": 48000
},
{
"epoch": 2.263831367705536,
"eval_loss": 0.6410719156265259,
"eval_runtime": 471.4352,
"eval_samples_per_second": 75.747,
"eval_steps_per_second": 37.874,
"step": 48000
},
{
"epoch": 2.266189531521281,
"grad_norm": 1.09375,
"learning_rate": 2.4461947208728326e-06,
"loss": 0.6269,
"step": 48050
},
{
"epoch": 2.268547695337026,
"grad_norm": 1.03125,
"learning_rate": 2.4383341979908505e-06,
"loss": 0.6366,
"step": 48100
},
{
"epoch": 2.2709058591527707,
"grad_norm": 1.078125,
"learning_rate": 2.4304736751088685e-06,
"loss": 0.6364,
"step": 48150
},
{
"epoch": 2.2732640229685157,
"grad_norm": 0.96875,
"learning_rate": 2.4226131522268864e-06,
"loss": 0.6399,
"step": 48200
},
{
"epoch": 2.2756221867842603,
"grad_norm": 0.9453125,
"learning_rate": 2.4147526293449044e-06,
"loss": 0.6375,
"step": 48250
},
{
"epoch": 2.2779803506000054,
"grad_norm": 1.0390625,
"learning_rate": 2.4068921064629223e-06,
"loss": 0.6249,
"step": 48300
},
{
"epoch": 2.2803385144157504,
"grad_norm": 1.0390625,
"learning_rate": 2.39903158358094e-06,
"loss": 0.6211,
"step": 48350
},
{
"epoch": 2.282696678231495,
"grad_norm": 0.9609375,
"learning_rate": 2.391171060698958e-06,
"loss": 0.6343,
"step": 48400
},
{
"epoch": 2.28505484204724,
"grad_norm": 1.1796875,
"learning_rate": 2.3833105378169757e-06,
"loss": 0.6483,
"step": 48450
},
{
"epoch": 2.2874130058629847,
"grad_norm": 1.0546875,
"learning_rate": 2.3754500149349936e-06,
"loss": 0.6274,
"step": 48500
},
{
"epoch": 2.2874130058629847,
"eval_loss": 0.6411145329475403,
"eval_runtime": 472.0639,
"eval_samples_per_second": 75.647,
"eval_steps_per_second": 37.823,
"step": 48500
},
{
"epoch": 2.2897711696787297,
"grad_norm": 0.98046875,
"learning_rate": 2.3675894920530116e-06,
"loss": 0.6402,
"step": 48550
},
{
"epoch": 2.2921293334944743,
"grad_norm": 1.015625,
"learning_rate": 2.3597289691710295e-06,
"loss": 0.6439,
"step": 48600
},
{
"epoch": 2.2944874973102194,
"grad_norm": 1.0859375,
"learning_rate": 2.3518684462890474e-06,
"loss": 0.6451,
"step": 48650
},
{
"epoch": 2.2968456611259644,
"grad_norm": 0.92578125,
"learning_rate": 2.3440079234070654e-06,
"loss": 0.654,
"step": 48700
},
{
"epoch": 2.299203824941709,
"grad_norm": 1.1015625,
"learning_rate": 2.3361474005250833e-06,
"loss": 0.6352,
"step": 48750
},
{
"epoch": 2.301561988757454,
"grad_norm": 1.0625,
"learning_rate": 2.3282868776431012e-06,
"loss": 0.6397,
"step": 48800
},
{
"epoch": 2.3039201525731987,
"grad_norm": 1.1015625,
"learning_rate": 2.3204263547611188e-06,
"loss": 0.6356,
"step": 48850
},
{
"epoch": 2.3062783163889438,
"grad_norm": 1.0234375,
"learning_rate": 2.312565831879137e-06,
"loss": 0.6359,
"step": 48900
},
{
"epoch": 2.308636480204689,
"grad_norm": 1.0859375,
"learning_rate": 2.3047053089971546e-06,
"loss": 0.651,
"step": 48950
},
{
"epoch": 2.3109946440204334,
"grad_norm": 1.078125,
"learning_rate": 2.2968447861151726e-06,
"loss": 0.6382,
"step": 49000
},
{
"epoch": 2.3109946440204334,
"eval_loss": 0.6411119103431702,
"eval_runtime": 475.9105,
"eval_samples_per_second": 75.035,
"eval_steps_per_second": 37.518,
"step": 49000
},
{
"epoch": 2.3133528078361785,
"grad_norm": 0.984375,
"learning_rate": 2.2889842632331905e-06,
"loss": 0.6447,
"step": 49050
},
{
"epoch": 2.315710971651923,
"grad_norm": 0.99609375,
"learning_rate": 2.2811237403512084e-06,
"loss": 0.6475,
"step": 49100
},
{
"epoch": 2.318069135467668,
"grad_norm": 1.2265625,
"learning_rate": 2.2732632174692264e-06,
"loss": 0.641,
"step": 49150
},
{
"epoch": 2.3204272992834127,
"grad_norm": 1.1171875,
"learning_rate": 2.2654026945872443e-06,
"loss": 0.6286,
"step": 49200
},
{
"epoch": 2.322785463099158,
"grad_norm": 1.109375,
"learning_rate": 2.257542171705262e-06,
"loss": 0.6393,
"step": 49250
},
{
"epoch": 2.325143626914903,
"grad_norm": 0.9609375,
"learning_rate": 2.2496816488232798e-06,
"loss": 0.6358,
"step": 49300
},
{
"epoch": 2.3275017907306474,
"grad_norm": 1.2265625,
"learning_rate": 2.2418211259412977e-06,
"loss": 0.6508,
"step": 49350
},
{
"epoch": 2.3298599545463925,
"grad_norm": 1.0703125,
"learning_rate": 2.2339606030593156e-06,
"loss": 0.6309,
"step": 49400
},
{
"epoch": 2.332218118362137,
"grad_norm": 0.96484375,
"learning_rate": 2.2261000801773336e-06,
"loss": 0.6426,
"step": 49450
},
{
"epoch": 2.334576282177882,
"grad_norm": 1.0546875,
"learning_rate": 2.2182395572953515e-06,
"loss": 0.6387,
"step": 49500
},
{
"epoch": 2.334576282177882,
"eval_loss": 0.6411243081092834,
"eval_runtime": 471.3649,
"eval_samples_per_second": 75.759,
"eval_steps_per_second": 37.879,
"step": 49500
},
{
"epoch": 2.336934445993627,
"grad_norm": 1.3359375,
"learning_rate": 2.2103790344133695e-06,
"loss": 0.6445,
"step": 49550
},
{
"epoch": 2.339292609809372,
"grad_norm": 1.140625,
"learning_rate": 2.202518511531387e-06,
"loss": 0.6366,
"step": 49600
},
{
"epoch": 2.341650773625117,
"grad_norm": 1.0859375,
"learning_rate": 2.1946579886494053e-06,
"loss": 0.6341,
"step": 49650
},
{
"epoch": 2.3440089374408615,
"grad_norm": 1.0390625,
"learning_rate": 2.186797465767423e-06,
"loss": 0.6306,
"step": 49700
},
{
"epoch": 2.3463671012566065,
"grad_norm": 1.0703125,
"learning_rate": 2.1789369428854408e-06,
"loss": 0.646,
"step": 49750
},
{
"epoch": 2.3487252650723516,
"grad_norm": 1.015625,
"learning_rate": 2.1710764200034587e-06,
"loss": 0.6357,
"step": 49800
},
{
"epoch": 2.351083428888096,
"grad_norm": 1.0390625,
"learning_rate": 2.1632158971214767e-06,
"loss": 0.6339,
"step": 49850
},
{
"epoch": 2.3534415927038412,
"grad_norm": 1.2578125,
"learning_rate": 2.1553553742394946e-06,
"loss": 0.6479,
"step": 49900
},
{
"epoch": 2.355799756519586,
"grad_norm": 1.0546875,
"learning_rate": 2.1474948513575125e-06,
"loss": 0.6441,
"step": 49950
},
{
"epoch": 2.358157920335331,
"grad_norm": 1.0390625,
"learning_rate": 2.1396343284755305e-06,
"loss": 0.6278,
"step": 50000
},
{
"epoch": 2.358157920335331,
"eval_loss": 0.6410099267959595,
"eval_runtime": 474.9955,
"eval_samples_per_second": 75.18,
"eval_steps_per_second": 37.59,
"step": 50000
},
{
"epoch": 2.360516084151076,
"grad_norm": 1.0546875,
"learning_rate": 2.1317738055935484e-06,
"loss": 0.6313,
"step": 50050
},
{
"epoch": 2.3628742479668206,
"grad_norm": 1.1328125,
"learning_rate": 2.123913282711566e-06,
"loss": 0.6445,
"step": 50100
},
{
"epoch": 2.3652324117825656,
"grad_norm": 1.0703125,
"learning_rate": 2.1160527598295843e-06,
"loss": 0.6401,
"step": 50150
},
{
"epoch": 2.36759057559831,
"grad_norm": 1.09375,
"learning_rate": 2.108192236947602e-06,
"loss": 0.6343,
"step": 50200
},
{
"epoch": 2.3699487394140553,
"grad_norm": 1.015625,
"learning_rate": 2.1003317140656197e-06,
"loss": 0.6334,
"step": 50250
},
{
"epoch": 2.3723069032298003,
"grad_norm": 1.0859375,
"learning_rate": 2.0924711911836377e-06,
"loss": 0.6402,
"step": 50300
},
{
"epoch": 2.374665067045545,
"grad_norm": 0.98046875,
"learning_rate": 2.0846106683016556e-06,
"loss": 0.6575,
"step": 50350
},
{
"epoch": 2.37702323086129,
"grad_norm": 1.0625,
"learning_rate": 2.0767501454196735e-06,
"loss": 0.6326,
"step": 50400
},
{
"epoch": 2.3793813946770346,
"grad_norm": 0.94921875,
"learning_rate": 2.0688896225376915e-06,
"loss": 0.6403,
"step": 50450
},
{
"epoch": 2.3817395584927796,
"grad_norm": 1.0703125,
"learning_rate": 2.061029099655709e-06,
"loss": 0.6279,
"step": 50500
},
{
"epoch": 2.3817395584927796,
"eval_loss": 0.6410384774208069,
"eval_runtime": 474.5984,
"eval_samples_per_second": 75.243,
"eval_steps_per_second": 37.621,
"step": 50500
},
{
"epoch": 2.3840977223085247,
"grad_norm": 1.0234375,
"learning_rate": 2.0531685767737274e-06,
"loss": 0.6402,
"step": 50550
},
{
"epoch": 2.3864558861242693,
"grad_norm": 0.94140625,
"learning_rate": 2.045308053891745e-06,
"loss": 0.6405,
"step": 50600
},
{
"epoch": 2.3888140499400143,
"grad_norm": 0.98046875,
"learning_rate": 2.037447531009763e-06,
"loss": 0.6292,
"step": 50650
},
{
"epoch": 2.391172213755759,
"grad_norm": 1.0078125,
"learning_rate": 2.0295870081277807e-06,
"loss": 0.6528,
"step": 50700
},
{
"epoch": 2.393530377571504,
"grad_norm": 1.140625,
"learning_rate": 2.0217264852457987e-06,
"loss": 0.6471,
"step": 50750
},
{
"epoch": 2.395888541387249,
"grad_norm": 1.0625,
"learning_rate": 2.0138659623638166e-06,
"loss": 0.6442,
"step": 50800
},
{
"epoch": 2.3982467052029937,
"grad_norm": 1.0625,
"learning_rate": 2.0060054394818346e-06,
"loss": 0.6385,
"step": 50850
},
{
"epoch": 2.4006048690187387,
"grad_norm": 1.09375,
"learning_rate": 1.9981449165998525e-06,
"loss": 0.6197,
"step": 50900
},
{
"epoch": 2.4029630328344833,
"grad_norm": 0.9609375,
"learning_rate": 1.9902843937178704e-06,
"loss": 0.6268,
"step": 50950
},
{
"epoch": 2.4053211966502284,
"grad_norm": 1.0390625,
"learning_rate": 1.982423870835888e-06,
"loss": 0.6453,
"step": 51000
},
{
"epoch": 2.4053211966502284,
"eval_loss": 0.6410667896270752,
"eval_runtime": 478.4093,
"eval_samples_per_second": 74.643,
"eval_steps_per_second": 37.322,
"step": 51000
},
{
"epoch": 2.4076793604659734,
"grad_norm": 1.1484375,
"learning_rate": 1.9745633479539063e-06,
"loss": 0.6329,
"step": 51050
},
{
"epoch": 2.410037524281718,
"grad_norm": 0.9921875,
"learning_rate": 1.966702825071924e-06,
"loss": 0.6452,
"step": 51100
},
{
"epoch": 2.412395688097463,
"grad_norm": 1.109375,
"learning_rate": 1.9588423021899418e-06,
"loss": 0.6491,
"step": 51150
},
{
"epoch": 2.4147538519132077,
"grad_norm": 1.109375,
"learning_rate": 1.9509817793079597e-06,
"loss": 0.6388,
"step": 51200
},
{
"epoch": 2.4171120157289527,
"grad_norm": 1.0859375,
"learning_rate": 1.9431212564259776e-06,
"loss": 0.638,
"step": 51250
},
{
"epoch": 2.4194701795446973,
"grad_norm": 1.1640625,
"learning_rate": 1.9352607335439956e-06,
"loss": 0.6462,
"step": 51300
},
{
"epoch": 2.4218283433604424,
"grad_norm": 1.046875,
"learning_rate": 1.9274002106620135e-06,
"loss": 0.6339,
"step": 51350
},
{
"epoch": 2.4241865071761874,
"grad_norm": 1.015625,
"learning_rate": 1.9195396877800314e-06,
"loss": 0.6367,
"step": 51400
},
{
"epoch": 2.426544670991932,
"grad_norm": 1.1875,
"learning_rate": 1.9116791648980494e-06,
"loss": 0.6362,
"step": 51450
},
{
"epoch": 2.428902834807677,
"grad_norm": 1.0078125,
"learning_rate": 1.9038186420160671e-06,
"loss": 0.6493,
"step": 51500
},
{
"epoch": 2.428902834807677,
"eval_loss": 0.6410060524940491,
"eval_runtime": 471.4392,
"eval_samples_per_second": 75.747,
"eval_steps_per_second": 37.873,
"step": 51500
},
{
"epoch": 2.4312609986234217,
"grad_norm": 1.03125,
"learning_rate": 1.895958119134085e-06,
"loss": 0.6373,
"step": 51550
},
{
"epoch": 2.4336191624391668,
"grad_norm": 1.0703125,
"learning_rate": 1.8880975962521028e-06,
"loss": 0.6315,
"step": 51600
},
{
"epoch": 2.4359773262549114,
"grad_norm": 0.93359375,
"learning_rate": 1.880237073370121e-06,
"loss": 0.6253,
"step": 51650
},
{
"epoch": 2.4383354900706564,
"grad_norm": 1.0390625,
"learning_rate": 1.8723765504881386e-06,
"loss": 0.6361,
"step": 51700
},
{
"epoch": 2.4406936538864015,
"grad_norm": 1.2265625,
"learning_rate": 1.8645160276061566e-06,
"loss": 0.6379,
"step": 51750
},
{
"epoch": 2.443051817702146,
"grad_norm": 0.9765625,
"learning_rate": 1.8566555047241743e-06,
"loss": 0.6426,
"step": 51800
},
{
"epoch": 2.445409981517891,
"grad_norm": 1.0078125,
"learning_rate": 1.8487949818421925e-06,
"loss": 0.6332,
"step": 51850
},
{
"epoch": 2.4477681453336357,
"grad_norm": 1.015625,
"learning_rate": 1.8409344589602102e-06,
"loss": 0.6342,
"step": 51900
},
{
"epoch": 2.450126309149381,
"grad_norm": 1.203125,
"learning_rate": 1.8330739360782281e-06,
"loss": 0.6358,
"step": 51950
},
{
"epoch": 2.452484472965126,
"grad_norm": 1.09375,
"learning_rate": 1.8252134131962459e-06,
"loss": 0.6477,
"step": 52000
},
{
"epoch": 2.452484472965126,
"eval_loss": 0.6410579681396484,
"eval_runtime": 474.8277,
"eval_samples_per_second": 75.206,
"eval_steps_per_second": 37.603,
"step": 52000
},
{
"epoch": 2.4548426367808704,
"grad_norm": 1.2421875,
"learning_rate": 1.817352890314264e-06,
"loss": 0.6435,
"step": 52050
},
{
"epoch": 2.4572008005966155,
"grad_norm": 1.015625,
"learning_rate": 1.8094923674322817e-06,
"loss": 0.6395,
"step": 52100
},
{
"epoch": 2.45955896441236,
"grad_norm": 0.94140625,
"learning_rate": 1.8016318445502999e-06,
"loss": 0.6372,
"step": 52150
},
{
"epoch": 2.461917128228105,
"grad_norm": 1.0,
"learning_rate": 1.7937713216683176e-06,
"loss": 0.6291,
"step": 52200
},
{
"epoch": 2.46427529204385,
"grad_norm": 1.078125,
"learning_rate": 1.7859107987863353e-06,
"loss": 0.6459,
"step": 52250
},
{
"epoch": 2.466633455859595,
"grad_norm": 1.03125,
"learning_rate": 1.7780502759043533e-06,
"loss": 0.6448,
"step": 52300
},
{
"epoch": 2.46899161967534,
"grad_norm": 0.95703125,
"learning_rate": 1.770189753022371e-06,
"loss": 0.6313,
"step": 52350
},
{
"epoch": 2.4713497834910845,
"grad_norm": 1.0546875,
"learning_rate": 1.7623292301403891e-06,
"loss": 0.646,
"step": 52400
},
{
"epoch": 2.4737079473068295,
"grad_norm": 0.96875,
"learning_rate": 1.7544687072584069e-06,
"loss": 0.6389,
"step": 52450
},
{
"epoch": 2.4760661111225746,
"grad_norm": 0.9453125,
"learning_rate": 1.7466081843764248e-06,
"loss": 0.6378,
"step": 52500
},
{
"epoch": 2.4760661111225746,
"eval_loss": 0.6410920023918152,
"eval_runtime": 469.8066,
"eval_samples_per_second": 76.01,
"eval_steps_per_second": 38.005,
"step": 52500
},
{
"epoch": 2.478424274938319,
"grad_norm": 1.0078125,
"learning_rate": 1.7387476614944425e-06,
"loss": 0.6373,
"step": 52550
},
{
"epoch": 2.4807824387540642,
"grad_norm": 0.97265625,
"learning_rate": 1.7308871386124607e-06,
"loss": 0.6392,
"step": 52600
},
{
"epoch": 2.483140602569809,
"grad_norm": 1.0859375,
"learning_rate": 1.7230266157304784e-06,
"loss": 0.6453,
"step": 52650
},
{
"epoch": 2.485498766385554,
"grad_norm": 1.09375,
"learning_rate": 1.7151660928484963e-06,
"loss": 0.6282,
"step": 52700
},
{
"epoch": 2.487856930201299,
"grad_norm": 0.953125,
"learning_rate": 1.7073055699665143e-06,
"loss": 0.6431,
"step": 52750
},
{
"epoch": 2.4902150940170436,
"grad_norm": 1.0703125,
"learning_rate": 1.6994450470845322e-06,
"loss": 0.6384,
"step": 52800
},
{
"epoch": 2.4925732578327886,
"grad_norm": 1.109375,
"learning_rate": 1.69158452420255e-06,
"loss": 0.6305,
"step": 52850
},
{
"epoch": 2.494931421648533,
"grad_norm": 1.0,
"learning_rate": 1.683724001320568e-06,
"loss": 0.6403,
"step": 52900
},
{
"epoch": 2.4972895854642783,
"grad_norm": 1.1171875,
"learning_rate": 1.6758634784385858e-06,
"loss": 0.6364,
"step": 52950
},
{
"epoch": 2.4996477492800233,
"grad_norm": 1.078125,
"learning_rate": 1.6680029555566038e-06,
"loss": 0.6319,
"step": 53000
},
{
"epoch": 2.4996477492800233,
"eval_loss": 0.6410515904426575,
"eval_runtime": 470.4813,
"eval_samples_per_second": 75.901,
"eval_steps_per_second": 37.951,
"step": 53000
},
{
"epoch": 2.502005913095768,
"grad_norm": 1.171875,
"learning_rate": 1.6601424326746215e-06,
"loss": 0.6371,
"step": 53050
},
{
"epoch": 2.504364076911513,
"grad_norm": 0.9609375,
"learning_rate": 1.6522819097926396e-06,
"loss": 0.6368,
"step": 53100
},
{
"epoch": 2.5067222407272576,
"grad_norm": 1.1171875,
"learning_rate": 1.6444213869106574e-06,
"loss": 0.6327,
"step": 53150
},
{
"epoch": 2.5090804045430026,
"grad_norm": 0.95703125,
"learning_rate": 1.6365608640286753e-06,
"loss": 0.6395,
"step": 53200
},
{
"epoch": 2.5114385683587477,
"grad_norm": 1.1171875,
"learning_rate": 1.628700341146693e-06,
"loss": 0.6396,
"step": 53250
},
{
"epoch": 2.5137967321744923,
"grad_norm": 0.9296875,
"learning_rate": 1.6208398182647112e-06,
"loss": 0.6325,
"step": 53300
},
{
"epoch": 2.5161548959902373,
"grad_norm": 1.1328125,
"learning_rate": 1.6129792953827289e-06,
"loss": 0.632,
"step": 53350
},
{
"epoch": 2.518513059805982,
"grad_norm": 0.95703125,
"learning_rate": 1.605118772500747e-06,
"loss": 0.633,
"step": 53400
},
{
"epoch": 2.520871223621727,
"grad_norm": 0.9765625,
"learning_rate": 1.5972582496187648e-06,
"loss": 0.6432,
"step": 53450
},
{
"epoch": 2.523229387437472,
"grad_norm": 1.0390625,
"learning_rate": 1.5893977267367827e-06,
"loss": 0.6322,
"step": 53500
},
{
"epoch": 2.523229387437472,
"eval_loss": 0.641071081161499,
"eval_runtime": 470.808,
"eval_samples_per_second": 75.848,
"eval_steps_per_second": 37.924,
"step": 53500
},
{
"epoch": 2.5255875512532167,
"grad_norm": 0.98046875,
"learning_rate": 1.5815372038548004e-06,
"loss": 0.6297,
"step": 53550
},
{
"epoch": 2.5279457150689617,
"grad_norm": 0.97265625,
"learning_rate": 1.5736766809728186e-06,
"loss": 0.6453,
"step": 53600
},
{
"epoch": 2.5303038788847063,
"grad_norm": 1.015625,
"learning_rate": 1.5658161580908363e-06,
"loss": 0.6414,
"step": 53650
},
{
"epoch": 2.5326620427004514,
"grad_norm": 1.09375,
"learning_rate": 1.5579556352088542e-06,
"loss": 0.634,
"step": 53700
},
{
"epoch": 2.5350202065161964,
"grad_norm": 1.2265625,
"learning_rate": 1.550095112326872e-06,
"loss": 0.6368,
"step": 53750
},
{
"epoch": 2.537378370331941,
"grad_norm": 0.97265625,
"learning_rate": 1.5422345894448901e-06,
"loss": 0.6415,
"step": 53800
},
{
"epoch": 2.5397365341476856,
"grad_norm": 1.140625,
"learning_rate": 1.5343740665629078e-06,
"loss": 0.6283,
"step": 53850
},
{
"epoch": 2.5420946979634307,
"grad_norm": 1.1484375,
"learning_rate": 1.5265135436809258e-06,
"loss": 0.6401,
"step": 53900
},
{
"epoch": 2.5444528617791757,
"grad_norm": 0.95703125,
"learning_rate": 1.5186530207989437e-06,
"loss": 0.6371,
"step": 53950
},
{
"epoch": 2.546811025594921,
"grad_norm": 0.98046875,
"learning_rate": 1.5107924979169617e-06,
"loss": 0.6381,
"step": 54000
},
{
"epoch": 2.546811025594921,
"eval_loss": 0.6411101222038269,
"eval_runtime": 469.3138,
"eval_samples_per_second": 76.09,
"eval_steps_per_second": 38.045,
"step": 54000
},
{
"epoch": 2.5491691894106654,
"grad_norm": 1.1171875,
"learning_rate": 1.5029319750349794e-06,
"loss": 0.6299,
"step": 54050
},
{
"epoch": 2.55152735322641,
"grad_norm": 1.0625,
"learning_rate": 1.4950714521529975e-06,
"loss": 0.6359,
"step": 54100
},
{
"epoch": 2.553885517042155,
"grad_norm": 1.1484375,
"learning_rate": 1.4872109292710153e-06,
"loss": 0.6527,
"step": 54150
},
{
"epoch": 2.5562436808579,
"grad_norm": 1.0234375,
"learning_rate": 1.4793504063890332e-06,
"loss": 0.6355,
"step": 54200
},
{
"epoch": 2.5586018446736447,
"grad_norm": 1.140625,
"learning_rate": 1.471489883507051e-06,
"loss": 0.6521,
"step": 54250
},
{
"epoch": 2.5609600084893898,
"grad_norm": 1.015625,
"learning_rate": 1.463629360625069e-06,
"loss": 0.6477,
"step": 54300
},
{
"epoch": 2.5633181723051344,
"grad_norm": 0.99609375,
"learning_rate": 1.4557688377430868e-06,
"loss": 0.6466,
"step": 54350
},
{
"epoch": 2.5656763361208794,
"grad_norm": 1.421875,
"learning_rate": 1.4479083148611047e-06,
"loss": 0.6266,
"step": 54400
},
{
"epoch": 2.5680344999366245,
"grad_norm": 1.125,
"learning_rate": 1.4400477919791225e-06,
"loss": 0.6427,
"step": 54450
},
{
"epoch": 2.570392663752369,
"grad_norm": 1.0703125,
"learning_rate": 1.4321872690971406e-06,
"loss": 0.6375,
"step": 54500
},
{
"epoch": 2.570392663752369,
"eval_loss": 0.6410405039787292,
"eval_runtime": 470.8842,
"eval_samples_per_second": 75.836,
"eval_steps_per_second": 37.918,
"step": 54500
},
{
"epoch": 2.572750827568114,
"grad_norm": 1.0390625,
"learning_rate": 1.4243267462151583e-06,
"loss": 0.6553,
"step": 54550
},
{
"epoch": 2.5751089913838587,
"grad_norm": 1.0078125,
"learning_rate": 1.4164662233331763e-06,
"loss": 0.6436,
"step": 54600
},
{
"epoch": 2.577467155199604,
"grad_norm": 1.0625,
"learning_rate": 1.4086057004511942e-06,
"loss": 0.6224,
"step": 54650
},
{
"epoch": 2.579825319015349,
"grad_norm": 1.03125,
"learning_rate": 1.4007451775692121e-06,
"loss": 0.6298,
"step": 54700
},
{
"epoch": 2.5821834828310934,
"grad_norm": 1.0703125,
"learning_rate": 1.3928846546872299e-06,
"loss": 0.6294,
"step": 54750
},
{
"epoch": 2.5845416466468385,
"grad_norm": 1.1953125,
"learning_rate": 1.385024131805248e-06,
"loss": 0.6264,
"step": 54800
},
{
"epoch": 2.586899810462583,
"grad_norm": 1.09375,
"learning_rate": 1.3771636089232657e-06,
"loss": 0.6413,
"step": 54850
},
{
"epoch": 2.589257974278328,
"grad_norm": 1.234375,
"learning_rate": 1.3693030860412837e-06,
"loss": 0.6407,
"step": 54900
},
{
"epoch": 2.591616138094073,
"grad_norm": 1.015625,
"learning_rate": 1.3614425631593014e-06,
"loss": 0.6537,
"step": 54950
},
{
"epoch": 2.593974301909818,
"grad_norm": 1.3359375,
"learning_rate": 1.3535820402773196e-06,
"loss": 0.6404,
"step": 55000
},
{
"epoch": 2.593974301909818,
"eval_loss": 0.6411945223808289,
"eval_runtime": 472.1323,
"eval_samples_per_second": 75.636,
"eval_steps_per_second": 37.818,
"step": 55000
},
{
"epoch": 2.596332465725563,
"grad_norm": 1.03125,
"learning_rate": 1.3457215173953373e-06,
"loss": 0.6433,
"step": 55050
},
{
"epoch": 2.5986906295413075,
"grad_norm": 1.09375,
"learning_rate": 1.3378609945133552e-06,
"loss": 0.6418,
"step": 55100
},
{
"epoch": 2.6010487933570525,
"grad_norm": 1.0703125,
"learning_rate": 1.330000471631373e-06,
"loss": 0.6216,
"step": 55150
},
{
"epoch": 2.6034069571727976,
"grad_norm": 0.98046875,
"learning_rate": 1.322139948749391e-06,
"loss": 0.6438,
"step": 55200
},
{
"epoch": 2.605765120988542,
"grad_norm": 1.125,
"learning_rate": 1.3142794258674088e-06,
"loss": 0.633,
"step": 55250
},
{
"epoch": 2.6081232848042872,
"grad_norm": 1.015625,
"learning_rate": 1.3064189029854265e-06,
"loss": 0.6477,
"step": 55300
},
{
"epoch": 2.610481448620032,
"grad_norm": 1.03125,
"learning_rate": 1.2985583801034447e-06,
"loss": 0.6428,
"step": 55350
},
{
"epoch": 2.612839612435777,
"grad_norm": 1.25,
"learning_rate": 1.2906978572214624e-06,
"loss": 0.6387,
"step": 55400
},
{
"epoch": 2.615197776251522,
"grad_norm": 1.0234375,
"learning_rate": 1.2828373343394804e-06,
"loss": 0.6317,
"step": 55450
},
{
"epoch": 2.6175559400672666,
"grad_norm": 0.94140625,
"learning_rate": 1.274976811457498e-06,
"loss": 0.6466,
"step": 55500
},
{
"epoch": 2.6175559400672666,
"eval_loss": 0.6410502195358276,
"eval_runtime": 471.6129,
"eval_samples_per_second": 75.719,
"eval_steps_per_second": 37.859,
"step": 55500
},
{
"epoch": 2.6199141038830116,
"grad_norm": 1.0390625,
"learning_rate": 1.2671162885755162e-06,
"loss": 0.6439,
"step": 55550
},
{
"epoch": 2.622272267698756,
"grad_norm": 1.1015625,
"learning_rate": 1.259255765693534e-06,
"loss": 0.6274,
"step": 55600
},
{
"epoch": 2.6246304315145013,
"grad_norm": 0.97265625,
"learning_rate": 1.251395242811552e-06,
"loss": 0.6479,
"step": 55650
},
{
"epoch": 2.6269885953302463,
"grad_norm": 0.984375,
"learning_rate": 1.2435347199295698e-06,
"loss": 0.6389,
"step": 55700
},
{
"epoch": 2.629346759145991,
"grad_norm": 1.0078125,
"learning_rate": 1.2356741970475878e-06,
"loss": 0.6367,
"step": 55750
},
{
"epoch": 2.631704922961736,
"grad_norm": 1.0078125,
"learning_rate": 1.2278136741656057e-06,
"loss": 0.6448,
"step": 55800
},
{
"epoch": 2.6340630867774806,
"grad_norm": 1.171875,
"learning_rate": 1.2199531512836234e-06,
"loss": 0.634,
"step": 55850
},
{
"epoch": 2.6364212505932256,
"grad_norm": 1.046875,
"learning_rate": 1.2120926284016414e-06,
"loss": 0.6318,
"step": 55900
},
{
"epoch": 2.6387794144089707,
"grad_norm": 1.1171875,
"learning_rate": 1.2042321055196593e-06,
"loss": 0.6379,
"step": 55950
},
{
"epoch": 2.6411375782247153,
"grad_norm": 1.078125,
"learning_rate": 1.1963715826376772e-06,
"loss": 0.6417,
"step": 56000
},
{
"epoch": 2.6411375782247153,
"eval_loss": 0.6411082744598389,
"eval_runtime": 471.2106,
"eval_samples_per_second": 75.784,
"eval_steps_per_second": 37.892,
"step": 56000
},
{
"epoch": 2.6434957420404603,
"grad_norm": 1.0234375,
"learning_rate": 1.1885110597556952e-06,
"loss": 0.6384,
"step": 56050
},
{
"epoch": 2.645853905856205,
"grad_norm": 1.0,
"learning_rate": 1.180650536873713e-06,
"loss": 0.6415,
"step": 56100
},
{
"epoch": 2.64821206967195,
"grad_norm": 1.0859375,
"learning_rate": 1.1727900139917308e-06,
"loss": 0.6443,
"step": 56150
},
{
"epoch": 2.650570233487695,
"grad_norm": 1.1015625,
"learning_rate": 1.1649294911097488e-06,
"loss": 0.6414,
"step": 56200
},
{
"epoch": 2.6529283973034397,
"grad_norm": 1.0390625,
"learning_rate": 1.1570689682277667e-06,
"loss": 0.6386,
"step": 56250
},
{
"epoch": 2.6552865611191847,
"grad_norm": 1.0390625,
"learning_rate": 1.1492084453457844e-06,
"loss": 0.6233,
"step": 56300
},
{
"epoch": 2.6576447249349293,
"grad_norm": 1.2265625,
"learning_rate": 1.1413479224638024e-06,
"loss": 0.6364,
"step": 56350
},
{
"epoch": 2.6600028887506744,
"grad_norm": 0.9921875,
"learning_rate": 1.1334873995818203e-06,
"loss": 0.636,
"step": 56400
},
{
"epoch": 2.6623610525664194,
"grad_norm": 1.0703125,
"learning_rate": 1.1256268766998383e-06,
"loss": 0.6402,
"step": 56450
},
{
"epoch": 2.664719216382164,
"grad_norm": 1.0546875,
"learning_rate": 1.1177663538178562e-06,
"loss": 0.6517,
"step": 56500
},
{
"epoch": 2.664719216382164,
"eval_loss": 0.6410189867019653,
"eval_runtime": 471.5412,
"eval_samples_per_second": 75.73,
"eval_steps_per_second": 37.865,
"step": 56500
},
{
"epoch": 2.6670773801979086,
"grad_norm": 0.93359375,
"learning_rate": 1.109905830935874e-06,
"loss": 0.6403,
"step": 56550
},
{
"epoch": 2.6694355440136537,
"grad_norm": 1.0703125,
"learning_rate": 1.1020453080538919e-06,
"loss": 0.6357,
"step": 56600
},
{
"epoch": 2.6717937078293987,
"grad_norm": 1.140625,
"learning_rate": 1.0941847851719098e-06,
"loss": 0.6385,
"step": 56650
},
{
"epoch": 2.674151871645144,
"grad_norm": 1.0390625,
"learning_rate": 1.0863242622899277e-06,
"loss": 0.621,
"step": 56700
},
{
"epoch": 2.6765100354608884,
"grad_norm": 1.0625,
"learning_rate": 1.0784637394079455e-06,
"loss": 0.6335,
"step": 56750
},
{
"epoch": 2.678868199276633,
"grad_norm": 1.0,
"learning_rate": 1.0706032165259634e-06,
"loss": 0.6392,
"step": 56800
},
{
"epoch": 2.681226363092378,
"grad_norm": 1.1484375,
"learning_rate": 1.0627426936439811e-06,
"loss": 0.6418,
"step": 56850
},
{
"epoch": 2.683584526908123,
"grad_norm": 1.1171875,
"learning_rate": 1.054882170761999e-06,
"loss": 0.6334,
"step": 56900
},
{
"epoch": 2.6859426907238677,
"grad_norm": 0.9296875,
"learning_rate": 1.047021647880017e-06,
"loss": 0.6373,
"step": 56950
},
{
"epoch": 2.6883008545396128,
"grad_norm": 1.1171875,
"learning_rate": 1.039161124998035e-06,
"loss": 0.6361,
"step": 57000
},
{
"epoch": 2.6883008545396128,
"eval_loss": 0.6411524415016174,
"eval_runtime": 472.1931,
"eval_samples_per_second": 75.626,
"eval_steps_per_second": 37.813,
"step": 57000
},
{
"epoch": 2.6906590183553574,
"grad_norm": 0.98828125,
"learning_rate": 1.0313006021160529e-06,
"loss": 0.6436,
"step": 57050
},
{
"epoch": 2.6930171821711024,
"grad_norm": 0.9765625,
"learning_rate": 1.0234400792340706e-06,
"loss": 0.6409,
"step": 57100
},
{
"epoch": 2.6953753459868475,
"grad_norm": 1.5703125,
"learning_rate": 1.0155795563520885e-06,
"loss": 0.632,
"step": 57150
},
{
"epoch": 2.697733509802592,
"grad_norm": 1.015625,
"learning_rate": 1.0077190334701065e-06,
"loss": 0.6359,
"step": 57200
},
{
"epoch": 2.700091673618337,
"grad_norm": 1.1640625,
"learning_rate": 9.998585105881244e-07,
"loss": 0.6363,
"step": 57250
},
{
"epoch": 2.7024498374340817,
"grad_norm": 1.140625,
"learning_rate": 9.919979877061423e-07,
"loss": 0.6353,
"step": 57300
},
{
"epoch": 2.704808001249827,
"grad_norm": 1.1875,
"learning_rate": 9.8413746482416e-07,
"loss": 0.6131,
"step": 57350
},
{
"epoch": 2.707166165065572,
"grad_norm": 1.0546875,
"learning_rate": 9.76276941942178e-07,
"loss": 0.6279,
"step": 57400
},
{
"epoch": 2.7095243288813164,
"grad_norm": 1.125,
"learning_rate": 9.68416419060196e-07,
"loss": 0.6299,
"step": 57450
},
{
"epoch": 2.7118824926970615,
"grad_norm": 1.1640625,
"learning_rate": 9.605558961782139e-07,
"loss": 0.6513,
"step": 57500
},
{
"epoch": 2.7118824926970615,
"eval_loss": 0.6411252021789551,
"eval_runtime": 470.5871,
"eval_samples_per_second": 75.884,
"eval_steps_per_second": 37.942,
"step": 57500
},
{
"epoch": 2.714240656512806,
"grad_norm": 1.0546875,
"learning_rate": 9.526953732962317e-07,
"loss": 0.6413,
"step": 57550
},
{
"epoch": 2.716598820328551,
"grad_norm": 1.0625,
"learning_rate": 9.448348504142497e-07,
"loss": 0.6312,
"step": 57600
},
{
"epoch": 2.718956984144296,
"grad_norm": 1.0234375,
"learning_rate": 9.369743275322675e-07,
"loss": 0.6465,
"step": 57650
},
{
"epoch": 2.721315147960041,
"grad_norm": 1.21875,
"learning_rate": 9.291138046502854e-07,
"loss": 0.6352,
"step": 57700
},
{
"epoch": 2.723673311775786,
"grad_norm": 0.9765625,
"learning_rate": 9.212532817683033e-07,
"loss": 0.6452,
"step": 57750
},
{
"epoch": 2.7260314755915305,
"grad_norm": 1.0078125,
"learning_rate": 9.133927588863212e-07,
"loss": 0.6383,
"step": 57800
},
{
"epoch": 2.7283896394072755,
"grad_norm": 1.015625,
"learning_rate": 9.05532236004339e-07,
"loss": 0.6449,
"step": 57850
},
{
"epoch": 2.7307478032230206,
"grad_norm": 1.0078125,
"learning_rate": 8.97671713122357e-07,
"loss": 0.6313,
"step": 57900
},
{
"epoch": 2.733105967038765,
"grad_norm": 1.2265625,
"learning_rate": 8.898111902403749e-07,
"loss": 0.6402,
"step": 57950
},
{
"epoch": 2.7354641308545102,
"grad_norm": 0.98828125,
"learning_rate": 8.819506673583927e-07,
"loss": 0.6406,
"step": 58000
},
{
"epoch": 2.7354641308545102,
"eval_loss": 0.6410654783248901,
"eval_runtime": 469.5278,
"eval_samples_per_second": 76.055,
"eval_steps_per_second": 38.028,
"step": 58000
},
{
"epoch": 2.737822294670255,
"grad_norm": 1.1015625,
"learning_rate": 8.740901444764107e-07,
"loss": 0.6313,
"step": 58050
},
{
"epoch": 2.740180458486,
"grad_norm": 0.93359375,
"learning_rate": 8.662296215944285e-07,
"loss": 0.624,
"step": 58100
},
{
"epoch": 2.742538622301745,
"grad_norm": 1.0,
"learning_rate": 8.583690987124464e-07,
"loss": 0.6479,
"step": 58150
},
{
"epoch": 2.7448967861174896,
"grad_norm": 1.1171875,
"learning_rate": 8.505085758304643e-07,
"loss": 0.6311,
"step": 58200
},
{
"epoch": 2.7472549499332346,
"grad_norm": 1.0078125,
"learning_rate": 8.426480529484822e-07,
"loss": 0.6461,
"step": 58250
},
{
"epoch": 2.749613113748979,
"grad_norm": 1.1015625,
"learning_rate": 8.347875300665001e-07,
"loss": 0.6463,
"step": 58300
},
{
"epoch": 2.7519712775647243,
"grad_norm": 0.984375,
"learning_rate": 8.26927007184518e-07,
"loss": 0.6301,
"step": 58350
},
{
"epoch": 2.7543294413804693,
"grad_norm": 1.1640625,
"learning_rate": 8.190664843025359e-07,
"loss": 0.6478,
"step": 58400
},
{
"epoch": 2.756687605196214,
"grad_norm": 0.98828125,
"learning_rate": 8.112059614205537e-07,
"loss": 0.632,
"step": 58450
},
{
"epoch": 2.759045769011959,
"grad_norm": 1.0,
"learning_rate": 8.033454385385717e-07,
"loss": 0.6381,
"step": 58500
},
{
"epoch": 2.759045769011959,
"eval_loss": 0.6410369277000427,
"eval_runtime": 470.0761,
"eval_samples_per_second": 75.966,
"eval_steps_per_second": 37.983,
"step": 58500
},
{
"epoch": 2.7614039328277036,
"grad_norm": 1.0859375,
"learning_rate": 7.954849156565895e-07,
"loss": 0.6414,
"step": 58550
},
{
"epoch": 2.7637620966434486,
"grad_norm": 1.28125,
"learning_rate": 7.876243927746075e-07,
"loss": 0.6424,
"step": 58600
},
{
"epoch": 2.7661202604591937,
"grad_norm": 1.0625,
"learning_rate": 7.797638698926254e-07,
"loss": 0.6327,
"step": 58650
},
{
"epoch": 2.7684784242749383,
"grad_norm": 1.0234375,
"learning_rate": 7.719033470106432e-07,
"loss": 0.6454,
"step": 58700
},
{
"epoch": 2.7708365880906833,
"grad_norm": 1.0078125,
"learning_rate": 7.640428241286612e-07,
"loss": 0.6395,
"step": 58750
},
{
"epoch": 2.773194751906428,
"grad_norm": 0.98046875,
"learning_rate": 7.56182301246679e-07,
"loss": 0.6353,
"step": 58800
},
{
"epoch": 2.775552915722173,
"grad_norm": 0.9921875,
"learning_rate": 7.483217783646969e-07,
"loss": 0.6427,
"step": 58850
},
{
"epoch": 2.777911079537918,
"grad_norm": 1.1015625,
"learning_rate": 7.404612554827148e-07,
"loss": 0.6473,
"step": 58900
},
{
"epoch": 2.7802692433536627,
"grad_norm": 0.98828125,
"learning_rate": 7.326007326007327e-07,
"loss": 0.62,
"step": 58950
},
{
"epoch": 2.7826274071694077,
"grad_norm": 1.140625,
"learning_rate": 7.247402097187506e-07,
"loss": 0.6349,
"step": 59000
},
{
"epoch": 2.7826274071694077,
"eval_loss": 0.6410152912139893,
"eval_runtime": 473.057,
"eval_samples_per_second": 75.488,
"eval_steps_per_second": 37.744,
"step": 59000
},
{
"epoch": 2.7849855709851523,
"grad_norm": 1.125,
"learning_rate": 7.168796868367685e-07,
"loss": 0.64,
"step": 59050
},
{
"epoch": 2.7873437348008974,
"grad_norm": 1.296875,
"learning_rate": 7.090191639547864e-07,
"loss": 0.6439,
"step": 59100
},
{
"epoch": 2.7897018986166424,
"grad_norm": 0.98046875,
"learning_rate": 7.011586410728042e-07,
"loss": 0.6366,
"step": 59150
},
{
"epoch": 2.792060062432387,
"grad_norm": 0.93359375,
"learning_rate": 6.932981181908222e-07,
"loss": 0.6253,
"step": 59200
},
{
"epoch": 2.7944182262481316,
"grad_norm": 1.140625,
"learning_rate": 6.8543759530884e-07,
"loss": 0.6412,
"step": 59250
},
{
"epoch": 2.7967763900638767,
"grad_norm": 0.9921875,
"learning_rate": 6.775770724268579e-07,
"loss": 0.6454,
"step": 59300
},
{
"epoch": 2.7991345538796217,
"grad_norm": 1.0078125,
"learning_rate": 6.697165495448759e-07,
"loss": 0.6422,
"step": 59350
},
{
"epoch": 2.8014927176953663,
"grad_norm": 0.95703125,
"learning_rate": 6.618560266628937e-07,
"loss": 0.6319,
"step": 59400
},
{
"epoch": 2.8038508815111114,
"grad_norm": 1.0234375,
"learning_rate": 6.539955037809116e-07,
"loss": 0.6447,
"step": 59450
},
{
"epoch": 2.806209045326856,
"grad_norm": 1.1484375,
"learning_rate": 6.461349808989295e-07,
"loss": 0.6316,
"step": 59500
},
{
"epoch": 2.806209045326856,
"eval_loss": 0.6411675214767456,
"eval_runtime": 469.7714,
"eval_samples_per_second": 76.016,
"eval_steps_per_second": 38.008,
"step": 59500
},
{
"epoch": 2.808567209142601,
"grad_norm": 1.015625,
"learning_rate": 6.382744580169474e-07,
"loss": 0.6454,
"step": 59550
},
{
"epoch": 2.810925372958346,
"grad_norm": 1.109375,
"learning_rate": 6.304139351349652e-07,
"loss": 0.641,
"step": 59600
},
{
"epoch": 2.8132835367740907,
"grad_norm": 0.96875,
"learning_rate": 6.225534122529831e-07,
"loss": 0.6424,
"step": 59650
},
{
"epoch": 2.8156417005898358,
"grad_norm": 0.94921875,
"learning_rate": 6.14692889371001e-07,
"loss": 0.6486,
"step": 59700
},
{
"epoch": 2.8179998644055804,
"grad_norm": 0.9921875,
"learning_rate": 6.068323664890188e-07,
"loss": 0.6416,
"step": 59750
},
{
"epoch": 2.8203580282213254,
"grad_norm": 1.0703125,
"learning_rate": 5.989718436070368e-07,
"loss": 0.6306,
"step": 59800
},
{
"epoch": 2.8227161920370705,
"grad_norm": 0.94921875,
"learning_rate": 5.911113207250547e-07,
"loss": 0.6408,
"step": 59850
},
{
"epoch": 2.825074355852815,
"grad_norm": 0.93359375,
"learning_rate": 5.832507978430726e-07,
"loss": 0.6327,
"step": 59900
},
{
"epoch": 2.82743251966856,
"grad_norm": 1.078125,
"learning_rate": 5.753902749610905e-07,
"loss": 0.6402,
"step": 59950
},
{
"epoch": 2.8297906834843047,
"grad_norm": 1.2890625,
"learning_rate": 5.675297520791083e-07,
"loss": 0.6373,
"step": 60000
},
{
"epoch": 2.8297906834843047,
"eval_loss": 0.6411817669868469,
"eval_runtime": 470.2946,
"eval_samples_per_second": 75.931,
"eval_steps_per_second": 37.966,
"step": 60000
},
{
"epoch": 2.83214884730005,
"grad_norm": 1.0390625,
"learning_rate": 5.596692291971263e-07,
"loss": 0.642,
"step": 60050
},
{
"epoch": 2.834507011115795,
"grad_norm": 1.046875,
"learning_rate": 5.518087063151441e-07,
"loss": 0.635,
"step": 60100
},
{
"epoch": 2.8368651749315394,
"grad_norm": 0.94140625,
"learning_rate": 5.43948183433162e-07,
"loss": 0.6441,
"step": 60150
},
{
"epoch": 2.8392233387472845,
"grad_norm": 1.1015625,
"learning_rate": 5.3608766055118e-07,
"loss": 0.6256,
"step": 60200
},
{
"epoch": 2.841581502563029,
"grad_norm": 1.0234375,
"learning_rate": 5.282271376691978e-07,
"loss": 0.6383,
"step": 60250
},
{
"epoch": 2.843939666378774,
"grad_norm": 1.015625,
"learning_rate": 5.203666147872157e-07,
"loss": 0.653,
"step": 60300
},
{
"epoch": 2.846297830194519,
"grad_norm": 1.015625,
"learning_rate": 5.125060919052336e-07,
"loss": 0.6333,
"step": 60350
},
{
"epoch": 2.848655994010264,
"grad_norm": 0.91796875,
"learning_rate": 5.046455690232515e-07,
"loss": 0.6362,
"step": 60400
},
{
"epoch": 2.851014157826009,
"grad_norm": 1.0625,
"learning_rate": 4.967850461412694e-07,
"loss": 0.6434,
"step": 60450
},
{
"epoch": 2.8533723216417535,
"grad_norm": 1.078125,
"learning_rate": 4.889245232592872e-07,
"loss": 0.6395,
"step": 60500
},
{
"epoch": 2.8533723216417535,
"eval_loss": 0.6410468220710754,
"eval_runtime": 473.7881,
"eval_samples_per_second": 75.371,
"eval_steps_per_second": 37.686,
"step": 60500
},
{
"epoch": 2.8557304854574985,
"grad_norm": 1.0703125,
"learning_rate": 4.810640003773051e-07,
"loss": 0.6297,
"step": 60550
},
{
"epoch": 2.8580886492732436,
"grad_norm": 0.9375,
"learning_rate": 4.73203477495323e-07,
"loss": 0.6338,
"step": 60600
},
{
"epoch": 2.860446813088988,
"grad_norm": 0.94921875,
"learning_rate": 4.653429546133409e-07,
"loss": 0.6337,
"step": 60650
},
{
"epoch": 2.8628049769047332,
"grad_norm": 1.0234375,
"learning_rate": 4.5748243173135876e-07,
"loss": 0.6432,
"step": 60700
},
{
"epoch": 2.865163140720478,
"grad_norm": 1.09375,
"learning_rate": 4.496219088493767e-07,
"loss": 0.6343,
"step": 60750
},
{
"epoch": 2.867521304536223,
"grad_norm": 1.1328125,
"learning_rate": 4.417613859673946e-07,
"loss": 0.6397,
"step": 60800
},
{
"epoch": 2.869879468351968,
"grad_norm": 1.09375,
"learning_rate": 4.3390086308541247e-07,
"loss": 0.6363,
"step": 60850
},
{
"epoch": 2.8722376321677126,
"grad_norm": 1.0546875,
"learning_rate": 4.2604034020343035e-07,
"loss": 0.6337,
"step": 60900
},
{
"epoch": 2.8745957959834576,
"grad_norm": 1.046875,
"learning_rate": 4.1817981732144824e-07,
"loss": 0.64,
"step": 60950
},
{
"epoch": 2.876953959799202,
"grad_norm": 1.109375,
"learning_rate": 4.103192944394661e-07,
"loss": 0.6338,
"step": 61000
},
{
"epoch": 2.876953959799202,
"eval_loss": 0.6411173343658447,
"eval_runtime": 469.9749,
"eval_samples_per_second": 75.983,
"eval_steps_per_second": 37.991,
"step": 61000
},
{
"epoch": 2.8793121236149473,
"grad_norm": 0.984375,
"learning_rate": 4.02458771557484e-07,
"loss": 0.6353,
"step": 61050
},
{
"epoch": 2.8816702874306923,
"grad_norm": 1.34375,
"learning_rate": 3.9459824867550194e-07,
"loss": 0.6449,
"step": 61100
},
{
"epoch": 2.884028451246437,
"grad_norm": 1.3125,
"learning_rate": 3.8673772579351983e-07,
"loss": 0.6456,
"step": 61150
},
{
"epoch": 2.886386615062182,
"grad_norm": 1.03125,
"learning_rate": 3.788772029115377e-07,
"loss": 0.6267,
"step": 61200
},
{
"epoch": 2.8887447788779266,
"grad_norm": 1.046875,
"learning_rate": 3.710166800295556e-07,
"loss": 0.6437,
"step": 61250
},
{
"epoch": 2.8911029426936716,
"grad_norm": 1.0625,
"learning_rate": 3.631561571475735e-07,
"loss": 0.6396,
"step": 61300
},
{
"epoch": 2.8934611065094167,
"grad_norm": 1.0546875,
"learning_rate": 3.5529563426559137e-07,
"loss": 0.6386,
"step": 61350
},
{
"epoch": 2.8958192703251613,
"grad_norm": 1.0390625,
"learning_rate": 3.4743511138360925e-07,
"loss": 0.637,
"step": 61400
},
{
"epoch": 2.8981774341409063,
"grad_norm": 1.0859375,
"learning_rate": 3.395745885016272e-07,
"loss": 0.6291,
"step": 61450
},
{
"epoch": 2.900535597956651,
"grad_norm": 1.078125,
"learning_rate": 3.3171406561964507e-07,
"loss": 0.6411,
"step": 61500
},
{
"epoch": 2.900535597956651,
"eval_loss": 0.6411393284797668,
"eval_runtime": 474.1568,
"eval_samples_per_second": 75.313,
"eval_steps_per_second": 37.656,
"step": 61500
},
{
"epoch": 2.902893761772396,
"grad_norm": 1.0859375,
"learning_rate": 3.2385354273766296e-07,
"loss": 0.6399,
"step": 61550
},
{
"epoch": 2.905251925588141,
"grad_norm": 0.99609375,
"learning_rate": 3.1599301985568084e-07,
"loss": 0.6441,
"step": 61600
},
{
"epoch": 2.9076100894038857,
"grad_norm": 1.046875,
"learning_rate": 3.081324969736987e-07,
"loss": 0.6298,
"step": 61650
},
{
"epoch": 2.9099682532196303,
"grad_norm": 1.0546875,
"learning_rate": 3.002719740917166e-07,
"loss": 0.6394,
"step": 61700
},
{
"epoch": 2.9123264170353753,
"grad_norm": 1.046875,
"learning_rate": 2.924114512097345e-07,
"loss": 0.6328,
"step": 61750
},
{
"epoch": 2.9146845808511204,
"grad_norm": 0.9765625,
"learning_rate": 2.845509283277524e-07,
"loss": 0.6292,
"step": 61800
},
{
"epoch": 2.9170427446668654,
"grad_norm": 1.03125,
"learning_rate": 2.7669040544577026e-07,
"loss": 0.6324,
"step": 61850
},
{
"epoch": 2.91940090848261,
"grad_norm": 1.171875,
"learning_rate": 2.6882988256378815e-07,
"loss": 0.6342,
"step": 61900
},
{
"epoch": 2.9217590722983546,
"grad_norm": 1.078125,
"learning_rate": 2.6096935968180603e-07,
"loss": 0.6352,
"step": 61950
},
{
"epoch": 2.9241172361140997,
"grad_norm": 1.0546875,
"learning_rate": 2.5310883679982397e-07,
"loss": 0.6279,
"step": 62000
},
{
"epoch": 2.9241172361140997,
"eval_loss": 0.6410698294639587,
"eval_runtime": 471.6688,
"eval_samples_per_second": 75.71,
"eval_steps_per_second": 37.855,
"step": 62000
},
{
"epoch": 2.9264753999298447,
"grad_norm": 0.96875,
"learning_rate": 2.4524831391784185e-07,
"loss": 0.6303,
"step": 62050
},
{
"epoch": 2.9288335637455893,
"grad_norm": 1.1953125,
"learning_rate": 2.3738779103585974e-07,
"loss": 0.6424,
"step": 62100
},
{
"epoch": 2.9311917275613344,
"grad_norm": 1.1953125,
"learning_rate": 2.2952726815387762e-07,
"loss": 0.6429,
"step": 62150
},
{
"epoch": 2.933549891377079,
"grad_norm": 1.0859375,
"learning_rate": 2.216667452718955e-07,
"loss": 0.6431,
"step": 62200
},
{
"epoch": 2.935908055192824,
"grad_norm": 1.0390625,
"learning_rate": 2.1380622238991342e-07,
"loss": 0.6369,
"step": 62250
},
{
"epoch": 2.938266219008569,
"grad_norm": 1.0234375,
"learning_rate": 2.059456995079313e-07,
"loss": 0.6399,
"step": 62300
},
{
"epoch": 2.9406243828243137,
"grad_norm": 1.1875,
"learning_rate": 1.9808517662594916e-07,
"loss": 0.6258,
"step": 62350
},
{
"epoch": 2.9429825466400588,
"grad_norm": 1.0,
"learning_rate": 1.9022465374396705e-07,
"loss": 0.6411,
"step": 62400
},
{
"epoch": 2.9453407104558034,
"grad_norm": 0.9296875,
"learning_rate": 1.8236413086198493e-07,
"loss": 0.6371,
"step": 62450
},
{
"epoch": 2.9476988742715484,
"grad_norm": 1.078125,
"learning_rate": 1.7450360798000284e-07,
"loss": 0.6454,
"step": 62500
},
{
"epoch": 2.9476988742715484,
"eval_loss": 0.6410679221153259,
"eval_runtime": 471.0373,
"eval_samples_per_second": 75.811,
"eval_steps_per_second": 37.906,
"step": 62500
},
{
"epoch": 2.9500570380872935,
"grad_norm": 0.99609375,
"learning_rate": 1.6664308509802073e-07,
"loss": 0.6419,
"step": 62550
},
{
"epoch": 2.952415201903038,
"grad_norm": 1.078125,
"learning_rate": 1.587825622160386e-07,
"loss": 0.6397,
"step": 62600
},
{
"epoch": 2.954773365718783,
"grad_norm": 1.0,
"learning_rate": 1.5092203933405652e-07,
"loss": 0.6429,
"step": 62650
},
{
"epoch": 2.9571315295345277,
"grad_norm": 1.09375,
"learning_rate": 1.430615164520744e-07,
"loss": 0.6389,
"step": 62700
},
{
"epoch": 2.959489693350273,
"grad_norm": 0.921875,
"learning_rate": 1.352009935700923e-07,
"loss": 0.6471,
"step": 62750
},
{
"epoch": 2.961847857166018,
"grad_norm": 1.0234375,
"learning_rate": 1.2734047068811018e-07,
"loss": 0.6362,
"step": 62800
},
{
"epoch": 2.9642060209817624,
"grad_norm": 0.8984375,
"learning_rate": 1.1947994780612809e-07,
"loss": 0.6305,
"step": 62850
},
{
"epoch": 2.9665641847975075,
"grad_norm": 1.015625,
"learning_rate": 1.1161942492414596e-07,
"loss": 0.642,
"step": 62900
},
{
"epoch": 2.968922348613252,
"grad_norm": 1.0234375,
"learning_rate": 1.0375890204216384e-07,
"loss": 0.6424,
"step": 62950
},
{
"epoch": 2.971280512428997,
"grad_norm": 0.9765625,
"learning_rate": 9.589837916018174e-08,
"loss": 0.6486,
"step": 63000
},
{
"epoch": 2.971280512428997,
"eval_loss": 0.6411649584770203,
"eval_runtime": 470.5283,
"eval_samples_per_second": 75.893,
"eval_steps_per_second": 37.947,
"step": 63000
},
{
"epoch": 2.973638676244742,
"grad_norm": 1.1015625,
"learning_rate": 8.803785627819964e-08,
"loss": 0.6288,
"step": 63050
},
{
"epoch": 2.975996840060487,
"grad_norm": 1.0078125,
"learning_rate": 8.017733339621752e-08,
"loss": 0.643,
"step": 63100
},
{
"epoch": 2.978355003876232,
"grad_norm": 1.0703125,
"learning_rate": 7.231681051423542e-08,
"loss": 0.6484,
"step": 63150
},
{
"epoch": 2.9807131676919765,
"grad_norm": 1.375,
"learning_rate": 6.44562876322533e-08,
"loss": 0.6447,
"step": 63200
},
{
"epoch": 2.9830713315077215,
"grad_norm": 1.078125,
"learning_rate": 5.659576475027119e-08,
"loss": 0.6435,
"step": 63250
},
{
"epoch": 2.9854294953234666,
"grad_norm": 1.15625,
"learning_rate": 4.873524186828909e-08,
"loss": 0.6504,
"step": 63300
},
{
"epoch": 2.987787659139211,
"grad_norm": 1.078125,
"learning_rate": 4.087471898630698e-08,
"loss": 0.6366,
"step": 63350
},
{
"epoch": 2.9901458229549562,
"grad_norm": 1.046875,
"learning_rate": 3.301419610432486e-08,
"loss": 0.6427,
"step": 63400
},
{
"epoch": 2.992503986770701,
"grad_norm": 1.1015625,
"learning_rate": 2.515367322234275e-08,
"loss": 0.6422,
"step": 63450
},
{
"epoch": 2.994862150586446,
"grad_norm": 1.0390625,
"learning_rate": 1.729315034036064e-08,
"loss": 0.6265,
"step": 63500
},
{
"epoch": 2.994862150586446,
"eval_loss": 0.6410553455352783,
"eval_runtime": 471.7509,
"eval_samples_per_second": 75.697,
"eval_steps_per_second": 37.848,
"step": 63500
},
{
"epoch": 2.997220314402191,
"grad_norm": 1.0390625,
"learning_rate": 9.432627458378533e-09,
"loss": 0.6428,
"step": 63550
},
{
"epoch": 2.9995784782179356,
"grad_norm": 1.265625,
"learning_rate": 1.572104576396422e-09,
"loss": 0.6394,
"step": 63600
}
],
"logging_steps": 50,
"max_steps": 63609,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.845666644465661e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}