InnerLoopARMTForCausalLM_run_34 / trainer_state.json
irodkin's picture
Training checkpoint at step 14000
2cd98f4 verified
{
"best_global_step": 14000,
"best_metric": 2.527418851852417,
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/google/gemma-3-1b-it/linear_adamw_wd1e-03_8x1024_mem32_bs64_hf_armt_dmem64/run_34/checkpoint-14000",
"epoch": 0.28,
"eval_steps": 100,
"global_step": 14000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 96.04050869121504,
"learning_rate": 4.8e-08,
"loss": 3.4391,
"step": 25
},
{
"epoch": 0.001,
"grad_norm": 78.95958818615539,
"learning_rate": 9.8e-08,
"loss": 3.397,
"step": 50
},
{
"epoch": 0.0015,
"grad_norm": 61.45018428703237,
"learning_rate": 1.4800000000000003e-07,
"loss": 3.297,
"step": 75
},
{
"epoch": 0.002,
"grad_norm": 22.353651858428393,
"learning_rate": 1.9800000000000003e-07,
"loss": 3.1733,
"step": 100
},
{
"epoch": 0.002,
"eval_loss": 3.09375,
"eval_runtime": 42.6579,
"eval_samples_per_second": 2.438,
"eval_steps_per_second": 1.219,
"step": 100
},
{
"epoch": 0.0025,
"grad_norm": 9.835689068347888,
"learning_rate": 2.48e-07,
"loss": 3.0557,
"step": 125
},
{
"epoch": 0.003,
"grad_norm": 8.293191220823632,
"learning_rate": 2.9800000000000005e-07,
"loss": 2.9954,
"step": 150
},
{
"epoch": 0.0035,
"grad_norm": 6.660135091710579,
"learning_rate": 3.48e-07,
"loss": 2.9504,
"step": 175
},
{
"epoch": 0.004,
"grad_norm": 13.605532098937575,
"learning_rate": 3.9800000000000004e-07,
"loss": 2.9363,
"step": 200
},
{
"epoch": 0.004,
"eval_loss": 2.924128532409668,
"eval_runtime": 42.5415,
"eval_samples_per_second": 2.445,
"eval_steps_per_second": 1.222,
"step": 200
},
{
"epoch": 0.0045,
"grad_norm": 7.7985826788732435,
"learning_rate": 4.4800000000000004e-07,
"loss": 2.9223,
"step": 225
},
{
"epoch": 0.005,
"grad_norm": 7.257382344220691,
"learning_rate": 4.98e-07,
"loss": 2.9043,
"step": 250
},
{
"epoch": 0.0055,
"grad_norm": 9.049674458422025,
"learning_rate": 5.480000000000001e-07,
"loss": 2.8984,
"step": 275
},
{
"epoch": 0.006,
"grad_norm": 5.766079229639856,
"learning_rate": 5.98e-07,
"loss": 2.8898,
"step": 300
},
{
"epoch": 0.006,
"eval_loss": 2.877253532409668,
"eval_runtime": 42.642,
"eval_samples_per_second": 2.439,
"eval_steps_per_second": 1.219,
"step": 300
},
{
"epoch": 0.0065,
"grad_norm": 5.448754520618337,
"learning_rate": 6.48e-07,
"loss": 2.871,
"step": 325
},
{
"epoch": 0.007,
"grad_norm": 6.866471472157179,
"learning_rate": 6.98e-07,
"loss": 2.8693,
"step": 350
},
{
"epoch": 0.0075,
"grad_norm": 6.115788528016365,
"learning_rate": 7.480000000000001e-07,
"loss": 2.8601,
"step": 375
},
{
"epoch": 0.008,
"grad_norm": 5.871468919197367,
"learning_rate": 7.98e-07,
"loss": 2.8555,
"step": 400
},
{
"epoch": 0.008,
"eval_loss": 2.848106861114502,
"eval_runtime": 42.3632,
"eval_samples_per_second": 2.455,
"eval_steps_per_second": 1.227,
"step": 400
},
{
"epoch": 0.0085,
"grad_norm": 6.050804087803095,
"learning_rate": 8.480000000000001e-07,
"loss": 2.832,
"step": 425
},
{
"epoch": 0.009,
"grad_norm": 4.634127162302958,
"learning_rate": 8.980000000000001e-07,
"loss": 2.8418,
"step": 450
},
{
"epoch": 0.0095,
"grad_norm": 5.700549652048682,
"learning_rate": 9.480000000000001e-07,
"loss": 2.8351,
"step": 475
},
{
"epoch": 0.01,
"grad_norm": 5.462019159507559,
"learning_rate": 9.98e-07,
"loss": 2.8319,
"step": 500
},
{
"epoch": 0.01,
"eval_loss": 2.828125,
"eval_runtime": 42.4078,
"eval_samples_per_second": 2.452,
"eval_steps_per_second": 1.226,
"step": 500
},
{
"epoch": 0.0105,
"grad_norm": 5.100237356575638,
"learning_rate": 1.0480000000000002e-06,
"loss": 2.8368,
"step": 525
},
{
"epoch": 0.011,
"grad_norm": 5.8591675831655134,
"learning_rate": 1.0980000000000001e-06,
"loss": 2.8262,
"step": 550
},
{
"epoch": 0.0115,
"grad_norm": 4.582188259829454,
"learning_rate": 1.148e-06,
"loss": 2.8083,
"step": 575
},
{
"epoch": 0.012,
"grad_norm": 4.853482247652135,
"learning_rate": 1.1980000000000002e-06,
"loss": 2.8187,
"step": 600
},
{
"epoch": 0.012,
"eval_loss": 2.810246467590332,
"eval_runtime": 42.429,
"eval_samples_per_second": 2.451,
"eval_steps_per_second": 1.226,
"step": 600
},
{
"epoch": 0.0125,
"grad_norm": 4.813324366644894,
"learning_rate": 1.248e-06,
"loss": 2.8109,
"step": 625
},
{
"epoch": 0.013,
"grad_norm": 4.680021008982155,
"learning_rate": 1.2980000000000001e-06,
"loss": 2.8071,
"step": 650
},
{
"epoch": 0.0135,
"grad_norm": 4.232572917961915,
"learning_rate": 1.348e-06,
"loss": 2.7996,
"step": 675
},
{
"epoch": 0.014,
"grad_norm": 4.140300235345937,
"learning_rate": 1.3980000000000002e-06,
"loss": 2.7965,
"step": 700
},
{
"epoch": 0.014,
"eval_loss": 2.795973539352417,
"eval_runtime": 42.2781,
"eval_samples_per_second": 2.46,
"eval_steps_per_second": 1.23,
"step": 700
},
{
"epoch": 0.0145,
"grad_norm": 4.066322921244863,
"learning_rate": 1.4480000000000002e-06,
"loss": 2.7892,
"step": 725
},
{
"epoch": 0.015,
"grad_norm": 4.790524346969656,
"learning_rate": 1.498e-06,
"loss": 2.7776,
"step": 750
},
{
"epoch": 0.0155,
"grad_norm": 4.814208015592297,
"learning_rate": 1.548e-06,
"loss": 2.7904,
"step": 775
},
{
"epoch": 0.016,
"grad_norm": 3.495397019361677,
"learning_rate": 1.5980000000000002e-06,
"loss": 2.7771,
"step": 800
},
{
"epoch": 0.016,
"eval_loss": 2.783353328704834,
"eval_runtime": 45.2475,
"eval_samples_per_second": 2.298,
"eval_steps_per_second": 1.149,
"step": 800
},
{
"epoch": 0.0165,
"grad_norm": 4.509827964168959,
"learning_rate": 1.6480000000000001e-06,
"loss": 2.7864,
"step": 825
},
{
"epoch": 0.017,
"grad_norm": 3.396755590212729,
"learning_rate": 1.6980000000000003e-06,
"loss": 2.7665,
"step": 850
},
{
"epoch": 0.0175,
"grad_norm": 3.6908600934389364,
"learning_rate": 1.7480000000000002e-06,
"loss": 2.7784,
"step": 875
},
{
"epoch": 0.018,
"grad_norm": 4.517092572588064,
"learning_rate": 1.798e-06,
"loss": 2.7718,
"step": 900
},
{
"epoch": 0.018,
"eval_loss": 2.772385835647583,
"eval_runtime": 42.1503,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.234,
"step": 900
},
{
"epoch": 0.0185,
"grad_norm": 4.1527970820269635,
"learning_rate": 1.8480000000000001e-06,
"loss": 2.7592,
"step": 925
},
{
"epoch": 0.019,
"grad_norm": 4.093946260210414,
"learning_rate": 1.898e-06,
"loss": 2.7728,
"step": 950
},
{
"epoch": 0.0195,
"grad_norm": 3.794409923219389,
"learning_rate": 1.9480000000000002e-06,
"loss": 2.7757,
"step": 975
},
{
"epoch": 0.02,
"grad_norm": 3.128018180220031,
"learning_rate": 1.998e-06,
"loss": 2.7614,
"step": 1000
},
{
"epoch": 0.02,
"eval_loss": 2.764573335647583,
"eval_runtime": 42.2226,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.232,
"step": 1000
},
{
"epoch": 0.0205,
"grad_norm": 3.8078874128993667,
"learning_rate": 2.048e-06,
"loss": 2.7629,
"step": 1025
},
{
"epoch": 0.021,
"grad_norm": 3.50724949935112,
"learning_rate": 2.098e-06,
"loss": 2.776,
"step": 1050
},
{
"epoch": 0.0215,
"grad_norm": 3.600343997799952,
"learning_rate": 2.148e-06,
"loss": 2.7503,
"step": 1075
},
{
"epoch": 0.022,
"grad_norm": 3.4227590286591667,
"learning_rate": 2.198e-06,
"loss": 2.7522,
"step": 1100
},
{
"epoch": 0.022,
"eval_loss": 2.754957914352417,
"eval_runtime": 42.1456,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 1100
},
{
"epoch": 0.0225,
"grad_norm": 3.6214573340756178,
"learning_rate": 2.2480000000000003e-06,
"loss": 2.7423,
"step": 1125
},
{
"epoch": 0.023,
"grad_norm": 4.963456774283441,
"learning_rate": 2.2980000000000003e-06,
"loss": 2.7473,
"step": 1150
},
{
"epoch": 0.0235,
"grad_norm": 4.417511515875024,
"learning_rate": 2.3480000000000002e-06,
"loss": 2.7458,
"step": 1175
},
{
"epoch": 0.024,
"grad_norm": 3.4640266757488054,
"learning_rate": 2.398e-06,
"loss": 2.755,
"step": 1200
},
{
"epoch": 0.024,
"eval_loss": 2.744741678237915,
"eval_runtime": 42.2958,
"eval_samples_per_second": 2.459,
"eval_steps_per_second": 1.229,
"step": 1200
},
{
"epoch": 0.0245,
"grad_norm": 3.8906187945336637,
"learning_rate": 2.448e-06,
"loss": 2.7413,
"step": 1225
},
{
"epoch": 0.025,
"grad_norm": 4.103531427287993,
"learning_rate": 2.498e-06,
"loss": 2.7464,
"step": 1250
},
{
"epoch": 0.0255,
"grad_norm": 3.7381187683762565,
"learning_rate": 2.5480000000000004e-06,
"loss": 2.7383,
"step": 1275
},
{
"epoch": 0.026,
"grad_norm": 4.019695597142381,
"learning_rate": 2.598e-06,
"loss": 2.7286,
"step": 1300
},
{
"epoch": 0.026,
"eval_loss": 2.735727071762085,
"eval_runtime": 42.1778,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 1300
},
{
"epoch": 0.0265,
"grad_norm": 3.761754015207239,
"learning_rate": 2.648e-06,
"loss": 2.7508,
"step": 1325
},
{
"epoch": 0.027,
"grad_norm": 3.5172792845513023,
"learning_rate": 2.6980000000000003e-06,
"loss": 2.7396,
"step": 1350
},
{
"epoch": 0.0275,
"grad_norm": 3.6926838130981556,
"learning_rate": 2.748e-06,
"loss": 2.7286,
"step": 1375
},
{
"epoch": 0.028,
"grad_norm": 3.5018547073145,
"learning_rate": 2.798e-06,
"loss": 2.7247,
"step": 1400
},
{
"epoch": 0.028,
"eval_loss": 2.728515625,
"eval_runtime": 42.129,
"eval_samples_per_second": 2.469,
"eval_steps_per_second": 1.234,
"step": 1400
},
{
"epoch": 0.0285,
"grad_norm": 3.575054037567428,
"learning_rate": 2.848e-06,
"loss": 2.7229,
"step": 1425
},
{
"epoch": 0.029,
"grad_norm": 4.062924067051664,
"learning_rate": 2.8980000000000005e-06,
"loss": 2.7208,
"step": 1450
},
{
"epoch": 0.0295,
"grad_norm": 3.5741121733868573,
"learning_rate": 2.9480000000000004e-06,
"loss": 2.7071,
"step": 1475
},
{
"epoch": 0.03,
"grad_norm": 3.9813713940318864,
"learning_rate": 2.9980000000000003e-06,
"loss": 2.729,
"step": 1500
},
{
"epoch": 0.03,
"eval_loss": 2.721153736114502,
"eval_runtime": 42.058,
"eval_samples_per_second": 2.473,
"eval_steps_per_second": 1.236,
"step": 1500
},
{
"epoch": 0.0305,
"grad_norm": 4.465898046671721,
"learning_rate": 3.0480000000000003e-06,
"loss": 2.7239,
"step": 1525
},
{
"epoch": 0.031,
"grad_norm": 4.083780430751083,
"learning_rate": 3.0980000000000007e-06,
"loss": 2.7177,
"step": 1550
},
{
"epoch": 0.0315,
"grad_norm": 3.259296223054617,
"learning_rate": 3.1480000000000006e-06,
"loss": 2.7149,
"step": 1575
},
{
"epoch": 0.032,
"grad_norm": 4.118900376683919,
"learning_rate": 3.198e-06,
"loss": 2.7157,
"step": 1600
},
{
"epoch": 0.032,
"eval_loss": 2.714693546295166,
"eval_runtime": 42.155,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.234,
"step": 1600
},
{
"epoch": 0.0325,
"grad_norm": 3.7685203077928335,
"learning_rate": 3.248e-06,
"loss": 2.7185,
"step": 1625
},
{
"epoch": 0.033,
"grad_norm": 3.786239665874637,
"learning_rate": 3.298e-06,
"loss": 2.694,
"step": 1650
},
{
"epoch": 0.0335,
"grad_norm": 4.0202339796786095,
"learning_rate": 3.348e-06,
"loss": 2.7076,
"step": 1675
},
{
"epoch": 0.034,
"grad_norm": 3.220912468646897,
"learning_rate": 3.3980000000000003e-06,
"loss": 2.7086,
"step": 1700
},
{
"epoch": 0.034,
"eval_loss": 2.708683967590332,
"eval_runtime": 42.1812,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 1700
},
{
"epoch": 0.0345,
"grad_norm": 3.4236457763643964,
"learning_rate": 3.4480000000000003e-06,
"loss": 2.7107,
"step": 1725
},
{
"epoch": 0.035,
"grad_norm": 3.428424878937346,
"learning_rate": 3.4980000000000002e-06,
"loss": 2.7033,
"step": 1750
},
{
"epoch": 0.0355,
"grad_norm": 3.7064590041354597,
"learning_rate": 3.548e-06,
"loss": 2.7135,
"step": 1775
},
{
"epoch": 0.036,
"grad_norm": 2.6935868617559127,
"learning_rate": 3.5980000000000005e-06,
"loss": 2.6977,
"step": 1800
},
{
"epoch": 0.036,
"eval_loss": 2.702373743057251,
"eval_runtime": 42.099,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 1800
},
{
"epoch": 0.0365,
"grad_norm": 3.1724624305272577,
"learning_rate": 3.6480000000000005e-06,
"loss": 2.6941,
"step": 1825
},
{
"epoch": 0.037,
"grad_norm": 3.3947291376692967,
"learning_rate": 3.6980000000000004e-06,
"loss": 2.705,
"step": 1850
},
{
"epoch": 0.0375,
"grad_norm": 3.2739522130247454,
"learning_rate": 3.7480000000000004e-06,
"loss": 2.6971,
"step": 1875
},
{
"epoch": 0.038,
"grad_norm": 2.886346941239111,
"learning_rate": 3.7980000000000007e-06,
"loss": 2.6878,
"step": 1900
},
{
"epoch": 0.038,
"eval_loss": 2.698768138885498,
"eval_runtime": 42.2524,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.231,
"step": 1900
},
{
"epoch": 0.0385,
"grad_norm": 2.961130539695273,
"learning_rate": 3.848e-06,
"loss": 2.6936,
"step": 1925
},
{
"epoch": 0.039,
"grad_norm": 3.2300245788196884,
"learning_rate": 3.898e-06,
"loss": 2.6989,
"step": 1950
},
{
"epoch": 0.0395,
"grad_norm": 3.2952386418656823,
"learning_rate": 3.948e-06,
"loss": 2.6937,
"step": 1975
},
{
"epoch": 0.04,
"grad_norm": 2.556435159379079,
"learning_rate": 3.9980000000000005e-06,
"loss": 2.6991,
"step": 2000
},
{
"epoch": 0.04,
"eval_loss": 2.693058967590332,
"eval_runtime": 42.2004,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 2000
},
{
"epoch": 0.0405,
"grad_norm": 2.975198340671437,
"learning_rate": 4.048e-06,
"loss": 2.6896,
"step": 2025
},
{
"epoch": 0.041,
"grad_norm": 2.366572300776235,
"learning_rate": 4.098e-06,
"loss": 2.6903,
"step": 2050
},
{
"epoch": 0.0415,
"grad_norm": 2.650575110326075,
"learning_rate": 4.148000000000001e-06,
"loss": 2.6974,
"step": 2075
},
{
"epoch": 0.042,
"grad_norm": 2.844363978567716,
"learning_rate": 4.198e-06,
"loss": 2.6833,
"step": 2100
},
{
"epoch": 0.042,
"eval_loss": 2.687650203704834,
"eval_runtime": 42.1236,
"eval_samples_per_second": 2.469,
"eval_steps_per_second": 1.234,
"step": 2100
},
{
"epoch": 0.0425,
"grad_norm": 2.5043519810203425,
"learning_rate": 4.248000000000001e-06,
"loss": 2.6848,
"step": 2125
},
{
"epoch": 0.043,
"grad_norm": 2.442865859341675,
"learning_rate": 4.298e-06,
"loss": 2.6834,
"step": 2150
},
{
"epoch": 0.0435,
"grad_norm": 2.396444505850839,
"learning_rate": 4.3480000000000006e-06,
"loss": 2.6842,
"step": 2175
},
{
"epoch": 0.044,
"grad_norm": 2.467830621762353,
"learning_rate": 4.398000000000001e-06,
"loss": 2.6849,
"step": 2200
},
{
"epoch": 0.044,
"eval_loss": 2.684495210647583,
"eval_runtime": 42.337,
"eval_samples_per_second": 2.456,
"eval_steps_per_second": 1.228,
"step": 2200
},
{
"epoch": 0.0445,
"grad_norm": 2.331183246577976,
"learning_rate": 4.4480000000000004e-06,
"loss": 2.6933,
"step": 2225
},
{
"epoch": 0.045,
"grad_norm": 2.7108879126095995,
"learning_rate": 4.498e-06,
"loss": 2.6756,
"step": 2250
},
{
"epoch": 0.0455,
"grad_norm": 2.297487473050839,
"learning_rate": 4.548e-06,
"loss": 2.6773,
"step": 2275
},
{
"epoch": 0.046,
"grad_norm": 2.260013609826266,
"learning_rate": 4.598e-06,
"loss": 2.6869,
"step": 2300
},
{
"epoch": 0.046,
"eval_loss": 2.680889368057251,
"eval_runtime": 42.2308,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.231,
"step": 2300
},
{
"epoch": 0.0465,
"grad_norm": 2.1362621908829964,
"learning_rate": 4.648e-06,
"loss": 2.674,
"step": 2325
},
{
"epoch": 0.047,
"grad_norm": 2.530250306266186,
"learning_rate": 4.698000000000001e-06,
"loss": 2.6682,
"step": 2350
},
{
"epoch": 0.0475,
"grad_norm": 2.284376818082532,
"learning_rate": 4.748e-06,
"loss": 2.6741,
"step": 2375
},
{
"epoch": 0.048,
"grad_norm": 2.9431781004579403,
"learning_rate": 4.7980000000000005e-06,
"loss": 2.6793,
"step": 2400
},
{
"epoch": 0.048,
"eval_loss": 2.676382303237915,
"eval_runtime": 42.1755,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 2400
},
{
"epoch": 0.0485,
"grad_norm": 2.2501714313646,
"learning_rate": 4.848000000000001e-06,
"loss": 2.6836,
"step": 2425
},
{
"epoch": 0.049,
"grad_norm": 2.520507270374293,
"learning_rate": 4.898e-06,
"loss": 2.6793,
"step": 2450
},
{
"epoch": 0.0495,
"grad_norm": 2.3001609851463156,
"learning_rate": 4.948000000000001e-06,
"loss": 2.6825,
"step": 2475
},
{
"epoch": 0.05,
"grad_norm": 2.0060268631347973,
"learning_rate": 4.998e-06,
"loss": 2.6736,
"step": 2500
},
{
"epoch": 0.05,
"eval_loss": 2.671875,
"eval_runtime": 42.1697,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 2500
},
{
"epoch": 0.0505,
"grad_norm": 2.1769919372211564,
"learning_rate": 5.048000000000001e-06,
"loss": 2.6741,
"step": 2525
},
{
"epoch": 0.051,
"grad_norm": 2.1133782069189366,
"learning_rate": 5.098000000000001e-06,
"loss": 2.67,
"step": 2550
},
{
"epoch": 0.0515,
"grad_norm": 2.242586565950932,
"learning_rate": 5.1480000000000005e-06,
"loss": 2.6835,
"step": 2575
},
{
"epoch": 0.052,
"grad_norm": 2.4130154185332615,
"learning_rate": 5.198000000000001e-06,
"loss": 2.6752,
"step": 2600
},
{
"epoch": 0.052,
"eval_loss": 2.669621467590332,
"eval_runtime": 42.1123,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 2600
},
{
"epoch": 0.0525,
"grad_norm": 2.243339931731786,
"learning_rate": 5.248000000000001e-06,
"loss": 2.6631,
"step": 2625
},
{
"epoch": 0.053,
"grad_norm": 2.1652170787894964,
"learning_rate": 5.298000000000001e-06,
"loss": 2.6653,
"step": 2650
},
{
"epoch": 0.0535,
"grad_norm": 2.3514042691010077,
"learning_rate": 5.348000000000001e-06,
"loss": 2.6704,
"step": 2675
},
{
"epoch": 0.054,
"grad_norm": 2.0555358311645104,
"learning_rate": 5.398e-06,
"loss": 2.6744,
"step": 2700
},
{
"epoch": 0.054,
"eval_loss": 2.668419361114502,
"eval_runtime": 42.1636,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.233,
"step": 2700
},
{
"epoch": 0.0545,
"grad_norm": 2.504233096197935,
"learning_rate": 5.448e-06,
"loss": 2.6686,
"step": 2725
},
{
"epoch": 0.055,
"grad_norm": 2.1966446495255014,
"learning_rate": 5.498e-06,
"loss": 2.6575,
"step": 2750
},
{
"epoch": 0.0555,
"grad_norm": 3.4129666421130738,
"learning_rate": 5.548e-06,
"loss": 2.6624,
"step": 2775
},
{
"epoch": 0.056,
"grad_norm": 2.5402178685422028,
"learning_rate": 5.5980000000000004e-06,
"loss": 2.6615,
"step": 2800
},
{
"epoch": 0.056,
"eval_loss": 2.666015625,
"eval_runtime": 42.1094,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 2800
},
{
"epoch": 0.0565,
"grad_norm": 2.5169534616209215,
"learning_rate": 5.648e-06,
"loss": 2.6745,
"step": 2825
},
{
"epoch": 0.057,
"grad_norm": 2.4269096679582347,
"learning_rate": 5.698e-06,
"loss": 2.658,
"step": 2850
},
{
"epoch": 0.0575,
"grad_norm": 2.2819396814928763,
"learning_rate": 5.748e-06,
"loss": 2.6694,
"step": 2875
},
{
"epoch": 0.058,
"grad_norm": 3.0448163445232512,
"learning_rate": 5.798e-06,
"loss": 2.6587,
"step": 2900
},
{
"epoch": 0.058,
"eval_loss": 2.662710428237915,
"eval_runtime": 42.173,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 2900
},
{
"epoch": 0.0585,
"grad_norm": 3.2390472506289343,
"learning_rate": 5.848000000000001e-06,
"loss": 2.661,
"step": 2925
},
{
"epoch": 0.059,
"grad_norm": 2.5836929915418194,
"learning_rate": 5.898e-06,
"loss": 2.6514,
"step": 2950
},
{
"epoch": 0.0595,
"grad_norm": 2.5766876152500227,
"learning_rate": 5.9480000000000005e-06,
"loss": 2.6673,
"step": 2975
},
{
"epoch": 0.06,
"grad_norm": 2.507842811667469,
"learning_rate": 5.998000000000001e-06,
"loss": 2.6658,
"step": 3000
},
{
"epoch": 0.06,
"eval_loss": 2.659705638885498,
"eval_runtime": 42.0906,
"eval_samples_per_second": 2.471,
"eval_steps_per_second": 1.235,
"step": 3000
},
{
"epoch": 0.0605,
"grad_norm": 2.291724100817165,
"learning_rate": 6.048e-06,
"loss": 2.6588,
"step": 3025
},
{
"epoch": 0.061,
"grad_norm": 2.356775687250912,
"learning_rate": 6.098000000000001e-06,
"loss": 2.6519,
"step": 3050
},
{
"epoch": 0.0615,
"grad_norm": 3.6009374683805553,
"learning_rate": 6.148e-06,
"loss": 2.6581,
"step": 3075
},
{
"epoch": 0.062,
"grad_norm": 3.2760170273305724,
"learning_rate": 6.198000000000001e-06,
"loss": 2.6588,
"step": 3100
},
{
"epoch": 0.062,
"eval_loss": 2.656700611114502,
"eval_runtime": 42.0325,
"eval_samples_per_second": 2.474,
"eval_steps_per_second": 1.237,
"step": 3100
},
{
"epoch": 0.0625,
"grad_norm": 2.5849236998041825,
"learning_rate": 6.248000000000001e-06,
"loss": 2.6548,
"step": 3125
},
{
"epoch": 0.063,
"grad_norm": 2.3095505880624474,
"learning_rate": 6.2980000000000005e-06,
"loss": 2.6511,
"step": 3150
},
{
"epoch": 0.0635,
"grad_norm": 2.5258255422234996,
"learning_rate": 6.348000000000001e-06,
"loss": 2.6589,
"step": 3175
},
{
"epoch": 0.064,
"grad_norm": 2.3520030773681335,
"learning_rate": 6.398000000000001e-06,
"loss": 2.6462,
"step": 3200
},
{
"epoch": 0.064,
"eval_loss": 2.652644157409668,
"eval_runtime": 42.2271,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.231,
"step": 3200
},
{
"epoch": 0.0645,
"grad_norm": 2.457532178302885,
"learning_rate": 6.448000000000001e-06,
"loss": 2.6495,
"step": 3225
},
{
"epoch": 0.065,
"grad_norm": 2.3328730844475833,
"learning_rate": 6.498000000000001e-06,
"loss": 2.6384,
"step": 3250
},
{
"epoch": 0.0655,
"grad_norm": 2.382459769400574,
"learning_rate": 6.548000000000001e-06,
"loss": 2.652,
"step": 3275
},
{
"epoch": 0.066,
"grad_norm": 2.4287460984943707,
"learning_rate": 6.598000000000001e-06,
"loss": 2.655,
"step": 3300
},
{
"epoch": 0.066,
"eval_loss": 2.650841236114502,
"eval_runtime": 42.1822,
"eval_samples_per_second": 2.465,
"eval_steps_per_second": 1.233,
"step": 3300
},
{
"epoch": 0.0665,
"grad_norm": 3.0374923212376963,
"learning_rate": 6.648e-06,
"loss": 2.6623,
"step": 3325
},
{
"epoch": 0.067,
"grad_norm": 2.3072135476674127,
"learning_rate": 6.698e-06,
"loss": 2.6484,
"step": 3350
},
{
"epoch": 0.0675,
"grad_norm": 2.3676328206176778,
"learning_rate": 6.7480000000000004e-06,
"loss": 2.6569,
"step": 3375
},
{
"epoch": 0.068,
"grad_norm": 2.313390296186245,
"learning_rate": 6.798e-06,
"loss": 2.6393,
"step": 3400
},
{
"epoch": 0.068,
"eval_loss": 2.648888111114502,
"eval_runtime": 44.6877,
"eval_samples_per_second": 2.327,
"eval_steps_per_second": 1.164,
"step": 3400
},
{
"epoch": 0.0685,
"grad_norm": 2.9181668179248033,
"learning_rate": 6.848e-06,
"loss": 2.6521,
"step": 3425
},
{
"epoch": 0.069,
"grad_norm": 2.1972242976901457,
"learning_rate": 6.898e-06,
"loss": 2.6605,
"step": 3450
},
{
"epoch": 0.0695,
"grad_norm": 2.514104559780915,
"learning_rate": 6.948e-06,
"loss": 2.6444,
"step": 3475
},
{
"epoch": 0.07,
"grad_norm": 2.463879404265904,
"learning_rate": 6.998000000000001e-06,
"loss": 2.6586,
"step": 3500
},
{
"epoch": 0.07,
"eval_loss": 2.644831657409668,
"eval_runtime": 45.1164,
"eval_samples_per_second": 2.305,
"eval_steps_per_second": 1.153,
"step": 3500
},
{
"epoch": 0.0705,
"grad_norm": 2.4337078135824126,
"learning_rate": 7.048e-06,
"loss": 2.6463,
"step": 3525
},
{
"epoch": 0.071,
"grad_norm": 2.2908199130690257,
"learning_rate": 7.0980000000000005e-06,
"loss": 2.655,
"step": 3550
},
{
"epoch": 0.0715,
"grad_norm": 2.4093156448180713,
"learning_rate": 7.148000000000001e-06,
"loss": 2.6479,
"step": 3575
},
{
"epoch": 0.072,
"grad_norm": 2.3128290328516172,
"learning_rate": 7.198e-06,
"loss": 2.6342,
"step": 3600
},
{
"epoch": 0.072,
"eval_loss": 2.643179178237915,
"eval_runtime": 43.1012,
"eval_samples_per_second": 2.413,
"eval_steps_per_second": 1.206,
"step": 3600
},
{
"epoch": 0.0725,
"grad_norm": 2.7714344541916165,
"learning_rate": 7.248000000000001e-06,
"loss": 2.6337,
"step": 3625
},
{
"epoch": 0.073,
"grad_norm": 2.8399095157670486,
"learning_rate": 7.298e-06,
"loss": 2.6413,
"step": 3650
},
{
"epoch": 0.0735,
"grad_norm": 2.6867409675260747,
"learning_rate": 7.348000000000001e-06,
"loss": 2.6314,
"step": 3675
},
{
"epoch": 0.074,
"grad_norm": 2.853697365081861,
"learning_rate": 7.398000000000001e-06,
"loss": 2.6372,
"step": 3700
},
{
"epoch": 0.074,
"eval_loss": 2.639573335647583,
"eval_runtime": 45.0291,
"eval_samples_per_second": 2.31,
"eval_steps_per_second": 1.155,
"step": 3700
},
{
"epoch": 0.0745,
"grad_norm": 1.998706410316405,
"learning_rate": 7.4480000000000005e-06,
"loss": 2.637,
"step": 3725
},
{
"epoch": 0.075,
"grad_norm": 2.3172883792227417,
"learning_rate": 7.498000000000001e-06,
"loss": 2.6386,
"step": 3750
},
{
"epoch": 0.0755,
"grad_norm": 2.2578618031758793,
"learning_rate": 7.548000000000001e-06,
"loss": 2.637,
"step": 3775
},
{
"epoch": 0.076,
"grad_norm": 2.022866842989095,
"learning_rate": 7.598000000000001e-06,
"loss": 2.6303,
"step": 3800
},
{
"epoch": 0.076,
"eval_loss": 2.63671875,
"eval_runtime": 45.1006,
"eval_samples_per_second": 2.306,
"eval_steps_per_second": 1.153,
"step": 3800
},
{
"epoch": 0.0765,
"grad_norm": 2.6019929572001987,
"learning_rate": 7.648e-06,
"loss": 2.6359,
"step": 3825
},
{
"epoch": 0.077,
"grad_norm": 2.1777094054397343,
"learning_rate": 7.698000000000002e-06,
"loss": 2.6397,
"step": 3850
},
{
"epoch": 0.0775,
"grad_norm": 2.0323537115489474,
"learning_rate": 7.748000000000001e-06,
"loss": 2.6321,
"step": 3875
},
{
"epoch": 0.078,
"grad_norm": 2.1502944909614037,
"learning_rate": 7.798e-06,
"loss": 2.6373,
"step": 3900
},
{
"epoch": 0.078,
"eval_loss": 2.634765625,
"eval_runtime": 44.8775,
"eval_samples_per_second": 2.317,
"eval_steps_per_second": 1.159,
"step": 3900
},
{
"epoch": 0.0785,
"grad_norm": 2.2895713962575748,
"learning_rate": 7.848000000000002e-06,
"loss": 2.6325,
"step": 3925
},
{
"epoch": 0.079,
"grad_norm": 2.473180320397106,
"learning_rate": 7.898e-06,
"loss": 2.6306,
"step": 3950
},
{
"epoch": 0.0795,
"grad_norm": 2.5774486324856865,
"learning_rate": 7.948e-06,
"loss": 2.6345,
"step": 3975
},
{
"epoch": 0.08,
"grad_norm": 2.282553852536701,
"learning_rate": 7.998e-06,
"loss": 2.641,
"step": 4000
},
{
"epoch": 0.08,
"eval_loss": 2.630859375,
"eval_runtime": 44.8187,
"eval_samples_per_second": 2.32,
"eval_steps_per_second": 1.16,
"step": 4000
},
{
"epoch": 0.0805,
"grad_norm": 2.500864236641362,
"learning_rate": 8.048e-06,
"loss": 2.6309,
"step": 4025
},
{
"epoch": 0.081,
"grad_norm": 2.5639376009370674,
"learning_rate": 8.098000000000001e-06,
"loss": 2.6211,
"step": 4050
},
{
"epoch": 0.0815,
"grad_norm": 3.0035728334967926,
"learning_rate": 8.148e-06,
"loss": 2.6317,
"step": 4075
},
{
"epoch": 0.082,
"grad_norm": 2.804391077504498,
"learning_rate": 8.198e-06,
"loss": 2.6273,
"step": 4100
},
{
"epoch": 0.082,
"eval_loss": 2.627704381942749,
"eval_runtime": 45.0778,
"eval_samples_per_second": 2.307,
"eval_steps_per_second": 1.154,
"step": 4100
},
{
"epoch": 0.0825,
"grad_norm": 2.8025033751566975,
"learning_rate": 8.248e-06,
"loss": 2.6224,
"step": 4125
},
{
"epoch": 0.083,
"grad_norm": 4.307364832973918,
"learning_rate": 8.298000000000001e-06,
"loss": 2.6217,
"step": 4150
},
{
"epoch": 0.0835,
"grad_norm": 2.510945545421516,
"learning_rate": 8.348e-06,
"loss": 2.6158,
"step": 4175
},
{
"epoch": 0.084,
"grad_norm": 2.874475964746802,
"learning_rate": 8.398e-06,
"loss": 2.6284,
"step": 4200
},
{
"epoch": 0.084,
"eval_loss": 2.626352071762085,
"eval_runtime": 44.9685,
"eval_samples_per_second": 2.313,
"eval_steps_per_second": 1.156,
"step": 4200
},
{
"epoch": 0.0845,
"grad_norm": 2.687782456648974,
"learning_rate": 8.448000000000001e-06,
"loss": 2.613,
"step": 4225
},
{
"epoch": 0.085,
"grad_norm": 2.290237147776631,
"learning_rate": 8.498e-06,
"loss": 2.6295,
"step": 4250
},
{
"epoch": 0.0855,
"grad_norm": 2.5217231224578196,
"learning_rate": 8.548e-06,
"loss": 2.6194,
"step": 4275
},
{
"epoch": 0.086,
"grad_norm": 2.478088396853028,
"learning_rate": 8.598000000000001e-06,
"loss": 2.6269,
"step": 4300
},
{
"epoch": 0.086,
"eval_loss": 2.624098539352417,
"eval_runtime": 45.0092,
"eval_samples_per_second": 2.311,
"eval_steps_per_second": 1.155,
"step": 4300
},
{
"epoch": 0.0865,
"grad_norm": 3.160637138604565,
"learning_rate": 8.648000000000001e-06,
"loss": 2.6179,
"step": 4325
},
{
"epoch": 0.087,
"grad_norm": 3.2730443987396787,
"learning_rate": 8.698e-06,
"loss": 2.6128,
"step": 4350
},
{
"epoch": 0.0875,
"grad_norm": 2.1924980955006257,
"learning_rate": 8.748000000000002e-06,
"loss": 2.6237,
"step": 4375
},
{
"epoch": 0.088,
"grad_norm": 2.2909495673616735,
"learning_rate": 8.798000000000001e-06,
"loss": 2.6183,
"step": 4400
},
{
"epoch": 0.088,
"eval_loss": 2.622445821762085,
"eval_runtime": 44.9844,
"eval_samples_per_second": 2.312,
"eval_steps_per_second": 1.156,
"step": 4400
},
{
"epoch": 0.0885,
"grad_norm": 2.3275380340868543,
"learning_rate": 8.848e-06,
"loss": 2.6198,
"step": 4425
},
{
"epoch": 0.089,
"grad_norm": 2.5451157769858135,
"learning_rate": 8.898000000000002e-06,
"loss": 2.6122,
"step": 4450
},
{
"epoch": 0.0895,
"grad_norm": 2.626975380348867,
"learning_rate": 8.948000000000001e-06,
"loss": 2.6053,
"step": 4475
},
{
"epoch": 0.09,
"grad_norm": 3.163525010125433,
"learning_rate": 8.998000000000001e-06,
"loss": 2.616,
"step": 4500
},
{
"epoch": 0.09,
"eval_loss": 2.620342493057251,
"eval_runtime": 45.1428,
"eval_samples_per_second": 2.304,
"eval_steps_per_second": 1.152,
"step": 4500
},
{
"epoch": 0.0905,
"grad_norm": 3.0132623006335857,
"learning_rate": 9.048e-06,
"loss": 2.6168,
"step": 4525
},
{
"epoch": 0.091,
"grad_norm": 2.671468374859406,
"learning_rate": 9.098000000000002e-06,
"loss": 2.6206,
"step": 4550
},
{
"epoch": 0.0915,
"grad_norm": 3.043132564516197,
"learning_rate": 9.148e-06,
"loss": 2.6175,
"step": 4575
},
{
"epoch": 0.092,
"grad_norm": 2.677082280124469,
"learning_rate": 9.198e-06,
"loss": 2.6051,
"step": 4600
},
{
"epoch": 0.092,
"eval_loss": 2.617037296295166,
"eval_runtime": 44.9042,
"eval_samples_per_second": 2.316,
"eval_steps_per_second": 1.158,
"step": 4600
},
{
"epoch": 0.0925,
"grad_norm": 3.05152520766704,
"learning_rate": 9.248e-06,
"loss": 2.6086,
"step": 4625
},
{
"epoch": 0.093,
"grad_norm": 2.751017986849495,
"learning_rate": 9.298e-06,
"loss": 2.6123,
"step": 4650
},
{
"epoch": 0.0935,
"grad_norm": 2.6313524456080573,
"learning_rate": 9.348000000000001e-06,
"loss": 2.6168,
"step": 4675
},
{
"epoch": 0.094,
"grad_norm": 3.186704450209755,
"learning_rate": 9.398e-06,
"loss": 2.6242,
"step": 4700
},
{
"epoch": 0.094,
"eval_loss": 2.615835428237915,
"eval_runtime": 44.9862,
"eval_samples_per_second": 2.312,
"eval_steps_per_second": 1.156,
"step": 4700
},
{
"epoch": 0.0945,
"grad_norm": 2.9215133630797436,
"learning_rate": 9.448e-06,
"loss": 2.6154,
"step": 4725
},
{
"epoch": 0.095,
"grad_norm": 2.7154053486577348,
"learning_rate": 9.498000000000001e-06,
"loss": 2.6133,
"step": 4750
},
{
"epoch": 0.0955,
"grad_norm": 2.30215652369695,
"learning_rate": 9.548e-06,
"loss": 2.6166,
"step": 4775
},
{
"epoch": 0.096,
"grad_norm": 2.534460541656069,
"learning_rate": 9.598e-06,
"loss": 2.6134,
"step": 4800
},
{
"epoch": 0.096,
"eval_loss": 2.613731861114502,
"eval_runtime": 45.1374,
"eval_samples_per_second": 2.304,
"eval_steps_per_second": 1.152,
"step": 4800
},
{
"epoch": 0.0965,
"grad_norm": 3.363450369306592,
"learning_rate": 9.648000000000001e-06,
"loss": 2.6185,
"step": 4825
},
{
"epoch": 0.097,
"grad_norm": 3.368913774523613,
"learning_rate": 9.698000000000001e-06,
"loss": 2.6158,
"step": 4850
},
{
"epoch": 0.0975,
"grad_norm": 2.512742170578084,
"learning_rate": 9.748e-06,
"loss": 2.619,
"step": 4875
},
{
"epoch": 0.098,
"grad_norm": 2.29528993458392,
"learning_rate": 9.798e-06,
"loss": 2.6124,
"step": 4900
},
{
"epoch": 0.098,
"eval_loss": 2.611778736114502,
"eval_runtime": 45.0967,
"eval_samples_per_second": 2.306,
"eval_steps_per_second": 1.153,
"step": 4900
},
{
"epoch": 0.0985,
"grad_norm": 3.6741400808249542,
"learning_rate": 9.848000000000001e-06,
"loss": 2.6057,
"step": 4925
},
{
"epoch": 0.099,
"grad_norm": 2.8877735323213987,
"learning_rate": 9.898e-06,
"loss": 2.5987,
"step": 4950
},
{
"epoch": 0.0995,
"grad_norm": 3.5539935185996785,
"learning_rate": 9.948e-06,
"loss": 2.6116,
"step": 4975
},
{
"epoch": 0.1,
"grad_norm": 3.1496567211993156,
"learning_rate": 9.998000000000002e-06,
"loss": 2.6114,
"step": 5000
},
{
"epoch": 0.1,
"eval_loss": 2.611027717590332,
"eval_runtime": 44.9767,
"eval_samples_per_second": 2.312,
"eval_steps_per_second": 1.156,
"step": 5000
},
{
"epoch": 0.1005,
"grad_norm": 3.416161880895133,
"learning_rate": 9.994666666666668e-06,
"loss": 2.6158,
"step": 5025
},
{
"epoch": 0.101,
"grad_norm": 2.53372876835717,
"learning_rate": 9.989111111111111e-06,
"loss": 2.6012,
"step": 5050
},
{
"epoch": 0.1015,
"grad_norm": 2.318152281282991,
"learning_rate": 9.983555555555556e-06,
"loss": 2.6136,
"step": 5075
},
{
"epoch": 0.102,
"grad_norm": 2.5498343821152525,
"learning_rate": 9.978000000000002e-06,
"loss": 2.6052,
"step": 5100
},
{
"epoch": 0.102,
"eval_loss": 2.610952615737915,
"eval_runtime": 42.1019,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 5100
},
{
"epoch": 0.1025,
"grad_norm": 2.100345439803683,
"learning_rate": 9.972444444444445e-06,
"loss": 2.6084,
"step": 5125
},
{
"epoch": 0.103,
"grad_norm": 2.4294233703328714,
"learning_rate": 9.966888888888889e-06,
"loss": 2.6041,
"step": 5150
},
{
"epoch": 0.1035,
"grad_norm": 3.4299922452762353,
"learning_rate": 9.961333333333334e-06,
"loss": 2.5993,
"step": 5175
},
{
"epoch": 0.104,
"grad_norm": 2.7096315724628273,
"learning_rate": 9.95577777777778e-06,
"loss": 2.6056,
"step": 5200
},
{
"epoch": 0.104,
"eval_loss": 2.605543851852417,
"eval_runtime": 42.1249,
"eval_samples_per_second": 2.469,
"eval_steps_per_second": 1.234,
"step": 5200
},
{
"epoch": 0.1045,
"grad_norm": 3.520679800243995,
"learning_rate": 9.950222222222223e-06,
"loss": 2.6198,
"step": 5225
},
{
"epoch": 0.105,
"grad_norm": 2.6207699649408145,
"learning_rate": 9.944666666666668e-06,
"loss": 2.5983,
"step": 5250
},
{
"epoch": 0.1055,
"grad_norm": 3.81435491451506,
"learning_rate": 9.939111111111112e-06,
"loss": 2.5977,
"step": 5275
},
{
"epoch": 0.106,
"grad_norm": 2.8442763110892058,
"learning_rate": 9.933555555555557e-06,
"loss": 2.5977,
"step": 5300
},
{
"epoch": 0.106,
"eval_loss": 2.603515625,
"eval_runtime": 42.1847,
"eval_samples_per_second": 2.465,
"eval_steps_per_second": 1.233,
"step": 5300
},
{
"epoch": 0.1065,
"grad_norm": 2.3330569818751288,
"learning_rate": 9.928e-06,
"loss": 2.603,
"step": 5325
},
{
"epoch": 0.107,
"grad_norm": 2.614504763128844,
"learning_rate": 9.922444444444446e-06,
"loss": 2.6075,
"step": 5350
},
{
"epoch": 0.1075,
"grad_norm": 2.3761581342305336,
"learning_rate": 9.91688888888889e-06,
"loss": 2.598,
"step": 5375
},
{
"epoch": 0.108,
"grad_norm": 2.9693890956012283,
"learning_rate": 9.911333333333335e-06,
"loss": 2.5939,
"step": 5400
},
{
"epoch": 0.108,
"eval_loss": 2.6025390625,
"eval_runtime": 42.1557,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.234,
"step": 5400
},
{
"epoch": 0.1085,
"grad_norm": 2.3502354313235325,
"learning_rate": 9.905777777777778e-06,
"loss": 2.5977,
"step": 5425
},
{
"epoch": 0.109,
"grad_norm": 4.244516912805596,
"learning_rate": 9.900222222222223e-06,
"loss": 2.6054,
"step": 5450
},
{
"epoch": 0.1095,
"grad_norm": 2.240617110709866,
"learning_rate": 9.894666666666669e-06,
"loss": 2.6033,
"step": 5475
},
{
"epoch": 0.11,
"grad_norm": 2.208047893771693,
"learning_rate": 9.889111111111112e-06,
"loss": 2.5977,
"step": 5500
},
{
"epoch": 0.11,
"eval_loss": 2.602914571762085,
"eval_runtime": 42.1365,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 5500
},
{
"epoch": 0.1105,
"grad_norm": 2.7978498351768364,
"learning_rate": 9.883555555555556e-06,
"loss": 2.5993,
"step": 5525
},
{
"epoch": 0.111,
"grad_norm": 3.122377711007523,
"learning_rate": 9.878000000000001e-06,
"loss": 2.5935,
"step": 5550
},
{
"epoch": 0.1115,
"grad_norm": 2.450318383908477,
"learning_rate": 9.872444444444446e-06,
"loss": 2.6025,
"step": 5575
},
{
"epoch": 0.112,
"grad_norm": 3.0984127045589855,
"learning_rate": 9.86688888888889e-06,
"loss": 2.5952,
"step": 5600
},
{
"epoch": 0.112,
"eval_loss": 2.599684476852417,
"eval_runtime": 42.1446,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 5600
},
{
"epoch": 0.1125,
"grad_norm": 3.1706979497083667,
"learning_rate": 9.861333333333333e-06,
"loss": 2.5938,
"step": 5625
},
{
"epoch": 0.113,
"grad_norm": 2.5819686451355977,
"learning_rate": 9.855777777777779e-06,
"loss": 2.6061,
"step": 5650
},
{
"epoch": 0.1135,
"grad_norm": 2.1160033983420257,
"learning_rate": 9.850222222222224e-06,
"loss": 2.591,
"step": 5675
},
{
"epoch": 0.114,
"grad_norm": 2.6757106700322053,
"learning_rate": 9.844666666666667e-06,
"loss": 2.5843,
"step": 5700
},
{
"epoch": 0.114,
"eval_loss": 2.600059986114502,
"eval_runtime": 42.1314,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 5700
},
{
"epoch": 0.1145,
"grad_norm": 3.0428400730526866,
"learning_rate": 9.839111111111111e-06,
"loss": 2.5889,
"step": 5725
},
{
"epoch": 0.115,
"grad_norm": 3.0023332110537275,
"learning_rate": 9.833555555555556e-06,
"loss": 2.589,
"step": 5750
},
{
"epoch": 0.1155,
"grad_norm": 2.4458242352411212,
"learning_rate": 9.828000000000001e-06,
"loss": 2.5912,
"step": 5775
},
{
"epoch": 0.116,
"grad_norm": 2.9070566280503134,
"learning_rate": 9.822444444444445e-06,
"loss": 2.594,
"step": 5800
},
{
"epoch": 0.116,
"eval_loss": 2.597205638885498,
"eval_runtime": 42.2699,
"eval_samples_per_second": 2.46,
"eval_steps_per_second": 1.23,
"step": 5800
},
{
"epoch": 0.1165,
"grad_norm": 1.9104821809183674,
"learning_rate": 9.81688888888889e-06,
"loss": 2.5945,
"step": 5825
},
{
"epoch": 0.117,
"grad_norm": 2.9356670820687905,
"learning_rate": 9.811333333333334e-06,
"loss": 2.5964,
"step": 5850
},
{
"epoch": 0.1175,
"grad_norm": 3.0014062286025682,
"learning_rate": 9.805777777777779e-06,
"loss": 2.5936,
"step": 5875
},
{
"epoch": 0.118,
"grad_norm": 2.133789981650032,
"learning_rate": 9.800222222222223e-06,
"loss": 2.5931,
"step": 5900
},
{
"epoch": 0.118,
"eval_loss": 2.597581148147583,
"eval_runtime": 42.1405,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 5900
},
{
"epoch": 0.1185,
"grad_norm": 2.2715886568619674,
"learning_rate": 9.794666666666668e-06,
"loss": 2.5892,
"step": 5925
},
{
"epoch": 0.119,
"grad_norm": 2.1629931013495747,
"learning_rate": 9.789111111111111e-06,
"loss": 2.6117,
"step": 5950
},
{
"epoch": 0.1195,
"grad_norm": 2.611955604210334,
"learning_rate": 9.783555555555557e-06,
"loss": 2.5867,
"step": 5975
},
{
"epoch": 0.12,
"grad_norm": 2.2367470112792294,
"learning_rate": 9.778e-06,
"loss": 2.5978,
"step": 6000
},
{
"epoch": 0.12,
"eval_loss": 2.594125509262085,
"eval_runtime": 43.0331,
"eval_samples_per_second": 2.417,
"eval_steps_per_second": 1.208,
"step": 6000
},
{
"epoch": 0.1205,
"grad_norm": 2.1932467610128166,
"learning_rate": 9.772444444444445e-06,
"loss": 2.5892,
"step": 6025
},
{
"epoch": 0.121,
"grad_norm": 2.387425729108963,
"learning_rate": 9.76688888888889e-06,
"loss": 2.5918,
"step": 6050
},
{
"epoch": 0.1215,
"grad_norm": 2.8624591702116313,
"learning_rate": 9.761333333333334e-06,
"loss": 2.5875,
"step": 6075
},
{
"epoch": 0.122,
"grad_norm": 2.930012610934339,
"learning_rate": 9.755777777777778e-06,
"loss": 2.5906,
"step": 6100
},
{
"epoch": 0.122,
"eval_loss": 2.592097282409668,
"eval_runtime": 42.1118,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 6100
},
{
"epoch": 0.1225,
"grad_norm": 3.6585883804987596,
"learning_rate": 9.750222222222223e-06,
"loss": 2.5888,
"step": 6125
},
{
"epoch": 0.123,
"grad_norm": 2.9636602337569213,
"learning_rate": 9.744666666666668e-06,
"loss": 2.5848,
"step": 6150
},
{
"epoch": 0.1235,
"grad_norm": 2.6452546886265242,
"learning_rate": 9.739111111111112e-06,
"loss": 2.5875,
"step": 6175
},
{
"epoch": 0.124,
"grad_norm": 2.230890007256631,
"learning_rate": 9.733555555555555e-06,
"loss": 2.5928,
"step": 6200
},
{
"epoch": 0.124,
"eval_loss": 2.591871976852417,
"eval_runtime": 42.2393,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 1.231,
"step": 6200
},
{
"epoch": 0.1245,
"grad_norm": 2.2263966783946643,
"learning_rate": 9.728e-06,
"loss": 2.5913,
"step": 6225
},
{
"epoch": 0.125,
"grad_norm": 3.0917521864623168,
"learning_rate": 9.722444444444446e-06,
"loss": 2.5858,
"step": 6250
},
{
"epoch": 0.1255,
"grad_norm": 3.406162518240377,
"learning_rate": 9.71688888888889e-06,
"loss": 2.5824,
"step": 6275
},
{
"epoch": 0.126,
"grad_norm": 1.9288658675383707,
"learning_rate": 9.711333333333333e-06,
"loss": 2.5881,
"step": 6300
},
{
"epoch": 0.126,
"eval_loss": 2.588792085647583,
"eval_runtime": 42.1993,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 6300
},
{
"epoch": 0.1265,
"grad_norm": 2.3054152552517557,
"learning_rate": 9.705777777777778e-06,
"loss": 2.5777,
"step": 6325
},
{
"epoch": 0.127,
"grad_norm": 2.4215099152732438,
"learning_rate": 9.700222222222224e-06,
"loss": 2.5905,
"step": 6350
},
{
"epoch": 0.1275,
"grad_norm": 2.1008082850001584,
"learning_rate": 9.694666666666667e-06,
"loss": 2.5891,
"step": 6375
},
{
"epoch": 0.128,
"grad_norm": 2.548161937775528,
"learning_rate": 9.68911111111111e-06,
"loss": 2.5828,
"step": 6400
},
{
"epoch": 0.128,
"eval_loss": 2.588566780090332,
"eval_runtime": 42.2757,
"eval_samples_per_second": 2.46,
"eval_steps_per_second": 1.23,
"step": 6400
},
{
"epoch": 0.1285,
"grad_norm": 2.1721864313913555,
"learning_rate": 9.683555555555556e-06,
"loss": 2.585,
"step": 6425
},
{
"epoch": 0.129,
"grad_norm": 2.6656100643358567,
"learning_rate": 9.678000000000001e-06,
"loss": 2.5859,
"step": 6450
},
{
"epoch": 0.1295,
"grad_norm": 2.14442087538069,
"learning_rate": 9.672444444444445e-06,
"loss": 2.5897,
"step": 6475
},
{
"epoch": 0.13,
"grad_norm": 2.544695719649347,
"learning_rate": 9.66688888888889e-06,
"loss": 2.5819,
"step": 6500
},
{
"epoch": 0.13,
"eval_loss": 2.585561990737915,
"eval_runtime": 42.2362,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 1.231,
"step": 6500
},
{
"epoch": 0.1305,
"grad_norm": 2.2451101114203724,
"learning_rate": 9.661333333333334e-06,
"loss": 2.5824,
"step": 6525
},
{
"epoch": 0.131,
"grad_norm": 2.7518738527602182,
"learning_rate": 9.655777777777779e-06,
"loss": 2.5869,
"step": 6550
},
{
"epoch": 0.1315,
"grad_norm": 2.2692401450967603,
"learning_rate": 9.650222222222222e-06,
"loss": 2.577,
"step": 6575
},
{
"epoch": 0.132,
"grad_norm": 2.0929236367500295,
"learning_rate": 9.644666666666668e-06,
"loss": 2.5732,
"step": 6600
},
{
"epoch": 0.132,
"eval_loss": 2.584359884262085,
"eval_runtime": 42.3019,
"eval_samples_per_second": 2.459,
"eval_steps_per_second": 1.229,
"step": 6600
},
{
"epoch": 0.1325,
"grad_norm": 2.5777548974093794,
"learning_rate": 9.639111111111113e-06,
"loss": 2.588,
"step": 6625
},
{
"epoch": 0.133,
"grad_norm": 3.2457146266333083,
"learning_rate": 9.633555555555556e-06,
"loss": 2.581,
"step": 6650
},
{
"epoch": 0.1335,
"grad_norm": 3.068905385924203,
"learning_rate": 9.628e-06,
"loss": 2.5819,
"step": 6675
},
{
"epoch": 0.134,
"grad_norm": 2.450321782983477,
"learning_rate": 9.622444444444445e-06,
"loss": 2.5771,
"step": 6700
},
{
"epoch": 0.134,
"eval_loss": 2.583233118057251,
"eval_runtime": 42.2202,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.232,
"step": 6700
},
{
"epoch": 0.1345,
"grad_norm": 3.3710381240286607,
"learning_rate": 9.61688888888889e-06,
"loss": 2.5708,
"step": 6725
},
{
"epoch": 0.135,
"grad_norm": 2.498962635333121,
"learning_rate": 9.611333333333334e-06,
"loss": 2.5777,
"step": 6750
},
{
"epoch": 0.1355,
"grad_norm": 2.9123144983870457,
"learning_rate": 9.605777777777778e-06,
"loss": 2.5754,
"step": 6775
},
{
"epoch": 0.136,
"grad_norm": 2.3715808650825347,
"learning_rate": 9.600222222222223e-06,
"loss": 2.5774,
"step": 6800
},
{
"epoch": 0.136,
"eval_loss": 2.583984375,
"eval_runtime": 42.1504,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.234,
"step": 6800
},
{
"epoch": 0.1365,
"grad_norm": 3.353836765177085,
"learning_rate": 9.594666666666668e-06,
"loss": 2.5802,
"step": 6825
},
{
"epoch": 0.137,
"grad_norm": 2.186970794143448,
"learning_rate": 9.589111111111112e-06,
"loss": 2.5716,
"step": 6850
},
{
"epoch": 0.1375,
"grad_norm": 2.561639852925048,
"learning_rate": 9.583555555555555e-06,
"loss": 2.5833,
"step": 6875
},
{
"epoch": 0.138,
"grad_norm": 2.657433324295019,
"learning_rate": 9.578e-06,
"loss": 2.5804,
"step": 6900
},
{
"epoch": 0.138,
"eval_loss": 2.581881046295166,
"eval_runtime": 42.115,
"eval_samples_per_second": 2.469,
"eval_steps_per_second": 1.235,
"step": 6900
},
{
"epoch": 0.1385,
"grad_norm": 3.0762488332335476,
"learning_rate": 9.572444444444446e-06,
"loss": 2.5849,
"step": 6925
},
{
"epoch": 0.139,
"grad_norm": 3.407171936606543,
"learning_rate": 9.56688888888889e-06,
"loss": 2.5745,
"step": 6950
},
{
"epoch": 0.1395,
"grad_norm": 2.343148272910383,
"learning_rate": 9.561333333333333e-06,
"loss": 2.5638,
"step": 6975
},
{
"epoch": 0.14,
"grad_norm": 2.183703414357588,
"learning_rate": 9.555777777777778e-06,
"loss": 2.5773,
"step": 7000
},
{
"epoch": 0.14,
"eval_loss": 2.580303430557251,
"eval_runtime": 42.2608,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.23,
"step": 7000
},
{
"epoch": 0.1405,
"grad_norm": 2.090683861216703,
"learning_rate": 9.550222222222223e-06,
"loss": 2.5752,
"step": 7025
},
{
"epoch": 0.141,
"grad_norm": 2.4462060701449575,
"learning_rate": 9.544666666666667e-06,
"loss": 2.5752,
"step": 7050
},
{
"epoch": 0.1415,
"grad_norm": 2.354572863574847,
"learning_rate": 9.539111111111112e-06,
"loss": 2.5793,
"step": 7075
},
{
"epoch": 0.142,
"grad_norm": 3.150188431313023,
"learning_rate": 9.533555555555556e-06,
"loss": 2.5829,
"step": 7100
},
{
"epoch": 0.142,
"eval_loss": 2.578876256942749,
"eval_runtime": 42.1571,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.233,
"step": 7100
},
{
"epoch": 0.1425,
"grad_norm": 2.2958267675435264,
"learning_rate": 9.528000000000001e-06,
"loss": 2.5819,
"step": 7125
},
{
"epoch": 0.143,
"grad_norm": 2.229576484389536,
"learning_rate": 9.522444444444444e-06,
"loss": 2.5699,
"step": 7150
},
{
"epoch": 0.1435,
"grad_norm": 2.5755824313301185,
"learning_rate": 9.51688888888889e-06,
"loss": 2.5618,
"step": 7175
},
{
"epoch": 0.144,
"grad_norm": 2.002723376168662,
"learning_rate": 9.511333333333335e-06,
"loss": 2.5765,
"step": 7200
},
{
"epoch": 0.144,
"eval_loss": 2.578125,
"eval_runtime": 42.1705,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 7200
},
{
"epoch": 0.1445,
"grad_norm": 2.4322674164363693,
"learning_rate": 9.505777777777779e-06,
"loss": 2.5787,
"step": 7225
},
{
"epoch": 0.145,
"grad_norm": 2.3686555525010795,
"learning_rate": 9.500222222222222e-06,
"loss": 2.5675,
"step": 7250
},
{
"epoch": 0.1455,
"grad_norm": 3.104821188519679,
"learning_rate": 9.494666666666667e-06,
"loss": 2.5746,
"step": 7275
},
{
"epoch": 0.146,
"grad_norm": 2.8814875220913523,
"learning_rate": 9.489111111111113e-06,
"loss": 2.569,
"step": 7300
},
{
"epoch": 0.146,
"eval_loss": 2.575345516204834,
"eval_runtime": 42.4072,
"eval_samples_per_second": 2.452,
"eval_steps_per_second": 1.226,
"step": 7300
},
{
"epoch": 0.1465,
"grad_norm": 2.431219059778247,
"learning_rate": 9.483555555555556e-06,
"loss": 2.5671,
"step": 7325
},
{
"epoch": 0.147,
"grad_norm": 3.105679752764214,
"learning_rate": 9.478e-06,
"loss": 2.5735,
"step": 7350
},
{
"epoch": 0.1475,
"grad_norm": 2.3844745428357528,
"learning_rate": 9.472444444444445e-06,
"loss": 2.5704,
"step": 7375
},
{
"epoch": 0.148,
"grad_norm": 3.1780151194050537,
"learning_rate": 9.46688888888889e-06,
"loss": 2.5754,
"step": 7400
},
{
"epoch": 0.148,
"eval_loss": 2.574970006942749,
"eval_runtime": 42.2781,
"eval_samples_per_second": 2.46,
"eval_steps_per_second": 1.23,
"step": 7400
},
{
"epoch": 0.1485,
"grad_norm": 2.8536998258405872,
"learning_rate": 9.461333333333334e-06,
"loss": 2.5737,
"step": 7425
},
{
"epoch": 0.149,
"grad_norm": 1.929486707486442,
"learning_rate": 9.455777777777777e-06,
"loss": 2.5794,
"step": 7450
},
{
"epoch": 0.1495,
"grad_norm": 2.3851860938995557,
"learning_rate": 9.450222222222223e-06,
"loss": 2.57,
"step": 7475
},
{
"epoch": 0.15,
"grad_norm": 2.274555727546256,
"learning_rate": 9.444666666666668e-06,
"loss": 2.5846,
"step": 7500
},
{
"epoch": 0.15,
"eval_loss": 2.575045108795166,
"eval_runtime": 42.2565,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.231,
"step": 7500
},
{
"epoch": 0.1505,
"grad_norm": 2.990595981559867,
"learning_rate": 9.439111111111111e-06,
"loss": 2.5635,
"step": 7525
},
{
"epoch": 0.151,
"grad_norm": 2.342033024484832,
"learning_rate": 9.433555555555557e-06,
"loss": 2.5682,
"step": 7550
},
{
"epoch": 0.1515,
"grad_norm": 3.278259902418593,
"learning_rate": 9.428e-06,
"loss": 2.5684,
"step": 7575
},
{
"epoch": 0.152,
"grad_norm": 3.323218206618402,
"learning_rate": 9.422444444444445e-06,
"loss": 2.5657,
"step": 7600
},
{
"epoch": 0.152,
"eval_loss": 2.574444055557251,
"eval_runtime": 42.1106,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 7600
},
{
"epoch": 0.1525,
"grad_norm": 2.613661230948087,
"learning_rate": 9.41688888888889e-06,
"loss": 2.5677,
"step": 7625
},
{
"epoch": 0.153,
"grad_norm": 2.1447049265831795,
"learning_rate": 9.411333333333334e-06,
"loss": 2.5772,
"step": 7650
},
{
"epoch": 0.1535,
"grad_norm": 2.074773482377195,
"learning_rate": 9.405777777777778e-06,
"loss": 2.5676,
"step": 7675
},
{
"epoch": 0.154,
"grad_norm": 2.2189972936163063,
"learning_rate": 9.400222222222223e-06,
"loss": 2.565,
"step": 7700
},
{
"epoch": 0.154,
"eval_loss": 2.572340726852417,
"eval_runtime": 42.163,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.233,
"step": 7700
},
{
"epoch": 0.1545,
"grad_norm": 2.020007297414947,
"learning_rate": 9.394666666666668e-06,
"loss": 2.5758,
"step": 7725
},
{
"epoch": 0.155,
"grad_norm": 2.6124546923876606,
"learning_rate": 9.389111111111112e-06,
"loss": 2.5723,
"step": 7750
},
{
"epoch": 0.1555,
"grad_norm": 3.144872673868399,
"learning_rate": 9.383555555555557e-06,
"loss": 2.5642,
"step": 7775
},
{
"epoch": 0.156,
"grad_norm": 2.3755756320446393,
"learning_rate": 9.378e-06,
"loss": 2.5684,
"step": 7800
},
{
"epoch": 0.156,
"eval_loss": 2.571063756942749,
"eval_runtime": 42.2055,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 7800
},
{
"epoch": 0.1565,
"grad_norm": 2.640695576224425,
"learning_rate": 9.372444444444446e-06,
"loss": 2.5735,
"step": 7825
},
{
"epoch": 0.157,
"grad_norm": 2.063148667839031,
"learning_rate": 9.36688888888889e-06,
"loss": 2.5665,
"step": 7850
},
{
"epoch": 0.1575,
"grad_norm": 2.016530541107887,
"learning_rate": 9.361333333333335e-06,
"loss": 2.5595,
"step": 7875
},
{
"epoch": 0.158,
"grad_norm": 2.4121763950632578,
"learning_rate": 9.355777777777778e-06,
"loss": 2.5661,
"step": 7900
},
{
"epoch": 0.158,
"eval_loss": 2.571364164352417,
"eval_runtime": 42.2366,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 1.231,
"step": 7900
},
{
"epoch": 0.1585,
"grad_norm": 3.1944792712012062,
"learning_rate": 9.350222222222224e-06,
"loss": 2.571,
"step": 7925
},
{
"epoch": 0.159,
"grad_norm": 2.624931566803773,
"learning_rate": 9.344666666666667e-06,
"loss": 2.5659,
"step": 7950
},
{
"epoch": 0.1595,
"grad_norm": 2.5196026490718086,
"learning_rate": 9.339111111111112e-06,
"loss": 2.5533,
"step": 7975
},
{
"epoch": 0.16,
"grad_norm": 2.466395654185627,
"learning_rate": 9.333555555555558e-06,
"loss": 2.5648,
"step": 8000
},
{
"epoch": 0.16,
"eval_loss": 2.568809986114502,
"eval_runtime": 45.0346,
"eval_samples_per_second": 2.309,
"eval_steps_per_second": 1.155,
"step": 8000
},
{
"epoch": 0.1605,
"grad_norm": 2.6560367873629835,
"learning_rate": 9.328000000000001e-06,
"loss": 2.5588,
"step": 8025
},
{
"epoch": 0.161,
"grad_norm": 2.2401297319157614,
"learning_rate": 9.322444444444445e-06,
"loss": 2.564,
"step": 8050
},
{
"epoch": 0.1615,
"grad_norm": 2.2847898029930653,
"learning_rate": 9.31688888888889e-06,
"loss": 2.5643,
"step": 8075
},
{
"epoch": 0.162,
"grad_norm": 2.798251121826375,
"learning_rate": 9.311333333333335e-06,
"loss": 2.5577,
"step": 8100
},
{
"epoch": 0.162,
"eval_loss": 2.568058967590332,
"eval_runtime": 42.5915,
"eval_samples_per_second": 2.442,
"eval_steps_per_second": 1.221,
"step": 8100
},
{
"epoch": 0.1625,
"grad_norm": 2.0139748360698895,
"learning_rate": 9.305777777777779e-06,
"loss": 2.5716,
"step": 8125
},
{
"epoch": 0.163,
"grad_norm": 2.052859658987244,
"learning_rate": 9.300222222222222e-06,
"loss": 2.5555,
"step": 8150
},
{
"epoch": 0.1635,
"grad_norm": 2.6452792973388584,
"learning_rate": 9.294666666666668e-06,
"loss": 2.5545,
"step": 8175
},
{
"epoch": 0.164,
"grad_norm": 2.8085427073848543,
"learning_rate": 9.289111111111113e-06,
"loss": 2.5575,
"step": 8200
},
{
"epoch": 0.164,
"eval_loss": 2.56640625,
"eval_runtime": 42.2476,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 1.231,
"step": 8200
},
{
"epoch": 0.1645,
"grad_norm": 1.994417686652318,
"learning_rate": 9.283555555555556e-06,
"loss": 2.5634,
"step": 8225
},
{
"epoch": 0.165,
"grad_norm": 2.8569259303287917,
"learning_rate": 9.278e-06,
"loss": 2.5711,
"step": 8250
},
{
"epoch": 0.1655,
"grad_norm": 2.15031573602464,
"learning_rate": 9.272444444444445e-06,
"loss": 2.5515,
"step": 8275
},
{
"epoch": 0.166,
"grad_norm": 2.1903087160864234,
"learning_rate": 9.26688888888889e-06,
"loss": 2.5588,
"step": 8300
},
{
"epoch": 0.166,
"eval_loss": 2.565354585647583,
"eval_runtime": 42.2533,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.231,
"step": 8300
},
{
"epoch": 0.1665,
"grad_norm": 2.1661066402797697,
"learning_rate": 9.261333333333334e-06,
"loss": 2.5582,
"step": 8325
},
{
"epoch": 0.167,
"grad_norm": 2.3738673472152603,
"learning_rate": 9.25577777777778e-06,
"loss": 2.5598,
"step": 8350
},
{
"epoch": 0.1675,
"grad_norm": 1.893415788443222,
"learning_rate": 9.250222222222223e-06,
"loss": 2.5553,
"step": 8375
},
{
"epoch": 0.168,
"grad_norm": 3.245074933027149,
"learning_rate": 9.244666666666668e-06,
"loss": 2.5632,
"step": 8400
},
{
"epoch": 0.168,
"eval_loss": 2.565354585647583,
"eval_runtime": 42.2015,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 8400
},
{
"epoch": 0.1685,
"grad_norm": 2.359910509969222,
"learning_rate": 9.239111111111112e-06,
"loss": 2.5564,
"step": 8425
},
{
"epoch": 0.169,
"grad_norm": 2.1851033577602355,
"learning_rate": 9.233555555555557e-06,
"loss": 2.5532,
"step": 8450
},
{
"epoch": 0.1695,
"grad_norm": 2.0954334474208443,
"learning_rate": 9.228e-06,
"loss": 2.5585,
"step": 8475
},
{
"epoch": 0.17,
"grad_norm": 2.326393982849659,
"learning_rate": 9.222444444444446e-06,
"loss": 2.5639,
"step": 8500
},
{
"epoch": 0.17,
"eval_loss": 2.564678430557251,
"eval_runtime": 42.3289,
"eval_samples_per_second": 2.457,
"eval_steps_per_second": 1.228,
"step": 8500
},
{
"epoch": 0.1705,
"grad_norm": 2.016190269867033,
"learning_rate": 9.21688888888889e-06,
"loss": 2.555,
"step": 8525
},
{
"epoch": 0.171,
"grad_norm": 2.1491011270580294,
"learning_rate": 9.211333333333334e-06,
"loss": 2.5525,
"step": 8550
},
{
"epoch": 0.1715,
"grad_norm": 2.401949244376787,
"learning_rate": 9.20577777777778e-06,
"loss": 2.5548,
"step": 8575
},
{
"epoch": 0.172,
"grad_norm": 2.6617222137871894,
"learning_rate": 9.200222222222223e-06,
"loss": 2.5567,
"step": 8600
},
{
"epoch": 0.172,
"eval_loss": 2.563025951385498,
"eval_runtime": 42.4626,
"eval_samples_per_second": 2.449,
"eval_steps_per_second": 1.225,
"step": 8600
},
{
"epoch": 0.1725,
"grad_norm": 1.939490462750623,
"learning_rate": 9.194666666666667e-06,
"loss": 2.5605,
"step": 8625
},
{
"epoch": 0.173,
"grad_norm": 2.7453592449199395,
"learning_rate": 9.189111111111112e-06,
"loss": 2.5522,
"step": 8650
},
{
"epoch": 0.1735,
"grad_norm": 2.669405830526754,
"learning_rate": 9.183555555555557e-06,
"loss": 2.5511,
"step": 8675
},
{
"epoch": 0.174,
"grad_norm": 2.483852860875828,
"learning_rate": 9.178000000000001e-06,
"loss": 2.5374,
"step": 8700
},
{
"epoch": 0.174,
"eval_loss": 2.562575101852417,
"eval_runtime": 42.6032,
"eval_samples_per_second": 2.441,
"eval_steps_per_second": 1.221,
"step": 8700
},
{
"epoch": 0.1745,
"grad_norm": 2.228436266030111,
"learning_rate": 9.172444444444444e-06,
"loss": 2.5469,
"step": 8725
},
{
"epoch": 0.175,
"grad_norm": 2.4160405582786306,
"learning_rate": 9.16688888888889e-06,
"loss": 2.5665,
"step": 8750
},
{
"epoch": 0.1755,
"grad_norm": 2.80965451621207,
"learning_rate": 9.161333333333335e-06,
"loss": 2.5542,
"step": 8775
},
{
"epoch": 0.176,
"grad_norm": 2.4851305844565386,
"learning_rate": 9.155777777777779e-06,
"loss": 2.5642,
"step": 8800
},
{
"epoch": 0.176,
"eval_loss": 2.561298131942749,
"eval_runtime": 42.4008,
"eval_samples_per_second": 2.453,
"eval_steps_per_second": 1.226,
"step": 8800
},
{
"epoch": 0.1765,
"grad_norm": 2.70253728592914,
"learning_rate": 9.150222222222222e-06,
"loss": 2.5653,
"step": 8825
},
{
"epoch": 0.177,
"grad_norm": 1.9507837259092773,
"learning_rate": 9.144666666666667e-06,
"loss": 2.5711,
"step": 8850
},
{
"epoch": 0.1775,
"grad_norm": 2.6311592623116926,
"learning_rate": 9.139111111111113e-06,
"loss": 2.5561,
"step": 8875
},
{
"epoch": 0.178,
"grad_norm": 2.5742422293958125,
"learning_rate": 9.133555555555556e-06,
"loss": 2.5551,
"step": 8900
},
{
"epoch": 0.178,
"eval_loss": 2.559945821762085,
"eval_runtime": 42.3142,
"eval_samples_per_second": 2.458,
"eval_steps_per_second": 1.229,
"step": 8900
},
{
"epoch": 0.1785,
"grad_norm": 2.017430018376759,
"learning_rate": 9.128e-06,
"loss": 2.5556,
"step": 8925
},
{
"epoch": 0.179,
"grad_norm": 2.2568307097241616,
"learning_rate": 9.122444444444445e-06,
"loss": 2.5643,
"step": 8950
},
{
"epoch": 0.1795,
"grad_norm": 2.7132996198893404,
"learning_rate": 9.11688888888889e-06,
"loss": 2.5469,
"step": 8975
},
{
"epoch": 0.18,
"grad_norm": 2.6678319001386117,
"learning_rate": 9.111333333333334e-06,
"loss": 2.5482,
"step": 9000
},
{
"epoch": 0.18,
"eval_loss": 2.560246467590332,
"eval_runtime": 42.2661,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.23,
"step": 9000
},
{
"epoch": 0.1805,
"grad_norm": 2.7335192428299697,
"learning_rate": 9.105777777777779e-06,
"loss": 2.56,
"step": 9025
},
{
"epoch": 0.181,
"grad_norm": 2.616833970329197,
"learning_rate": 9.100222222222223e-06,
"loss": 2.5659,
"step": 9050
},
{
"epoch": 0.1815,
"grad_norm": 2.636296249975529,
"learning_rate": 9.094666666666668e-06,
"loss": 2.5605,
"step": 9075
},
{
"epoch": 0.182,
"grad_norm": 2.1413102875849828,
"learning_rate": 9.089111111111111e-06,
"loss": 2.5454,
"step": 9100
},
{
"epoch": 0.182,
"eval_loss": 2.558293342590332,
"eval_runtime": 42.2294,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.231,
"step": 9100
},
{
"epoch": 0.1825,
"grad_norm": 2.195374313863304,
"learning_rate": 9.083555555555557e-06,
"loss": 2.5584,
"step": 9125
},
{
"epoch": 0.183,
"grad_norm": 2.9470418486379546,
"learning_rate": 9.078000000000002e-06,
"loss": 2.5604,
"step": 9150
},
{
"epoch": 0.1835,
"grad_norm": 1.9289932950554558,
"learning_rate": 9.072444444444445e-06,
"loss": 2.5529,
"step": 9175
},
{
"epoch": 0.184,
"grad_norm": 2.905671046574134,
"learning_rate": 9.066888888888889e-06,
"loss": 2.5551,
"step": 9200
},
{
"epoch": 0.184,
"eval_loss": 2.558293342590332,
"eval_runtime": 42.216,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 9200
},
{
"epoch": 0.1845,
"grad_norm": 2.8062526156064522,
"learning_rate": 9.061333333333334e-06,
"loss": 2.5438,
"step": 9225
},
{
"epoch": 0.185,
"grad_norm": 2.543328123273362,
"learning_rate": 9.05577777777778e-06,
"loss": 2.5476,
"step": 9250
},
{
"epoch": 0.1855,
"grad_norm": 2.396296044779414,
"learning_rate": 9.050222222222223e-06,
"loss": 2.5437,
"step": 9275
},
{
"epoch": 0.186,
"grad_norm": 1.980055565462775,
"learning_rate": 9.044666666666667e-06,
"loss": 2.5552,
"step": 9300
},
{
"epoch": 0.186,
"eval_loss": 2.557692289352417,
"eval_runtime": 42.6636,
"eval_samples_per_second": 2.438,
"eval_steps_per_second": 1.219,
"step": 9300
},
{
"epoch": 0.1865,
"grad_norm": 2.028891972183573,
"learning_rate": 9.039111111111112e-06,
"loss": 2.5603,
"step": 9325
},
{
"epoch": 0.187,
"grad_norm": 2.244801606614392,
"learning_rate": 9.033555555555557e-06,
"loss": 2.5565,
"step": 9350
},
{
"epoch": 0.1875,
"grad_norm": 2.6445168963619348,
"learning_rate": 9.028e-06,
"loss": 2.5453,
"step": 9375
},
{
"epoch": 0.188,
"grad_norm": 2.2015819629656543,
"learning_rate": 9.022444444444444e-06,
"loss": 2.5463,
"step": 9400
},
{
"epoch": 0.188,
"eval_loss": 2.555739164352417,
"eval_runtime": 44.4913,
"eval_samples_per_second": 2.338,
"eval_steps_per_second": 1.169,
"step": 9400
},
{
"epoch": 0.1885,
"grad_norm": 2.0871782907981076,
"learning_rate": 9.01688888888889e-06,
"loss": 2.5494,
"step": 9425
},
{
"epoch": 0.189,
"grad_norm": 2.3339796044543006,
"learning_rate": 9.011333333333335e-06,
"loss": 2.562,
"step": 9450
},
{
"epoch": 0.1895,
"grad_norm": 2.5447600145368257,
"learning_rate": 9.005777777777778e-06,
"loss": 2.5613,
"step": 9475
},
{
"epoch": 0.19,
"grad_norm": 2.2530767222642805,
"learning_rate": 9.000222222222222e-06,
"loss": 2.5561,
"step": 9500
},
{
"epoch": 0.19,
"eval_loss": 2.555588960647583,
"eval_runtime": 42.3312,
"eval_samples_per_second": 2.457,
"eval_steps_per_second": 1.228,
"step": 9500
},
{
"epoch": 0.1905,
"grad_norm": 2.2878227597512146,
"learning_rate": 8.994666666666667e-06,
"loss": 2.549,
"step": 9525
},
{
"epoch": 0.191,
"grad_norm": 3.0478077786015088,
"learning_rate": 8.989111111111112e-06,
"loss": 2.5588,
"step": 9550
},
{
"epoch": 0.1915,
"grad_norm": 2.499301869546187,
"learning_rate": 8.983555555555556e-06,
"loss": 2.5529,
"step": 9575
},
{
"epoch": 0.192,
"grad_norm": 2.337747110130922,
"learning_rate": 8.978000000000001e-06,
"loss": 2.5485,
"step": 9600
},
{
"epoch": 0.192,
"eval_loss": 2.554462194442749,
"eval_runtime": 42.3,
"eval_samples_per_second": 2.459,
"eval_steps_per_second": 1.229,
"step": 9600
},
{
"epoch": 0.1925,
"grad_norm": 2.199520541356511,
"learning_rate": 8.972444444444445e-06,
"loss": 2.5484,
"step": 9625
},
{
"epoch": 0.193,
"grad_norm": 2.0965551340270663,
"learning_rate": 8.96688888888889e-06,
"loss": 2.5469,
"step": 9650
},
{
"epoch": 0.1935,
"grad_norm": 2.299106466929266,
"learning_rate": 8.961333333333333e-06,
"loss": 2.5418,
"step": 9675
},
{
"epoch": 0.194,
"grad_norm": 2.4569979839281446,
"learning_rate": 8.955777777777779e-06,
"loss": 2.5539,
"step": 9700
},
{
"epoch": 0.194,
"eval_loss": 2.553786039352417,
"eval_runtime": 42.2974,
"eval_samples_per_second": 2.459,
"eval_steps_per_second": 1.229,
"step": 9700
},
{
"epoch": 0.1945,
"grad_norm": 2.041615655285428,
"learning_rate": 8.950222222222224e-06,
"loss": 2.5423,
"step": 9725
},
{
"epoch": 0.195,
"grad_norm": 2.3161296628839434,
"learning_rate": 8.944666666666668e-06,
"loss": 2.5425,
"step": 9750
},
{
"epoch": 0.1955,
"grad_norm": 1.9404726428231058,
"learning_rate": 8.939111111111111e-06,
"loss": 2.5574,
"step": 9775
},
{
"epoch": 0.196,
"grad_norm": 2.1787807307174596,
"learning_rate": 8.933555555555556e-06,
"loss": 2.5601,
"step": 9800
},
{
"epoch": 0.196,
"eval_loss": 2.553335428237915,
"eval_runtime": 42.3156,
"eval_samples_per_second": 2.458,
"eval_steps_per_second": 1.229,
"step": 9800
},
{
"epoch": 0.1965,
"grad_norm": 2.159828577335103,
"learning_rate": 8.928000000000002e-06,
"loss": 2.5374,
"step": 9825
},
{
"epoch": 0.197,
"grad_norm": 2.212298780606798,
"learning_rate": 8.922444444444445e-06,
"loss": 2.5421,
"step": 9850
},
{
"epoch": 0.1975,
"grad_norm": 2.4629384962810685,
"learning_rate": 8.916888888888889e-06,
"loss": 2.544,
"step": 9875
},
{
"epoch": 0.198,
"grad_norm": 2.2323138923920145,
"learning_rate": 8.911333333333334e-06,
"loss": 2.5538,
"step": 9900
},
{
"epoch": 0.198,
"eval_loss": 2.552133321762085,
"eval_runtime": 42.0858,
"eval_samples_per_second": 2.471,
"eval_steps_per_second": 1.236,
"step": 9900
},
{
"epoch": 0.1985,
"grad_norm": 2.2719465467364057,
"learning_rate": 8.90577777777778e-06,
"loss": 2.5478,
"step": 9925
},
{
"epoch": 0.199,
"grad_norm": 2.705917304760513,
"learning_rate": 8.900222222222223e-06,
"loss": 2.5278,
"step": 9950
},
{
"epoch": 0.1995,
"grad_norm": 1.7785859357117906,
"learning_rate": 8.894666666666666e-06,
"loss": 2.5477,
"step": 9975
},
{
"epoch": 0.2,
"grad_norm": 2.480488966768482,
"learning_rate": 8.889111111111112e-06,
"loss": 2.5415,
"step": 10000
},
{
"epoch": 0.2,
"eval_loss": 2.552133321762085,
"eval_runtime": 42.2581,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.231,
"step": 10000
},
{
"epoch": 0.2005,
"grad_norm": 3.0378947508990453,
"learning_rate": 8.883555555555557e-06,
"loss": 2.5449,
"step": 10025
},
{
"epoch": 0.201,
"grad_norm": 2.995635037144703,
"learning_rate": 8.878e-06,
"loss": 2.5406,
"step": 10050
},
{
"epoch": 0.2015,
"grad_norm": 2.198045707343682,
"learning_rate": 8.872444444444444e-06,
"loss": 2.53,
"step": 10075
},
{
"epoch": 0.202,
"grad_norm": 2.4083638230263946,
"learning_rate": 8.86688888888889e-06,
"loss": 2.5468,
"step": 10100
},
{
"epoch": 0.202,
"eval_loss": 2.550405740737915,
"eval_runtime": 42.0945,
"eval_samples_per_second": 2.471,
"eval_steps_per_second": 1.235,
"step": 10100
},
{
"epoch": 0.2025,
"grad_norm": 2.0842503072786958,
"learning_rate": 8.861333333333334e-06,
"loss": 2.5342,
"step": 10125
},
{
"epoch": 0.203,
"grad_norm": 2.1409770634433665,
"learning_rate": 8.855777777777778e-06,
"loss": 2.5459,
"step": 10150
},
{
"epoch": 0.2035,
"grad_norm": 1.8019290797971257,
"learning_rate": 8.850222222222223e-06,
"loss": 2.5489,
"step": 10175
},
{
"epoch": 0.204,
"grad_norm": 2.159224946702751,
"learning_rate": 8.844666666666667e-06,
"loss": 2.5402,
"step": 10200
},
{
"epoch": 0.204,
"eval_loss": 2.550255298614502,
"eval_runtime": 42.1763,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 10200
},
{
"epoch": 0.2045,
"grad_norm": 2.149785275250866,
"learning_rate": 8.839111111111112e-06,
"loss": 2.548,
"step": 10225
},
{
"epoch": 0.205,
"grad_norm": 2.6911078360763874,
"learning_rate": 8.833555555555556e-06,
"loss": 2.5447,
"step": 10250
},
{
"epoch": 0.2055,
"grad_norm": 2.254737041517942,
"learning_rate": 8.828000000000001e-06,
"loss": 2.548,
"step": 10275
},
{
"epoch": 0.206,
"grad_norm": 2.2852857848915,
"learning_rate": 8.822444444444446e-06,
"loss": 2.5371,
"step": 10300
},
{
"epoch": 0.206,
"eval_loss": 2.549729585647583,
"eval_runtime": 42.1465,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 10300
},
{
"epoch": 0.2065,
"grad_norm": 2.5498279388836425,
"learning_rate": 8.81688888888889e-06,
"loss": 2.536,
"step": 10325
},
{
"epoch": 0.207,
"grad_norm": 2.2620660537006385,
"learning_rate": 8.811333333333333e-06,
"loss": 2.5478,
"step": 10350
},
{
"epoch": 0.2075,
"grad_norm": 2.06322927545459,
"learning_rate": 8.805777777777778e-06,
"loss": 2.5421,
"step": 10375
},
{
"epoch": 0.208,
"grad_norm": 1.9770079692771143,
"learning_rate": 8.800222222222224e-06,
"loss": 2.5519,
"step": 10400
},
{
"epoch": 0.208,
"eval_loss": 2.549504280090332,
"eval_runtime": 42.0501,
"eval_samples_per_second": 2.473,
"eval_steps_per_second": 1.237,
"step": 10400
},
{
"epoch": 0.2085,
"grad_norm": 2.837285948836536,
"learning_rate": 8.794666666666667e-06,
"loss": 2.5387,
"step": 10425
},
{
"epoch": 0.209,
"grad_norm": 2.0428174767585086,
"learning_rate": 8.78911111111111e-06,
"loss": 2.5398,
"step": 10450
},
{
"epoch": 0.2095,
"grad_norm": 1.826545976894172,
"learning_rate": 8.783555555555556e-06,
"loss": 2.5398,
"step": 10475
},
{
"epoch": 0.21,
"grad_norm": 2.335064875387599,
"learning_rate": 8.778000000000001e-06,
"loss": 2.5323,
"step": 10500
},
{
"epoch": 0.21,
"eval_loss": 2.548001766204834,
"eval_runtime": 44.9592,
"eval_samples_per_second": 2.313,
"eval_steps_per_second": 1.157,
"step": 10500
},
{
"epoch": 0.2105,
"grad_norm": 2.1349530306908746,
"learning_rate": 8.772444444444445e-06,
"loss": 2.5322,
"step": 10525
},
{
"epoch": 0.211,
"grad_norm": 2.2099539420109706,
"learning_rate": 8.766888888888888e-06,
"loss": 2.552,
"step": 10550
},
{
"epoch": 0.2115,
"grad_norm": 2.185692829530028,
"learning_rate": 8.761333333333334e-06,
"loss": 2.537,
"step": 10575
},
{
"epoch": 0.212,
"grad_norm": 2.2842207172577087,
"learning_rate": 8.755777777777779e-06,
"loss": 2.5373,
"step": 10600
},
{
"epoch": 0.212,
"eval_loss": 2.547701358795166,
"eval_runtime": 42.1838,
"eval_samples_per_second": 2.465,
"eval_steps_per_second": 1.233,
"step": 10600
},
{
"epoch": 0.2125,
"grad_norm": 1.9972991885719102,
"learning_rate": 8.750222222222223e-06,
"loss": 2.5319,
"step": 10625
},
{
"epoch": 0.213,
"grad_norm": 2.330105056727183,
"learning_rate": 8.744666666666666e-06,
"loss": 2.5388,
"step": 10650
},
{
"epoch": 0.2135,
"grad_norm": 2.70628718016926,
"learning_rate": 8.739111111111111e-06,
"loss": 2.5303,
"step": 10675
},
{
"epoch": 0.214,
"grad_norm": 2.4584947239335624,
"learning_rate": 8.733555555555557e-06,
"loss": 2.5342,
"step": 10700
},
{
"epoch": 0.214,
"eval_loss": 2.546649694442749,
"eval_runtime": 42.0732,
"eval_samples_per_second": 2.472,
"eval_steps_per_second": 1.236,
"step": 10700
},
{
"epoch": 0.2145,
"grad_norm": 2.214087371322184,
"learning_rate": 8.728e-06,
"loss": 2.5421,
"step": 10725
},
{
"epoch": 0.215,
"grad_norm": 2.6528158070317245,
"learning_rate": 8.722444444444445e-06,
"loss": 2.5444,
"step": 10750
},
{
"epoch": 0.2155,
"grad_norm": 2.346998333067942,
"learning_rate": 8.716888888888889e-06,
"loss": 2.5443,
"step": 10775
},
{
"epoch": 0.216,
"grad_norm": 2.3982005375452013,
"learning_rate": 8.711333333333334e-06,
"loss": 2.5355,
"step": 10800
},
{
"epoch": 0.216,
"eval_loss": 2.546048641204834,
"eval_runtime": 42.068,
"eval_samples_per_second": 2.472,
"eval_steps_per_second": 1.236,
"step": 10800
},
{
"epoch": 0.2165,
"grad_norm": 3.0048318722769762,
"learning_rate": 8.705777777777778e-06,
"loss": 2.5394,
"step": 10825
},
{
"epoch": 0.217,
"grad_norm": 2.0272377886620037,
"learning_rate": 8.700222222222223e-06,
"loss": 2.5464,
"step": 10850
},
{
"epoch": 0.2175,
"grad_norm": 2.652472330601305,
"learning_rate": 8.694666666666668e-06,
"loss": 2.529,
"step": 10875
},
{
"epoch": 0.218,
"grad_norm": 2.2445164925981307,
"learning_rate": 8.689111111111112e-06,
"loss": 2.5427,
"step": 10900
},
{
"epoch": 0.218,
"eval_loss": 2.545748233795166,
"eval_runtime": 42.3618,
"eval_samples_per_second": 2.455,
"eval_steps_per_second": 1.228,
"step": 10900
},
{
"epoch": 0.2185,
"grad_norm": 2.201461546405023,
"learning_rate": 8.683555555555555e-06,
"loss": 2.5393,
"step": 10925
},
{
"epoch": 0.219,
"grad_norm": 2.3583119593823674,
"learning_rate": 8.678e-06,
"loss": 2.5427,
"step": 10950
},
{
"epoch": 0.2195,
"grad_norm": 2.1379039245727403,
"learning_rate": 8.672444444444446e-06,
"loss": 2.5356,
"step": 10975
},
{
"epoch": 0.22,
"grad_norm": 2.5685849750637084,
"learning_rate": 8.66688888888889e-06,
"loss": 2.5457,
"step": 11000
},
{
"epoch": 0.22,
"eval_loss": 2.546349048614502,
"eval_runtime": 42.135,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 11000
},
{
"epoch": 0.2205,
"grad_norm": 2.324853539087807,
"learning_rate": 8.661333333333335e-06,
"loss": 2.5285,
"step": 11025
},
{
"epoch": 0.221,
"grad_norm": 1.9658509127735029,
"learning_rate": 8.655777777777778e-06,
"loss": 2.5219,
"step": 11050
},
{
"epoch": 0.2215,
"grad_norm": 2.53943222758357,
"learning_rate": 8.650222222222223e-06,
"loss": 2.5272,
"step": 11075
},
{
"epoch": 0.222,
"grad_norm": 2.3198644963527775,
"learning_rate": 8.644666666666669e-06,
"loss": 2.5409,
"step": 11100
},
{
"epoch": 0.222,
"eval_loss": 2.544395923614502,
"eval_runtime": 42.0045,
"eval_samples_per_second": 2.476,
"eval_steps_per_second": 1.238,
"step": 11100
},
{
"epoch": 0.2225,
"grad_norm": 2.3120626804419375,
"learning_rate": 8.639111111111112e-06,
"loss": 2.547,
"step": 11125
},
{
"epoch": 0.223,
"grad_norm": 2.3238512646839773,
"learning_rate": 8.633555555555556e-06,
"loss": 2.5378,
"step": 11150
},
{
"epoch": 0.2235,
"grad_norm": 2.2746777774566107,
"learning_rate": 8.628000000000001e-06,
"loss": 2.5298,
"step": 11175
},
{
"epoch": 0.224,
"grad_norm": 2.2756339157469934,
"learning_rate": 8.622444444444446e-06,
"loss": 2.5293,
"step": 11200
},
{
"epoch": 0.224,
"eval_loss": 2.544095516204834,
"eval_runtime": 42.2435,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 1.231,
"step": 11200
},
{
"epoch": 0.2245,
"grad_norm": 2.069926826217822,
"learning_rate": 8.61688888888889e-06,
"loss": 2.5401,
"step": 11225
},
{
"epoch": 0.225,
"grad_norm": 2.322441839423337,
"learning_rate": 8.611333333333333e-06,
"loss": 2.5417,
"step": 11250
},
{
"epoch": 0.2255,
"grad_norm": 3.726100896647911,
"learning_rate": 8.605777777777779e-06,
"loss": 2.5311,
"step": 11275
},
{
"epoch": 0.226,
"grad_norm": 3.3085164517610632,
"learning_rate": 8.600222222222224e-06,
"loss": 2.5433,
"step": 11300
},
{
"epoch": 0.226,
"eval_loss": 2.541391134262085,
"eval_runtime": 42.0958,
"eval_samples_per_second": 2.471,
"eval_steps_per_second": 1.235,
"step": 11300
},
{
"epoch": 0.2265,
"grad_norm": 2.3467605461379324,
"learning_rate": 8.594666666666668e-06,
"loss": 2.5293,
"step": 11325
},
{
"epoch": 0.227,
"grad_norm": 2.2566795917134637,
"learning_rate": 8.589111111111111e-06,
"loss": 2.5383,
"step": 11350
},
{
"epoch": 0.2275,
"grad_norm": 1.9604293201194958,
"learning_rate": 8.583555555555556e-06,
"loss": 2.5466,
"step": 11375
},
{
"epoch": 0.228,
"grad_norm": 2.7705828556158907,
"learning_rate": 8.578000000000002e-06,
"loss": 2.54,
"step": 11400
},
{
"epoch": 0.228,
"eval_loss": 2.543344259262085,
"eval_runtime": 42.1958,
"eval_samples_per_second": 2.465,
"eval_steps_per_second": 1.232,
"step": 11400
},
{
"epoch": 0.2285,
"grad_norm": 2.1573913228005392,
"learning_rate": 8.572444444444445e-06,
"loss": 2.544,
"step": 11425
},
{
"epoch": 0.229,
"grad_norm": 2.4499651434376264,
"learning_rate": 8.56688888888889e-06,
"loss": 2.543,
"step": 11450
},
{
"epoch": 0.2295,
"grad_norm": 2.1343769951292204,
"learning_rate": 8.561333333333334e-06,
"loss": 2.5568,
"step": 11475
},
{
"epoch": 0.23,
"grad_norm": 1.930848949528708,
"learning_rate": 8.55577777777778e-06,
"loss": 2.5419,
"step": 11500
},
{
"epoch": 0.23,
"eval_loss": 2.541316032409668,
"eval_runtime": 42.2013,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 11500
},
{
"epoch": 0.2305,
"grad_norm": 2.155444422697904,
"learning_rate": 8.550222222222223e-06,
"loss": 2.543,
"step": 11525
},
{
"epoch": 0.231,
"grad_norm": 2.5216609928964706,
"learning_rate": 8.544666666666668e-06,
"loss": 2.5339,
"step": 11550
},
{
"epoch": 0.2315,
"grad_norm": 3.2141643729123826,
"learning_rate": 8.539111111111112e-06,
"loss": 2.5311,
"step": 11575
},
{
"epoch": 0.232,
"grad_norm": 2.779033714093245,
"learning_rate": 8.533555555555557e-06,
"loss": 2.5367,
"step": 11600
},
{
"epoch": 0.232,
"eval_loss": 2.539663553237915,
"eval_runtime": 42.1104,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 11600
},
{
"epoch": 0.2325,
"grad_norm": 2.0599049344871134,
"learning_rate": 8.528e-06,
"loss": 2.5406,
"step": 11625
},
{
"epoch": 0.233,
"grad_norm": 2.1617162796171536,
"learning_rate": 8.522444444444446e-06,
"loss": 2.5244,
"step": 11650
},
{
"epoch": 0.2335,
"grad_norm": 2.4286224889340926,
"learning_rate": 8.51688888888889e-06,
"loss": 2.5364,
"step": 11675
},
{
"epoch": 0.234,
"grad_norm": 2.0435359432545424,
"learning_rate": 8.511333333333334e-06,
"loss": 2.5332,
"step": 11700
},
{
"epoch": 0.234,
"eval_loss": 2.539963960647583,
"eval_runtime": 42.1502,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.234,
"step": 11700
},
{
"epoch": 0.2345,
"grad_norm": 2.6031764141012195,
"learning_rate": 8.505777777777778e-06,
"loss": 2.5292,
"step": 11725
},
{
"epoch": 0.235,
"grad_norm": 2.2484621657042427,
"learning_rate": 8.500222222222223e-06,
"loss": 2.523,
"step": 11750
},
{
"epoch": 0.2355,
"grad_norm": 2.854177673999505,
"learning_rate": 8.494666666666668e-06,
"loss": 2.5218,
"step": 11775
},
{
"epoch": 0.236,
"grad_norm": 2.0770100967771055,
"learning_rate": 8.489111111111112e-06,
"loss": 2.534,
"step": 11800
},
{
"epoch": 0.236,
"eval_loss": 2.538536548614502,
"eval_runtime": 42.3875,
"eval_samples_per_second": 2.454,
"eval_steps_per_second": 1.227,
"step": 11800
},
{
"epoch": 0.2365,
"grad_norm": 2.391823444522325,
"learning_rate": 8.483555555555556e-06,
"loss": 2.5211,
"step": 11825
},
{
"epoch": 0.237,
"grad_norm": 2.333238897849914,
"learning_rate": 8.478e-06,
"loss": 2.5238,
"step": 11850
},
{
"epoch": 0.2375,
"grad_norm": 2.1636671466235256,
"learning_rate": 8.472444444444446e-06,
"loss": 2.5378,
"step": 11875
},
{
"epoch": 0.238,
"grad_norm": 2.5877564973697607,
"learning_rate": 8.46688888888889e-06,
"loss": 2.5415,
"step": 11900
},
{
"epoch": 0.238,
"eval_loss": 2.538837194442749,
"eval_runtime": 42.2059,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 11900
},
{
"epoch": 0.2385,
"grad_norm": 2.1416643296031785,
"learning_rate": 8.461333333333333e-06,
"loss": 2.525,
"step": 11925
},
{
"epoch": 0.239,
"grad_norm": 2.213813959028046,
"learning_rate": 8.455777777777778e-06,
"loss": 2.5416,
"step": 11950
},
{
"epoch": 0.2395,
"grad_norm": 2.759854381361929,
"learning_rate": 8.450222222222224e-06,
"loss": 2.5355,
"step": 11975
},
{
"epoch": 0.24,
"grad_norm": 2.050520488248713,
"learning_rate": 8.444666666666667e-06,
"loss": 2.5263,
"step": 12000
},
{
"epoch": 0.24,
"eval_loss": 2.538311243057251,
"eval_runtime": 42.2256,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.231,
"step": 12000
},
{
"epoch": 0.2405,
"grad_norm": 1.7936589101138234,
"learning_rate": 8.43911111111111e-06,
"loss": 2.5284,
"step": 12025
},
{
"epoch": 0.241,
"grad_norm": 1.9363979159698028,
"learning_rate": 8.433555555555556e-06,
"loss": 2.5217,
"step": 12050
},
{
"epoch": 0.2415,
"grad_norm": 2.0808153808443324,
"learning_rate": 8.428000000000001e-06,
"loss": 2.5151,
"step": 12075
},
{
"epoch": 0.242,
"grad_norm": 2.6428939921225303,
"learning_rate": 8.422444444444445e-06,
"loss": 2.529,
"step": 12100
},
{
"epoch": 0.242,
"eval_loss": 2.537259578704834,
"eval_runtime": 42.2398,
"eval_samples_per_second": 2.462,
"eval_steps_per_second": 1.231,
"step": 12100
},
{
"epoch": 0.2425,
"grad_norm": 2.317352818958468,
"learning_rate": 8.41688888888889e-06,
"loss": 2.5272,
"step": 12125
},
{
"epoch": 0.243,
"grad_norm": 2.3625174954143717,
"learning_rate": 8.411333333333334e-06,
"loss": 2.5341,
"step": 12150
},
{
"epoch": 0.2435,
"grad_norm": 2.385583283955561,
"learning_rate": 8.405777777777779e-06,
"loss": 2.5252,
"step": 12175
},
{
"epoch": 0.244,
"grad_norm": 2.216512817161135,
"learning_rate": 8.400222222222222e-06,
"loss": 2.53,
"step": 12200
},
{
"epoch": 0.244,
"eval_loss": 2.537409782409668,
"eval_runtime": 43.0155,
"eval_samples_per_second": 2.418,
"eval_steps_per_second": 1.209,
"step": 12200
},
{
"epoch": 0.2445,
"grad_norm": 2.851691032693815,
"learning_rate": 8.394666666666668e-06,
"loss": 2.5409,
"step": 12225
},
{
"epoch": 0.245,
"grad_norm": 2.3667554446376085,
"learning_rate": 8.389111111111113e-06,
"loss": 2.5268,
"step": 12250
},
{
"epoch": 0.2455,
"grad_norm": 2.1930831286302896,
"learning_rate": 8.383555555555557e-06,
"loss": 2.5308,
"step": 12275
},
{
"epoch": 0.246,
"grad_norm": 2.4644858889937824,
"learning_rate": 8.378e-06,
"loss": 2.5279,
"step": 12300
},
{
"epoch": 0.246,
"eval_loss": 2.537409782409668,
"eval_runtime": 42.0899,
"eval_samples_per_second": 2.471,
"eval_steps_per_second": 1.235,
"step": 12300
},
{
"epoch": 0.2465,
"grad_norm": 2.6684093247331555,
"learning_rate": 8.372444444444445e-06,
"loss": 2.5263,
"step": 12325
},
{
"epoch": 0.247,
"grad_norm": 1.9552978346665313,
"learning_rate": 8.36688888888889e-06,
"loss": 2.5228,
"step": 12350
},
{
"epoch": 0.2475,
"grad_norm": 2.8273236400537294,
"learning_rate": 8.361333333333334e-06,
"loss": 2.5305,
"step": 12375
},
{
"epoch": 0.248,
"grad_norm": 2.052357534814466,
"learning_rate": 8.355777777777778e-06,
"loss": 2.5193,
"step": 12400
},
{
"epoch": 0.248,
"eval_loss": 2.535456657409668,
"eval_runtime": 42.1653,
"eval_samples_per_second": 2.466,
"eval_steps_per_second": 1.233,
"step": 12400
},
{
"epoch": 0.2485,
"grad_norm": 2.410408330063049,
"learning_rate": 8.350222222222223e-06,
"loss": 2.5269,
"step": 12425
},
{
"epoch": 0.249,
"grad_norm": 1.784156472071755,
"learning_rate": 8.344666666666668e-06,
"loss": 2.5258,
"step": 12450
},
{
"epoch": 0.2495,
"grad_norm": 2.6880708020978368,
"learning_rate": 8.339111111111112e-06,
"loss": 2.5298,
"step": 12475
},
{
"epoch": 0.25,
"grad_norm": 2.2030168810534922,
"learning_rate": 8.333555555555555e-06,
"loss": 2.5201,
"step": 12500
},
{
"epoch": 0.25,
"eval_loss": 2.535832405090332,
"eval_runtime": 42.0482,
"eval_samples_per_second": 2.473,
"eval_steps_per_second": 1.237,
"step": 12500
},
{
"epoch": 0.2505,
"grad_norm": 2.019140906115923,
"learning_rate": 8.328e-06,
"loss": 2.5241,
"step": 12525
},
{
"epoch": 0.251,
"grad_norm": 1.9012303831260067,
"learning_rate": 8.322444444444446e-06,
"loss": 2.5354,
"step": 12550
},
{
"epoch": 0.2515,
"grad_norm": 1.7607101331370496,
"learning_rate": 8.31688888888889e-06,
"loss": 2.5254,
"step": 12575
},
{
"epoch": 0.252,
"grad_norm": 2.5505055208286933,
"learning_rate": 8.311333333333333e-06,
"loss": 2.5294,
"step": 12600
},
{
"epoch": 0.252,
"eval_loss": 2.535231351852417,
"eval_runtime": 41.9731,
"eval_samples_per_second": 2.478,
"eval_steps_per_second": 1.239,
"step": 12600
},
{
"epoch": 0.2525,
"grad_norm": 1.6218420390627293,
"learning_rate": 8.305777777777778e-06,
"loss": 2.5262,
"step": 12625
},
{
"epoch": 0.253,
"grad_norm": 2.0991897222525115,
"learning_rate": 8.300222222222223e-06,
"loss": 2.5206,
"step": 12650
},
{
"epoch": 0.2535,
"grad_norm": 2.478785246720621,
"learning_rate": 8.294666666666667e-06,
"loss": 2.5275,
"step": 12675
},
{
"epoch": 0.254,
"grad_norm": 2.141371973093057,
"learning_rate": 8.289111111111112e-06,
"loss": 2.5323,
"step": 12700
},
{
"epoch": 0.254,
"eval_loss": 2.5341796875,
"eval_runtime": 42.2622,
"eval_samples_per_second": 2.461,
"eval_steps_per_second": 1.23,
"step": 12700
},
{
"epoch": 0.2545,
"grad_norm": 2.269733740633448,
"learning_rate": 8.283555555555556e-06,
"loss": 2.5367,
"step": 12725
},
{
"epoch": 0.255,
"grad_norm": 1.893617133257015,
"learning_rate": 8.278000000000001e-06,
"loss": 2.5257,
"step": 12750
},
{
"epoch": 0.2555,
"grad_norm": 1.751381032940087,
"learning_rate": 8.272444444444445e-06,
"loss": 2.5276,
"step": 12775
},
{
"epoch": 0.256,
"grad_norm": 2.6264391487699545,
"learning_rate": 8.26688888888889e-06,
"loss": 2.5281,
"step": 12800
},
{
"epoch": 0.256,
"eval_loss": 2.534780740737915,
"eval_runtime": 42.0037,
"eval_samples_per_second": 2.476,
"eval_steps_per_second": 1.238,
"step": 12800
},
{
"epoch": 0.2565,
"grad_norm": 2.9544216590918766,
"learning_rate": 8.261333333333335e-06,
"loss": 2.5159,
"step": 12825
},
{
"epoch": 0.257,
"grad_norm": 1.703574826031134,
"learning_rate": 8.255777777777779e-06,
"loss": 2.5314,
"step": 12850
},
{
"epoch": 0.2575,
"grad_norm": 2.23456733038464,
"learning_rate": 8.250222222222222e-06,
"loss": 2.5301,
"step": 12875
},
{
"epoch": 0.258,
"grad_norm": 2.0236952351089132,
"learning_rate": 8.244666666666667e-06,
"loss": 2.5274,
"step": 12900
},
{
"epoch": 0.258,
"eval_loss": 2.532827615737915,
"eval_runtime": 42.2742,
"eval_samples_per_second": 2.46,
"eval_steps_per_second": 1.23,
"step": 12900
},
{
"epoch": 0.2585,
"grad_norm": 1.9175658573019432,
"learning_rate": 8.239111111111113e-06,
"loss": 2.5293,
"step": 12925
},
{
"epoch": 0.259,
"grad_norm": 2.227745372848629,
"learning_rate": 8.233555555555556e-06,
"loss": 2.5346,
"step": 12950
},
{
"epoch": 0.2595,
"grad_norm": 2.0320264112024375,
"learning_rate": 8.228e-06,
"loss": 2.5133,
"step": 12975
},
{
"epoch": 0.26,
"grad_norm": 2.3254627331546636,
"learning_rate": 8.222444444444445e-06,
"loss": 2.5257,
"step": 13000
},
{
"epoch": 0.26,
"eval_loss": 2.532376766204834,
"eval_runtime": 42.0555,
"eval_samples_per_second": 2.473,
"eval_steps_per_second": 1.236,
"step": 13000
},
{
"epoch": 0.2605,
"grad_norm": 1.9492007310542454,
"learning_rate": 8.21688888888889e-06,
"loss": 2.5246,
"step": 13025
},
{
"epoch": 0.261,
"grad_norm": 2.3076187120913105,
"learning_rate": 8.211333333333334e-06,
"loss": 2.5287,
"step": 13050
},
{
"epoch": 0.2615,
"grad_norm": 1.8076829520267466,
"learning_rate": 8.205777777777777e-06,
"loss": 2.5161,
"step": 13075
},
{
"epoch": 0.262,
"grad_norm": 2.272592798843781,
"learning_rate": 8.200222222222223e-06,
"loss": 2.5272,
"step": 13100
},
{
"epoch": 0.262,
"eval_loss": 2.532526969909668,
"eval_runtime": 42.2261,
"eval_samples_per_second": 2.463,
"eval_steps_per_second": 1.231,
"step": 13100
},
{
"epoch": 0.2625,
"grad_norm": 2.249412616356025,
"learning_rate": 8.194666666666668e-06,
"loss": 2.5268,
"step": 13125
},
{
"epoch": 0.263,
"grad_norm": 2.054677758627288,
"learning_rate": 8.189111111111111e-06,
"loss": 2.5232,
"step": 13150
},
{
"epoch": 0.2635,
"grad_norm": 2.3525307448487545,
"learning_rate": 8.183555555555555e-06,
"loss": 2.5186,
"step": 13175
},
{
"epoch": 0.264,
"grad_norm": 2.0816994586757294,
"learning_rate": 8.178e-06,
"loss": 2.5249,
"step": 13200
},
{
"epoch": 0.264,
"eval_loss": 2.531926155090332,
"eval_runtime": 42.1448,
"eval_samples_per_second": 2.468,
"eval_steps_per_second": 1.234,
"step": 13200
},
{
"epoch": 0.2645,
"grad_norm": 1.8771257650501383,
"learning_rate": 8.172444444444446e-06,
"loss": 2.5185,
"step": 13225
},
{
"epoch": 0.265,
"grad_norm": 3.015360724178772,
"learning_rate": 8.166888888888889e-06,
"loss": 2.5236,
"step": 13250
},
{
"epoch": 0.2655,
"grad_norm": 2.809533897542425,
"learning_rate": 8.161333333333334e-06,
"loss": 2.5207,
"step": 13275
},
{
"epoch": 0.266,
"grad_norm": 2.0578110863684307,
"learning_rate": 8.155777777777778e-06,
"loss": 2.5133,
"step": 13300
},
{
"epoch": 0.266,
"eval_loss": 2.531550407409668,
"eval_runtime": 42.4297,
"eval_samples_per_second": 2.451,
"eval_steps_per_second": 1.226,
"step": 13300
},
{
"epoch": 0.2665,
"grad_norm": 2.0416241200876533,
"learning_rate": 8.150222222222223e-06,
"loss": 2.5174,
"step": 13325
},
{
"epoch": 0.267,
"grad_norm": 2.208631750373162,
"learning_rate": 8.144666666666667e-06,
"loss": 2.5218,
"step": 13350
},
{
"epoch": 0.2675,
"grad_norm": 2.28882872372725,
"learning_rate": 8.139111111111112e-06,
"loss": 2.5249,
"step": 13375
},
{
"epoch": 0.268,
"grad_norm": 2.0364230232228233,
"learning_rate": 8.133555555555557e-06,
"loss": 2.5235,
"step": 13400
},
{
"epoch": 0.268,
"eval_loss": 2.530423641204834,
"eval_runtime": 42.078,
"eval_samples_per_second": 2.472,
"eval_steps_per_second": 1.236,
"step": 13400
},
{
"epoch": 0.2685,
"grad_norm": 2.6248839843381266,
"learning_rate": 8.128e-06,
"loss": 2.5163,
"step": 13425
},
{
"epoch": 0.269,
"grad_norm": 1.7874079388176,
"learning_rate": 8.122444444444444e-06,
"loss": 2.5321,
"step": 13450
},
{
"epoch": 0.2695,
"grad_norm": 2.464203238277693,
"learning_rate": 8.11688888888889e-06,
"loss": 2.5194,
"step": 13475
},
{
"epoch": 0.27,
"grad_norm": 2.120977146291401,
"learning_rate": 8.111333333333335e-06,
"loss": 2.5206,
"step": 13500
},
{
"epoch": 0.27,
"eval_loss": 2.530874490737915,
"eval_runtime": 42.0694,
"eval_samples_per_second": 2.472,
"eval_steps_per_second": 1.236,
"step": 13500
},
{
"epoch": 0.2705,
"grad_norm": 2.1455954231359193,
"learning_rate": 8.105777777777778e-06,
"loss": 2.5175,
"step": 13525
},
{
"epoch": 0.271,
"grad_norm": 2.139148836070426,
"learning_rate": 8.100222222222222e-06,
"loss": 2.5274,
"step": 13550
},
{
"epoch": 0.2715,
"grad_norm": 2.79028405854528,
"learning_rate": 8.094666666666667e-06,
"loss": 2.5231,
"step": 13575
},
{
"epoch": 0.272,
"grad_norm": 2.1128072564088227,
"learning_rate": 8.089111111111112e-06,
"loss": 2.5278,
"step": 13600
},
{
"epoch": 0.272,
"eval_loss": 2.530573844909668,
"eval_runtime": 42.0442,
"eval_samples_per_second": 2.474,
"eval_steps_per_second": 1.237,
"step": 13600
},
{
"epoch": 0.2725,
"grad_norm": 2.3629134301910044,
"learning_rate": 8.083555555555556e-06,
"loss": 2.5214,
"step": 13625
},
{
"epoch": 0.273,
"grad_norm": 1.8871218393029536,
"learning_rate": 8.078e-06,
"loss": 2.5068,
"step": 13650
},
{
"epoch": 0.2735,
"grad_norm": 1.9782214927434483,
"learning_rate": 8.072444444444445e-06,
"loss": 2.5243,
"step": 13675
},
{
"epoch": 0.274,
"grad_norm": 2.6328648420185927,
"learning_rate": 8.06688888888889e-06,
"loss": 2.5228,
"step": 13700
},
{
"epoch": 0.274,
"eval_loss": 2.530573844909668,
"eval_runtime": 42.1498,
"eval_samples_per_second": 2.467,
"eval_steps_per_second": 1.234,
"step": 13700
},
{
"epoch": 0.2745,
"grad_norm": 1.7018004256507808,
"learning_rate": 8.061333333333334e-06,
"loss": 2.5178,
"step": 13725
},
{
"epoch": 0.275,
"grad_norm": 1.9267466735782932,
"learning_rate": 8.055777777777777e-06,
"loss": 2.5079,
"step": 13750
},
{
"epoch": 0.2755,
"grad_norm": 2.2443788597292778,
"learning_rate": 8.050222222222222e-06,
"loss": 2.5213,
"step": 13775
},
{
"epoch": 0.276,
"grad_norm": 2.873399914185342,
"learning_rate": 8.044666666666668e-06,
"loss": 2.5212,
"step": 13800
},
{
"epoch": 0.276,
"eval_loss": 2.529296875,
"eval_runtime": 42.1115,
"eval_samples_per_second": 2.47,
"eval_steps_per_second": 1.235,
"step": 13800
},
{
"epoch": 0.2765,
"grad_norm": 2.794454715063574,
"learning_rate": 8.039111111111111e-06,
"loss": 2.523,
"step": 13825
},
{
"epoch": 0.277,
"grad_norm": 2.2530766498680244,
"learning_rate": 8.033555555555556e-06,
"loss": 2.5195,
"step": 13850
},
{
"epoch": 0.2775,
"grad_norm": 2.357725394975894,
"learning_rate": 8.028e-06,
"loss": 2.509,
"step": 13875
},
{
"epoch": 0.278,
"grad_norm": 1.896374018977524,
"learning_rate": 8.022444444444445e-06,
"loss": 2.5246,
"step": 13900
},
{
"epoch": 0.278,
"eval_loss": 2.527794361114502,
"eval_runtime": 42.2006,
"eval_samples_per_second": 2.464,
"eval_steps_per_second": 1.232,
"step": 13900
},
{
"epoch": 0.2785,
"grad_norm": 1.9716064978040044,
"learning_rate": 8.016888888888889e-06,
"loss": 2.5325,
"step": 13925
},
{
"epoch": 0.279,
"grad_norm": 2.113859123339066,
"learning_rate": 8.011333333333334e-06,
"loss": 2.5452,
"step": 13950
},
{
"epoch": 0.2795,
"grad_norm": 1.802172403266005,
"learning_rate": 8.00577777777778e-06,
"loss": 2.5261,
"step": 13975
},
{
"epoch": 0.28,
"grad_norm": 1.6938868697398992,
"learning_rate": 8.000222222222223e-06,
"loss": 2.5194,
"step": 14000
},
{
"epoch": 0.28,
"eval_loss": 2.527418851852417,
"eval_runtime": 42.0495,
"eval_samples_per_second": 2.473,
"eval_steps_per_second": 1.237,
"step": 14000
}
],
"logging_steps": 25,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.1419070965912437e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}