InnerLoopARMTForCausalLM_run_30 / trainer_state.json
irodkin's picture
Training checkpoint at step 6000
121a14d verified
{
"best_global_step": 5900,
"best_metric": 2.4210917949676514,
"best_model_checkpoint": "../runs/karpathy/fineweb-edu-100b-shuffle/meta-llama/Llama-3.2-1B/linear_adamw_wd1e-03_7x1024_mem32_bs64_hf_armt_dmem64/run_30/checkpoint-5000",
"epoch": 0.12,
"eval_steps": 100,
"global_step": 6000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005,
"grad_norm": 2.624103276270124,
"learning_rate": 4.8e-08,
"loss": 4.0893,
"step": 25
},
{
"epoch": 0.001,
"grad_norm": 1.3629568986234561,
"learning_rate": 9.8e-08,
"loss": 3.9543,
"step": 50
},
{
"epoch": 0.0015,
"grad_norm": 0.8050128701430977,
"learning_rate": 1.4800000000000003e-07,
"loss": 3.6763,
"step": 75
},
{
"epoch": 0.002,
"grad_norm": 0.3690286383727022,
"learning_rate": 1.9800000000000003e-07,
"loss": 3.327,
"step": 100
},
{
"epoch": 0.002,
"eval_loss": 3.100055694580078,
"eval_runtime": 32.7706,
"eval_samples_per_second": 3.57,
"eval_steps_per_second": 1.8,
"step": 100
},
{
"epoch": 0.0025,
"grad_norm": 0.24011694167100578,
"learning_rate": 2.48e-07,
"loss": 3.1322,
"step": 125
},
{
"epoch": 0.003,
"grad_norm": 0.149511940963387,
"learning_rate": 2.9800000000000005e-07,
"loss": 2.9672,
"step": 150
},
{
"epoch": 0.0035,
"grad_norm": 0.10071711520195754,
"learning_rate": 3.48e-07,
"loss": 2.8684,
"step": 175
},
{
"epoch": 0.004,
"grad_norm": 0.09695377414070089,
"learning_rate": 3.9800000000000004e-07,
"loss": 2.8244,
"step": 200
},
{
"epoch": 0.004,
"eval_loss": 2.7518060207366943,
"eval_runtime": 32.9203,
"eval_samples_per_second": 3.554,
"eval_steps_per_second": 1.792,
"step": 200
},
{
"epoch": 0.0045,
"grad_norm": 0.06541174981920718,
"learning_rate": 4.4800000000000004e-07,
"loss": 2.7736,
"step": 225
},
{
"epoch": 0.005,
"grad_norm": 0.061297886999798934,
"learning_rate": 4.98e-07,
"loss": 2.7392,
"step": 250
},
{
"epoch": 0.0055,
"grad_norm": 0.07881073149840945,
"learning_rate": 5.480000000000001e-07,
"loss": 2.7194,
"step": 275
},
{
"epoch": 0.006,
"grad_norm": 0.05125386617161651,
"learning_rate": 5.98e-07,
"loss": 2.6982,
"step": 300
},
{
"epoch": 0.006,
"eval_loss": 2.6622018814086914,
"eval_runtime": 32.9076,
"eval_samples_per_second": 3.555,
"eval_steps_per_second": 1.793,
"step": 300
},
{
"epoch": 0.0065,
"grad_norm": 0.04659366450077996,
"learning_rate": 6.48e-07,
"loss": 2.6725,
"step": 325
},
{
"epoch": 0.007,
"grad_norm": 0.04588097652548341,
"learning_rate": 6.98e-07,
"loss": 2.6592,
"step": 350
},
{
"epoch": 0.0075,
"grad_norm": 0.058421958212028904,
"learning_rate": 7.480000000000001e-07,
"loss": 2.6481,
"step": 375
},
{
"epoch": 0.008,
"grad_norm": 0.04289575736155661,
"learning_rate": 7.98e-07,
"loss": 2.6257,
"step": 400
},
{
"epoch": 0.008,
"eval_loss": 2.6052613258361816,
"eval_runtime": 32.8227,
"eval_samples_per_second": 3.565,
"eval_steps_per_second": 1.798,
"step": 400
},
{
"epoch": 0.0085,
"grad_norm": 0.041602666338794385,
"learning_rate": 8.480000000000001e-07,
"loss": 2.6089,
"step": 425
},
{
"epoch": 0.009,
"grad_norm": 0.040090024026539266,
"learning_rate": 8.980000000000001e-07,
"loss": 2.5985,
"step": 450
},
{
"epoch": 0.0095,
"grad_norm": 0.05346463020318845,
"learning_rate": 9.480000000000001e-07,
"loss": 2.5858,
"step": 475
},
{
"epoch": 0.01,
"grad_norm": 0.03240197247016216,
"learning_rate": 9.98e-07,
"loss": 2.5773,
"step": 500
},
{
"epoch": 0.01,
"eval_loss": 2.5677218437194824,
"eval_runtime": 32.9146,
"eval_samples_per_second": 3.555,
"eval_steps_per_second": 1.793,
"step": 500
},
{
"epoch": 0.0105,
"grad_norm": 0.030627609315729644,
"learning_rate": 1.0480000000000002e-06,
"loss": 2.5695,
"step": 525
},
{
"epoch": 0.011,
"grad_norm": 0.03146801435404312,
"learning_rate": 1.0980000000000001e-06,
"loss": 2.558,
"step": 550
},
{
"epoch": 0.0115,
"grad_norm": 0.028453864143727626,
"learning_rate": 1.148e-06,
"loss": 2.5645,
"step": 575
},
{
"epoch": 0.012,
"grad_norm": 0.03026805511159676,
"learning_rate": 1.1980000000000002e-06,
"loss": 2.5645,
"step": 600
},
{
"epoch": 0.012,
"eval_loss": 2.546586275100708,
"eval_runtime": 32.8424,
"eval_samples_per_second": 3.562,
"eval_steps_per_second": 1.796,
"step": 600
},
{
"epoch": 0.0125,
"grad_norm": 0.032033771539522,
"learning_rate": 1.248e-06,
"loss": 2.5424,
"step": 625
},
{
"epoch": 0.013,
"grad_norm": 0.0281966122475446,
"learning_rate": 1.2980000000000001e-06,
"loss": 2.5409,
"step": 650
},
{
"epoch": 0.0135,
"grad_norm": 0.02887428243284281,
"learning_rate": 1.348e-06,
"loss": 2.543,
"step": 675
},
{
"epoch": 0.014,
"grad_norm": 0.027672621753278132,
"learning_rate": 1.3980000000000002e-06,
"loss": 2.5385,
"step": 700
},
{
"epoch": 0.014,
"eval_loss": 2.530237913131714,
"eval_runtime": 32.7994,
"eval_samples_per_second": 3.567,
"eval_steps_per_second": 1.799,
"step": 700
},
{
"epoch": 0.0145,
"grad_norm": 0.030815191380069624,
"learning_rate": 1.4480000000000002e-06,
"loss": 2.5302,
"step": 725
},
{
"epoch": 0.015,
"grad_norm": 0.0336387385604783,
"learning_rate": 1.498e-06,
"loss": 2.531,
"step": 750
},
{
"epoch": 0.0155,
"grad_norm": 0.02858543320323233,
"learning_rate": 1.548e-06,
"loss": 2.5184,
"step": 775
},
{
"epoch": 0.016,
"grad_norm": 0.028120393653995705,
"learning_rate": 1.5980000000000002e-06,
"loss": 2.5101,
"step": 800
},
{
"epoch": 0.016,
"eval_loss": 2.5182888507843018,
"eval_runtime": 33.2135,
"eval_samples_per_second": 3.523,
"eval_steps_per_second": 1.776,
"step": 800
},
{
"epoch": 0.0165,
"grad_norm": 0.03014167593156162,
"learning_rate": 1.6480000000000001e-06,
"loss": 2.5232,
"step": 825
},
{
"epoch": 0.017,
"grad_norm": 0.028528349033195077,
"learning_rate": 1.6980000000000003e-06,
"loss": 2.5162,
"step": 850
},
{
"epoch": 0.0175,
"grad_norm": 0.031230193601244804,
"learning_rate": 1.7480000000000002e-06,
"loss": 2.4995,
"step": 875
},
{
"epoch": 0.018,
"grad_norm": 0.03555060954716827,
"learning_rate": 1.798e-06,
"loss": 2.5064,
"step": 900
},
{
"epoch": 0.018,
"eval_loss": 2.5070879459381104,
"eval_runtime": 33.3807,
"eval_samples_per_second": 3.505,
"eval_steps_per_second": 1.767,
"step": 900
},
{
"epoch": 0.0185,
"grad_norm": 0.03561871969060444,
"learning_rate": 1.8480000000000001e-06,
"loss": 2.5004,
"step": 925
},
{
"epoch": 0.019,
"grad_norm": 0.03094584673111385,
"learning_rate": 1.898e-06,
"loss": 2.4959,
"step": 950
},
{
"epoch": 0.0195,
"grad_norm": 0.035545021685136444,
"learning_rate": 1.9480000000000002e-06,
"loss": 2.4982,
"step": 975
},
{
"epoch": 0.02,
"grad_norm": 0.0370422613473599,
"learning_rate": 1.998e-06,
"loss": 2.4927,
"step": 1000
},
{
"epoch": 0.02,
"eval_loss": 2.4966063499450684,
"eval_runtime": 33.3038,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.772,
"step": 1000
},
{
"epoch": 0.0205,
"grad_norm": 0.04199895036530391,
"learning_rate": 2.048e-06,
"loss": 2.4847,
"step": 1025
},
{
"epoch": 0.021,
"grad_norm": 0.0384304039845165,
"learning_rate": 2.098e-06,
"loss": 2.4916,
"step": 1050
},
{
"epoch": 0.0215,
"grad_norm": 0.03291684378446945,
"learning_rate": 2.148e-06,
"loss": 2.4891,
"step": 1075
},
{
"epoch": 0.022,
"grad_norm": 0.03376054787167217,
"learning_rate": 2.198e-06,
"loss": 2.4896,
"step": 1100
},
{
"epoch": 0.022,
"eval_loss": 2.488358974456787,
"eval_runtime": 33.2437,
"eval_samples_per_second": 3.519,
"eval_steps_per_second": 1.775,
"step": 1100
},
{
"epoch": 0.0225,
"grad_norm": 0.04001450258151374,
"learning_rate": 2.2480000000000003e-06,
"loss": 2.4855,
"step": 1125
},
{
"epoch": 0.023,
"grad_norm": 0.036190398257348835,
"learning_rate": 2.2980000000000003e-06,
"loss": 2.4834,
"step": 1150
},
{
"epoch": 0.0235,
"grad_norm": 0.03806535632489679,
"learning_rate": 2.3480000000000002e-06,
"loss": 2.48,
"step": 1175
},
{
"epoch": 0.024,
"grad_norm": 0.039255476981030824,
"learning_rate": 2.398e-06,
"loss": 2.4853,
"step": 1200
},
{
"epoch": 0.024,
"eval_loss": 2.481823205947876,
"eval_runtime": 33.3121,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 1200
},
{
"epoch": 0.0245,
"grad_norm": 0.037361446323077335,
"learning_rate": 2.448e-06,
"loss": 2.4776,
"step": 1225
},
{
"epoch": 0.025,
"grad_norm": 0.03410866644624654,
"learning_rate": 2.498e-06,
"loss": 2.4672,
"step": 1250
},
{
"epoch": 0.0255,
"grad_norm": 0.03501276078614437,
"learning_rate": 2.5480000000000004e-06,
"loss": 2.4633,
"step": 1275
},
{
"epoch": 0.026,
"grad_norm": 0.035383520468643466,
"learning_rate": 2.598e-06,
"loss": 2.4647,
"step": 1300
},
{
"epoch": 0.026,
"eval_loss": 2.476562976837158,
"eval_runtime": 33.4013,
"eval_samples_per_second": 3.503,
"eval_steps_per_second": 1.766,
"step": 1300
},
{
"epoch": 0.0265,
"grad_norm": 0.03467179176189109,
"learning_rate": 2.648e-06,
"loss": 2.476,
"step": 1325
},
{
"epoch": 0.027,
"grad_norm": 0.03925271631713796,
"learning_rate": 2.6980000000000003e-06,
"loss": 2.4675,
"step": 1350
},
{
"epoch": 0.0275,
"grad_norm": 0.03419652940921129,
"learning_rate": 2.748e-06,
"loss": 2.4708,
"step": 1375
},
{
"epoch": 0.028,
"grad_norm": 0.03764216373530557,
"learning_rate": 2.798e-06,
"loss": 2.4709,
"step": 1400
},
{
"epoch": 0.028,
"eval_loss": 2.471618175506592,
"eval_runtime": 33.0936,
"eval_samples_per_second": 3.535,
"eval_steps_per_second": 1.783,
"step": 1400
},
{
"epoch": 0.0285,
"grad_norm": 0.03802047455035515,
"learning_rate": 2.848e-06,
"loss": 2.4608,
"step": 1425
},
{
"epoch": 0.029,
"grad_norm": 0.03323072329115027,
"learning_rate": 2.8980000000000005e-06,
"loss": 2.4695,
"step": 1450
},
{
"epoch": 0.0295,
"grad_norm": 0.03693054288365918,
"learning_rate": 2.9480000000000004e-06,
"loss": 2.4635,
"step": 1475
},
{
"epoch": 0.03,
"grad_norm": 0.06509796100945928,
"learning_rate": 2.9980000000000003e-06,
"loss": 2.467,
"step": 1500
},
{
"epoch": 0.03,
"eval_loss": 2.467376232147217,
"eval_runtime": 33.1827,
"eval_samples_per_second": 3.526,
"eval_steps_per_second": 1.778,
"step": 1500
},
{
"epoch": 0.0305,
"grad_norm": 0.030120041993102375,
"learning_rate": 3.0480000000000003e-06,
"loss": 2.463,
"step": 1525
},
{
"epoch": 0.031,
"grad_norm": 0.039881744916892024,
"learning_rate": 3.0980000000000007e-06,
"loss": 2.4533,
"step": 1550
},
{
"epoch": 0.0315,
"grad_norm": 0.029950518864288997,
"learning_rate": 3.1480000000000006e-06,
"loss": 2.4585,
"step": 1575
},
{
"epoch": 0.032,
"grad_norm": 0.07753499473514511,
"learning_rate": 3.198e-06,
"loss": 2.4502,
"step": 1600
},
{
"epoch": 0.032,
"eval_loss": 2.4625656604766846,
"eval_runtime": 33.2433,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 1.775,
"step": 1600
},
{
"epoch": 0.0325,
"grad_norm": 0.048526204949902306,
"learning_rate": 3.248e-06,
"loss": 2.45,
"step": 1625
},
{
"epoch": 0.033,
"grad_norm": 0.0378506235382453,
"learning_rate": 3.298e-06,
"loss": 2.4488,
"step": 1650
},
{
"epoch": 0.0335,
"grad_norm": 0.03228564469275673,
"learning_rate": 3.348e-06,
"loss": 2.4568,
"step": 1675
},
{
"epoch": 0.034,
"grad_norm": 0.03417826301349761,
"learning_rate": 3.3980000000000003e-06,
"loss": 2.4514,
"step": 1700
},
{
"epoch": 0.034,
"eval_loss": 2.459094762802124,
"eval_runtime": 33.1684,
"eval_samples_per_second": 3.527,
"eval_steps_per_second": 1.779,
"step": 1700
},
{
"epoch": 0.0345,
"grad_norm": 0.03119990821359214,
"learning_rate": 3.4480000000000003e-06,
"loss": 2.4447,
"step": 1725
},
{
"epoch": 0.035,
"grad_norm": 0.032737257559355144,
"learning_rate": 3.4980000000000002e-06,
"loss": 2.4531,
"step": 1750
},
{
"epoch": 0.0355,
"grad_norm": 0.03341768726028273,
"learning_rate": 3.548e-06,
"loss": 2.4476,
"step": 1775
},
{
"epoch": 0.036,
"grad_norm": 0.03225090122428514,
"learning_rate": 3.5980000000000005e-06,
"loss": 2.4403,
"step": 1800
},
{
"epoch": 0.036,
"eval_loss": 2.455217123031616,
"eval_runtime": 32.9783,
"eval_samples_per_second": 3.548,
"eval_steps_per_second": 1.789,
"step": 1800
},
{
"epoch": 0.0365,
"grad_norm": 0.030979620558740147,
"learning_rate": 3.6480000000000005e-06,
"loss": 2.4379,
"step": 1825
},
{
"epoch": 0.037,
"grad_norm": 0.04044689712503281,
"learning_rate": 3.6980000000000004e-06,
"loss": 2.455,
"step": 1850
},
{
"epoch": 0.0375,
"grad_norm": 0.034557037951751954,
"learning_rate": 3.7480000000000004e-06,
"loss": 2.4517,
"step": 1875
},
{
"epoch": 0.038,
"grad_norm": 0.02821125825480679,
"learning_rate": 3.7980000000000007e-06,
"loss": 2.4429,
"step": 1900
},
{
"epoch": 0.038,
"eval_loss": 2.4529292583465576,
"eval_runtime": 33.4058,
"eval_samples_per_second": 3.502,
"eval_steps_per_second": 1.766,
"step": 1900
},
{
"epoch": 0.0385,
"grad_norm": 0.029890640830031213,
"learning_rate": 3.848e-06,
"loss": 2.4437,
"step": 1925
},
{
"epoch": 0.039,
"grad_norm": 0.03265759623511245,
"learning_rate": 3.898e-06,
"loss": 2.438,
"step": 1950
},
{
"epoch": 0.0395,
"grad_norm": 0.10385356338699042,
"learning_rate": 3.948e-06,
"loss": 2.4442,
"step": 1975
},
{
"epoch": 0.04,
"grad_norm": 0.03233294644174686,
"learning_rate": 3.9980000000000005e-06,
"loss": 2.4451,
"step": 2000
},
{
"epoch": 0.04,
"eval_loss": 2.450512647628784,
"eval_runtime": 33.274,
"eval_samples_per_second": 3.516,
"eval_steps_per_second": 1.773,
"step": 2000
},
{
"epoch": 0.0405,
"grad_norm": 0.034945541932647324,
"learning_rate": 4.048e-06,
"loss": 2.4357,
"step": 2025
},
{
"epoch": 0.041,
"grad_norm": 0.029322959861707003,
"learning_rate": 4.098e-06,
"loss": 2.4373,
"step": 2050
},
{
"epoch": 0.0415,
"grad_norm": 0.027365033479394632,
"learning_rate": 4.148000000000001e-06,
"loss": 2.442,
"step": 2075
},
{
"epoch": 0.042,
"grad_norm": 0.042214130565513416,
"learning_rate": 4.198e-06,
"loss": 2.4362,
"step": 2100
},
{
"epoch": 0.042,
"eval_loss": 2.448322296142578,
"eval_runtime": 33.466,
"eval_samples_per_second": 3.496,
"eval_steps_per_second": 1.763,
"step": 2100
},
{
"epoch": 0.0425,
"grad_norm": 0.028874346576168566,
"learning_rate": 4.248000000000001e-06,
"loss": 2.4428,
"step": 2125
},
{
"epoch": 0.043,
"grad_norm": 0.029771861998040296,
"learning_rate": 4.298e-06,
"loss": 2.4298,
"step": 2150
},
{
"epoch": 0.0435,
"grad_norm": 0.029668415484575914,
"learning_rate": 4.3480000000000006e-06,
"loss": 2.4352,
"step": 2175
},
{
"epoch": 0.044,
"grad_norm": 0.02564927582570633,
"learning_rate": 4.398000000000001e-06,
"loss": 2.4349,
"step": 2200
},
{
"epoch": 0.044,
"eval_loss": 2.4465889930725098,
"eval_runtime": 33.3555,
"eval_samples_per_second": 3.508,
"eval_steps_per_second": 1.769,
"step": 2200
},
{
"epoch": 0.0445,
"grad_norm": 0.024797235968250814,
"learning_rate": 4.4480000000000004e-06,
"loss": 2.4409,
"step": 2225
},
{
"epoch": 0.045,
"grad_norm": 0.02813189377877088,
"learning_rate": 4.498e-06,
"loss": 2.4367,
"step": 2250
},
{
"epoch": 0.0455,
"grad_norm": 0.02750903211389184,
"learning_rate": 4.548e-06,
"loss": 2.4326,
"step": 2275
},
{
"epoch": 0.046,
"grad_norm": 0.027737559952553607,
"learning_rate": 4.598e-06,
"loss": 2.4375,
"step": 2300
},
{
"epoch": 0.046,
"eval_loss": 2.4448626041412354,
"eval_runtime": 33.2658,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.774,
"step": 2300
},
{
"epoch": 0.0465,
"grad_norm": 0.02630663299301831,
"learning_rate": 4.648e-06,
"loss": 2.4392,
"step": 2325
},
{
"epoch": 0.047,
"grad_norm": 0.027929449055597393,
"learning_rate": 4.698000000000001e-06,
"loss": 2.4256,
"step": 2350
},
{
"epoch": 0.0475,
"grad_norm": 0.0283193243102273,
"learning_rate": 4.748e-06,
"loss": 2.429,
"step": 2375
},
{
"epoch": 0.048,
"grad_norm": 0.029295313451333963,
"learning_rate": 4.7980000000000005e-06,
"loss": 2.4393,
"step": 2400
},
{
"epoch": 0.048,
"eval_loss": 2.4432175159454346,
"eval_runtime": 33.3067,
"eval_samples_per_second": 3.513,
"eval_steps_per_second": 1.771,
"step": 2400
},
{
"epoch": 0.0485,
"grad_norm": 0.025382897552394503,
"learning_rate": 4.848000000000001e-06,
"loss": 2.4322,
"step": 2425
},
{
"epoch": 0.049,
"grad_norm": 0.02450548193909556,
"learning_rate": 4.898e-06,
"loss": 2.4314,
"step": 2450
},
{
"epoch": 0.0495,
"grad_norm": 0.033065483070063684,
"learning_rate": 4.948000000000001e-06,
"loss": 2.4338,
"step": 2475
},
{
"epoch": 0.05,
"grad_norm": 0.027543894857825314,
"learning_rate": 4.998e-06,
"loss": 2.4333,
"step": 2500
},
{
"epoch": 0.05,
"eval_loss": 2.441807985305786,
"eval_runtime": 33.0379,
"eval_samples_per_second": 3.541,
"eval_steps_per_second": 1.786,
"step": 2500
},
{
"epoch": 0.0505,
"grad_norm": 0.027354239436717945,
"learning_rate": 5.048000000000001e-06,
"loss": 2.439,
"step": 2525
},
{
"epoch": 0.051,
"grad_norm": 0.022458884368301627,
"learning_rate": 5.098000000000001e-06,
"loss": 2.427,
"step": 2550
},
{
"epoch": 0.0515,
"grad_norm": 0.033350881745701555,
"learning_rate": 5.1480000000000005e-06,
"loss": 2.4275,
"step": 2575
},
{
"epoch": 0.052,
"grad_norm": 0.025032545530163004,
"learning_rate": 5.198000000000001e-06,
"loss": 2.4275,
"step": 2600
},
{
"epoch": 0.052,
"eval_loss": 2.440882444381714,
"eval_runtime": 33.1835,
"eval_samples_per_second": 3.526,
"eval_steps_per_second": 1.778,
"step": 2600
},
{
"epoch": 0.0525,
"grad_norm": 0.026294170044068685,
"learning_rate": 5.248000000000001e-06,
"loss": 2.4312,
"step": 2625
},
{
"epoch": 0.053,
"grad_norm": 0.03301155351988982,
"learning_rate": 5.298000000000001e-06,
"loss": 2.4203,
"step": 2650
},
{
"epoch": 0.0535,
"grad_norm": 0.02389586194961339,
"learning_rate": 5.348000000000001e-06,
"loss": 2.4332,
"step": 2675
},
{
"epoch": 0.054,
"grad_norm": 0.056862279743176244,
"learning_rate": 5.398e-06,
"loss": 2.4313,
"step": 2700
},
{
"epoch": 0.054,
"eval_loss": 2.4402644634246826,
"eval_runtime": 33.2071,
"eval_samples_per_second": 3.523,
"eval_steps_per_second": 1.777,
"step": 2700
},
{
"epoch": 0.0545,
"grad_norm": 0.025636671246445756,
"learning_rate": 5.448e-06,
"loss": 2.4311,
"step": 2725
},
{
"epoch": 0.055,
"grad_norm": 0.022083605910153424,
"learning_rate": 5.498e-06,
"loss": 2.4357,
"step": 2750
},
{
"epoch": 0.0555,
"grad_norm": 0.024223735712298522,
"learning_rate": 5.548e-06,
"loss": 2.4294,
"step": 2775
},
{
"epoch": 0.056,
"grad_norm": 0.029847698463432104,
"learning_rate": 5.5980000000000004e-06,
"loss": 2.4344,
"step": 2800
},
{
"epoch": 0.056,
"eval_loss": 2.4389007091522217,
"eval_runtime": 33.2705,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.773,
"step": 2800
},
{
"epoch": 0.0565,
"grad_norm": 0.032144633236930065,
"learning_rate": 5.648e-06,
"loss": 2.4282,
"step": 2825
},
{
"epoch": 0.057,
"grad_norm": 0.02355863809037046,
"learning_rate": 5.698e-06,
"loss": 2.4322,
"step": 2850
},
{
"epoch": 0.0575,
"grad_norm": 0.023728744427970416,
"learning_rate": 5.748e-06,
"loss": 2.4286,
"step": 2875
},
{
"epoch": 0.058,
"grad_norm": 0.025539915034515293,
"learning_rate": 5.798e-06,
"loss": 2.4287,
"step": 2900
},
{
"epoch": 0.058,
"eval_loss": 2.4376914501190186,
"eval_runtime": 33.3179,
"eval_samples_per_second": 3.512,
"eval_steps_per_second": 1.771,
"step": 2900
},
{
"epoch": 0.0585,
"grad_norm": 0.023457547558388747,
"learning_rate": 5.848000000000001e-06,
"loss": 2.4289,
"step": 2925
},
{
"epoch": 0.059,
"grad_norm": 0.025297710201421797,
"learning_rate": 5.898e-06,
"loss": 2.4274,
"step": 2950
},
{
"epoch": 0.0595,
"grad_norm": 0.024155176530161276,
"learning_rate": 5.9480000000000005e-06,
"loss": 2.4169,
"step": 2975
},
{
"epoch": 0.06,
"grad_norm": 0.023954841726960448,
"learning_rate": 5.998000000000001e-06,
"loss": 2.4244,
"step": 3000
},
{
"epoch": 0.06,
"eval_loss": 2.436969041824341,
"eval_runtime": 33.2713,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.773,
"step": 3000
},
{
"epoch": 0.0605,
"grad_norm": 0.025507916252978883,
"learning_rate": 6.048e-06,
"loss": 2.4192,
"step": 3025
},
{
"epoch": 0.061,
"grad_norm": 0.02126046028834697,
"learning_rate": 6.098000000000001e-06,
"loss": 2.4233,
"step": 3050
},
{
"epoch": 0.0615,
"grad_norm": 0.026235681014214807,
"learning_rate": 6.148e-06,
"loss": 2.4215,
"step": 3075
},
{
"epoch": 0.062,
"grad_norm": 0.026243691288249413,
"learning_rate": 6.198000000000001e-06,
"loss": 2.4134,
"step": 3100
},
{
"epoch": 0.062,
"eval_loss": 2.435988664627075,
"eval_runtime": 33.0276,
"eval_samples_per_second": 3.542,
"eval_steps_per_second": 1.786,
"step": 3100
},
{
"epoch": 0.0625,
"grad_norm": 0.02496599291141367,
"learning_rate": 6.248000000000001e-06,
"loss": 2.4241,
"step": 3125
},
{
"epoch": 0.063,
"grad_norm": 0.0236951365360608,
"learning_rate": 6.2980000000000005e-06,
"loss": 2.4252,
"step": 3150
},
{
"epoch": 0.0635,
"grad_norm": 0.022752035914773892,
"learning_rate": 6.348000000000001e-06,
"loss": 2.4244,
"step": 3175
},
{
"epoch": 0.064,
"grad_norm": 0.021656953860252137,
"learning_rate": 6.398000000000001e-06,
"loss": 2.4227,
"step": 3200
},
{
"epoch": 0.064,
"eval_loss": 2.43520450592041,
"eval_runtime": 33.136,
"eval_samples_per_second": 3.531,
"eval_steps_per_second": 1.781,
"step": 3200
},
{
"epoch": 0.0645,
"grad_norm": 0.021188520683488872,
"learning_rate": 6.448000000000001e-06,
"loss": 2.4248,
"step": 3225
},
{
"epoch": 0.065,
"grad_norm": 0.02274972468402099,
"learning_rate": 6.498000000000001e-06,
"loss": 2.4215,
"step": 3250
},
{
"epoch": 0.0655,
"grad_norm": 0.024046700552500286,
"learning_rate": 6.548000000000001e-06,
"loss": 2.4169,
"step": 3275
},
{
"epoch": 0.066,
"grad_norm": 0.022071385618052216,
"learning_rate": 6.598000000000001e-06,
"loss": 2.4199,
"step": 3300
},
{
"epoch": 0.066,
"eval_loss": 2.4344840049743652,
"eval_runtime": 33.1729,
"eval_samples_per_second": 3.527,
"eval_steps_per_second": 1.779,
"step": 3300
},
{
"epoch": 0.0665,
"grad_norm": 0.02931021842271797,
"learning_rate": 6.648e-06,
"loss": 2.4253,
"step": 3325
},
{
"epoch": 0.067,
"grad_norm": 0.021754527434557868,
"learning_rate": 6.698e-06,
"loss": 2.4281,
"step": 3350
},
{
"epoch": 0.0675,
"grad_norm": 0.022651522972508432,
"learning_rate": 6.7480000000000004e-06,
"loss": 2.4208,
"step": 3375
},
{
"epoch": 0.068,
"grad_norm": 0.022676405563792287,
"learning_rate": 6.798e-06,
"loss": 2.4222,
"step": 3400
},
{
"epoch": 0.068,
"eval_loss": 2.43371844291687,
"eval_runtime": 33.1293,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 1.781,
"step": 3400
},
{
"epoch": 0.0685,
"grad_norm": 0.021100680573628707,
"learning_rate": 6.848e-06,
"loss": 2.4243,
"step": 3425
},
{
"epoch": 0.069,
"grad_norm": 0.02101417038408147,
"learning_rate": 6.898e-06,
"loss": 2.4242,
"step": 3450
},
{
"epoch": 0.0695,
"grad_norm": 0.022432735157488455,
"learning_rate": 6.948e-06,
"loss": 2.4224,
"step": 3475
},
{
"epoch": 0.07,
"grad_norm": 0.02164716008760555,
"learning_rate": 6.998000000000001e-06,
"loss": 2.4202,
"step": 3500
},
{
"epoch": 0.07,
"eval_loss": 2.433281898498535,
"eval_runtime": 33.0783,
"eval_samples_per_second": 3.537,
"eval_steps_per_second": 1.784,
"step": 3500
},
{
"epoch": 0.0705,
"grad_norm": 0.022412840176404082,
"learning_rate": 7.048e-06,
"loss": 2.4184,
"step": 3525
},
{
"epoch": 0.071,
"grad_norm": 0.025300113537910857,
"learning_rate": 7.0980000000000005e-06,
"loss": 2.421,
"step": 3550
},
{
"epoch": 0.0715,
"grad_norm": 0.022085711512698558,
"learning_rate": 7.148000000000001e-06,
"loss": 2.415,
"step": 3575
},
{
"epoch": 0.072,
"grad_norm": 0.021041258769866313,
"learning_rate": 7.198e-06,
"loss": 2.4157,
"step": 3600
},
{
"epoch": 0.072,
"eval_loss": 2.4324123859405518,
"eval_runtime": 34.1633,
"eval_samples_per_second": 3.425,
"eval_steps_per_second": 1.727,
"step": 3600
},
{
"epoch": 0.0725,
"grad_norm": 0.021694681795354324,
"learning_rate": 7.248000000000001e-06,
"loss": 2.4152,
"step": 3625
},
{
"epoch": 0.073,
"grad_norm": 0.03056130171104773,
"learning_rate": 7.298e-06,
"loss": 2.4151,
"step": 3650
},
{
"epoch": 0.0735,
"grad_norm": 0.02112814663770162,
"learning_rate": 7.348000000000001e-06,
"loss": 2.4163,
"step": 3675
},
{
"epoch": 0.074,
"grad_norm": 0.024883267721069864,
"learning_rate": 7.398000000000001e-06,
"loss": 2.4258,
"step": 3700
},
{
"epoch": 0.074,
"eval_loss": 2.4319984912872314,
"eval_runtime": 33.2699,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.773,
"step": 3700
},
{
"epoch": 0.0745,
"grad_norm": 0.02062910451612879,
"learning_rate": 7.4480000000000005e-06,
"loss": 2.4263,
"step": 3725
},
{
"epoch": 0.075,
"grad_norm": 0.021068085012478772,
"learning_rate": 7.498000000000001e-06,
"loss": 2.4216,
"step": 3750
},
{
"epoch": 0.0755,
"grad_norm": 0.020665118516629687,
"learning_rate": 7.548000000000001e-06,
"loss": 2.4285,
"step": 3775
},
{
"epoch": 0.076,
"grad_norm": 0.02425992757924128,
"learning_rate": 7.598000000000001e-06,
"loss": 2.4174,
"step": 3800
},
{
"epoch": 0.076,
"eval_loss": 2.4310436248779297,
"eval_runtime": 35.0728,
"eval_samples_per_second": 3.336,
"eval_steps_per_second": 1.682,
"step": 3800
},
{
"epoch": 0.0765,
"grad_norm": 0.021337004595007786,
"learning_rate": 7.648e-06,
"loss": 2.4303,
"step": 3825
},
{
"epoch": 0.077,
"grad_norm": 0.020168500131750186,
"learning_rate": 7.698000000000002e-06,
"loss": 2.4298,
"step": 3850
},
{
"epoch": 0.0775,
"grad_norm": 0.020089032493824672,
"learning_rate": 7.748000000000001e-06,
"loss": 2.4151,
"step": 3875
},
{
"epoch": 0.078,
"grad_norm": 0.02462630071931115,
"learning_rate": 7.798e-06,
"loss": 2.4235,
"step": 3900
},
{
"epoch": 0.078,
"eval_loss": 2.431330442428589,
"eval_runtime": 33.093,
"eval_samples_per_second": 3.535,
"eval_steps_per_second": 1.783,
"step": 3900
},
{
"epoch": 0.0785,
"grad_norm": 0.0226705620922379,
"learning_rate": 7.848000000000002e-06,
"loss": 2.4185,
"step": 3925
},
{
"epoch": 0.079,
"grad_norm": 0.022075041269811142,
"learning_rate": 7.898e-06,
"loss": 2.4344,
"step": 3950
},
{
"epoch": 0.0795,
"grad_norm": 0.03932607113814955,
"learning_rate": 7.948e-06,
"loss": 2.4228,
"step": 3975
},
{
"epoch": 0.08,
"grad_norm": 0.020604342831921824,
"learning_rate": 7.998e-06,
"loss": 2.4289,
"step": 4000
},
{
"epoch": 0.08,
"eval_loss": 2.430954933166504,
"eval_runtime": 33.1216,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 1.781,
"step": 4000
},
{
"epoch": 0.0805,
"grad_norm": 0.021865944897834468,
"learning_rate": 8.048e-06,
"loss": 2.4283,
"step": 4025
},
{
"epoch": 0.081,
"grad_norm": 0.020393010409248808,
"learning_rate": 8.098000000000001e-06,
"loss": 2.4142,
"step": 4050
},
{
"epoch": 0.0815,
"grad_norm": 0.02279155824698799,
"learning_rate": 8.148e-06,
"loss": 2.4208,
"step": 4075
},
{
"epoch": 0.082,
"grad_norm": 0.021110562493101104,
"learning_rate": 8.198e-06,
"loss": 2.4093,
"step": 4100
},
{
"epoch": 0.082,
"eval_loss": 2.4299628734588623,
"eval_runtime": 33.2215,
"eval_samples_per_second": 3.522,
"eval_steps_per_second": 1.776,
"step": 4100
},
{
"epoch": 0.0825,
"grad_norm": 0.019752507861163327,
"learning_rate": 8.248e-06,
"loss": 2.4073,
"step": 4125
},
{
"epoch": 0.083,
"grad_norm": 0.019897433088879975,
"learning_rate": 8.298000000000001e-06,
"loss": 2.4129,
"step": 4150
},
{
"epoch": 0.0835,
"grad_norm": 0.02275241957806373,
"learning_rate": 8.348e-06,
"loss": 2.4243,
"step": 4175
},
{
"epoch": 0.084,
"grad_norm": 0.02009113389579191,
"learning_rate": 8.398e-06,
"loss": 2.4138,
"step": 4200
},
{
"epoch": 0.084,
"eval_loss": 2.4301230907440186,
"eval_runtime": 33.0641,
"eval_samples_per_second": 3.539,
"eval_steps_per_second": 1.784,
"step": 4200
},
{
"epoch": 0.0845,
"grad_norm": 0.021259070586902896,
"learning_rate": 8.448000000000001e-06,
"loss": 2.4212,
"step": 4225
},
{
"epoch": 0.085,
"grad_norm": 0.021461643865178466,
"learning_rate": 8.498e-06,
"loss": 2.4242,
"step": 4250
},
{
"epoch": 0.0855,
"grad_norm": 0.02129278617109427,
"learning_rate": 8.548e-06,
"loss": 2.4153,
"step": 4275
},
{
"epoch": 0.086,
"grad_norm": 0.019884381961586706,
"learning_rate": 8.598000000000001e-06,
"loss": 2.4107,
"step": 4300
},
{
"epoch": 0.086,
"eval_loss": 2.429638385772705,
"eval_runtime": 33.1452,
"eval_samples_per_second": 3.53,
"eval_steps_per_second": 1.78,
"step": 4300
},
{
"epoch": 0.0865,
"grad_norm": 0.02127578557225418,
"learning_rate": 8.648000000000001e-06,
"loss": 2.4202,
"step": 4325
},
{
"epoch": 0.087,
"grad_norm": 0.021749788475476855,
"learning_rate": 8.698e-06,
"loss": 2.4274,
"step": 4350
},
{
"epoch": 0.0875,
"grad_norm": 0.021521494708913836,
"learning_rate": 8.748000000000002e-06,
"loss": 2.4189,
"step": 4375
},
{
"epoch": 0.088,
"grad_norm": 0.021276426458537334,
"learning_rate": 8.798000000000001e-06,
"loss": 2.4152,
"step": 4400
},
{
"epoch": 0.088,
"eval_loss": 2.4292917251586914,
"eval_runtime": 33.1057,
"eval_samples_per_second": 3.534,
"eval_steps_per_second": 1.782,
"step": 4400
},
{
"epoch": 0.0885,
"grad_norm": 0.019843371943772815,
"learning_rate": 8.848e-06,
"loss": 2.421,
"step": 4425
},
{
"epoch": 0.089,
"grad_norm": 0.02031045171970109,
"learning_rate": 8.898000000000002e-06,
"loss": 2.4201,
"step": 4450
},
{
"epoch": 0.0895,
"grad_norm": 0.018642717079241176,
"learning_rate": 8.948000000000001e-06,
"loss": 2.4171,
"step": 4475
},
{
"epoch": 0.09,
"grad_norm": 0.021016901396559935,
"learning_rate": 8.998000000000001e-06,
"loss": 2.4257,
"step": 4500
},
{
"epoch": 0.09,
"eval_loss": 2.4288113117218018,
"eval_runtime": 33.1217,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 1.781,
"step": 4500
},
{
"epoch": 0.0905,
"grad_norm": 0.021595090834222327,
"learning_rate": 9.048e-06,
"loss": 2.4209,
"step": 4525
},
{
"epoch": 0.091,
"grad_norm": 0.020500341653961213,
"learning_rate": 9.098000000000002e-06,
"loss": 2.4093,
"step": 4550
},
{
"epoch": 0.0915,
"grad_norm": 0.021134665935359346,
"learning_rate": 9.148e-06,
"loss": 2.4238,
"step": 4575
},
{
"epoch": 0.092,
"grad_norm": 0.018064298488706988,
"learning_rate": 9.198e-06,
"loss": 2.4163,
"step": 4600
},
{
"epoch": 0.092,
"eval_loss": 2.428257465362549,
"eval_runtime": 33.451,
"eval_samples_per_second": 3.498,
"eval_steps_per_second": 1.764,
"step": 4600
},
{
"epoch": 0.0925,
"grad_norm": 0.019704962175624032,
"learning_rate": 9.248e-06,
"loss": 2.4082,
"step": 4625
},
{
"epoch": 0.093,
"grad_norm": 0.019712333508134283,
"learning_rate": 9.298e-06,
"loss": 2.4089,
"step": 4650
},
{
"epoch": 0.0935,
"grad_norm": 0.021269463834833153,
"learning_rate": 9.348000000000001e-06,
"loss": 2.408,
"step": 4675
},
{
"epoch": 0.094,
"grad_norm": 0.021278662940784676,
"learning_rate": 9.398e-06,
"loss": 2.4189,
"step": 4700
},
{
"epoch": 0.094,
"eval_loss": 2.4279165267944336,
"eval_runtime": 33.1606,
"eval_samples_per_second": 3.528,
"eval_steps_per_second": 1.779,
"step": 4700
},
{
"epoch": 0.0945,
"grad_norm": 0.018504564797986272,
"learning_rate": 9.448e-06,
"loss": 2.4254,
"step": 4725
},
{
"epoch": 0.095,
"grad_norm": 0.01917099113509997,
"learning_rate": 9.498000000000001e-06,
"loss": 2.411,
"step": 4750
},
{
"epoch": 0.0955,
"grad_norm": 0.019097394482211122,
"learning_rate": 9.548e-06,
"loss": 2.4209,
"step": 4775
},
{
"epoch": 0.096,
"grad_norm": 0.020220692469392707,
"learning_rate": 9.598e-06,
"loss": 2.4066,
"step": 4800
},
{
"epoch": 0.096,
"eval_loss": 2.4273650646209717,
"eval_runtime": 33.1079,
"eval_samples_per_second": 3.534,
"eval_steps_per_second": 1.782,
"step": 4800
},
{
"epoch": 0.0965,
"grad_norm": 0.019607148490934756,
"learning_rate": 9.648000000000001e-06,
"loss": 2.4132,
"step": 4825
},
{
"epoch": 0.097,
"grad_norm": 0.019388710503851023,
"learning_rate": 9.698000000000001e-06,
"loss": 2.4096,
"step": 4850
},
{
"epoch": 0.0975,
"grad_norm": 0.019593746411763164,
"learning_rate": 9.748e-06,
"loss": 2.4064,
"step": 4875
},
{
"epoch": 0.098,
"grad_norm": 0.018761734791343965,
"learning_rate": 9.798e-06,
"loss": 2.4033,
"step": 4900
},
{
"epoch": 0.098,
"eval_loss": 2.4270286560058594,
"eval_runtime": 33.0269,
"eval_samples_per_second": 3.543,
"eval_steps_per_second": 1.786,
"step": 4900
},
{
"epoch": 0.0985,
"grad_norm": 0.018964507342139367,
"learning_rate": 9.848000000000001e-06,
"loss": 2.4211,
"step": 4925
},
{
"epoch": 0.099,
"grad_norm": 0.01858861943184826,
"learning_rate": 9.898e-06,
"loss": 2.4032,
"step": 4950
},
{
"epoch": 0.0995,
"grad_norm": 0.01821023564956819,
"learning_rate": 9.948e-06,
"loss": 2.4031,
"step": 4975
},
{
"epoch": 0.1,
"grad_norm": 0.018839474555921314,
"learning_rate": 9.998000000000002e-06,
"loss": 2.4112,
"step": 5000
},
{
"epoch": 0.1,
"eval_loss": 2.426590919494629,
"eval_runtime": 33.0133,
"eval_samples_per_second": 3.544,
"eval_steps_per_second": 1.787,
"step": 5000
},
{
"epoch": 0.1005,
"grad_norm": 0.0187590945164155,
"learning_rate": 9.994666666666668e-06,
"loss": 2.4164,
"step": 5025
},
{
"epoch": 0.101,
"grad_norm": 0.018683158146542603,
"learning_rate": 9.989111111111111e-06,
"loss": 2.4082,
"step": 5050
},
{
"epoch": 0.1015,
"grad_norm": 0.017610949419625762,
"learning_rate": 9.983555555555556e-06,
"loss": 2.4124,
"step": 5075
},
{
"epoch": 0.102,
"grad_norm": 0.01862298073358942,
"learning_rate": 9.978000000000002e-06,
"loss": 2.409,
"step": 5100
},
{
"epoch": 0.102,
"eval_loss": 2.425841808319092,
"eval_runtime": 33.063,
"eval_samples_per_second": 3.539,
"eval_steps_per_second": 1.784,
"step": 5100
},
{
"epoch": 0.1025,
"grad_norm": 0.025407800531065724,
"learning_rate": 9.972444444444445e-06,
"loss": 2.4051,
"step": 5125
},
{
"epoch": 0.103,
"grad_norm": 0.01838713779514561,
"learning_rate": 9.966888888888889e-06,
"loss": 2.4105,
"step": 5150
},
{
"epoch": 0.1035,
"grad_norm": 0.018921321521659856,
"learning_rate": 9.961333333333334e-06,
"loss": 2.4191,
"step": 5175
},
{
"epoch": 0.104,
"grad_norm": 0.01824666535901335,
"learning_rate": 9.95577777777778e-06,
"loss": 2.4115,
"step": 5200
},
{
"epoch": 0.104,
"eval_loss": 2.4254310131073,
"eval_runtime": 33.141,
"eval_samples_per_second": 3.53,
"eval_steps_per_second": 1.78,
"step": 5200
},
{
"epoch": 0.1045,
"grad_norm": 0.018794067362196056,
"learning_rate": 9.950222222222223e-06,
"loss": 2.4062,
"step": 5225
},
{
"epoch": 0.105,
"grad_norm": 0.01825837669653065,
"learning_rate": 9.944666666666668e-06,
"loss": 2.4154,
"step": 5250
},
{
"epoch": 0.1055,
"grad_norm": 0.01843310767671649,
"learning_rate": 9.939111111111112e-06,
"loss": 2.4201,
"step": 5275
},
{
"epoch": 0.106,
"grad_norm": 0.018304681522005508,
"learning_rate": 9.933555555555557e-06,
"loss": 2.4089,
"step": 5300
},
{
"epoch": 0.106,
"eval_loss": 2.424731492996216,
"eval_runtime": 33.0325,
"eval_samples_per_second": 3.542,
"eval_steps_per_second": 1.786,
"step": 5300
},
{
"epoch": 0.1065,
"grad_norm": 0.01846362790517963,
"learning_rate": 9.928e-06,
"loss": 2.4118,
"step": 5325
},
{
"epoch": 0.107,
"grad_norm": 0.01872825463357926,
"learning_rate": 9.922444444444446e-06,
"loss": 2.4045,
"step": 5350
},
{
"epoch": 0.1075,
"grad_norm": 0.017781011104963246,
"learning_rate": 9.91688888888889e-06,
"loss": 2.4145,
"step": 5375
},
{
"epoch": 0.108,
"grad_norm": 0.018840752543683545,
"learning_rate": 9.911333333333335e-06,
"loss": 2.416,
"step": 5400
},
{
"epoch": 0.108,
"eval_loss": 2.423886775970459,
"eval_runtime": 33.1239,
"eval_samples_per_second": 3.532,
"eval_steps_per_second": 1.781,
"step": 5400
},
{
"epoch": 0.1085,
"grad_norm": 0.019278786947294697,
"learning_rate": 9.905777777777778e-06,
"loss": 2.4117,
"step": 5425
},
{
"epoch": 0.109,
"grad_norm": 0.018430470806705172,
"learning_rate": 9.900222222222223e-06,
"loss": 2.4114,
"step": 5450
},
{
"epoch": 0.1095,
"grad_norm": 0.018464088455141334,
"learning_rate": 9.894666666666669e-06,
"loss": 2.4185,
"step": 5475
},
{
"epoch": 0.11,
"grad_norm": 0.01866239126789079,
"learning_rate": 9.889111111111112e-06,
"loss": 2.4099,
"step": 5500
},
{
"epoch": 0.11,
"eval_loss": 2.423039197921753,
"eval_runtime": 35.4471,
"eval_samples_per_second": 3.301,
"eval_steps_per_second": 1.664,
"step": 5500
},
{
"epoch": 0.1105,
"grad_norm": 0.01827370320895024,
"learning_rate": 9.883555555555556e-06,
"loss": 2.4078,
"step": 5525
},
{
"epoch": 0.111,
"grad_norm": 0.01863057836209491,
"learning_rate": 9.878000000000001e-06,
"loss": 2.4044,
"step": 5550
},
{
"epoch": 0.1115,
"grad_norm": 0.018262835671926946,
"learning_rate": 9.872444444444446e-06,
"loss": 2.4123,
"step": 5575
},
{
"epoch": 0.112,
"grad_norm": 0.017655227692766756,
"learning_rate": 9.86688888888889e-06,
"loss": 2.4118,
"step": 5600
},
{
"epoch": 0.112,
"eval_loss": 2.4225943088531494,
"eval_runtime": 33.2709,
"eval_samples_per_second": 3.517,
"eval_steps_per_second": 1.773,
"step": 5600
},
{
"epoch": 0.1125,
"grad_norm": 0.01812962067528887,
"learning_rate": 9.861333333333333e-06,
"loss": 2.4017,
"step": 5625
},
{
"epoch": 0.113,
"grad_norm": 0.018265397582930686,
"learning_rate": 9.855777777777779e-06,
"loss": 2.4166,
"step": 5650
},
{
"epoch": 0.1135,
"grad_norm": 0.018207114017877214,
"learning_rate": 9.850222222222224e-06,
"loss": 2.413,
"step": 5675
},
{
"epoch": 0.114,
"grad_norm": 0.01952225079171619,
"learning_rate": 9.844666666666667e-06,
"loss": 2.4022,
"step": 5700
},
{
"epoch": 0.114,
"eval_loss": 2.42179274559021,
"eval_runtime": 33.0648,
"eval_samples_per_second": 3.539,
"eval_steps_per_second": 1.784,
"step": 5700
},
{
"epoch": 0.1145,
"grad_norm": 0.01780836124763766,
"learning_rate": 9.839111111111111e-06,
"loss": 2.4128,
"step": 5725
},
{
"epoch": 0.115,
"grad_norm": 0.018290904429709265,
"learning_rate": 9.833555555555556e-06,
"loss": 2.4119,
"step": 5750
},
{
"epoch": 0.1155,
"grad_norm": 0.019359740861514655,
"learning_rate": 9.828000000000001e-06,
"loss": 2.4019,
"step": 5775
},
{
"epoch": 0.116,
"grad_norm": 0.018278231474623628,
"learning_rate": 9.822444444444445e-06,
"loss": 2.4072,
"step": 5800
},
{
"epoch": 0.116,
"eval_loss": 2.4214675426483154,
"eval_runtime": 33.0642,
"eval_samples_per_second": 3.539,
"eval_steps_per_second": 1.784,
"step": 5800
},
{
"epoch": 0.1165,
"grad_norm": 0.017493007146383306,
"learning_rate": 9.81688888888889e-06,
"loss": 2.4134,
"step": 5825
},
{
"epoch": 0.117,
"grad_norm": 0.018399348008473985,
"learning_rate": 9.811333333333334e-06,
"loss": 2.4082,
"step": 5850
},
{
"epoch": 0.1175,
"grad_norm": 0.0186494867742927,
"learning_rate": 9.805777777777779e-06,
"loss": 2.4131,
"step": 5875
},
{
"epoch": 0.118,
"grad_norm": 0.017842605036949514,
"learning_rate": 9.800222222222223e-06,
"loss": 2.4134,
"step": 5900
},
{
"epoch": 0.118,
"eval_loss": 2.4210917949676514,
"eval_runtime": 33.1318,
"eval_samples_per_second": 3.531,
"eval_steps_per_second": 1.781,
"step": 5900
},
{
"epoch": 0.1185,
"grad_norm": 0.01835138877842204,
"learning_rate": 9.794666666666668e-06,
"loss": 2.4017,
"step": 5925
},
{
"epoch": 0.119,
"grad_norm": 0.018202303746487493,
"learning_rate": 9.789111111111111e-06,
"loss": 2.4103,
"step": 5950
},
{
"epoch": 0.1195,
"grad_norm": 0.0176777777086958,
"learning_rate": 9.783555555555557e-06,
"loss": 2.4023,
"step": 5975
},
{
"epoch": 0.12,
"grad_norm": 0.019351209333625233,
"learning_rate": 9.778e-06,
"loss": 2.4053,
"step": 6000
},
{
"epoch": 0.12,
"eval_loss": 2.421157121658325,
"eval_runtime": 33.0891,
"eval_samples_per_second": 3.536,
"eval_steps_per_second": 1.783,
"step": 6000
}
],
"logging_steps": 25,
"max_steps": 50000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6711811550821745e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}