train_stsb_1752763924 / trainer_state.json
rbelanec's picture
End of training
72d2f9c verified
{
"best_global_step": 6156,
"best_metric": 1.9586824178695679,
"best_model_checkpoint": "saves/prompt-tuning/llama-3-8b-instruct/train_stsb_1752763924/checkpoint-6156",
"epoch": 10.0,
"eval_steps": 324,
"global_step": 6470,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0077279752704791345,
"grad_norm": 0.7005897760391235,
"learning_rate": 3.0911901081916536e-07,
"loss": 8.2653,
"num_input_tokens_seen": 3904,
"step": 5
},
{
"epoch": 0.015455950540958269,
"grad_norm": 0.6296595335006714,
"learning_rate": 6.955177743431221e-07,
"loss": 8.431,
"num_input_tokens_seen": 7296,
"step": 10
},
{
"epoch": 0.023183925811437404,
"grad_norm": 0.6692777276039124,
"learning_rate": 1.0819165378670788e-06,
"loss": 8.5446,
"num_input_tokens_seen": 11136,
"step": 15
},
{
"epoch": 0.030911901081916538,
"grad_norm": 0.6222425699234009,
"learning_rate": 1.4683153013910356e-06,
"loss": 8.4578,
"num_input_tokens_seen": 15040,
"step": 20
},
{
"epoch": 0.03863987635239567,
"grad_norm": 0.6727356910705566,
"learning_rate": 1.8547140649149923e-06,
"loss": 8.2819,
"num_input_tokens_seen": 18496,
"step": 25
},
{
"epoch": 0.04636785162287481,
"grad_norm": 0.5868141651153564,
"learning_rate": 2.241112828438949e-06,
"loss": 8.2987,
"num_input_tokens_seen": 21824,
"step": 30
},
{
"epoch": 0.05409582689335394,
"grad_norm": 1.1792558431625366,
"learning_rate": 2.627511591962906e-06,
"loss": 8.4134,
"num_input_tokens_seen": 25792,
"step": 35
},
{
"epoch": 0.061823802163833076,
"grad_norm": 0.7558006048202515,
"learning_rate": 3.0139103554868627e-06,
"loss": 8.4503,
"num_input_tokens_seen": 29632,
"step": 40
},
{
"epoch": 0.0695517774343122,
"grad_norm": 1.1656103134155273,
"learning_rate": 3.4003091190108196e-06,
"loss": 8.4214,
"num_input_tokens_seen": 33536,
"step": 45
},
{
"epoch": 0.07727975270479134,
"grad_norm": 0.6609169840812683,
"learning_rate": 3.7867078825347765e-06,
"loss": 8.5704,
"num_input_tokens_seen": 37056,
"step": 50
},
{
"epoch": 0.08500772797527048,
"grad_norm": 0.6289478540420532,
"learning_rate": 4.173106646058733e-06,
"loss": 8.4276,
"num_input_tokens_seen": 40384,
"step": 55
},
{
"epoch": 0.09273570324574962,
"grad_norm": 0.7771320939064026,
"learning_rate": 4.559505409582689e-06,
"loss": 8.0075,
"num_input_tokens_seen": 44032,
"step": 60
},
{
"epoch": 0.10046367851622875,
"grad_norm": 0.5663495659828186,
"learning_rate": 4.945904173106646e-06,
"loss": 8.4612,
"num_input_tokens_seen": 47552,
"step": 65
},
{
"epoch": 0.10819165378670788,
"grad_norm": 0.7151011228561401,
"learning_rate": 5.332302936630603e-06,
"loss": 8.4386,
"num_input_tokens_seen": 51520,
"step": 70
},
{
"epoch": 0.11591962905718702,
"grad_norm": 0.4895608425140381,
"learning_rate": 5.71870170015456e-06,
"loss": 8.5596,
"num_input_tokens_seen": 55360,
"step": 75
},
{
"epoch": 0.12364760432766615,
"grad_norm": 0.4930502772331238,
"learning_rate": 6.1051004636785165e-06,
"loss": 8.4408,
"num_input_tokens_seen": 58944,
"step": 80
},
{
"epoch": 0.13137557959814528,
"grad_norm": 0.619640052318573,
"learning_rate": 6.491499227202473e-06,
"loss": 8.4171,
"num_input_tokens_seen": 62336,
"step": 85
},
{
"epoch": 0.1391035548686244,
"grad_norm": 0.5968901515007019,
"learning_rate": 6.87789799072643e-06,
"loss": 8.4728,
"num_input_tokens_seen": 66176,
"step": 90
},
{
"epoch": 0.14683153013910355,
"grad_norm": 0.5749031901359558,
"learning_rate": 7.264296754250387e-06,
"loss": 8.4344,
"num_input_tokens_seen": 69888,
"step": 95
},
{
"epoch": 0.1545595054095827,
"grad_norm": 0.7264779806137085,
"learning_rate": 7.650695517774343e-06,
"loss": 8.2166,
"num_input_tokens_seen": 73600,
"step": 100
},
{
"epoch": 0.16228748068006182,
"grad_norm": 2.099539041519165,
"learning_rate": 8.0370942812983e-06,
"loss": 8.3009,
"num_input_tokens_seen": 77568,
"step": 105
},
{
"epoch": 0.17001545595054096,
"grad_norm": 0.7334800362586975,
"learning_rate": 8.423493044822257e-06,
"loss": 8.1876,
"num_input_tokens_seen": 80768,
"step": 110
},
{
"epoch": 0.1777434312210201,
"grad_norm": 0.6308206915855408,
"learning_rate": 8.809891808346214e-06,
"loss": 8.2654,
"num_input_tokens_seen": 84288,
"step": 115
},
{
"epoch": 0.18547140649149924,
"grad_norm": 0.5811961889266968,
"learning_rate": 9.19629057187017e-06,
"loss": 8.1847,
"num_input_tokens_seen": 87424,
"step": 120
},
{
"epoch": 0.19319938176197837,
"grad_norm": 0.5790714025497437,
"learning_rate": 9.582689335394126e-06,
"loss": 8.1837,
"num_input_tokens_seen": 91584,
"step": 125
},
{
"epoch": 0.2009273570324575,
"grad_norm": 0.6126049757003784,
"learning_rate": 9.969088098918083e-06,
"loss": 8.2103,
"num_input_tokens_seen": 95616,
"step": 130
},
{
"epoch": 0.20865533230293662,
"grad_norm": 0.5547794699668884,
"learning_rate": 1.035548686244204e-05,
"loss": 8.2972,
"num_input_tokens_seen": 99136,
"step": 135
},
{
"epoch": 0.21638330757341576,
"grad_norm": 0.6736480593681335,
"learning_rate": 1.0741885625965996e-05,
"loss": 8.2168,
"num_input_tokens_seen": 102336,
"step": 140
},
{
"epoch": 0.2241112828438949,
"grad_norm": 0.8961809873580933,
"learning_rate": 1.1128284389489953e-05,
"loss": 8.2389,
"num_input_tokens_seen": 106560,
"step": 145
},
{
"epoch": 0.23183925811437403,
"grad_norm": 0.5742205381393433,
"learning_rate": 1.151468315301391e-05,
"loss": 8.029,
"num_input_tokens_seen": 110528,
"step": 150
},
{
"epoch": 0.23956723338485317,
"grad_norm": 0.9602662324905396,
"learning_rate": 1.1901081916537867e-05,
"loss": 8.3119,
"num_input_tokens_seen": 114048,
"step": 155
},
{
"epoch": 0.2472952086553323,
"grad_norm": 0.6280139684677124,
"learning_rate": 1.2287480680061824e-05,
"loss": 8.254,
"num_input_tokens_seen": 118336,
"step": 160
},
{
"epoch": 0.2550231839258114,
"grad_norm": 0.7177520990371704,
"learning_rate": 1.2673879443585781e-05,
"loss": 8.5888,
"num_input_tokens_seen": 122304,
"step": 165
},
{
"epoch": 0.26275115919629055,
"grad_norm": 0.546525776386261,
"learning_rate": 1.3060278207109738e-05,
"loss": 8.2285,
"num_input_tokens_seen": 126080,
"step": 170
},
{
"epoch": 0.2704791344667697,
"grad_norm": 0.4941937029361725,
"learning_rate": 1.3446676970633695e-05,
"loss": 8.4309,
"num_input_tokens_seen": 129920,
"step": 175
},
{
"epoch": 0.2782071097372488,
"grad_norm": 0.5484105944633484,
"learning_rate": 1.3833075734157651e-05,
"loss": 8.194,
"num_input_tokens_seen": 133696,
"step": 180
},
{
"epoch": 0.28593508500772796,
"grad_norm": 0.7775905132293701,
"learning_rate": 1.4219474497681608e-05,
"loss": 8.4067,
"num_input_tokens_seen": 137664,
"step": 185
},
{
"epoch": 0.2936630602782071,
"grad_norm": 0.6176138520240784,
"learning_rate": 1.4605873261205565e-05,
"loss": 8.2687,
"num_input_tokens_seen": 141184,
"step": 190
},
{
"epoch": 0.30139103554868624,
"grad_norm": 0.6117033958435059,
"learning_rate": 1.4992272024729522e-05,
"loss": 8.2188,
"num_input_tokens_seen": 144768,
"step": 195
},
{
"epoch": 0.3091190108191654,
"grad_norm": 0.6097720265388489,
"learning_rate": 1.5378670788253476e-05,
"loss": 8.3016,
"num_input_tokens_seen": 148032,
"step": 200
},
{
"epoch": 0.3168469860896445,
"grad_norm": 0.5379249453544617,
"learning_rate": 1.5765069551777432e-05,
"loss": 8.0923,
"num_input_tokens_seen": 152000,
"step": 205
},
{
"epoch": 0.32457496136012365,
"grad_norm": 0.7796866297721863,
"learning_rate": 1.615146831530139e-05,
"loss": 8.2276,
"num_input_tokens_seen": 156096,
"step": 210
},
{
"epoch": 0.3323029366306028,
"grad_norm": 0.4686180651187897,
"learning_rate": 1.6537867078825346e-05,
"loss": 8.251,
"num_input_tokens_seen": 160128,
"step": 215
},
{
"epoch": 0.3400309119010819,
"grad_norm": 0.6390528678894043,
"learning_rate": 1.6924265842349303e-05,
"loss": 8.2704,
"num_input_tokens_seen": 163712,
"step": 220
},
{
"epoch": 0.34775888717156106,
"grad_norm": 0.7915067076683044,
"learning_rate": 1.731066460587326e-05,
"loss": 7.7831,
"num_input_tokens_seen": 167424,
"step": 225
},
{
"epoch": 0.3554868624420402,
"grad_norm": 0.7012531161308289,
"learning_rate": 1.7697063369397217e-05,
"loss": 8.1578,
"num_input_tokens_seen": 170816,
"step": 230
},
{
"epoch": 0.36321483771251933,
"grad_norm": 0.5768135190010071,
"learning_rate": 1.8083462132921174e-05,
"loss": 7.7408,
"num_input_tokens_seen": 174528,
"step": 235
},
{
"epoch": 0.37094281298299847,
"grad_norm": 0.5718309283256531,
"learning_rate": 1.846986089644513e-05,
"loss": 7.9793,
"num_input_tokens_seen": 178432,
"step": 240
},
{
"epoch": 0.3786707882534776,
"grad_norm": 0.5248563885688782,
"learning_rate": 1.8856259659969088e-05,
"loss": 7.256,
"num_input_tokens_seen": 181888,
"step": 245
},
{
"epoch": 0.38639876352395675,
"grad_norm": 0.565098226070404,
"learning_rate": 1.9242658423493044e-05,
"loss": 8.3203,
"num_input_tokens_seen": 185984,
"step": 250
},
{
"epoch": 0.3941267387944359,
"grad_norm": 0.46190837025642395,
"learning_rate": 1.9629057187017e-05,
"loss": 7.9005,
"num_input_tokens_seen": 189504,
"step": 255
},
{
"epoch": 0.401854714064915,
"grad_norm": 0.45425835251808167,
"learning_rate": 2.0015455950540958e-05,
"loss": 8.028,
"num_input_tokens_seen": 193408,
"step": 260
},
{
"epoch": 0.4095826893353941,
"grad_norm": 0.6547238230705261,
"learning_rate": 2.0401854714064915e-05,
"loss": 7.8846,
"num_input_tokens_seen": 196672,
"step": 265
},
{
"epoch": 0.41731066460587324,
"grad_norm": 0.4780774712562561,
"learning_rate": 2.0788253477588872e-05,
"loss": 8.1814,
"num_input_tokens_seen": 200960,
"step": 270
},
{
"epoch": 0.4250386398763524,
"grad_norm": 0.5580217242240906,
"learning_rate": 2.117465224111283e-05,
"loss": 7.7065,
"num_input_tokens_seen": 204352,
"step": 275
},
{
"epoch": 0.4327666151468315,
"grad_norm": 0.4923495650291443,
"learning_rate": 2.1561051004636786e-05,
"loss": 8.0245,
"num_input_tokens_seen": 208000,
"step": 280
},
{
"epoch": 0.44049459041731065,
"grad_norm": 0.6452226042747498,
"learning_rate": 2.1947449768160743e-05,
"loss": 7.8578,
"num_input_tokens_seen": 211584,
"step": 285
},
{
"epoch": 0.4482225656877898,
"grad_norm": 0.7374160289764404,
"learning_rate": 2.23338485316847e-05,
"loss": 7.7108,
"num_input_tokens_seen": 215680,
"step": 290
},
{
"epoch": 0.4559505409582689,
"grad_norm": 0.5521262288093567,
"learning_rate": 2.2720247295208656e-05,
"loss": 7.7653,
"num_input_tokens_seen": 219328,
"step": 295
},
{
"epoch": 0.46367851622874806,
"grad_norm": 0.5362392067909241,
"learning_rate": 2.3106646058732613e-05,
"loss": 7.6045,
"num_input_tokens_seen": 223232,
"step": 300
},
{
"epoch": 0.4714064914992272,
"grad_norm": 0.5953294634819031,
"learning_rate": 2.349304482225657e-05,
"loss": 7.6481,
"num_input_tokens_seen": 226816,
"step": 305
},
{
"epoch": 0.47913446676970634,
"grad_norm": 0.7014301419258118,
"learning_rate": 2.3879443585780527e-05,
"loss": 7.7099,
"num_input_tokens_seen": 230464,
"step": 310
},
{
"epoch": 0.4868624420401855,
"grad_norm": 0.510015070438385,
"learning_rate": 2.4265842349304484e-05,
"loss": 7.5586,
"num_input_tokens_seen": 234368,
"step": 315
},
{
"epoch": 0.4945904173106646,
"grad_norm": 0.5965865254402161,
"learning_rate": 2.465224111282844e-05,
"loss": 7.6904,
"num_input_tokens_seen": 238464,
"step": 320
},
{
"epoch": 0.500772797527048,
"eval_loss": 7.569452285766602,
"eval_runtime": 9.8312,
"eval_samples_per_second": 58.487,
"eval_steps_per_second": 7.324,
"num_input_tokens_seen": 241664,
"step": 324
},
{
"epoch": 0.5023183925811437,
"grad_norm": 0.49373796582221985,
"learning_rate": 2.5038639876352398e-05,
"loss": 7.495,
"num_input_tokens_seen": 242304,
"step": 325
},
{
"epoch": 0.5100463678516228,
"grad_norm": 0.4991404116153717,
"learning_rate": 2.5425038639876354e-05,
"loss": 7.3584,
"num_input_tokens_seen": 246080,
"step": 330
},
{
"epoch": 0.517774343122102,
"grad_norm": 0.5048945546150208,
"learning_rate": 2.581143740340031e-05,
"loss": 8.1246,
"num_input_tokens_seen": 250304,
"step": 335
},
{
"epoch": 0.5255023183925811,
"grad_norm": 0.5030365586280823,
"learning_rate": 2.6197836166924268e-05,
"loss": 7.56,
"num_input_tokens_seen": 253504,
"step": 340
},
{
"epoch": 0.5332302936630603,
"grad_norm": 0.47446444630622864,
"learning_rate": 2.6584234930448225e-05,
"loss": 7.0644,
"num_input_tokens_seen": 257088,
"step": 345
},
{
"epoch": 0.5409582689335394,
"grad_norm": 0.5291785001754761,
"learning_rate": 2.6970633693972182e-05,
"loss": 7.4655,
"num_input_tokens_seen": 261440,
"step": 350
},
{
"epoch": 0.5486862442040186,
"grad_norm": 0.46279534697532654,
"learning_rate": 2.735703245749614e-05,
"loss": 7.4421,
"num_input_tokens_seen": 265280,
"step": 355
},
{
"epoch": 0.5564142194744977,
"grad_norm": 0.6287479400634766,
"learning_rate": 2.7743431221020096e-05,
"loss": 7.6079,
"num_input_tokens_seen": 268928,
"step": 360
},
{
"epoch": 0.5641421947449768,
"grad_norm": 0.5496251583099365,
"learning_rate": 2.812982998454405e-05,
"loss": 7.3586,
"num_input_tokens_seen": 272704,
"step": 365
},
{
"epoch": 0.5718701700154559,
"grad_norm": 0.5213049650192261,
"learning_rate": 2.851622874806801e-05,
"loss": 7.2783,
"num_input_tokens_seen": 276992,
"step": 370
},
{
"epoch": 0.5795981452859351,
"grad_norm": 0.487604558467865,
"learning_rate": 2.8902627511591963e-05,
"loss": 7.483,
"num_input_tokens_seen": 280640,
"step": 375
},
{
"epoch": 0.5873261205564142,
"grad_norm": 0.5995193123817444,
"learning_rate": 2.9289026275115923e-05,
"loss": 7.3264,
"num_input_tokens_seen": 284608,
"step": 380
},
{
"epoch": 0.5950540958268934,
"grad_norm": 0.5379067063331604,
"learning_rate": 2.9675425038639877e-05,
"loss": 7.362,
"num_input_tokens_seen": 288640,
"step": 385
},
{
"epoch": 0.6027820710973725,
"grad_norm": 0.5467705130577087,
"learning_rate": 3.0061823802163837e-05,
"loss": 7.5526,
"num_input_tokens_seen": 292416,
"step": 390
},
{
"epoch": 0.6105100463678517,
"grad_norm": 0.4332791566848755,
"learning_rate": 3.044822256568779e-05,
"loss": 7.6352,
"num_input_tokens_seen": 296000,
"step": 395
},
{
"epoch": 0.6182380216383307,
"grad_norm": 0.5871270895004272,
"learning_rate": 3.083462132921175e-05,
"loss": 7.2932,
"num_input_tokens_seen": 299712,
"step": 400
},
{
"epoch": 0.6259659969088099,
"grad_norm": 0.4454021751880646,
"learning_rate": 3.12210200927357e-05,
"loss": 7.6205,
"num_input_tokens_seen": 303360,
"step": 405
},
{
"epoch": 0.633693972179289,
"grad_norm": 0.5419654250144958,
"learning_rate": 3.1607418856259664e-05,
"loss": 7.3044,
"num_input_tokens_seen": 306752,
"step": 410
},
{
"epoch": 0.6414219474497682,
"grad_norm": 0.4751136302947998,
"learning_rate": 3.1993817619783615e-05,
"loss": 7.3805,
"num_input_tokens_seen": 310592,
"step": 415
},
{
"epoch": 0.6491499227202473,
"grad_norm": 0.5459133386611938,
"learning_rate": 3.238021638330758e-05,
"loss": 7.5579,
"num_input_tokens_seen": 314688,
"step": 420
},
{
"epoch": 0.6568778979907264,
"grad_norm": 0.45621687173843384,
"learning_rate": 3.276661514683153e-05,
"loss": 7.2994,
"num_input_tokens_seen": 318592,
"step": 425
},
{
"epoch": 0.6646058732612056,
"grad_norm": 0.6815227270126343,
"learning_rate": 3.315301391035549e-05,
"loss": 6.8075,
"num_input_tokens_seen": 322368,
"step": 430
},
{
"epoch": 0.6723338485316847,
"grad_norm": 0.46193644404411316,
"learning_rate": 3.353941267387944e-05,
"loss": 6.5451,
"num_input_tokens_seen": 325824,
"step": 435
},
{
"epoch": 0.6800618238021638,
"grad_norm": 0.6927512884140015,
"learning_rate": 3.3925811437403406e-05,
"loss": 6.85,
"num_input_tokens_seen": 329216,
"step": 440
},
{
"epoch": 0.6877897990726429,
"grad_norm": 0.5209140181541443,
"learning_rate": 3.4312210200927356e-05,
"loss": 6.8214,
"num_input_tokens_seen": 332992,
"step": 445
},
{
"epoch": 0.6955177743431221,
"grad_norm": 0.4288826882839203,
"learning_rate": 3.469860896445132e-05,
"loss": 7.0542,
"num_input_tokens_seen": 336640,
"step": 450
},
{
"epoch": 0.7032457496136012,
"grad_norm": 0.5550535917282104,
"learning_rate": 3.508500772797527e-05,
"loss": 7.034,
"num_input_tokens_seen": 340352,
"step": 455
},
{
"epoch": 0.7109737248840804,
"grad_norm": 0.4508901536464691,
"learning_rate": 3.547140649149923e-05,
"loss": 7.4092,
"num_input_tokens_seen": 344128,
"step": 460
},
{
"epoch": 0.7187017001545595,
"grad_norm": 0.5066952109336853,
"learning_rate": 3.585780525502318e-05,
"loss": 7.0113,
"num_input_tokens_seen": 347584,
"step": 465
},
{
"epoch": 0.7264296754250387,
"grad_norm": 0.42304113507270813,
"learning_rate": 3.624420401854714e-05,
"loss": 7.1517,
"num_input_tokens_seen": 352000,
"step": 470
},
{
"epoch": 0.7341576506955177,
"grad_norm": 0.4524816572666168,
"learning_rate": 3.66306027820711e-05,
"loss": 7.1342,
"num_input_tokens_seen": 356032,
"step": 475
},
{
"epoch": 0.7418856259659969,
"grad_norm": 0.46454742550849915,
"learning_rate": 3.7017001545595054e-05,
"loss": 7.1964,
"num_input_tokens_seen": 359552,
"step": 480
},
{
"epoch": 0.749613601236476,
"grad_norm": 0.4011656939983368,
"learning_rate": 3.740340030911901e-05,
"loss": 6.6278,
"num_input_tokens_seen": 363776,
"step": 485
},
{
"epoch": 0.7573415765069552,
"grad_norm": 0.5052220225334167,
"learning_rate": 3.778979907264297e-05,
"loss": 6.7523,
"num_input_tokens_seen": 367104,
"step": 490
},
{
"epoch": 0.7650695517774343,
"grad_norm": 0.5182458162307739,
"learning_rate": 3.8176197836166925e-05,
"loss": 6.7254,
"num_input_tokens_seen": 370688,
"step": 495
},
{
"epoch": 0.7727975270479135,
"grad_norm": 0.4116693437099457,
"learning_rate": 3.856259659969088e-05,
"loss": 7.038,
"num_input_tokens_seen": 374272,
"step": 500
},
{
"epoch": 0.7805255023183926,
"grad_norm": 0.5274565815925598,
"learning_rate": 3.894899536321484e-05,
"loss": 6.8012,
"num_input_tokens_seen": 377792,
"step": 505
},
{
"epoch": 0.7882534775888718,
"grad_norm": 0.5249013900756836,
"learning_rate": 3.9335394126738795e-05,
"loss": 6.4372,
"num_input_tokens_seen": 381312,
"step": 510
},
{
"epoch": 0.7959814528593508,
"grad_norm": 0.4378226697444916,
"learning_rate": 3.972179289026275e-05,
"loss": 6.7899,
"num_input_tokens_seen": 384896,
"step": 515
},
{
"epoch": 0.80370942812983,
"grad_norm": 0.4792448878288269,
"learning_rate": 4.010819165378671e-05,
"loss": 6.7719,
"num_input_tokens_seen": 388672,
"step": 520
},
{
"epoch": 0.8114374034003091,
"grad_norm": 0.47215649485588074,
"learning_rate": 4.0494590417310666e-05,
"loss": 6.5768,
"num_input_tokens_seen": 392640,
"step": 525
},
{
"epoch": 0.8191653786707882,
"grad_norm": 0.4052186906337738,
"learning_rate": 4.088098918083462e-05,
"loss": 6.4603,
"num_input_tokens_seen": 396032,
"step": 530
},
{
"epoch": 0.8268933539412674,
"grad_norm": 0.40831634402275085,
"learning_rate": 4.126738794435858e-05,
"loss": 6.9047,
"num_input_tokens_seen": 400384,
"step": 535
},
{
"epoch": 0.8346213292117465,
"grad_norm": 0.41408637166023254,
"learning_rate": 4.1653786707882537e-05,
"loss": 6.929,
"num_input_tokens_seen": 404352,
"step": 540
},
{
"epoch": 0.8423493044822257,
"grad_norm": 0.3988915681838989,
"learning_rate": 4.2040185471406493e-05,
"loss": 6.2877,
"num_input_tokens_seen": 408000,
"step": 545
},
{
"epoch": 0.8500772797527048,
"grad_norm": 0.3835439682006836,
"learning_rate": 4.242658423493045e-05,
"loss": 6.6326,
"num_input_tokens_seen": 412032,
"step": 550
},
{
"epoch": 0.8578052550231839,
"grad_norm": 0.42893922328948975,
"learning_rate": 4.281298299845441e-05,
"loss": 6.3484,
"num_input_tokens_seen": 415360,
"step": 555
},
{
"epoch": 0.865533230293663,
"grad_norm": 0.41601845622062683,
"learning_rate": 4.3199381761978364e-05,
"loss": 6.6927,
"num_input_tokens_seen": 419264,
"step": 560
},
{
"epoch": 0.8732612055641422,
"grad_norm": 0.438260942697525,
"learning_rate": 4.358578052550232e-05,
"loss": 6.5952,
"num_input_tokens_seen": 422784,
"step": 565
},
{
"epoch": 0.8809891808346213,
"grad_norm": 0.3734167516231537,
"learning_rate": 4.397217928902628e-05,
"loss": 6.2921,
"num_input_tokens_seen": 427008,
"step": 570
},
{
"epoch": 0.8887171561051005,
"grad_norm": 0.4252164661884308,
"learning_rate": 4.4358578052550235e-05,
"loss": 7.1411,
"num_input_tokens_seen": 431104,
"step": 575
},
{
"epoch": 0.8964451313755796,
"grad_norm": 0.38181161880493164,
"learning_rate": 4.474497681607419e-05,
"loss": 6.791,
"num_input_tokens_seen": 435072,
"step": 580
},
{
"epoch": 0.9041731066460588,
"grad_norm": 0.4132893681526184,
"learning_rate": 4.513137557959815e-05,
"loss": 6.3356,
"num_input_tokens_seen": 439040,
"step": 585
},
{
"epoch": 0.9119010819165378,
"grad_norm": 0.3585706353187561,
"learning_rate": 4.5517774343122105e-05,
"loss": 6.1658,
"num_input_tokens_seen": 442560,
"step": 590
},
{
"epoch": 0.919629057187017,
"grad_norm": 0.3809860348701477,
"learning_rate": 4.590417310664606e-05,
"loss": 6.358,
"num_input_tokens_seen": 446528,
"step": 595
},
{
"epoch": 0.9273570324574961,
"grad_norm": 0.3630789518356323,
"learning_rate": 4.629057187017002e-05,
"loss": 6.3932,
"num_input_tokens_seen": 450176,
"step": 600
},
{
"epoch": 0.9350850077279753,
"grad_norm": 0.3790375590324402,
"learning_rate": 4.6676970633693976e-05,
"loss": 6.4598,
"num_input_tokens_seen": 453952,
"step": 605
},
{
"epoch": 0.9428129829984544,
"grad_norm": 0.36716851592063904,
"learning_rate": 4.706336939721793e-05,
"loss": 6.1113,
"num_input_tokens_seen": 457536,
"step": 610
},
{
"epoch": 0.9505409582689336,
"grad_norm": 0.362441748380661,
"learning_rate": 4.744976816074189e-05,
"loss": 6.3924,
"num_input_tokens_seen": 461120,
"step": 615
},
{
"epoch": 0.9582689335394127,
"grad_norm": 0.3889773190021515,
"learning_rate": 4.7836166924265847e-05,
"loss": 6.4084,
"num_input_tokens_seen": 464960,
"step": 620
},
{
"epoch": 0.9659969088098919,
"grad_norm": 0.45412859320640564,
"learning_rate": 4.8222565687789803e-05,
"loss": 6.3847,
"num_input_tokens_seen": 468736,
"step": 625
},
{
"epoch": 0.973724884080371,
"grad_norm": 0.378328800201416,
"learning_rate": 4.860896445131376e-05,
"loss": 6.0302,
"num_input_tokens_seen": 472768,
"step": 630
},
{
"epoch": 0.98145285935085,
"grad_norm": 0.37784719467163086,
"learning_rate": 4.899536321483772e-05,
"loss": 6.0604,
"num_input_tokens_seen": 476544,
"step": 635
},
{
"epoch": 0.9891808346213292,
"grad_norm": 0.3870941400527954,
"learning_rate": 4.9381761978361674e-05,
"loss": 5.923,
"num_input_tokens_seen": 480384,
"step": 640
},
{
"epoch": 0.9969088098918083,
"grad_norm": 0.3700666129589081,
"learning_rate": 4.976816074188563e-05,
"loss": 5.955,
"num_input_tokens_seen": 483520,
"step": 645
},
{
"epoch": 1.001545595054096,
"eval_loss": 6.029697418212891,
"eval_runtime": 9.8501,
"eval_samples_per_second": 58.375,
"eval_steps_per_second": 7.31,
"num_input_tokens_seen": 485616,
"step": 648
},
{
"epoch": 1.0046367851622875,
"grad_norm": 0.4179936647415161,
"learning_rate": 4.999998544620922e-05,
"loss": 6.3116,
"num_input_tokens_seen": 487472,
"step": 650
},
{
"epoch": 1.0123647604327666,
"grad_norm": 0.40085089206695557,
"learning_rate": 4.999982171625755e-05,
"loss": 5.9363,
"num_input_tokens_seen": 491504,
"step": 655
},
{
"epoch": 1.0200927357032457,
"grad_norm": 0.3790755271911621,
"learning_rate": 4.999947606531115e-05,
"loss": 6.368,
"num_input_tokens_seen": 495152,
"step": 660
},
{
"epoch": 1.027820710973725,
"grad_norm": 0.4374313950538635,
"learning_rate": 4.999894849588528e-05,
"loss": 5.6627,
"num_input_tokens_seen": 498672,
"step": 665
},
{
"epoch": 1.035548686244204,
"grad_norm": 0.3703961968421936,
"learning_rate": 4.9998239011819015e-05,
"loss": 6.3572,
"num_input_tokens_seen": 502640,
"step": 670
},
{
"epoch": 1.0432766615146831,
"grad_norm": 0.4290381669998169,
"learning_rate": 4.999734761827518e-05,
"loss": 5.8942,
"num_input_tokens_seen": 506224,
"step": 675
},
{
"epoch": 1.0510046367851622,
"grad_norm": 0.4251026511192322,
"learning_rate": 4.9996274321740366e-05,
"loss": 5.6945,
"num_input_tokens_seen": 509936,
"step": 680
},
{
"epoch": 1.0587326120556415,
"grad_norm": 0.44041091203689575,
"learning_rate": 4.999501913002482e-05,
"loss": 6.2215,
"num_input_tokens_seen": 513968,
"step": 685
},
{
"epoch": 1.0664605873261206,
"grad_norm": 0.3784466087818146,
"learning_rate": 4.999358205226245e-05,
"loss": 5.9134,
"num_input_tokens_seen": 517552,
"step": 690
},
{
"epoch": 1.0741885625965997,
"grad_norm": 0.4258522391319275,
"learning_rate": 4.999196309891071e-05,
"loss": 5.528,
"num_input_tokens_seen": 520752,
"step": 695
},
{
"epoch": 1.0819165378670788,
"grad_norm": 0.3941080868244171,
"learning_rate": 4.999016228175054e-05,
"loss": 5.5496,
"num_input_tokens_seen": 524208,
"step": 700
},
{
"epoch": 1.089644513137558,
"grad_norm": 0.3999853730201721,
"learning_rate": 4.99881796138863e-05,
"loss": 5.775,
"num_input_tokens_seen": 527856,
"step": 705
},
{
"epoch": 1.0973724884080371,
"grad_norm": 0.43056732416152954,
"learning_rate": 4.998601510974565e-05,
"loss": 5.7852,
"num_input_tokens_seen": 532080,
"step": 710
},
{
"epoch": 1.1051004636785162,
"grad_norm": 0.36167922616004944,
"learning_rate": 4.998366878507945e-05,
"loss": 5.8906,
"num_input_tokens_seen": 535856,
"step": 715
},
{
"epoch": 1.1128284389489953,
"grad_norm": 0.4512415826320648,
"learning_rate": 4.9981140656961645e-05,
"loss": 5.7887,
"num_input_tokens_seen": 539696,
"step": 720
},
{
"epoch": 1.1205564142194744,
"grad_norm": 0.33006787300109863,
"learning_rate": 4.997843074378916e-05,
"loss": 5.9003,
"num_input_tokens_seen": 543408,
"step": 725
},
{
"epoch": 1.1282843894899537,
"grad_norm": 0.3509158790111542,
"learning_rate": 4.9975539065281733e-05,
"loss": 5.78,
"num_input_tokens_seen": 547248,
"step": 730
},
{
"epoch": 1.1360123647604328,
"grad_norm": 0.3338649570941925,
"learning_rate": 4.9972465642481796e-05,
"loss": 5.7212,
"num_input_tokens_seen": 550384,
"step": 735
},
{
"epoch": 1.1437403400309119,
"grad_norm": 0.4186403751373291,
"learning_rate": 4.9969210497754314e-05,
"loss": 5.3607,
"num_input_tokens_seen": 554288,
"step": 740
},
{
"epoch": 1.1514683153013912,
"grad_norm": 0.4077812731266022,
"learning_rate": 4.996577365478663e-05,
"loss": 5.6737,
"num_input_tokens_seen": 558128,
"step": 745
},
{
"epoch": 1.1591962905718702,
"grad_norm": 0.39160779118537903,
"learning_rate": 4.996215513858826e-05,
"loss": 5.6733,
"num_input_tokens_seen": 561584,
"step": 750
},
{
"epoch": 1.1669242658423493,
"grad_norm": 0.42368146777153015,
"learning_rate": 4.995835497549077e-05,
"loss": 5.4552,
"num_input_tokens_seen": 565296,
"step": 755
},
{
"epoch": 1.1746522411128284,
"grad_norm": 0.35899245738983154,
"learning_rate": 4.995437319314753e-05,
"loss": 5.3818,
"num_input_tokens_seen": 569008,
"step": 760
},
{
"epoch": 1.1823802163833075,
"grad_norm": 0.40432843565940857,
"learning_rate": 4.995020982053354e-05,
"loss": 5.4115,
"num_input_tokens_seen": 572656,
"step": 765
},
{
"epoch": 1.1901081916537868,
"grad_norm": 0.34939906001091003,
"learning_rate": 4.9945864887945215e-05,
"loss": 5.6154,
"num_input_tokens_seen": 576496,
"step": 770
},
{
"epoch": 1.1978361669242659,
"grad_norm": 0.40631914138793945,
"learning_rate": 4.994133842700015e-05,
"loss": 5.5658,
"num_input_tokens_seen": 580208,
"step": 775
},
{
"epoch": 1.205564142194745,
"grad_norm": 0.38632699847221375,
"learning_rate": 4.993663047063692e-05,
"loss": 5.421,
"num_input_tokens_seen": 584240,
"step": 780
},
{
"epoch": 1.213292117465224,
"grad_norm": 0.3667367994785309,
"learning_rate": 4.993174105311481e-05,
"loss": 5.2719,
"num_input_tokens_seen": 588400,
"step": 785
},
{
"epoch": 1.2210200927357033,
"grad_norm": 0.3893936574459076,
"learning_rate": 4.992667021001357e-05,
"loss": 5.7396,
"num_input_tokens_seen": 592176,
"step": 790
},
{
"epoch": 1.2287480680061824,
"grad_norm": 0.44007450342178345,
"learning_rate": 4.99214179782332e-05,
"loss": 5.3169,
"num_input_tokens_seen": 595632,
"step": 795
},
{
"epoch": 1.2364760432766615,
"grad_norm": 0.32318371534347534,
"learning_rate": 4.9915984395993606e-05,
"loss": 5.2931,
"num_input_tokens_seen": 599152,
"step": 800
},
{
"epoch": 1.2442040185471406,
"grad_norm": 0.3229334354400635,
"learning_rate": 4.991036950283438e-05,
"loss": 5.4225,
"num_input_tokens_seen": 602800,
"step": 805
},
{
"epoch": 1.2519319938176197,
"grad_norm": 0.36364948749542236,
"learning_rate": 4.990457333961449e-05,
"loss": 5.3279,
"num_input_tokens_seen": 606128,
"step": 810
},
{
"epoch": 1.259659969088099,
"grad_norm": 0.3748265206813812,
"learning_rate": 4.9898595948511984e-05,
"loss": 5.3406,
"num_input_tokens_seen": 609968,
"step": 815
},
{
"epoch": 1.267387944358578,
"grad_norm": 0.35926467180252075,
"learning_rate": 4.9892437373023706e-05,
"loss": 5.5681,
"num_input_tokens_seen": 613808,
"step": 820
},
{
"epoch": 1.2751159196290571,
"grad_norm": 0.3639248311519623,
"learning_rate": 4.988609765796492e-05,
"loss": 5.482,
"num_input_tokens_seen": 617456,
"step": 825
},
{
"epoch": 1.2828438948995364,
"grad_norm": 0.4079735577106476,
"learning_rate": 4.9879576849469065e-05,
"loss": 5.103,
"num_input_tokens_seen": 621168,
"step": 830
},
{
"epoch": 1.2905718701700155,
"grad_norm": 0.327030748128891,
"learning_rate": 4.9872874994987354e-05,
"loss": 4.6912,
"num_input_tokens_seen": 624624,
"step": 835
},
{
"epoch": 1.2982998454404946,
"grad_norm": 0.336750328540802,
"learning_rate": 4.986599214328844e-05,
"loss": 5.288,
"num_input_tokens_seen": 628016,
"step": 840
},
{
"epoch": 1.3060278207109737,
"grad_norm": 0.5080354809761047,
"learning_rate": 4.985892834445811e-05,
"loss": 5.1205,
"num_input_tokens_seen": 631664,
"step": 845
},
{
"epoch": 1.3137557959814528,
"grad_norm": 0.3950064778327942,
"learning_rate": 4.985168364989886e-05,
"loss": 5.1247,
"num_input_tokens_seen": 635824,
"step": 850
},
{
"epoch": 1.321483771251932,
"grad_norm": 0.37459149956703186,
"learning_rate": 4.984425811232954e-05,
"loss": 5.5621,
"num_input_tokens_seen": 639536,
"step": 855
},
{
"epoch": 1.3292117465224111,
"grad_norm": 0.3513096570968628,
"learning_rate": 4.983665178578498e-05,
"loss": 5.4441,
"num_input_tokens_seen": 642992,
"step": 860
},
{
"epoch": 1.3369397217928902,
"grad_norm": 0.3584459125995636,
"learning_rate": 4.98288647256156e-05,
"loss": 4.4869,
"num_input_tokens_seen": 646576,
"step": 865
},
{
"epoch": 1.3446676970633695,
"grad_norm": 0.33611801266670227,
"learning_rate": 4.9820896988487e-05,
"loss": 5.1894,
"num_input_tokens_seen": 650160,
"step": 870
},
{
"epoch": 1.3523956723338486,
"grad_norm": 0.3570202589035034,
"learning_rate": 4.981274863237953e-05,
"loss": 4.8735,
"num_input_tokens_seen": 654256,
"step": 875
},
{
"epoch": 1.3601236476043277,
"grad_norm": 0.4741190969944,
"learning_rate": 4.9804419716587894e-05,
"loss": 4.5883,
"num_input_tokens_seen": 658288,
"step": 880
},
{
"epoch": 1.3678516228748068,
"grad_norm": 0.3228873312473297,
"learning_rate": 4.979591030172072e-05,
"loss": 5.5148,
"num_input_tokens_seen": 662192,
"step": 885
},
{
"epoch": 1.3755795981452859,
"grad_norm": 0.3376190960407257,
"learning_rate": 4.978722044970009e-05,
"loss": 4.937,
"num_input_tokens_seen": 666096,
"step": 890
},
{
"epoch": 1.383307573415765,
"grad_norm": 0.37344226241111755,
"learning_rate": 4.9778350223761115e-05,
"loss": 5.0469,
"num_input_tokens_seen": 670512,
"step": 895
},
{
"epoch": 1.3910355486862442,
"grad_norm": 0.2931409478187561,
"learning_rate": 4.9769299688451475e-05,
"loss": 4.993,
"num_input_tokens_seen": 674480,
"step": 900
},
{
"epoch": 1.3987635239567233,
"grad_norm": 0.43220609426498413,
"learning_rate": 4.976006890963093e-05,
"loss": 5.1053,
"num_input_tokens_seen": 677488,
"step": 905
},
{
"epoch": 1.4064914992272024,
"grad_norm": 0.3765735328197479,
"learning_rate": 4.9750657954470864e-05,
"loss": 5.0701,
"num_input_tokens_seen": 681392,
"step": 910
},
{
"epoch": 1.4142194744976817,
"grad_norm": 0.3108097314834595,
"learning_rate": 4.974106689145377e-05,
"loss": 5.1058,
"num_input_tokens_seen": 685232,
"step": 915
},
{
"epoch": 1.4219474497681608,
"grad_norm": 0.36728906631469727,
"learning_rate": 4.973129579037278e-05,
"loss": 5.318,
"num_input_tokens_seen": 688944,
"step": 920
},
{
"epoch": 1.4296754250386399,
"grad_norm": 0.35344597697257996,
"learning_rate": 4.972134472233113e-05,
"loss": 5.1818,
"num_input_tokens_seen": 692464,
"step": 925
},
{
"epoch": 1.437403400309119,
"grad_norm": 0.37421491742134094,
"learning_rate": 4.971121375974168e-05,
"loss": 4.6604,
"num_input_tokens_seen": 696240,
"step": 930
},
{
"epoch": 1.445131375579598,
"grad_norm": 0.3447347581386566,
"learning_rate": 4.970090297632633e-05,
"loss": 4.8528,
"num_input_tokens_seen": 699952,
"step": 935
},
{
"epoch": 1.4528593508500773,
"grad_norm": 0.3421487808227539,
"learning_rate": 4.969041244711555e-05,
"loss": 4.8747,
"num_input_tokens_seen": 703664,
"step": 940
},
{
"epoch": 1.4605873261205564,
"grad_norm": 0.354444295167923,
"learning_rate": 4.967974224844777e-05,
"loss": 5.0095,
"num_input_tokens_seen": 707696,
"step": 945
},
{
"epoch": 1.4683153013910355,
"grad_norm": 0.36146560311317444,
"learning_rate": 4.966889245796888e-05,
"loss": 4.7609,
"num_input_tokens_seen": 711216,
"step": 950
},
{
"epoch": 1.4760432766615148,
"grad_norm": 0.3156791031360626,
"learning_rate": 4.965786315463162e-05,
"loss": 4.7666,
"num_input_tokens_seen": 714992,
"step": 955
},
{
"epoch": 1.4837712519319939,
"grad_norm": 0.33873096108436584,
"learning_rate": 4.9646654418695055e-05,
"loss": 4.833,
"num_input_tokens_seen": 718704,
"step": 960
},
{
"epoch": 1.491499227202473,
"grad_norm": 0.4051862359046936,
"learning_rate": 4.963526633172392e-05,
"loss": 4.5196,
"num_input_tokens_seen": 722608,
"step": 965
},
{
"epoch": 1.499227202472952,
"grad_norm": 0.32755422592163086,
"learning_rate": 4.9623698976588105e-05,
"loss": 4.6224,
"num_input_tokens_seen": 725744,
"step": 970
},
{
"epoch": 1.5023183925811439,
"eval_loss": 4.67022705078125,
"eval_runtime": 9.8381,
"eval_samples_per_second": 58.446,
"eval_steps_per_second": 7.318,
"num_input_tokens_seen": 727280,
"step": 972
},
{
"epoch": 1.5069551777434311,
"grad_norm": 0.3122451901435852,
"learning_rate": 4.9611952437462e-05,
"loss": 4.6149,
"num_input_tokens_seen": 729456,
"step": 975
},
{
"epoch": 1.5146831530139102,
"grad_norm": 0.3334190845489502,
"learning_rate": 4.960002679982389e-05,
"loss": 4.8694,
"num_input_tokens_seen": 733680,
"step": 980
},
{
"epoch": 1.5224111282843895,
"grad_norm": 0.32536813616752625,
"learning_rate": 4.958792215045535e-05,
"loss": 4.904,
"num_input_tokens_seen": 737584,
"step": 985
},
{
"epoch": 1.5301391035548686,
"grad_norm": 0.4649519622325897,
"learning_rate": 4.9575638577440606e-05,
"loss": 4.9842,
"num_input_tokens_seen": 741680,
"step": 990
},
{
"epoch": 1.537867078825348,
"grad_norm": 0.31299111247062683,
"learning_rate": 4.956317617016589e-05,
"loss": 4.7217,
"num_input_tokens_seen": 745776,
"step": 995
},
{
"epoch": 1.545595054095827,
"grad_norm": 0.3985936939716339,
"learning_rate": 4.955053501931878e-05,
"loss": 4.9462,
"num_input_tokens_seen": 749232,
"step": 1000
},
{
"epoch": 1.553323029366306,
"grad_norm": 0.598997175693512,
"learning_rate": 4.953771521688757e-05,
"loss": 4.4938,
"num_input_tokens_seen": 753008,
"step": 1005
},
{
"epoch": 1.5610510046367851,
"grad_norm": 0.7777657508850098,
"learning_rate": 4.952471685616058e-05,
"loss": 4.3919,
"num_input_tokens_seen": 756912,
"step": 1010
},
{
"epoch": 1.5687789799072642,
"grad_norm": 0.32821112871170044,
"learning_rate": 4.9511540031725454e-05,
"loss": 4.797,
"num_input_tokens_seen": 760432,
"step": 1015
},
{
"epoch": 1.5765069551777433,
"grad_norm": 0.2888381779193878,
"learning_rate": 4.949818483946853e-05,
"loss": 4.3946,
"num_input_tokens_seen": 764592,
"step": 1020
},
{
"epoch": 1.5842349304482226,
"grad_norm": 0.30184459686279297,
"learning_rate": 4.9484651376574094e-05,
"loss": 4.2145,
"num_input_tokens_seen": 767792,
"step": 1025
},
{
"epoch": 1.5919629057187017,
"grad_norm": 0.3076271414756775,
"learning_rate": 4.9470939741523685e-05,
"loss": 5.2852,
"num_input_tokens_seen": 772080,
"step": 1030
},
{
"epoch": 1.599690880989181,
"grad_norm": 0.31468066573143005,
"learning_rate": 4.9457050034095395e-05,
"loss": 4.7483,
"num_input_tokens_seen": 775216,
"step": 1035
},
{
"epoch": 1.60741885625966,
"grad_norm": 0.5978931784629822,
"learning_rate": 4.944298235536311e-05,
"loss": 4.5684,
"num_input_tokens_seen": 778992,
"step": 1040
},
{
"epoch": 1.6151468315301392,
"grad_norm": 0.45990973711013794,
"learning_rate": 4.942873680769581e-05,
"loss": 4.4237,
"num_input_tokens_seen": 782832,
"step": 1045
},
{
"epoch": 1.6228748068006182,
"grad_norm": 0.3652671277523041,
"learning_rate": 4.9414313494756804e-05,
"loss": 4.5137,
"num_input_tokens_seen": 786288,
"step": 1050
},
{
"epoch": 1.6306027820710973,
"grad_norm": 0.3771812915802002,
"learning_rate": 4.9399712521502966e-05,
"loss": 4.4615,
"num_input_tokens_seen": 789552,
"step": 1055
},
{
"epoch": 1.6383307573415764,
"grad_norm": 0.40835604071617126,
"learning_rate": 4.9384933994184016e-05,
"loss": 4.4301,
"num_input_tokens_seen": 793712,
"step": 1060
},
{
"epoch": 1.6460587326120555,
"grad_norm": 0.6361907720565796,
"learning_rate": 4.9369978020341676e-05,
"loss": 4.5741,
"num_input_tokens_seen": 797040,
"step": 1065
},
{
"epoch": 1.6537867078825348,
"grad_norm": 0.2855110466480255,
"learning_rate": 4.9354844708808965e-05,
"loss": 4.3495,
"num_input_tokens_seen": 800688,
"step": 1070
},
{
"epoch": 1.6615146831530139,
"grad_norm": 0.4478287100791931,
"learning_rate": 4.933953416970935e-05,
"loss": 4.6778,
"num_input_tokens_seen": 804656,
"step": 1075
},
{
"epoch": 1.6692426584234932,
"grad_norm": 0.3282228410243988,
"learning_rate": 4.932404651445596e-05,
"loss": 4.492,
"num_input_tokens_seen": 808752,
"step": 1080
},
{
"epoch": 1.6769706336939723,
"grad_norm": 0.38058677315711975,
"learning_rate": 4.930838185575077e-05,
"loss": 4.4508,
"num_input_tokens_seen": 812592,
"step": 1085
},
{
"epoch": 1.6846986089644513,
"grad_norm": 0.3633776307106018,
"learning_rate": 4.929254030758383e-05,
"loss": 4.5136,
"num_input_tokens_seen": 816688,
"step": 1090
},
{
"epoch": 1.6924265842349304,
"grad_norm": 0.4143412411212921,
"learning_rate": 4.927652198523237e-05,
"loss": 4.5167,
"num_input_tokens_seen": 820592,
"step": 1095
},
{
"epoch": 1.7001545595054095,
"grad_norm": 0.2974889874458313,
"learning_rate": 4.926032700525997e-05,
"loss": 4.2801,
"num_input_tokens_seen": 823984,
"step": 1100
},
{
"epoch": 1.7078825347758886,
"grad_norm": 0.31788986921310425,
"learning_rate": 4.924395548551575e-05,
"loss": 4.7661,
"num_input_tokens_seen": 827760,
"step": 1105
},
{
"epoch": 1.7156105100463679,
"grad_norm": 0.3491007685661316,
"learning_rate": 4.9227407545133486e-05,
"loss": 3.9265,
"num_input_tokens_seen": 831408,
"step": 1110
},
{
"epoch": 1.723338485316847,
"grad_norm": 0.2992270886898041,
"learning_rate": 4.921068330453075e-05,
"loss": 4.0714,
"num_input_tokens_seen": 835120,
"step": 1115
},
{
"epoch": 1.7310664605873263,
"grad_norm": 0.31046733260154724,
"learning_rate": 4.9193782885408026e-05,
"loss": 4.1982,
"num_input_tokens_seen": 839216,
"step": 1120
},
{
"epoch": 1.7387944358578054,
"grad_norm": 0.3238506615161896,
"learning_rate": 4.917670641074784e-05,
"loss": 4.4634,
"num_input_tokens_seen": 842352,
"step": 1125
},
{
"epoch": 1.7465224111282844,
"grad_norm": 0.34013789892196655,
"learning_rate": 4.9159454004813854e-05,
"loss": 3.7904,
"num_input_tokens_seen": 845936,
"step": 1130
},
{
"epoch": 1.7542503863987635,
"grad_norm": 0.5322805643081665,
"learning_rate": 4.9142025793149935e-05,
"loss": 4.3029,
"num_input_tokens_seen": 849456,
"step": 1135
},
{
"epoch": 1.7619783616692426,
"grad_norm": 0.489900678396225,
"learning_rate": 4.912442190257931e-05,
"loss": 3.8957,
"num_input_tokens_seen": 852912,
"step": 1140
},
{
"epoch": 1.7697063369397217,
"grad_norm": 0.316989004611969,
"learning_rate": 4.9106642461203575e-05,
"loss": 4.4156,
"num_input_tokens_seen": 857648,
"step": 1145
},
{
"epoch": 1.7774343122102008,
"grad_norm": 0.31556737422943115,
"learning_rate": 4.908868759840181e-05,
"loss": 4.1275,
"num_input_tokens_seen": 861680,
"step": 1150
},
{
"epoch": 1.78516228748068,
"grad_norm": 0.28157737851142883,
"learning_rate": 4.907055744482959e-05,
"loss": 4.3739,
"num_input_tokens_seen": 865456,
"step": 1155
},
{
"epoch": 1.7928902627511591,
"grad_norm": 0.30491557717323303,
"learning_rate": 4.905225213241809e-05,
"loss": 4.083,
"num_input_tokens_seen": 868784,
"step": 1160
},
{
"epoch": 1.8006182380216385,
"grad_norm": 0.35626837611198425,
"learning_rate": 4.9033771794373084e-05,
"loss": 4.2187,
"num_input_tokens_seen": 872560,
"step": 1165
},
{
"epoch": 1.8083462132921175,
"grad_norm": 0.3904974162578583,
"learning_rate": 4.901511656517399e-05,
"loss": 3.9821,
"num_input_tokens_seen": 876528,
"step": 1170
},
{
"epoch": 1.8160741885625966,
"grad_norm": 0.6061235666275024,
"learning_rate": 4.8996286580572895e-05,
"loss": 4.3058,
"num_input_tokens_seen": 880624,
"step": 1175
},
{
"epoch": 1.8238021638330757,
"grad_norm": 0.30002129077911377,
"learning_rate": 4.8977281977593546e-05,
"loss": 4.2416,
"num_input_tokens_seen": 884528,
"step": 1180
},
{
"epoch": 1.8315301391035548,
"grad_norm": 0.29267552495002747,
"learning_rate": 4.8958102894530395e-05,
"loss": 4.0598,
"num_input_tokens_seen": 888176,
"step": 1185
},
{
"epoch": 1.8392581143740339,
"grad_norm": 0.33920446038246155,
"learning_rate": 4.8938749470947534e-05,
"loss": 4.432,
"num_input_tokens_seen": 892336,
"step": 1190
},
{
"epoch": 1.8469860896445132,
"grad_norm": 0.3370172679424286,
"learning_rate": 4.8919221847677744e-05,
"loss": 4.141,
"num_input_tokens_seen": 896496,
"step": 1195
},
{
"epoch": 1.8547140649149922,
"grad_norm": 0.2805825471878052,
"learning_rate": 4.889952016682142e-05,
"loss": 4.4681,
"num_input_tokens_seen": 900208,
"step": 1200
},
{
"epoch": 1.8624420401854715,
"grad_norm": 0.3391216993331909,
"learning_rate": 4.8879644571745565e-05,
"loss": 4.492,
"num_input_tokens_seen": 903536,
"step": 1205
},
{
"epoch": 1.8701700154559506,
"grad_norm": 0.7657363414764404,
"learning_rate": 4.885959520708272e-05,
"loss": 3.8387,
"num_input_tokens_seen": 907120,
"step": 1210
},
{
"epoch": 1.8778979907264297,
"grad_norm": 0.3277113139629364,
"learning_rate": 4.883937221872995e-05,
"loss": 4.0781,
"num_input_tokens_seen": 910576,
"step": 1215
},
{
"epoch": 1.8856259659969088,
"grad_norm": 0.3017078638076782,
"learning_rate": 4.881897575384774e-05,
"loss": 3.5843,
"num_input_tokens_seen": 914416,
"step": 1220
},
{
"epoch": 1.8933539412673879,
"grad_norm": 0.32566162943840027,
"learning_rate": 4.879840596085897e-05,
"loss": 3.9687,
"num_input_tokens_seen": 918064,
"step": 1225
},
{
"epoch": 1.901081916537867,
"grad_norm": 0.285196989774704,
"learning_rate": 4.877766298944779e-05,
"loss": 4.0323,
"num_input_tokens_seen": 922160,
"step": 1230
},
{
"epoch": 1.9088098918083463,
"grad_norm": 0.31941235065460205,
"learning_rate": 4.875674699055855e-05,
"loss": 4.0946,
"num_input_tokens_seen": 926128,
"step": 1235
},
{
"epoch": 1.9165378670788253,
"grad_norm": 0.36172521114349365,
"learning_rate": 4.8735658116394714e-05,
"loss": 4.3137,
"num_input_tokens_seen": 930224,
"step": 1240
},
{
"epoch": 1.9242658423493046,
"grad_norm": 0.2846207618713379,
"learning_rate": 4.871439652041773e-05,
"loss": 3.8571,
"num_input_tokens_seen": 934000,
"step": 1245
},
{
"epoch": 1.9319938176197837,
"grad_norm": 0.4269625246524811,
"learning_rate": 4.869296235734594e-05,
"loss": 4.116,
"num_input_tokens_seen": 938160,
"step": 1250
},
{
"epoch": 1.9397217928902628,
"grad_norm": 0.24182330071926117,
"learning_rate": 4.8671355783153415e-05,
"loss": 4.1353,
"num_input_tokens_seen": 941552,
"step": 1255
},
{
"epoch": 1.947449768160742,
"grad_norm": 0.2979944348335266,
"learning_rate": 4.864957695506885e-05,
"loss": 4.0559,
"num_input_tokens_seen": 945200,
"step": 1260
},
{
"epoch": 1.955177743431221,
"grad_norm": 0.2942993640899658,
"learning_rate": 4.862762603157445e-05,
"loss": 3.7291,
"num_input_tokens_seen": 949040,
"step": 1265
},
{
"epoch": 1.9629057187017,
"grad_norm": 0.2731165289878845,
"learning_rate": 4.860550317240467e-05,
"loss": 4.0914,
"num_input_tokens_seen": 952944,
"step": 1270
},
{
"epoch": 1.9706336939721791,
"grad_norm": 0.2240080088376999,
"learning_rate": 4.8583208538545175e-05,
"loss": 3.7287,
"num_input_tokens_seen": 956912,
"step": 1275
},
{
"epoch": 1.9783616692426584,
"grad_norm": 0.3749920427799225,
"learning_rate": 4.856074229223161e-05,
"loss": 3.8797,
"num_input_tokens_seen": 960752,
"step": 1280
},
{
"epoch": 1.9860896445131375,
"grad_norm": 0.24248334765434265,
"learning_rate": 4.85381045969484e-05,
"loss": 4.2639,
"num_input_tokens_seen": 964336,
"step": 1285
},
{
"epoch": 1.9938176197836168,
"grad_norm": 0.6645636558532715,
"learning_rate": 4.851529561742762e-05,
"loss": 3.9315,
"num_input_tokens_seen": 967600,
"step": 1290
},
{
"epoch": 2.001545595054096,
"grad_norm": 0.5366644263267517,
"learning_rate": 4.849231551964771e-05,
"loss": 3.407,
"num_input_tokens_seen": 970704,
"step": 1295
},
{
"epoch": 2.003091190108192,
"eval_loss": 3.904050588607788,
"eval_runtime": 9.8507,
"eval_samples_per_second": 58.372,
"eval_steps_per_second": 7.309,
"num_input_tokens_seen": 971536,
"step": 1296
},
{
"epoch": 2.009273570324575,
"grad_norm": 0.30658474564552307,
"learning_rate": 4.846916447083239e-05,
"loss": 4.2572,
"num_input_tokens_seen": 974416,
"step": 1300
},
{
"epoch": 2.017001545595054,
"grad_norm": 0.2476230412721634,
"learning_rate": 4.8445842639449313e-05,
"loss": 3.871,
"num_input_tokens_seen": 978704,
"step": 1305
},
{
"epoch": 2.024729520865533,
"grad_norm": 0.2552611529827118,
"learning_rate": 4.842235019520893e-05,
"loss": 4.0268,
"num_input_tokens_seen": 982480,
"step": 1310
},
{
"epoch": 2.0324574961360122,
"grad_norm": 0.243674173951149,
"learning_rate": 4.8398687309063206e-05,
"loss": 3.9918,
"num_input_tokens_seen": 986384,
"step": 1315
},
{
"epoch": 2.0401854714064913,
"grad_norm": 0.34793391823768616,
"learning_rate": 4.8374854153204405e-05,
"loss": 4.1587,
"num_input_tokens_seen": 990032,
"step": 1320
},
{
"epoch": 2.047913446676971,
"grad_norm": 0.24952645599842072,
"learning_rate": 4.835085090106382e-05,
"loss": 3.9453,
"num_input_tokens_seen": 993424,
"step": 1325
},
{
"epoch": 2.05564142194745,
"grad_norm": 0.4589490294456482,
"learning_rate": 4.832667772731051e-05,
"loss": 3.7476,
"num_input_tokens_seen": 997648,
"step": 1330
},
{
"epoch": 2.063369397217929,
"grad_norm": 0.2599397897720337,
"learning_rate": 4.830233480785005e-05,
"loss": 3.6993,
"num_input_tokens_seen": 1001360,
"step": 1335
},
{
"epoch": 2.071097372488408,
"grad_norm": 0.31336286664009094,
"learning_rate": 4.827782231982323e-05,
"loss": 4.0099,
"num_input_tokens_seen": 1005200,
"step": 1340
},
{
"epoch": 2.078825347758887,
"grad_norm": 0.2606953978538513,
"learning_rate": 4.8253140441604764e-05,
"loss": 4.0381,
"num_input_tokens_seen": 1008912,
"step": 1345
},
{
"epoch": 2.0865533230293662,
"grad_norm": 0.38538479804992676,
"learning_rate": 4.8228289352802006e-05,
"loss": 3.5811,
"num_input_tokens_seen": 1012688,
"step": 1350
},
{
"epoch": 2.0942812982998453,
"grad_norm": 0.29820919036865234,
"learning_rate": 4.820326923425364e-05,
"loss": 3.8514,
"num_input_tokens_seen": 1015952,
"step": 1355
},
{
"epoch": 2.1020092735703244,
"grad_norm": 0.338681697845459,
"learning_rate": 4.817808026802836e-05,
"loss": 3.8809,
"num_input_tokens_seen": 1019664,
"step": 1360
},
{
"epoch": 2.109737248840804,
"grad_norm": 0.38213613629341125,
"learning_rate": 4.815272263742354e-05,
"loss": 3.7324,
"num_input_tokens_seen": 1023632,
"step": 1365
},
{
"epoch": 2.117465224111283,
"grad_norm": 0.24088416993618011,
"learning_rate": 4.812719652696392e-05,
"loss": 3.6557,
"num_input_tokens_seen": 1027728,
"step": 1370
},
{
"epoch": 2.125193199381762,
"grad_norm": 0.32198968529701233,
"learning_rate": 4.810150212240023e-05,
"loss": 3.7389,
"num_input_tokens_seen": 1031696,
"step": 1375
},
{
"epoch": 2.132921174652241,
"grad_norm": 0.308168888092041,
"learning_rate": 4.807563961070788e-05,
"loss": 4.219,
"num_input_tokens_seen": 1035472,
"step": 1380
},
{
"epoch": 2.1406491499227203,
"grad_norm": 0.2740299105644226,
"learning_rate": 4.804960918008557e-05,
"loss": 2.9497,
"num_input_tokens_seen": 1038672,
"step": 1385
},
{
"epoch": 2.1483771251931993,
"grad_norm": 0.3373715281486511,
"learning_rate": 4.802341101995389e-05,
"loss": 4.0791,
"num_input_tokens_seen": 1042192,
"step": 1390
},
{
"epoch": 2.1561051004636784,
"grad_norm": 0.28904953598976135,
"learning_rate": 4.7997045320954056e-05,
"loss": 3.8542,
"num_input_tokens_seen": 1045712,
"step": 1395
},
{
"epoch": 2.1638330757341575,
"grad_norm": 0.27055951952934265,
"learning_rate": 4.797051227494638e-05,
"loss": 3.7141,
"num_input_tokens_seen": 1049552,
"step": 1400
},
{
"epoch": 2.1715610510046366,
"grad_norm": 0.3160632252693176,
"learning_rate": 4.7943812075008975e-05,
"loss": 3.9331,
"num_input_tokens_seen": 1052944,
"step": 1405
},
{
"epoch": 2.179289026275116,
"grad_norm": 0.4832019805908203,
"learning_rate": 4.791694491543629e-05,
"loss": 3.5992,
"num_input_tokens_seen": 1056656,
"step": 1410
},
{
"epoch": 2.187017001545595,
"grad_norm": 0.5526258945465088,
"learning_rate": 4.788991099173775e-05,
"loss": 3.9143,
"num_input_tokens_seen": 1060560,
"step": 1415
},
{
"epoch": 2.1947449768160743,
"grad_norm": 0.32115232944488525,
"learning_rate": 4.786271050063629e-05,
"loss": 3.9486,
"num_input_tokens_seen": 1064528,
"step": 1420
},
{
"epoch": 2.2024729520865534,
"grad_norm": 0.25502118468284607,
"learning_rate": 4.783534364006692e-05,
"loss": 3.5801,
"num_input_tokens_seen": 1067856,
"step": 1425
},
{
"epoch": 2.2102009273570324,
"grad_norm": 0.38059893250465393,
"learning_rate": 4.780781060917533e-05,
"loss": 3.5739,
"num_input_tokens_seen": 1071248,
"step": 1430
},
{
"epoch": 2.2179289026275115,
"grad_norm": 0.3651339113712311,
"learning_rate": 4.778011160831641e-05,
"loss": 3.5081,
"num_input_tokens_seen": 1074832,
"step": 1435
},
{
"epoch": 2.2256568778979906,
"grad_norm": 0.24716055393218994,
"learning_rate": 4.7752246839052785e-05,
"loss": 3.8176,
"num_input_tokens_seen": 1079184,
"step": 1440
},
{
"epoch": 2.2333848531684697,
"grad_norm": 0.2873936891555786,
"learning_rate": 4.7724216504153356e-05,
"loss": 4.0717,
"num_input_tokens_seen": 1082832,
"step": 1445
},
{
"epoch": 2.2411128284389488,
"grad_norm": 0.4039398431777954,
"learning_rate": 4.769602080759185e-05,
"loss": 3.8992,
"num_input_tokens_seen": 1086544,
"step": 1450
},
{
"epoch": 2.2488408037094283,
"grad_norm": 0.2683480679988861,
"learning_rate": 4.766765995454527e-05,
"loss": 3.2922,
"num_input_tokens_seen": 1090512,
"step": 1455
},
{
"epoch": 2.2565687789799074,
"grad_norm": 0.23535872995853424,
"learning_rate": 4.76391341513925e-05,
"loss": 3.7648,
"num_input_tokens_seen": 1094352,
"step": 1460
},
{
"epoch": 2.2642967542503865,
"grad_norm": 0.3309881389141083,
"learning_rate": 4.7610443605712696e-05,
"loss": 3.7942,
"num_input_tokens_seen": 1098128,
"step": 1465
},
{
"epoch": 2.2720247295208655,
"grad_norm": 0.33581194281578064,
"learning_rate": 4.758158852628387e-05,
"loss": 3.4687,
"num_input_tokens_seen": 1101392,
"step": 1470
},
{
"epoch": 2.2797527047913446,
"grad_norm": 0.30702441930770874,
"learning_rate": 4.7552569123081305e-05,
"loss": 3.9292,
"num_input_tokens_seen": 1104848,
"step": 1475
},
{
"epoch": 2.2874806800618237,
"grad_norm": 0.26855015754699707,
"learning_rate": 4.752338560727604e-05,
"loss": 3.51,
"num_input_tokens_seen": 1108560,
"step": 1480
},
{
"epoch": 2.295208655332303,
"grad_norm": 0.30905085802078247,
"learning_rate": 4.749403819123338e-05,
"loss": 3.7468,
"num_input_tokens_seen": 1112784,
"step": 1485
},
{
"epoch": 2.3029366306027823,
"grad_norm": 0.31154030561447144,
"learning_rate": 4.746452708851128e-05,
"loss": 4.1219,
"num_input_tokens_seen": 1116560,
"step": 1490
},
{
"epoch": 2.3106646058732614,
"grad_norm": 0.3405595123767853,
"learning_rate": 4.7434852513858844e-05,
"loss": 3.9791,
"num_input_tokens_seen": 1120272,
"step": 1495
},
{
"epoch": 2.3183925811437405,
"grad_norm": 0.3933602273464203,
"learning_rate": 4.740501468321473e-05,
"loss": 3.5535,
"num_input_tokens_seen": 1124048,
"step": 1500
},
{
"epoch": 2.3261205564142196,
"grad_norm": 0.26671847701072693,
"learning_rate": 4.737501381370561e-05,
"loss": 3.9563,
"num_input_tokens_seen": 1127952,
"step": 1505
},
{
"epoch": 2.3338485316846986,
"grad_norm": 0.2816687226295471,
"learning_rate": 4.7344850123644555e-05,
"loss": 3.5441,
"num_input_tokens_seen": 1132240,
"step": 1510
},
{
"epoch": 2.3415765069551777,
"grad_norm": 0.4195476472377777,
"learning_rate": 4.7314523832529465e-05,
"loss": 3.9082,
"num_input_tokens_seen": 1135952,
"step": 1515
},
{
"epoch": 2.349304482225657,
"grad_norm": 0.3049340844154358,
"learning_rate": 4.728403516104149e-05,
"loss": 3.6012,
"num_input_tokens_seen": 1139664,
"step": 1520
},
{
"epoch": 2.357032457496136,
"grad_norm": 0.3365153968334198,
"learning_rate": 4.725338433104337e-05,
"loss": 3.7627,
"num_input_tokens_seen": 1143312,
"step": 1525
},
{
"epoch": 2.364760432766615,
"grad_norm": 0.2289888709783554,
"learning_rate": 4.72225715655779e-05,
"loss": 3.848,
"num_input_tokens_seen": 1147216,
"step": 1530
},
{
"epoch": 2.3724884080370945,
"grad_norm": 0.26810458302497864,
"learning_rate": 4.719159708886621e-05,
"loss": 4.0156,
"num_input_tokens_seen": 1150864,
"step": 1535
},
{
"epoch": 2.3802163833075736,
"grad_norm": 0.670549750328064,
"learning_rate": 4.716046112630623e-05,
"loss": 3.466,
"num_input_tokens_seen": 1154384,
"step": 1540
},
{
"epoch": 2.3879443585780527,
"grad_norm": 0.4632706046104431,
"learning_rate": 4.712916390447099e-05,
"loss": 3.454,
"num_input_tokens_seen": 1158288,
"step": 1545
},
{
"epoch": 2.3956723338485317,
"grad_norm": 0.2911776602268219,
"learning_rate": 4.709770565110697e-05,
"loss": 3.6318,
"num_input_tokens_seen": 1161744,
"step": 1550
},
{
"epoch": 2.403400309119011,
"grad_norm": 0.2800629734992981,
"learning_rate": 4.7066086595132486e-05,
"loss": 3.9736,
"num_input_tokens_seen": 1165584,
"step": 1555
},
{
"epoch": 2.41112828438949,
"grad_norm": 0.2719181776046753,
"learning_rate": 4.7034306966635966e-05,
"loss": 3.7662,
"num_input_tokens_seen": 1169104,
"step": 1560
},
{
"epoch": 2.418856259659969,
"grad_norm": 0.34364286065101624,
"learning_rate": 4.700236699687434e-05,
"loss": 3.5639,
"num_input_tokens_seen": 1173200,
"step": 1565
},
{
"epoch": 2.426584234930448,
"grad_norm": 0.25128039717674255,
"learning_rate": 4.697026691827129e-05,
"loss": 3.6556,
"num_input_tokens_seen": 1176848,
"step": 1570
},
{
"epoch": 2.434312210200927,
"grad_norm": 0.38562244176864624,
"learning_rate": 4.693800696441564e-05,
"loss": 3.3021,
"num_input_tokens_seen": 1180432,
"step": 1575
},
{
"epoch": 2.4420401854714067,
"grad_norm": 0.2515873312950134,
"learning_rate": 4.690558737005955e-05,
"loss": 3.5264,
"num_input_tokens_seen": 1184528,
"step": 1580
},
{
"epoch": 2.4497681607418857,
"grad_norm": 0.3244396448135376,
"learning_rate": 4.687300837111691e-05,
"loss": 3.2549,
"num_input_tokens_seen": 1188560,
"step": 1585
},
{
"epoch": 2.457496136012365,
"grad_norm": 0.23380200564861298,
"learning_rate": 4.6840270204661575e-05,
"loss": 3.4865,
"num_input_tokens_seen": 1192656,
"step": 1590
},
{
"epoch": 2.465224111282844,
"grad_norm": 0.24694781005382538,
"learning_rate": 4.6807373108925626e-05,
"loss": 3.6555,
"num_input_tokens_seen": 1196176,
"step": 1595
},
{
"epoch": 2.472952086553323,
"grad_norm": 0.42272141575813293,
"learning_rate": 4.677431732329766e-05,
"loss": 3.3435,
"num_input_tokens_seen": 1199824,
"step": 1600
},
{
"epoch": 2.480680061823802,
"grad_norm": 0.31362879276275635,
"learning_rate": 4.674110308832106e-05,
"loss": 3.6951,
"num_input_tokens_seen": 1204048,
"step": 1605
},
{
"epoch": 2.488408037094281,
"grad_norm": 0.22943803668022156,
"learning_rate": 4.670773064569221e-05,
"loss": 3.8555,
"num_input_tokens_seen": 1207632,
"step": 1610
},
{
"epoch": 2.4961360123647607,
"grad_norm": 0.26428958773612976,
"learning_rate": 4.667420023825876e-05,
"loss": 3.4376,
"num_input_tokens_seen": 1211216,
"step": 1615
},
{
"epoch": 2.5038639876352393,
"grad_norm": 0.39434218406677246,
"learning_rate": 4.664051211001786e-05,
"loss": 3.3945,
"num_input_tokens_seen": 1214864,
"step": 1620
},
{
"epoch": 2.5038639876352393,
"eval_loss": 3.5485548973083496,
"eval_runtime": 9.8824,
"eval_samples_per_second": 58.185,
"eval_steps_per_second": 7.286,
"num_input_tokens_seen": 1214864,
"step": 1620
},
{
"epoch": 2.511591962905719,
"grad_norm": 0.49116188287734985,
"learning_rate": 4.660666650611436e-05,
"loss": 3.3948,
"num_input_tokens_seen": 1218960,
"step": 1625
},
{
"epoch": 2.519319938176198,
"grad_norm": 0.5065126419067383,
"learning_rate": 4.657266367283906e-05,
"loss": 3.1531,
"num_input_tokens_seen": 1223120,
"step": 1630
},
{
"epoch": 2.527047913446677,
"grad_norm": 0.395476758480072,
"learning_rate": 4.653850385762689e-05,
"loss": 3.2479,
"num_input_tokens_seen": 1226768,
"step": 1635
},
{
"epoch": 2.534775888717156,
"grad_norm": 0.36967065930366516,
"learning_rate": 4.6504187309055135e-05,
"loss": 3.4955,
"num_input_tokens_seen": 1230352,
"step": 1640
},
{
"epoch": 2.542503863987635,
"grad_norm": 0.25611770153045654,
"learning_rate": 4.646971427684159e-05,
"loss": 3.8143,
"num_input_tokens_seen": 1233936,
"step": 1645
},
{
"epoch": 2.5502318392581143,
"grad_norm": 0.4444847106933594,
"learning_rate": 4.6435085011842785e-05,
"loss": 3.7283,
"num_input_tokens_seen": 1237392,
"step": 1650
},
{
"epoch": 2.5579598145285933,
"grad_norm": 0.5292262434959412,
"learning_rate": 4.6400299766052126e-05,
"loss": 3.6426,
"num_input_tokens_seen": 1241552,
"step": 1655
},
{
"epoch": 2.565687789799073,
"grad_norm": 0.24306781589984894,
"learning_rate": 4.636535879259808e-05,
"loss": 3.837,
"num_input_tokens_seen": 1245008,
"step": 1660
},
{
"epoch": 2.573415765069552,
"grad_norm": 0.2738051414489746,
"learning_rate": 4.633026234574232e-05,
"loss": 3.4779,
"num_input_tokens_seen": 1248592,
"step": 1665
},
{
"epoch": 2.581143740340031,
"grad_norm": 0.26396363973617554,
"learning_rate": 4.62950106808779e-05,
"loss": 3.3651,
"num_input_tokens_seen": 1252496,
"step": 1670
},
{
"epoch": 2.58887171561051,
"grad_norm": 0.3096264898777008,
"learning_rate": 4.6259604054527364e-05,
"loss": 3.7189,
"num_input_tokens_seen": 1255824,
"step": 1675
},
{
"epoch": 2.596599690880989,
"grad_norm": 0.37881341576576233,
"learning_rate": 4.622404272434089e-05,
"loss": 3.8145,
"num_input_tokens_seen": 1259728,
"step": 1680
},
{
"epoch": 2.6043276661514683,
"grad_norm": 0.5067923665046692,
"learning_rate": 4.6188326949094425e-05,
"loss": 3.2801,
"num_input_tokens_seen": 1263376,
"step": 1685
},
{
"epoch": 2.6120556414219473,
"grad_norm": 0.30508747696876526,
"learning_rate": 4.615245698868781e-05,
"loss": 3.7173,
"num_input_tokens_seen": 1267472,
"step": 1690
},
{
"epoch": 2.6197836166924264,
"grad_norm": 0.2755107879638672,
"learning_rate": 4.6116433104142845e-05,
"loss": 3.4027,
"num_input_tokens_seen": 1271312,
"step": 1695
},
{
"epoch": 2.6275115919629055,
"grad_norm": 0.4445127248764038,
"learning_rate": 4.608025555760145e-05,
"loss": 3.373,
"num_input_tokens_seen": 1274960,
"step": 1700
},
{
"epoch": 2.635239567233385,
"grad_norm": 0.2652303874492645,
"learning_rate": 4.604392461232371e-05,
"loss": 3.491,
"num_input_tokens_seen": 1278160,
"step": 1705
},
{
"epoch": 2.642967542503864,
"grad_norm": 0.27256447076797485,
"learning_rate": 4.600744053268596e-05,
"loss": 3.7528,
"num_input_tokens_seen": 1282128,
"step": 1710
},
{
"epoch": 2.650695517774343,
"grad_norm": 0.30931556224823,
"learning_rate": 4.597080358417893e-05,
"loss": 3.4361,
"num_input_tokens_seen": 1285840,
"step": 1715
},
{
"epoch": 2.6584234930448223,
"grad_norm": 0.2930947244167328,
"learning_rate": 4.5934014033405695e-05,
"loss": 3.1586,
"num_input_tokens_seen": 1289744,
"step": 1720
},
{
"epoch": 2.6661514683153014,
"grad_norm": 0.22680403292179108,
"learning_rate": 4.5897072148079846e-05,
"loss": 3.3894,
"num_input_tokens_seen": 1293840,
"step": 1725
},
{
"epoch": 2.6738794435857804,
"grad_norm": 0.25522181391716003,
"learning_rate": 4.585997819702348e-05,
"loss": 3.6516,
"num_input_tokens_seen": 1297872,
"step": 1730
},
{
"epoch": 2.6816074188562595,
"grad_norm": 0.3003213405609131,
"learning_rate": 4.5822732450165253e-05,
"loss": 3.3977,
"num_input_tokens_seen": 1301712,
"step": 1735
},
{
"epoch": 2.689335394126739,
"grad_norm": 0.287356972694397,
"learning_rate": 4.5785335178538444e-05,
"loss": 3.229,
"num_input_tokens_seen": 1305040,
"step": 1740
},
{
"epoch": 2.6970633693972177,
"grad_norm": 0.23212386667728424,
"learning_rate": 4.5747786654278936e-05,
"loss": 3.4395,
"num_input_tokens_seen": 1308880,
"step": 1745
},
{
"epoch": 2.704791344667697,
"grad_norm": 0.4021557569503784,
"learning_rate": 4.5710087150623274e-05,
"loss": 3.5296,
"num_input_tokens_seen": 1312464,
"step": 1750
},
{
"epoch": 2.7125193199381763,
"grad_norm": 0.27974724769592285,
"learning_rate": 4.567223694190667e-05,
"loss": 3.5769,
"num_input_tokens_seen": 1316240,
"step": 1755
},
{
"epoch": 2.7202472952086554,
"grad_norm": 0.29455214738845825,
"learning_rate": 4.563423630356099e-05,
"loss": 3.1926,
"num_input_tokens_seen": 1320080,
"step": 1760
},
{
"epoch": 2.7279752704791345,
"grad_norm": 0.6454901695251465,
"learning_rate": 4.559608551211276e-05,
"loss": 3.4921,
"num_input_tokens_seen": 1323408,
"step": 1765
},
{
"epoch": 2.7357032457496135,
"grad_norm": 0.41331547498703003,
"learning_rate": 4.555778484518116e-05,
"loss": 3.5073,
"num_input_tokens_seen": 1327376,
"step": 1770
},
{
"epoch": 2.7434312210200926,
"grad_norm": 0.26140904426574707,
"learning_rate": 4.551933458147599e-05,
"loss": 3.6708,
"num_input_tokens_seen": 1331152,
"step": 1775
},
{
"epoch": 2.7511591962905717,
"grad_norm": 0.2431512176990509,
"learning_rate": 4.548073500079566e-05,
"loss": 3.2524,
"num_input_tokens_seen": 1335120,
"step": 1780
},
{
"epoch": 2.7588871715610512,
"grad_norm": 0.28115934133529663,
"learning_rate": 4.544198638402514e-05,
"loss": 3.2425,
"num_input_tokens_seen": 1338576,
"step": 1785
},
{
"epoch": 2.76661514683153,
"grad_norm": 0.31076323986053467,
"learning_rate": 4.5403089013133905e-05,
"loss": 3.5338,
"num_input_tokens_seen": 1342288,
"step": 1790
},
{
"epoch": 2.7743431221020094,
"grad_norm": 0.3837130069732666,
"learning_rate": 4.536404317117392e-05,
"loss": 3.5508,
"num_input_tokens_seen": 1346192,
"step": 1795
},
{
"epoch": 2.7820710973724885,
"grad_norm": 0.4085627794265747,
"learning_rate": 4.5324849142277545e-05,
"loss": 3.3496,
"num_input_tokens_seen": 1350160,
"step": 1800
},
{
"epoch": 2.7897990726429676,
"grad_norm": 0.393399715423584,
"learning_rate": 4.5285507211655486e-05,
"loss": 3.7701,
"num_input_tokens_seen": 1354320,
"step": 1805
},
{
"epoch": 2.7975270479134466,
"grad_norm": 0.32475441694259644,
"learning_rate": 4.52460176655947e-05,
"loss": 3.3192,
"num_input_tokens_seen": 1358096,
"step": 1810
},
{
"epoch": 2.8052550231839257,
"grad_norm": 0.38561108708381653,
"learning_rate": 4.520638079145635e-05,
"loss": 3.4763,
"num_input_tokens_seen": 1361552,
"step": 1815
},
{
"epoch": 2.812982998454405,
"grad_norm": 0.2731021046638489,
"learning_rate": 4.516659687767367e-05,
"loss": 3.6312,
"num_input_tokens_seen": 1365776,
"step": 1820
},
{
"epoch": 2.820710973724884,
"grad_norm": 0.2799111604690552,
"learning_rate": 4.512666621374989e-05,
"loss": 3.4312,
"num_input_tokens_seen": 1369296,
"step": 1825
},
{
"epoch": 2.8284389489953634,
"grad_norm": 0.40603402256965637,
"learning_rate": 4.5086589090256124e-05,
"loss": 3.6473,
"num_input_tokens_seen": 1372752,
"step": 1830
},
{
"epoch": 2.8361669242658425,
"grad_norm": 0.31322500109672546,
"learning_rate": 4.5046365798829265e-05,
"loss": 3.5422,
"num_input_tokens_seen": 1376336,
"step": 1835
},
{
"epoch": 2.8438948995363216,
"grad_norm": 0.3008683919906616,
"learning_rate": 4.5005996632169845e-05,
"loss": 3.3106,
"num_input_tokens_seen": 1379664,
"step": 1840
},
{
"epoch": 2.8516228748068007,
"grad_norm": 0.2807864844799042,
"learning_rate": 4.4965481884039915e-05,
"loss": 3.4939,
"num_input_tokens_seen": 1383632,
"step": 1845
},
{
"epoch": 2.8593508500772797,
"grad_norm": 0.31224992871284485,
"learning_rate": 4.492482184926091e-05,
"loss": 3.5265,
"num_input_tokens_seen": 1387856,
"step": 1850
},
{
"epoch": 2.867078825347759,
"grad_norm": 0.367418497800827,
"learning_rate": 4.48840168237115e-05,
"loss": 3.6019,
"num_input_tokens_seen": 1391888,
"step": 1855
},
{
"epoch": 2.874806800618238,
"grad_norm": 0.23605087399482727,
"learning_rate": 4.484306710432544e-05,
"loss": 3.2355,
"num_input_tokens_seen": 1395344,
"step": 1860
},
{
"epoch": 2.8825347758887174,
"grad_norm": 0.42899090051651,
"learning_rate": 4.480197298908939e-05,
"loss": 3.3917,
"num_input_tokens_seen": 1398928,
"step": 1865
},
{
"epoch": 2.890262751159196,
"grad_norm": 0.23555681109428406,
"learning_rate": 4.4760734777040785e-05,
"loss": 3.5563,
"num_input_tokens_seen": 1402512,
"step": 1870
},
{
"epoch": 2.8979907264296756,
"grad_norm": 0.3788621127605438,
"learning_rate": 4.471935276826563e-05,
"loss": 3.4171,
"num_input_tokens_seen": 1406544,
"step": 1875
},
{
"epoch": 2.9057187017001547,
"grad_norm": 0.2907779812812805,
"learning_rate": 4.4677827263896315e-05,
"loss": 3.5528,
"num_input_tokens_seen": 1410064,
"step": 1880
},
{
"epoch": 2.9134466769706338,
"grad_norm": 0.29947736859321594,
"learning_rate": 4.463615856610943e-05,
"loss": 3.3987,
"num_input_tokens_seen": 1413648,
"step": 1885
},
{
"epoch": 2.921174652241113,
"grad_norm": 0.35539594292640686,
"learning_rate": 4.4594346978123595e-05,
"loss": 3.3475,
"num_input_tokens_seen": 1417232,
"step": 1890
},
{
"epoch": 2.928902627511592,
"grad_norm": 0.24370482563972473,
"learning_rate": 4.45523928041972e-05,
"loss": 3.565,
"num_input_tokens_seen": 1420560,
"step": 1895
},
{
"epoch": 2.936630602782071,
"grad_norm": 0.3878605365753174,
"learning_rate": 4.45102963496262e-05,
"loss": 3.2705,
"num_input_tokens_seen": 1424656,
"step": 1900
},
{
"epoch": 2.94435857805255,
"grad_norm": 0.24530839920043945,
"learning_rate": 4.4468057920741976e-05,
"loss": 3.6164,
"num_input_tokens_seen": 1428688,
"step": 1905
},
{
"epoch": 2.9520865533230296,
"grad_norm": 0.2778976857662201,
"learning_rate": 4.442567782490897e-05,
"loss": 3.4781,
"num_input_tokens_seen": 1432144,
"step": 1910
},
{
"epoch": 2.9598145285935082,
"grad_norm": 0.39094430208206177,
"learning_rate": 4.4383156370522554e-05,
"loss": 3.5724,
"num_input_tokens_seen": 1435792,
"step": 1915
},
{
"epoch": 2.9675425038639878,
"grad_norm": 0.22923092544078827,
"learning_rate": 4.434049386700676e-05,
"loss": 3.2843,
"num_input_tokens_seen": 1439120,
"step": 1920
},
{
"epoch": 2.975270479134467,
"grad_norm": 0.2819403409957886,
"learning_rate": 4.4297690624811984e-05,
"loss": 3.4764,
"num_input_tokens_seen": 1442896,
"step": 1925
},
{
"epoch": 2.982998454404946,
"grad_norm": 0.466340571641922,
"learning_rate": 4.42547469554128e-05,
"loss": 3.0741,
"num_input_tokens_seen": 1446352,
"step": 1930
},
{
"epoch": 2.990726429675425,
"grad_norm": 0.2733190655708313,
"learning_rate": 4.421166317130563e-05,
"loss": 3.2974,
"num_input_tokens_seen": 1450256,
"step": 1935
},
{
"epoch": 2.998454404945904,
"grad_norm": 0.2440556436777115,
"learning_rate": 4.4168439586006506e-05,
"loss": 3.0494,
"num_input_tokens_seen": 1454096,
"step": 1940
},
{
"epoch": 3.0046367851622873,
"eval_loss": 3.3124587535858154,
"eval_runtime": 9.8622,
"eval_samples_per_second": 58.303,
"eval_steps_per_second": 7.301,
"num_input_tokens_seen": 1456656,
"step": 1944
},
{
"epoch": 3.006182380216383,
"grad_norm": 0.2453344613313675,
"learning_rate": 4.412507651404878e-05,
"loss": 3.2227,
"num_input_tokens_seen": 1457360,
"step": 1945
},
{
"epoch": 3.0139103554868623,
"grad_norm": 0.32557570934295654,
"learning_rate": 4.408157427098083e-05,
"loss": 3.336,
"num_input_tokens_seen": 1460752,
"step": 1950
},
{
"epoch": 3.021638330757342,
"grad_norm": 0.2702576518058777,
"learning_rate": 4.4037933173363756e-05,
"loss": 3.3529,
"num_input_tokens_seen": 1464208,
"step": 1955
},
{
"epoch": 3.029366306027821,
"grad_norm": 0.389478474855423,
"learning_rate": 4.3994153538769114e-05,
"loss": 3.1414,
"num_input_tokens_seen": 1467792,
"step": 1960
},
{
"epoch": 3.0370942812983,
"grad_norm": 0.2336999922990799,
"learning_rate": 4.395023568577655e-05,
"loss": 3.4423,
"num_input_tokens_seen": 1471504,
"step": 1965
},
{
"epoch": 3.044822256568779,
"grad_norm": 0.2960321605205536,
"learning_rate": 4.390617993397153e-05,
"loss": 3.3133,
"num_input_tokens_seen": 1475216,
"step": 1970
},
{
"epoch": 3.052550231839258,
"grad_norm": 0.39140525460243225,
"learning_rate": 4.3861986603942985e-05,
"loss": 3.4031,
"num_input_tokens_seen": 1478672,
"step": 1975
},
{
"epoch": 3.060278207109737,
"grad_norm": 0.40083011984825134,
"learning_rate": 4.3817656017280995e-05,
"loss": 3.3519,
"num_input_tokens_seen": 1482704,
"step": 1980
},
{
"epoch": 3.0680061823802163,
"grad_norm": 0.27878859639167786,
"learning_rate": 4.3773188496574424e-05,
"loss": 3.6379,
"num_input_tokens_seen": 1486160,
"step": 1985
},
{
"epoch": 3.0757341576506954,
"grad_norm": 0.24547715485095978,
"learning_rate": 4.372858436540863e-05,
"loss": 3.0755,
"num_input_tokens_seen": 1489360,
"step": 1990
},
{
"epoch": 3.0834621329211744,
"grad_norm": 0.3561779260635376,
"learning_rate": 4.368384394836301e-05,
"loss": 3.2519,
"num_input_tokens_seen": 1493136,
"step": 1995
},
{
"epoch": 3.091190108191654,
"grad_norm": 0.561246931552887,
"learning_rate": 4.363896757100876e-05,
"loss": 3.0584,
"num_input_tokens_seen": 1497552,
"step": 2000
},
{
"epoch": 3.098918083462133,
"grad_norm": 0.3138968348503113,
"learning_rate": 4.359395555990641e-05,
"loss": 3.6345,
"num_input_tokens_seen": 1501200,
"step": 2005
},
{
"epoch": 3.106646058732612,
"grad_norm": 0.2394760251045227,
"learning_rate": 4.3548808242603484e-05,
"loss": 3.3659,
"num_input_tokens_seen": 1505296,
"step": 2010
},
{
"epoch": 3.114374034003091,
"grad_norm": 0.25303345918655396,
"learning_rate": 4.3503525947632126e-05,
"loss": 3.5564,
"num_input_tokens_seen": 1509456,
"step": 2015
},
{
"epoch": 3.1221020092735703,
"grad_norm": 0.5716120004653931,
"learning_rate": 4.3458109004506684e-05,
"loss": 2.9278,
"num_input_tokens_seen": 1512976,
"step": 2020
},
{
"epoch": 3.1298299845440494,
"grad_norm": 0.3777564764022827,
"learning_rate": 4.3412557743721336e-05,
"loss": 3.3736,
"num_input_tokens_seen": 1516752,
"step": 2025
},
{
"epoch": 3.1375579598145285,
"grad_norm": 0.5030646920204163,
"learning_rate": 4.336687249674768e-05,
"loss": 3.0751,
"num_input_tokens_seen": 1520400,
"step": 2030
},
{
"epoch": 3.1452859350850075,
"grad_norm": 0.40548887848854065,
"learning_rate": 4.33210535960323e-05,
"loss": 3.464,
"num_input_tokens_seen": 1524048,
"step": 2035
},
{
"epoch": 3.153013910355487,
"grad_norm": 0.287009596824646,
"learning_rate": 4.3275101374994386e-05,
"loss": 2.9975,
"num_input_tokens_seen": 1527440,
"step": 2040
},
{
"epoch": 3.160741885625966,
"grad_norm": 0.2755314111709595,
"learning_rate": 4.322901616802326e-05,
"loss": 3.1306,
"num_input_tokens_seen": 1531088,
"step": 2045
},
{
"epoch": 3.1684698608964452,
"grad_norm": 0.2994464337825775,
"learning_rate": 4.3182798310475994e-05,
"loss": 3.3007,
"num_input_tokens_seen": 1535568,
"step": 2050
},
{
"epoch": 3.1761978361669243,
"grad_norm": 0.42739635705947876,
"learning_rate": 4.313644813867491e-05,
"loss": 3.4714,
"num_input_tokens_seen": 1539408,
"step": 2055
},
{
"epoch": 3.1839258114374034,
"grad_norm": 0.33637383580207825,
"learning_rate": 4.308996598990521e-05,
"loss": 3.2876,
"num_input_tokens_seen": 1543376,
"step": 2060
},
{
"epoch": 3.1916537867078825,
"grad_norm": 0.317121297121048,
"learning_rate": 4.3043352202412445e-05,
"loss": 3.3758,
"num_input_tokens_seen": 1547216,
"step": 2065
},
{
"epoch": 3.1993817619783615,
"grad_norm": 0.471192330121994,
"learning_rate": 4.29966071154001e-05,
"loss": 3.3303,
"num_input_tokens_seen": 1550800,
"step": 2070
},
{
"epoch": 3.2071097372488406,
"grad_norm": 0.37737250328063965,
"learning_rate": 4.294973106902711e-05,
"loss": 3.0919,
"num_input_tokens_seen": 1554384,
"step": 2075
},
{
"epoch": 3.21483771251932,
"grad_norm": 0.3162528872489929,
"learning_rate": 4.2902724404405395e-05,
"loss": 3.7993,
"num_input_tokens_seen": 1557968,
"step": 2080
},
{
"epoch": 3.2225656877897992,
"grad_norm": 0.3474006652832031,
"learning_rate": 4.285558746359735e-05,
"loss": 3.3829,
"num_input_tokens_seen": 1562128,
"step": 2085
},
{
"epoch": 3.2302936630602783,
"grad_norm": 0.29031333327293396,
"learning_rate": 4.280832058961338e-05,
"loss": 3.2496,
"num_input_tokens_seen": 1566096,
"step": 2090
},
{
"epoch": 3.2380216383307574,
"grad_norm": 0.2682136595249176,
"learning_rate": 4.2760924126409427e-05,
"loss": 3.2577,
"num_input_tokens_seen": 1570064,
"step": 2095
},
{
"epoch": 3.2457496136012365,
"grad_norm": 0.2770131230354309,
"learning_rate": 4.271339841888441e-05,
"loss": 2.9641,
"num_input_tokens_seen": 1574032,
"step": 2100
},
{
"epoch": 3.2534775888717156,
"grad_norm": 0.3436896502971649,
"learning_rate": 4.266574381287776e-05,
"loss": 3.1281,
"num_input_tokens_seen": 1577488,
"step": 2105
},
{
"epoch": 3.2612055641421946,
"grad_norm": 0.32517606019973755,
"learning_rate": 4.261796065516688e-05,
"loss": 3.448,
"num_input_tokens_seen": 1581200,
"step": 2110
},
{
"epoch": 3.2689335394126737,
"grad_norm": 0.3597550392150879,
"learning_rate": 4.257004929346462e-05,
"loss": 3.2547,
"num_input_tokens_seen": 1585040,
"step": 2115
},
{
"epoch": 3.276661514683153,
"grad_norm": 0.2619961202144623,
"learning_rate": 4.252201007641679e-05,
"loss": 3.3299,
"num_input_tokens_seen": 1588624,
"step": 2120
},
{
"epoch": 3.2843894899536323,
"grad_norm": 0.2485017478466034,
"learning_rate": 4.247384335359956e-05,
"loss": 3.4412,
"num_input_tokens_seen": 1592784,
"step": 2125
},
{
"epoch": 3.2921174652241114,
"grad_norm": 0.5881773233413696,
"learning_rate": 4.2425549475516954e-05,
"loss": 2.9302,
"num_input_tokens_seen": 1596432,
"step": 2130
},
{
"epoch": 3.2998454404945905,
"grad_norm": 0.32718685269355774,
"learning_rate": 4.2377128793598295e-05,
"loss": 3.3778,
"num_input_tokens_seen": 1600016,
"step": 2135
},
{
"epoch": 3.3075734157650696,
"grad_norm": 0.26880738139152527,
"learning_rate": 4.232858166019564e-05,
"loss": 3.155,
"num_input_tokens_seen": 1604048,
"step": 2140
},
{
"epoch": 3.3153013910355487,
"grad_norm": 0.40621763467788696,
"learning_rate": 4.227990842858122e-05,
"loss": 3.0137,
"num_input_tokens_seen": 1607952,
"step": 2145
},
{
"epoch": 3.3230293663060277,
"grad_norm": 0.27599066495895386,
"learning_rate": 4.223110945294486e-05,
"loss": 3.4187,
"num_input_tokens_seen": 1611600,
"step": 2150
},
{
"epoch": 3.330757341576507,
"grad_norm": 0.32147035002708435,
"learning_rate": 4.2182185088391435e-05,
"loss": 3.236,
"num_input_tokens_seen": 1615184,
"step": 2155
},
{
"epoch": 3.338485316846986,
"grad_norm": 0.2467658519744873,
"learning_rate": 4.213313569093824e-05,
"loss": 3.2334,
"num_input_tokens_seen": 1619152,
"step": 2160
},
{
"epoch": 3.346213292117465,
"grad_norm": 0.31439894437789917,
"learning_rate": 4.208396161751243e-05,
"loss": 3.0429,
"num_input_tokens_seen": 1622736,
"step": 2165
},
{
"epoch": 3.3539412673879445,
"grad_norm": 0.27845317125320435,
"learning_rate": 4.20346632259484e-05,
"loss": 3.3637,
"num_input_tokens_seen": 1626256,
"step": 2170
},
{
"epoch": 3.3616692426584236,
"grad_norm": 0.34065747261047363,
"learning_rate": 4.198524087498522e-05,
"loss": 3.3511,
"num_input_tokens_seen": 1629584,
"step": 2175
},
{
"epoch": 3.3693972179289027,
"grad_norm": 0.3487941324710846,
"learning_rate": 4.193569492426398e-05,
"loss": 3.021,
"num_input_tokens_seen": 1633552,
"step": 2180
},
{
"epoch": 3.3771251931993818,
"grad_norm": 0.33543628454208374,
"learning_rate": 4.188602573432519e-05,
"loss": 3.215,
"num_input_tokens_seen": 1637584,
"step": 2185
},
{
"epoch": 3.384853168469861,
"grad_norm": 0.5411263704299927,
"learning_rate": 4.1836233666606176e-05,
"loss": 3.2398,
"num_input_tokens_seen": 1641744,
"step": 2190
},
{
"epoch": 3.39258114374034,
"grad_norm": 0.26962918043136597,
"learning_rate": 4.1786319083438406e-05,
"loss": 3.1149,
"num_input_tokens_seen": 1645520,
"step": 2195
},
{
"epoch": 3.400309119010819,
"grad_norm": 0.24972794950008392,
"learning_rate": 4.1736282348044916e-05,
"loss": 3.1159,
"num_input_tokens_seen": 1648912,
"step": 2200
},
{
"epoch": 3.4080370942812985,
"grad_norm": 0.35576534271240234,
"learning_rate": 4.168612382453759e-05,
"loss": 3.1835,
"num_input_tokens_seen": 1653072,
"step": 2205
},
{
"epoch": 3.4157650695517776,
"grad_norm": 0.3997161090373993,
"learning_rate": 4.163584387791458e-05,
"loss": 3.3053,
"num_input_tokens_seen": 1656912,
"step": 2210
},
{
"epoch": 3.4234930448222567,
"grad_norm": 0.47984302043914795,
"learning_rate": 4.158544287405762e-05,
"loss": 3.1529,
"num_input_tokens_seen": 1660560,
"step": 2215
},
{
"epoch": 3.4312210200927358,
"grad_norm": 0.24802884459495544,
"learning_rate": 4.153492117972934e-05,
"loss": 3.3344,
"num_input_tokens_seen": 1664528,
"step": 2220
},
{
"epoch": 3.438948995363215,
"grad_norm": 0.29277801513671875,
"learning_rate": 4.148427916257064e-05,
"loss": 3.0215,
"num_input_tokens_seen": 1668688,
"step": 2225
},
{
"epoch": 3.446676970633694,
"grad_norm": 0.3837185502052307,
"learning_rate": 4.1433517191098e-05,
"loss": 3.3542,
"num_input_tokens_seen": 1672528,
"step": 2230
},
{
"epoch": 3.454404945904173,
"grad_norm": 0.2690471112728119,
"learning_rate": 4.138263563470078e-05,
"loss": 3.1406,
"num_input_tokens_seen": 1675984,
"step": 2235
},
{
"epoch": 3.462132921174652,
"grad_norm": 0.28649571537971497,
"learning_rate": 4.133163486363857e-05,
"loss": 2.9112,
"num_input_tokens_seen": 1680080,
"step": 2240
},
{
"epoch": 3.469860896445131,
"grad_norm": 0.2906050980091095,
"learning_rate": 4.128051524903844e-05,
"loss": 3.0799,
"num_input_tokens_seen": 1683856,
"step": 2245
},
{
"epoch": 3.4775888717156107,
"grad_norm": 0.2971077561378479,
"learning_rate": 4.12292771628923e-05,
"loss": 3.4529,
"num_input_tokens_seen": 1688016,
"step": 2250
},
{
"epoch": 3.48531684698609,
"grad_norm": 0.279525488615036,
"learning_rate": 4.1177920978054144e-05,
"loss": 3.2334,
"num_input_tokens_seen": 1691664,
"step": 2255
},
{
"epoch": 3.493044822256569,
"grad_norm": 0.35451480746269226,
"learning_rate": 4.1126447068237376e-05,
"loss": 3.2177,
"num_input_tokens_seen": 1695312,
"step": 2260
},
{
"epoch": 3.500772797527048,
"grad_norm": 0.42419734597206116,
"learning_rate": 4.107485580801205e-05,
"loss": 3.1003,
"num_input_tokens_seen": 1699344,
"step": 2265
},
{
"epoch": 3.5054095826893352,
"eval_loss": 3.1060409545898438,
"eval_runtime": 9.8552,
"eval_samples_per_second": 58.345,
"eval_steps_per_second": 7.306,
"num_input_tokens_seen": 1701712,
"step": 2268
},
{
"epoch": 3.508500772797527,
"grad_norm": 0.3014146685600281,
"learning_rate": 4.102314757280219e-05,
"loss": 3.3953,
"num_input_tokens_seen": 1703312,
"step": 2270
},
{
"epoch": 3.516228748068006,
"grad_norm": 0.3373369574546814,
"learning_rate": 4.0971322738883014e-05,
"loss": 3.0952,
"num_input_tokens_seen": 1707088,
"step": 2275
},
{
"epoch": 3.523956723338485,
"grad_norm": 0.29237452149391174,
"learning_rate": 4.091938168337822e-05,
"loss": 3.1188,
"num_input_tokens_seen": 1710544,
"step": 2280
},
{
"epoch": 3.5316846986089647,
"grad_norm": 0.44121846556663513,
"learning_rate": 4.086732478425726e-05,
"loss": 2.8938,
"num_input_tokens_seen": 1713936,
"step": 2285
},
{
"epoch": 3.5394126738794434,
"grad_norm": 0.3628321886062622,
"learning_rate": 4.081515242033254e-05,
"loss": 3.0559,
"num_input_tokens_seen": 1717776,
"step": 2290
},
{
"epoch": 3.547140649149923,
"grad_norm": 0.3319717049598694,
"learning_rate": 4.076286497125671e-05,
"loss": 3.1759,
"num_input_tokens_seen": 1721616,
"step": 2295
},
{
"epoch": 3.554868624420402,
"grad_norm": 0.328025758266449,
"learning_rate": 4.071046281751986e-05,
"loss": 3.0737,
"num_input_tokens_seen": 1725520,
"step": 2300
},
{
"epoch": 3.562596599690881,
"grad_norm": 0.3481411635875702,
"learning_rate": 4.065794634044679e-05,
"loss": 2.8744,
"num_input_tokens_seen": 1729232,
"step": 2305
},
{
"epoch": 3.57032457496136,
"grad_norm": 0.2612632215023041,
"learning_rate": 4.060531592219422e-05,
"loss": 3.0024,
"num_input_tokens_seen": 1732752,
"step": 2310
},
{
"epoch": 3.578052550231839,
"grad_norm": 0.2584324777126312,
"learning_rate": 4.0552571945748e-05,
"loss": 2.8444,
"num_input_tokens_seen": 1736528,
"step": 2315
},
{
"epoch": 3.5857805255023183,
"grad_norm": 0.3369653820991516,
"learning_rate": 4.049971479492034e-05,
"loss": 3.0028,
"num_input_tokens_seen": 1740048,
"step": 2320
},
{
"epoch": 3.5935085007727974,
"grad_norm": 0.35002318024635315,
"learning_rate": 4.044674485434699e-05,
"loss": 3.2903,
"num_input_tokens_seen": 1743824,
"step": 2325
},
{
"epoch": 3.601236476043277,
"grad_norm": 0.29385900497436523,
"learning_rate": 4.039366250948448e-05,
"loss": 3.095,
"num_input_tokens_seen": 1747920,
"step": 2330
},
{
"epoch": 3.6089644513137555,
"grad_norm": 0.5506426095962524,
"learning_rate": 4.034046814660728e-05,
"loss": 3.2893,
"num_input_tokens_seen": 1751632,
"step": 2335
},
{
"epoch": 3.616692426584235,
"grad_norm": 0.24844296276569366,
"learning_rate": 4.0287162152805e-05,
"loss": 3.3977,
"num_input_tokens_seen": 1755664,
"step": 2340
},
{
"epoch": 3.624420401854714,
"grad_norm": 0.38642531633377075,
"learning_rate": 4.0233744915979594e-05,
"loss": 3.0687,
"num_input_tokens_seen": 1758800,
"step": 2345
},
{
"epoch": 3.6321483771251932,
"grad_norm": 0.49976831674575806,
"learning_rate": 4.01802168248425e-05,
"loss": 3.1762,
"num_input_tokens_seen": 1762832,
"step": 2350
},
{
"epoch": 3.6398763523956723,
"grad_norm": 0.3384222984313965,
"learning_rate": 4.012657826891185e-05,
"loss": 2.937,
"num_input_tokens_seen": 1766672,
"step": 2355
},
{
"epoch": 3.6476043276661514,
"grad_norm": 0.36615273356437683,
"learning_rate": 4.00728296385096e-05,
"loss": 3.0035,
"num_input_tokens_seen": 1770256,
"step": 2360
},
{
"epoch": 3.6553323029366305,
"grad_norm": 0.48593974113464355,
"learning_rate": 4.0018971324758705e-05,
"loss": 2.7236,
"num_input_tokens_seen": 1774224,
"step": 2365
},
{
"epoch": 3.6630602782071096,
"grad_norm": 0.3047850728034973,
"learning_rate": 3.996500371958028e-05,
"loss": 3.2167,
"num_input_tokens_seen": 1777616,
"step": 2370
},
{
"epoch": 3.670788253477589,
"grad_norm": 0.43760672211647034,
"learning_rate": 3.991092721569075e-05,
"loss": 2.9141,
"num_input_tokens_seen": 1781648,
"step": 2375
},
{
"epoch": 3.678516228748068,
"grad_norm": 0.289705753326416,
"learning_rate": 3.985674220659898e-05,
"loss": 3.1258,
"num_input_tokens_seen": 1785488,
"step": 2380
},
{
"epoch": 3.6862442040185472,
"grad_norm": 0.2379242330789566,
"learning_rate": 3.980244908660341e-05,
"loss": 2.8883,
"num_input_tokens_seen": 1789456,
"step": 2385
},
{
"epoch": 3.6939721792890263,
"grad_norm": 0.34431400895118713,
"learning_rate": 3.974804825078918e-05,
"loss": 3.2766,
"num_input_tokens_seen": 1793040,
"step": 2390
},
{
"epoch": 3.7017001545595054,
"grad_norm": 0.8246620297431946,
"learning_rate": 3.96935400950253e-05,
"loss": 2.8954,
"num_input_tokens_seen": 1796560,
"step": 2395
},
{
"epoch": 3.7094281298299845,
"grad_norm": 0.32684797048568726,
"learning_rate": 3.963892501596169e-05,
"loss": 2.9391,
"num_input_tokens_seen": 1800400,
"step": 2400
},
{
"epoch": 3.7171561051004636,
"grad_norm": 0.4404638409614563,
"learning_rate": 3.958420341102639e-05,
"loss": 3.003,
"num_input_tokens_seen": 1804048,
"step": 2405
},
{
"epoch": 3.7248840803709427,
"grad_norm": 0.5552765130996704,
"learning_rate": 3.9529375678422575e-05,
"loss": 3.0192,
"num_input_tokens_seen": 1807568,
"step": 2410
},
{
"epoch": 3.7326120556414217,
"grad_norm": 0.39639389514923096,
"learning_rate": 3.9474442217125726e-05,
"loss": 2.9807,
"num_input_tokens_seen": 1811280,
"step": 2415
},
{
"epoch": 3.7403400309119013,
"grad_norm": 0.280390202999115,
"learning_rate": 3.9419403426880684e-05,
"loss": 3.2511,
"num_input_tokens_seen": 1814992,
"step": 2420
},
{
"epoch": 3.7480680061823803,
"grad_norm": 0.4016093313694,
"learning_rate": 3.936425970819877e-05,
"loss": 2.9766,
"num_input_tokens_seen": 1818832,
"step": 2425
},
{
"epoch": 3.7557959814528594,
"grad_norm": 0.520076334476471,
"learning_rate": 3.930901146235485e-05,
"loss": 2.9149,
"num_input_tokens_seen": 1822288,
"step": 2430
},
{
"epoch": 3.7635239567233385,
"grad_norm": 0.3622855246067047,
"learning_rate": 3.925365909138443e-05,
"loss": 2.9592,
"num_input_tokens_seen": 1826192,
"step": 2435
},
{
"epoch": 3.7712519319938176,
"grad_norm": 0.3980333209037781,
"learning_rate": 3.91982029980807e-05,
"loss": 2.8414,
"num_input_tokens_seen": 1829968,
"step": 2440
},
{
"epoch": 3.7789799072642967,
"grad_norm": 0.3101566433906555,
"learning_rate": 3.9142643585991655e-05,
"loss": 2.7509,
"num_input_tokens_seen": 1834000,
"step": 2445
},
{
"epoch": 3.7867078825347757,
"grad_norm": 0.6647924780845642,
"learning_rate": 3.908698125941713e-05,
"loss": 2.9496,
"num_input_tokens_seen": 1838160,
"step": 2450
},
{
"epoch": 3.7944358578052553,
"grad_norm": 0.2631145417690277,
"learning_rate": 3.903121642340583e-05,
"loss": 2.9745,
"num_input_tokens_seen": 1841872,
"step": 2455
},
{
"epoch": 3.802163833075734,
"grad_norm": 0.352568119764328,
"learning_rate": 3.8975349483752436e-05,
"loss": 2.9618,
"num_input_tokens_seen": 1845776,
"step": 2460
},
{
"epoch": 3.8098918083462134,
"grad_norm": 0.39804670214653015,
"learning_rate": 3.8919380846994605e-05,
"loss": 3.1174,
"num_input_tokens_seen": 1849552,
"step": 2465
},
{
"epoch": 3.8176197836166925,
"grad_norm": 0.4087231755256653,
"learning_rate": 3.8863310920410055e-05,
"loss": 2.437,
"num_input_tokens_seen": 1853008,
"step": 2470
},
{
"epoch": 3.8253477588871716,
"grad_norm": 0.506405234336853,
"learning_rate": 3.8807140112013574e-05,
"loss": 3.0171,
"num_input_tokens_seen": 1856272,
"step": 2475
},
{
"epoch": 3.8330757341576507,
"grad_norm": 0.41620808839797974,
"learning_rate": 3.875086883055403e-05,
"loss": 3.0946,
"num_input_tokens_seen": 1859920,
"step": 2480
},
{
"epoch": 3.8408037094281298,
"grad_norm": 0.3406372368335724,
"learning_rate": 3.869449748551146e-05,
"loss": 3.0194,
"num_input_tokens_seen": 1863632,
"step": 2485
},
{
"epoch": 3.848531684698609,
"grad_norm": 0.4365371763706207,
"learning_rate": 3.863802648709404e-05,
"loss": 2.8842,
"num_input_tokens_seen": 1867088,
"step": 2490
},
{
"epoch": 3.856259659969088,
"grad_norm": 0.5339605808258057,
"learning_rate": 3.858145624623509e-05,
"loss": 2.9492,
"num_input_tokens_seen": 1870672,
"step": 2495
},
{
"epoch": 3.8639876352395675,
"grad_norm": 0.2724621295928955,
"learning_rate": 3.852478717459014e-05,
"loss": 2.9462,
"num_input_tokens_seen": 1874384,
"step": 2500
},
{
"epoch": 3.871715610510046,
"grad_norm": 0.36727994680404663,
"learning_rate": 3.8468019684533875e-05,
"loss": 2.8771,
"num_input_tokens_seen": 1878096,
"step": 2505
},
{
"epoch": 3.8794435857805256,
"grad_norm": 0.2990911900997162,
"learning_rate": 3.8411154189157185e-05,
"loss": 3.0987,
"num_input_tokens_seen": 1881936,
"step": 2510
},
{
"epoch": 3.8871715610510047,
"grad_norm": 0.32814911007881165,
"learning_rate": 3.8354191102264105e-05,
"loss": 3.1755,
"num_input_tokens_seen": 1885520,
"step": 2515
},
{
"epoch": 3.894899536321484,
"grad_norm": 0.47634440660476685,
"learning_rate": 3.829713083836886e-05,
"loss": 3.073,
"num_input_tokens_seen": 1889488,
"step": 2520
},
{
"epoch": 3.902627511591963,
"grad_norm": 0.39127570390701294,
"learning_rate": 3.82399738126928e-05,
"loss": 3.1232,
"num_input_tokens_seen": 1893072,
"step": 2525
},
{
"epoch": 3.910355486862442,
"grad_norm": 0.366414338350296,
"learning_rate": 3.818272044116142e-05,
"loss": 3.0161,
"num_input_tokens_seen": 1896912,
"step": 2530
},
{
"epoch": 3.918083462132921,
"grad_norm": 0.5952576994895935,
"learning_rate": 3.812537114040131e-05,
"loss": 2.7545,
"num_input_tokens_seen": 1900432,
"step": 2535
},
{
"epoch": 3.9258114374034,
"grad_norm": 0.3024425208568573,
"learning_rate": 3.806792632773709e-05,
"loss": 2.9895,
"num_input_tokens_seen": 1904656,
"step": 2540
},
{
"epoch": 3.9335394126738796,
"grad_norm": 0.3352251648902893,
"learning_rate": 3.801038642118847e-05,
"loss": 2.9207,
"num_input_tokens_seen": 1908176,
"step": 2545
},
{
"epoch": 3.9412673879443587,
"grad_norm": 0.37967225909233093,
"learning_rate": 3.7952751839467106e-05,
"loss": 2.8346,
"num_input_tokens_seen": 1911568,
"step": 2550
},
{
"epoch": 3.948995363214838,
"grad_norm": 0.3634495735168457,
"learning_rate": 3.78950230019736e-05,
"loss": 2.6807,
"num_input_tokens_seen": 1915408,
"step": 2555
},
{
"epoch": 3.956723338485317,
"grad_norm": 0.5400171279907227,
"learning_rate": 3.783720032879445e-05,
"loss": 2.5507,
"num_input_tokens_seen": 1919376,
"step": 2560
},
{
"epoch": 3.964451313755796,
"grad_norm": 0.6465855836868286,
"learning_rate": 3.7779284240699003e-05,
"loss": 2.9413,
"num_input_tokens_seen": 1922960,
"step": 2565
},
{
"epoch": 3.972179289026275,
"grad_norm": 0.34273695945739746,
"learning_rate": 3.772127515913634e-05,
"loss": 2.7105,
"num_input_tokens_seen": 1926544,
"step": 2570
},
{
"epoch": 3.979907264296754,
"grad_norm": 0.37591809034347534,
"learning_rate": 3.766317350623227e-05,
"loss": 2.7055,
"num_input_tokens_seen": 1929744,
"step": 2575
},
{
"epoch": 3.9876352395672336,
"grad_norm": 0.39454731345176697,
"learning_rate": 3.760497970478624e-05,
"loss": 2.7457,
"num_input_tokens_seen": 1933520,
"step": 2580
},
{
"epoch": 3.9953632148377123,
"grad_norm": 0.453294038772583,
"learning_rate": 3.7546694178268215e-05,
"loss": 3.2118,
"num_input_tokens_seen": 1937360,
"step": 2585
},
{
"epoch": 4.003091190108192,
"grad_norm": 0.38035985827445984,
"learning_rate": 3.7488317350815674e-05,
"loss": 2.7815,
"num_input_tokens_seen": 1941552,
"step": 2590
},
{
"epoch": 4.006182380216384,
"eval_loss": 2.9035558700561523,
"eval_runtime": 9.8504,
"eval_samples_per_second": 58.373,
"eval_steps_per_second": 7.309,
"num_input_tokens_seen": 1942960,
"step": 2592
},
{
"epoch": 4.0108191653786704,
"grad_norm": 0.3568049371242523,
"learning_rate": 3.742984964723047e-05,
"loss": 3.0167,
"num_input_tokens_seen": 1945200,
"step": 2595
},
{
"epoch": 4.01854714064915,
"grad_norm": 0.3261895477771759,
"learning_rate": 3.737129149297574e-05,
"loss": 2.8763,
"num_input_tokens_seen": 1949168,
"step": 2600
},
{
"epoch": 4.0262751159196295,
"grad_norm": 0.6526939868927002,
"learning_rate": 3.731264331417284e-05,
"loss": 2.7011,
"num_input_tokens_seen": 1952624,
"step": 2605
},
{
"epoch": 4.034003091190108,
"grad_norm": 0.3340992331504822,
"learning_rate": 3.72539055375982e-05,
"loss": 2.7641,
"num_input_tokens_seen": 1956912,
"step": 2610
},
{
"epoch": 4.041731066460588,
"grad_norm": 0.2900647521018982,
"learning_rate": 3.7195078590680275e-05,
"loss": 2.9773,
"num_input_tokens_seen": 1960688,
"step": 2615
},
{
"epoch": 4.049459041731066,
"grad_norm": 0.45488232374191284,
"learning_rate": 3.713616290149636e-05,
"loss": 2.8954,
"num_input_tokens_seen": 1964272,
"step": 2620
},
{
"epoch": 4.057187017001546,
"grad_norm": 0.3596618175506592,
"learning_rate": 3.7077158898769574e-05,
"loss": 2.9347,
"num_input_tokens_seen": 1967792,
"step": 2625
},
{
"epoch": 4.0649149922720245,
"grad_norm": 0.3053226172924042,
"learning_rate": 3.701806701186563e-05,
"loss": 2.6642,
"num_input_tokens_seen": 1971312,
"step": 2630
},
{
"epoch": 4.072642967542504,
"grad_norm": 0.32128965854644775,
"learning_rate": 3.695888767078981e-05,
"loss": 3.0455,
"num_input_tokens_seen": 1975152,
"step": 2635
},
{
"epoch": 4.080370942812983,
"grad_norm": 0.3554864525794983,
"learning_rate": 3.6899621306183754e-05,
"loss": 3.1219,
"num_input_tokens_seen": 1978928,
"step": 2640
},
{
"epoch": 4.088098918083462,
"grad_norm": 0.2976425588130951,
"learning_rate": 3.684026834932238e-05,
"loss": 2.6695,
"num_input_tokens_seen": 1982256,
"step": 2645
},
{
"epoch": 4.095826893353942,
"grad_norm": 0.42317289113998413,
"learning_rate": 3.678082923211072e-05,
"loss": 2.8639,
"num_input_tokens_seen": 1986224,
"step": 2650
},
{
"epoch": 4.10355486862442,
"grad_norm": 0.440965861082077,
"learning_rate": 3.6721304387080804e-05,
"loss": 2.8947,
"num_input_tokens_seen": 1990064,
"step": 2655
},
{
"epoch": 4.1112828438949,
"grad_norm": 0.7069739699363708,
"learning_rate": 3.666169424738848e-05,
"loss": 2.8494,
"num_input_tokens_seen": 1993968,
"step": 2660
},
{
"epoch": 4.1190108191653785,
"grad_norm": 0.4947892725467682,
"learning_rate": 3.660199924681027e-05,
"loss": 3.0113,
"num_input_tokens_seen": 1997808,
"step": 2665
},
{
"epoch": 4.126738794435858,
"grad_norm": 0.4745485186576843,
"learning_rate": 3.6542219819740234e-05,
"loss": 3.0485,
"num_input_tokens_seen": 2001520,
"step": 2670
},
{
"epoch": 4.134466769706337,
"grad_norm": 0.4884566068649292,
"learning_rate": 3.648235640118678e-05,
"loss": 2.9497,
"num_input_tokens_seen": 2005232,
"step": 2675
},
{
"epoch": 4.142194744976816,
"grad_norm": 0.38576316833496094,
"learning_rate": 3.642240942676953e-05,
"loss": 2.7742,
"num_input_tokens_seen": 2009328,
"step": 2680
},
{
"epoch": 4.149922720247295,
"grad_norm": 0.39867785573005676,
"learning_rate": 3.6362379332716126e-05,
"loss": 2.7951,
"num_input_tokens_seen": 2012912,
"step": 2685
},
{
"epoch": 4.157650695517774,
"grad_norm": 0.36217308044433594,
"learning_rate": 3.630226655585904e-05,
"loss": 3.0995,
"num_input_tokens_seen": 2016944,
"step": 2690
},
{
"epoch": 4.165378670788254,
"grad_norm": 0.4282553791999817,
"learning_rate": 3.624207153363246e-05,
"loss": 2.8722,
"num_input_tokens_seen": 2020912,
"step": 2695
},
{
"epoch": 4.1731066460587325,
"grad_norm": 0.44885340332984924,
"learning_rate": 3.6181794704069036e-05,
"loss": 2.7546,
"num_input_tokens_seen": 2025136,
"step": 2700
},
{
"epoch": 4.180834621329212,
"grad_norm": 0.4299675226211548,
"learning_rate": 3.612143650579673e-05,
"loss": 2.968,
"num_input_tokens_seen": 2029424,
"step": 2705
},
{
"epoch": 4.188562596599691,
"grad_norm": 0.31228068470954895,
"learning_rate": 3.606099737803559e-05,
"loss": 2.9026,
"num_input_tokens_seen": 2033328,
"step": 2710
},
{
"epoch": 4.19629057187017,
"grad_norm": 0.630832850933075,
"learning_rate": 3.600047776059464e-05,
"loss": 3.1547,
"num_input_tokens_seen": 2037040,
"step": 2715
},
{
"epoch": 4.204018547140649,
"grad_norm": 0.4609726369380951,
"learning_rate": 3.593987809386855e-05,
"loss": 2.8728,
"num_input_tokens_seen": 2041008,
"step": 2720
},
{
"epoch": 4.211746522411128,
"grad_norm": 0.4508463442325592,
"learning_rate": 3.5879198818834544e-05,
"loss": 3.0045,
"num_input_tokens_seen": 2044912,
"step": 2725
},
{
"epoch": 4.219474497681608,
"grad_norm": 0.3503614366054535,
"learning_rate": 3.581844037704914e-05,
"loss": 3.0032,
"num_input_tokens_seen": 2048048,
"step": 2730
},
{
"epoch": 4.2272024729520865,
"grad_norm": 0.5175710916519165,
"learning_rate": 3.575760321064492e-05,
"loss": 3.0595,
"num_input_tokens_seen": 2051952,
"step": 2735
},
{
"epoch": 4.234930448222566,
"grad_norm": 0.5993461012840271,
"learning_rate": 3.569668776232737e-05,
"loss": 2.912,
"num_input_tokens_seen": 2055344,
"step": 2740
},
{
"epoch": 4.242658423493045,
"grad_norm": 0.5014939904212952,
"learning_rate": 3.563569447537161e-05,
"loss": 2.5098,
"num_input_tokens_seen": 2058864,
"step": 2745
},
{
"epoch": 4.250386398763524,
"grad_norm": 0.35828927159309387,
"learning_rate": 3.5574623793619164e-05,
"loss": 2.7504,
"num_input_tokens_seen": 2062512,
"step": 2750
},
{
"epoch": 4.258114374034003,
"grad_norm": 0.3931995928287506,
"learning_rate": 3.551347616147479e-05,
"loss": 2.7396,
"num_input_tokens_seen": 2065904,
"step": 2755
},
{
"epoch": 4.265842349304482,
"grad_norm": 0.5853766798973083,
"learning_rate": 3.5452252023903176e-05,
"loss": 2.6511,
"num_input_tokens_seen": 2069744,
"step": 2760
},
{
"epoch": 4.273570324574961,
"grad_norm": 0.47158560156822205,
"learning_rate": 3.539095182642573e-05,
"loss": 2.2498,
"num_input_tokens_seen": 2073584,
"step": 2765
},
{
"epoch": 4.2812982998454405,
"grad_norm": 0.6278407573699951,
"learning_rate": 3.532957601511736e-05,
"loss": 3.012,
"num_input_tokens_seen": 2077296,
"step": 2770
},
{
"epoch": 4.289026275115919,
"grad_norm": 0.4914918541908264,
"learning_rate": 3.52681250366032e-05,
"loss": 3.0169,
"num_input_tokens_seen": 2081264,
"step": 2775
},
{
"epoch": 4.296754250386399,
"grad_norm": 0.3581917881965637,
"learning_rate": 3.520659933805535e-05,
"loss": 2.8308,
"num_input_tokens_seen": 2085296,
"step": 2780
},
{
"epoch": 4.304482225656878,
"grad_norm": 0.4669550359249115,
"learning_rate": 3.514499936718966e-05,
"loss": 2.9323,
"num_input_tokens_seen": 2089456,
"step": 2785
},
{
"epoch": 4.312210200927357,
"grad_norm": 0.619138777256012,
"learning_rate": 3.508332557226246e-05,
"loss": 2.3913,
"num_input_tokens_seen": 2093104,
"step": 2790
},
{
"epoch": 4.319938176197836,
"grad_norm": 0.44549477100372314,
"learning_rate": 3.502157840206725e-05,
"loss": 2.9869,
"num_input_tokens_seen": 2096496,
"step": 2795
},
{
"epoch": 4.327666151468315,
"grad_norm": 0.5536242127418518,
"learning_rate": 3.4959758305931525e-05,
"loss": 2.8984,
"num_input_tokens_seen": 2100272,
"step": 2800
},
{
"epoch": 4.3353941267387945,
"grad_norm": 0.34818941354751587,
"learning_rate": 3.489786573371341e-05,
"loss": 2.9021,
"num_input_tokens_seen": 2104048,
"step": 2805
},
{
"epoch": 4.343122102009273,
"grad_norm": 0.4407016336917877,
"learning_rate": 3.4835901135798456e-05,
"loss": 2.8693,
"num_input_tokens_seen": 2107760,
"step": 2810
},
{
"epoch": 4.350850077279753,
"grad_norm": 0.38318338990211487,
"learning_rate": 3.4773864963096326e-05,
"loss": 2.4316,
"num_input_tokens_seen": 2111216,
"step": 2815
},
{
"epoch": 4.358578052550232,
"grad_norm": 0.39572352170944214,
"learning_rate": 3.4711757667037536e-05,
"loss": 2.7168,
"num_input_tokens_seen": 2115312,
"step": 2820
},
{
"epoch": 4.366306027820711,
"grad_norm": 0.5528718829154968,
"learning_rate": 3.464957969957015e-05,
"loss": 2.8171,
"num_input_tokens_seen": 2118640,
"step": 2825
},
{
"epoch": 4.37403400309119,
"grad_norm": 0.44296079874038696,
"learning_rate": 3.45873315131565e-05,
"loss": 2.8152,
"num_input_tokens_seen": 2122416,
"step": 2830
},
{
"epoch": 4.381761978361669,
"grad_norm": 0.3478895425796509,
"learning_rate": 3.45250135607699e-05,
"loss": 2.5164,
"num_input_tokens_seen": 2125936,
"step": 2835
},
{
"epoch": 4.3894899536321486,
"grad_norm": 0.4040631055831909,
"learning_rate": 3.4462626295891325e-05,
"loss": 2.7903,
"num_input_tokens_seen": 2129520,
"step": 2840
},
{
"epoch": 4.397217928902627,
"grad_norm": 0.33225977420806885,
"learning_rate": 3.440017017250616e-05,
"loss": 2.8935,
"num_input_tokens_seen": 2134256,
"step": 2845
},
{
"epoch": 4.404945904173107,
"grad_norm": 0.4490237236022949,
"learning_rate": 3.433764564510085e-05,
"loss": 2.819,
"num_input_tokens_seen": 2138096,
"step": 2850
},
{
"epoch": 4.412673879443586,
"grad_norm": 0.40184807777404785,
"learning_rate": 3.427505316865961e-05,
"loss": 2.8849,
"num_input_tokens_seen": 2142128,
"step": 2855
},
{
"epoch": 4.420401854714065,
"grad_norm": 0.38390636444091797,
"learning_rate": 3.4212393198661094e-05,
"loss": 2.6071,
"num_input_tokens_seen": 2145904,
"step": 2860
},
{
"epoch": 4.428129829984544,
"grad_norm": 0.42832306027412415,
"learning_rate": 3.414966619107514e-05,
"loss": 2.3911,
"num_input_tokens_seen": 2149680,
"step": 2865
},
{
"epoch": 4.435857805255023,
"grad_norm": 0.6270868182182312,
"learning_rate": 3.408687260235935e-05,
"loss": 2.644,
"num_input_tokens_seen": 2153392,
"step": 2870
},
{
"epoch": 4.443585780525503,
"grad_norm": 0.41972818970680237,
"learning_rate": 3.402401288945591e-05,
"loss": 2.5729,
"num_input_tokens_seen": 2157360,
"step": 2875
},
{
"epoch": 4.451313755795981,
"grad_norm": 0.5714272260665894,
"learning_rate": 3.396108750978813e-05,
"loss": 2.942,
"num_input_tokens_seen": 2161584,
"step": 2880
},
{
"epoch": 4.459041731066461,
"grad_norm": 0.36096087098121643,
"learning_rate": 3.389809692125717e-05,
"loss": 2.6037,
"num_input_tokens_seen": 2165808,
"step": 2885
},
{
"epoch": 4.466769706336939,
"grad_norm": 0.49124354124069214,
"learning_rate": 3.3835041582238734e-05,
"loss": 2.7337,
"num_input_tokens_seen": 2169840,
"step": 2890
},
{
"epoch": 4.474497681607419,
"grad_norm": 0.4930627942085266,
"learning_rate": 3.377192195157968e-05,
"loss": 2.5801,
"num_input_tokens_seen": 2173488,
"step": 2895
},
{
"epoch": 4.4822256568778975,
"grad_norm": 0.6838205456733704,
"learning_rate": 3.370873848859473e-05,
"loss": 2.5378,
"num_input_tokens_seen": 2177008,
"step": 2900
},
{
"epoch": 4.489953632148377,
"grad_norm": 0.48879119753837585,
"learning_rate": 3.36454916530631e-05,
"loss": 2.8039,
"num_input_tokens_seen": 2180976,
"step": 2905
},
{
"epoch": 4.497681607418857,
"grad_norm": 0.5275672674179077,
"learning_rate": 3.358218190522516e-05,
"loss": 2.6789,
"num_input_tokens_seen": 2184432,
"step": 2910
},
{
"epoch": 4.505409582689335,
"grad_norm": 0.5114855170249939,
"learning_rate": 3.35188097057791e-05,
"loss": 2.7445,
"num_input_tokens_seen": 2188656,
"step": 2915
},
{
"epoch": 4.506955177743431,
"eval_loss": 2.695441246032715,
"eval_runtime": 9.8594,
"eval_samples_per_second": 58.32,
"eval_steps_per_second": 7.303,
"num_input_tokens_seen": 2189232,
"step": 2916
},
{
"epoch": 4.513137557959815,
"grad_norm": 0.38441339135169983,
"learning_rate": 3.345537551587753e-05,
"loss": 2.413,
"num_input_tokens_seen": 2191984,
"step": 2920
},
{
"epoch": 4.520865533230293,
"grad_norm": 0.3344990909099579,
"learning_rate": 3.33918797971242e-05,
"loss": 2.7369,
"num_input_tokens_seen": 2195632,
"step": 2925
},
{
"epoch": 4.528593508500773,
"grad_norm": 0.35520127415657043,
"learning_rate": 3.332832301157056e-05,
"loss": 2.7584,
"num_input_tokens_seen": 2199344,
"step": 2930
},
{
"epoch": 4.5363214837712516,
"grad_norm": 0.6611173152923584,
"learning_rate": 3.326470562171246e-05,
"loss": 2.681,
"num_input_tokens_seen": 2203248,
"step": 2935
},
{
"epoch": 4.544049459041731,
"grad_norm": 0.7467407584190369,
"learning_rate": 3.320102809048676e-05,
"loss": 2.5338,
"num_input_tokens_seen": 2207024,
"step": 2940
},
{
"epoch": 4.551777434312211,
"grad_norm": 0.3933999240398407,
"learning_rate": 3.313729088126796e-05,
"loss": 2.5062,
"num_input_tokens_seen": 2211056,
"step": 2945
},
{
"epoch": 4.559505409582689,
"grad_norm": 0.5209619998931885,
"learning_rate": 3.307349445786481e-05,
"loss": 2.5035,
"num_input_tokens_seen": 2214512,
"step": 2950
},
{
"epoch": 4.567233384853169,
"grad_norm": 0.4052325189113617,
"learning_rate": 3.300963928451699e-05,
"loss": 2.6405,
"num_input_tokens_seen": 2217968,
"step": 2955
},
{
"epoch": 4.574961360123647,
"grad_norm": 0.37696200609207153,
"learning_rate": 3.2945725825891676e-05,
"loss": 2.5377,
"num_input_tokens_seen": 2222192,
"step": 2960
},
{
"epoch": 4.582689335394127,
"grad_norm": 0.5208265781402588,
"learning_rate": 3.288175454708017e-05,
"loss": 2.6624,
"num_input_tokens_seen": 2226096,
"step": 2965
},
{
"epoch": 4.590417310664606,
"grad_norm": 0.5913110375404358,
"learning_rate": 3.281772591359457e-05,
"loss": 2.7047,
"num_input_tokens_seen": 2230128,
"step": 2970
},
{
"epoch": 4.598145285935085,
"grad_norm": 0.5141699314117432,
"learning_rate": 3.2753640391364276e-05,
"loss": 2.8835,
"num_input_tokens_seen": 2234672,
"step": 2975
},
{
"epoch": 4.605873261205565,
"grad_norm": 0.5090105533599854,
"learning_rate": 3.2689498446732705e-05,
"loss": 2.6953,
"num_input_tokens_seen": 2238256,
"step": 2980
},
{
"epoch": 4.613601236476043,
"grad_norm": 0.4541560709476471,
"learning_rate": 3.262530054645384e-05,
"loss": 2.7004,
"num_input_tokens_seen": 2242032,
"step": 2985
},
{
"epoch": 4.621329211746523,
"grad_norm": 0.49343761801719666,
"learning_rate": 3.256104715768885e-05,
"loss": 2.6817,
"num_input_tokens_seen": 2245488,
"step": 2990
},
{
"epoch": 4.629057187017001,
"grad_norm": 0.7500348687171936,
"learning_rate": 3.249673874800267e-05,
"loss": 2.2831,
"num_input_tokens_seen": 2249520,
"step": 2995
},
{
"epoch": 4.636785162287481,
"grad_norm": 0.41946667432785034,
"learning_rate": 3.2432375785360644e-05,
"loss": 2.6983,
"num_input_tokens_seen": 2253168,
"step": 3000
},
{
"epoch": 4.64451313755796,
"grad_norm": 0.4098086655139923,
"learning_rate": 3.236795873812509e-05,
"loss": 2.4214,
"num_input_tokens_seen": 2257328,
"step": 3005
},
{
"epoch": 4.652241112828439,
"grad_norm": 0.3378872275352478,
"learning_rate": 3.230348807505186e-05,
"loss": 2.7132,
"num_input_tokens_seen": 2260528,
"step": 3010
},
{
"epoch": 4.659969088098918,
"grad_norm": 0.4678763449192047,
"learning_rate": 3.223896426528701e-05,
"loss": 2.5183,
"num_input_tokens_seen": 2263920,
"step": 3015
},
{
"epoch": 4.667697063369397,
"grad_norm": 0.3910799026489258,
"learning_rate": 3.217438777836329e-05,
"loss": 2.4753,
"num_input_tokens_seen": 2267824,
"step": 3020
},
{
"epoch": 4.675425038639876,
"grad_norm": 0.3688734471797943,
"learning_rate": 3.210975908419682e-05,
"loss": 2.2819,
"num_input_tokens_seen": 2271408,
"step": 3025
},
{
"epoch": 4.683153013910355,
"grad_norm": 0.4956994950771332,
"learning_rate": 3.2045078653083594e-05,
"loss": 2.5683,
"num_input_tokens_seen": 2275120,
"step": 3030
},
{
"epoch": 4.690880989180835,
"grad_norm": 0.4585415720939636,
"learning_rate": 3.1980346955696116e-05,
"loss": 2.5434,
"num_input_tokens_seen": 2278576,
"step": 3035
},
{
"epoch": 4.698608964451314,
"grad_norm": 0.4156535863876343,
"learning_rate": 3.191556446307993e-05,
"loss": 2.5429,
"num_input_tokens_seen": 2281712,
"step": 3040
},
{
"epoch": 4.706336939721793,
"grad_norm": 0.40493646264076233,
"learning_rate": 3.1850731646650215e-05,
"loss": 2.4746,
"num_input_tokens_seen": 2285296,
"step": 3045
},
{
"epoch": 4.714064914992272,
"grad_norm": 0.3802744448184967,
"learning_rate": 3.178584897818836e-05,
"loss": 2.8662,
"num_input_tokens_seen": 2288688,
"step": 3050
},
{
"epoch": 4.721792890262751,
"grad_norm": 0.40369701385498047,
"learning_rate": 3.172091692983851e-05,
"loss": 2.721,
"num_input_tokens_seen": 2292144,
"step": 3055
},
{
"epoch": 4.72952086553323,
"grad_norm": 0.5234618782997131,
"learning_rate": 3.165593597410414e-05,
"loss": 2.5668,
"num_input_tokens_seen": 2295792,
"step": 3060
},
{
"epoch": 4.7372488408037094,
"grad_norm": 0.418813556432724,
"learning_rate": 3.1590906583844644e-05,
"loss": 2.8402,
"num_input_tokens_seen": 2299440,
"step": 3065
},
{
"epoch": 4.744976816074189,
"grad_norm": 0.4814930558204651,
"learning_rate": 3.1525829232271845e-05,
"loss": 2.8945,
"num_input_tokens_seen": 2303280,
"step": 3070
},
{
"epoch": 4.752704791344668,
"grad_norm": 0.38368573784828186,
"learning_rate": 3.146070439294657e-05,
"loss": 2.4513,
"num_input_tokens_seen": 2307056,
"step": 3075
},
{
"epoch": 4.760432766615147,
"grad_norm": 0.49482211470603943,
"learning_rate": 3.1395532539775244e-05,
"loss": 2.5108,
"num_input_tokens_seen": 2309872,
"step": 3080
},
{
"epoch": 4.768160741885626,
"grad_norm": 0.629030704498291,
"learning_rate": 3.1330314147006355e-05,
"loss": 2.5163,
"num_input_tokens_seen": 2313392,
"step": 3085
},
{
"epoch": 4.775888717156105,
"grad_norm": 0.3420495092868805,
"learning_rate": 3.126504968922711e-05,
"loss": 2.6241,
"num_input_tokens_seen": 2316976,
"step": 3090
},
{
"epoch": 4.783616692426584,
"grad_norm": 0.39348718523979187,
"learning_rate": 3.119973964135987e-05,
"loss": 2.6212,
"num_input_tokens_seen": 2320816,
"step": 3095
},
{
"epoch": 4.7913446676970635,
"grad_norm": 0.3372386395931244,
"learning_rate": 3.113438447865881e-05,
"loss": 2.3769,
"num_input_tokens_seen": 2324400,
"step": 3100
},
{
"epoch": 4.799072642967543,
"grad_norm": 0.4806166887283325,
"learning_rate": 3.1068984676706344e-05,
"loss": 2.6456,
"num_input_tokens_seen": 2327856,
"step": 3105
},
{
"epoch": 4.806800618238022,
"grad_norm": 0.5349667072296143,
"learning_rate": 3.100354071140977e-05,
"loss": 2.6822,
"num_input_tokens_seen": 2331824,
"step": 3110
},
{
"epoch": 4.814528593508501,
"grad_norm": 0.5503065586090088,
"learning_rate": 3.093805305899772e-05,
"loss": 2.5127,
"num_input_tokens_seen": 2335408,
"step": 3115
},
{
"epoch": 4.82225656877898,
"grad_norm": 0.4024689197540283,
"learning_rate": 3.0872522196016746e-05,
"loss": 2.6049,
"num_input_tokens_seen": 2339312,
"step": 3120
},
{
"epoch": 4.829984544049459,
"grad_norm": 0.462722510099411,
"learning_rate": 3.080694859932785e-05,
"loss": 2.5243,
"num_input_tokens_seen": 2342768,
"step": 3125
},
{
"epoch": 4.837712519319938,
"grad_norm": 0.43024778366088867,
"learning_rate": 3.074133274610297e-05,
"loss": 2.6725,
"num_input_tokens_seen": 2346544,
"step": 3130
},
{
"epoch": 4.8454404945904175,
"grad_norm": 0.5202332139015198,
"learning_rate": 3.06756751138216e-05,
"loss": 2.3939,
"num_input_tokens_seen": 2350640,
"step": 3135
},
{
"epoch": 4.853168469860896,
"grad_norm": 0.3638491928577423,
"learning_rate": 3.0609976180267186e-05,
"loss": 2.4518,
"num_input_tokens_seen": 2354160,
"step": 3140
},
{
"epoch": 4.860896445131376,
"grad_norm": 0.4176388680934906,
"learning_rate": 3.054423642352376e-05,
"loss": 2.5428,
"num_input_tokens_seen": 2357872,
"step": 3145
},
{
"epoch": 4.868624420401854,
"grad_norm": 0.4720919728279114,
"learning_rate": 3.0478456321972422e-05,
"loss": 2.4582,
"num_input_tokens_seen": 2361840,
"step": 3150
},
{
"epoch": 4.876352395672334,
"grad_norm": 0.47086870670318604,
"learning_rate": 3.0412636354287826e-05,
"loss": 2.437,
"num_input_tokens_seen": 2365488,
"step": 3155
},
{
"epoch": 4.884080370942813,
"grad_norm": 0.42605626583099365,
"learning_rate": 3.0346776999434774e-05,
"loss": 2.4077,
"num_input_tokens_seen": 2369200,
"step": 3160
},
{
"epoch": 4.891808346213292,
"grad_norm": 0.5135416388511658,
"learning_rate": 3.0280878736664632e-05,
"loss": 2.6054,
"num_input_tokens_seen": 2372976,
"step": 3165
},
{
"epoch": 4.8995363214837715,
"grad_norm": 0.519250214099884,
"learning_rate": 3.0214942045511933e-05,
"loss": 2.564,
"num_input_tokens_seen": 2377008,
"step": 3170
},
{
"epoch": 4.90726429675425,
"grad_norm": 0.33954399824142456,
"learning_rate": 3.014896740579084e-05,
"loss": 2.4686,
"num_input_tokens_seen": 2380464,
"step": 3175
},
{
"epoch": 4.91499227202473,
"grad_norm": 0.5407111048698425,
"learning_rate": 3.0082955297591646e-05,
"loss": 2.7348,
"num_input_tokens_seen": 2384432,
"step": 3180
},
{
"epoch": 4.922720247295208,
"grad_norm": 0.4390753209590912,
"learning_rate": 3.001690620127733e-05,
"loss": 2.4345,
"num_input_tokens_seen": 2388592,
"step": 3185
},
{
"epoch": 4.930448222565688,
"grad_norm": 0.5103745460510254,
"learning_rate": 2.9950820597479988e-05,
"loss": 2.4188,
"num_input_tokens_seen": 2392624,
"step": 3190
},
{
"epoch": 4.938176197836167,
"grad_norm": 0.4389425218105316,
"learning_rate": 2.9884698967097425e-05,
"loss": 2.2987,
"num_input_tokens_seen": 2396208,
"step": 3195
},
{
"epoch": 4.945904173106646,
"grad_norm": 0.7027727961540222,
"learning_rate": 2.9818541791289568e-05,
"loss": 2.4588,
"num_input_tokens_seen": 2399792,
"step": 3200
},
{
"epoch": 4.9536321483771255,
"grad_norm": 0.43571075797080994,
"learning_rate": 2.9752349551475028e-05,
"loss": 2.5056,
"num_input_tokens_seen": 2403952,
"step": 3205
},
{
"epoch": 4.961360123647604,
"grad_norm": 0.39576250314712524,
"learning_rate": 2.9686122729327565e-05,
"loss": 2.489,
"num_input_tokens_seen": 2407472,
"step": 3210
},
{
"epoch": 4.969088098918084,
"grad_norm": 0.5138210654258728,
"learning_rate": 2.961986180677258e-05,
"loss": 2.3268,
"num_input_tokens_seen": 2411248,
"step": 3215
},
{
"epoch": 4.976816074188562,
"grad_norm": 0.6068966388702393,
"learning_rate": 2.9553567265983634e-05,
"loss": 2.4227,
"num_input_tokens_seen": 2414960,
"step": 3220
},
{
"epoch": 4.984544049459042,
"grad_norm": 0.5577759742736816,
"learning_rate": 2.9487239589378923e-05,
"loss": 2.4349,
"num_input_tokens_seen": 2418800,
"step": 3225
},
{
"epoch": 4.992272024729521,
"grad_norm": 0.5744683146476746,
"learning_rate": 2.942087925961776e-05,
"loss": 2.2231,
"num_input_tokens_seen": 2422512,
"step": 3230
},
{
"epoch": 5.0,
"grad_norm": 0.447038471698761,
"learning_rate": 2.9354486759597087e-05,
"loss": 2.5454,
"num_input_tokens_seen": 2426048,
"step": 3235
},
{
"epoch": 5.0077279752704795,
"grad_norm": 0.4673357903957367,
"learning_rate": 2.9288062572447926e-05,
"loss": 2.3198,
"num_input_tokens_seen": 2429824,
"step": 3240
},
{
"epoch": 5.0077279752704795,
"eval_loss": 2.511051893234253,
"eval_runtime": 9.8438,
"eval_samples_per_second": 58.412,
"eval_steps_per_second": 7.314,
"num_input_tokens_seen": 2429824,
"step": 3240
},
{
"epoch": 5.015455950540958,
"grad_norm": 0.4172385334968567,
"learning_rate": 2.9221607181531897e-05,
"loss": 2.5621,
"num_input_tokens_seen": 2433728,
"step": 3245
},
{
"epoch": 5.023183925811438,
"grad_norm": 0.40235933661460876,
"learning_rate": 2.915512107043767e-05,
"loss": 2.7036,
"num_input_tokens_seen": 2438208,
"step": 3250
},
{
"epoch": 5.030911901081916,
"grad_norm": 0.4346725046634674,
"learning_rate": 2.9088604722977487e-05,
"loss": 2.5252,
"num_input_tokens_seen": 2442048,
"step": 3255
},
{
"epoch": 5.038639876352396,
"grad_norm": 0.5125299096107483,
"learning_rate": 2.9022058623183603e-05,
"loss": 2.5612,
"num_input_tokens_seen": 2445760,
"step": 3260
},
{
"epoch": 5.0463678516228745,
"grad_norm": 0.4407518208026886,
"learning_rate": 2.895548325530477e-05,
"loss": 2.3247,
"num_input_tokens_seen": 2449280,
"step": 3265
},
{
"epoch": 5.054095826893354,
"grad_norm": 0.4180731475353241,
"learning_rate": 2.8888879103802735e-05,
"loss": 2.7282,
"num_input_tokens_seen": 2452736,
"step": 3270
},
{
"epoch": 5.061823802163833,
"grad_norm": 0.4604838788509369,
"learning_rate": 2.882224665334869e-05,
"loss": 2.4472,
"num_input_tokens_seen": 2456320,
"step": 3275
},
{
"epoch": 5.069551777434312,
"grad_norm": 0.4161360561847687,
"learning_rate": 2.8755586388819766e-05,
"loss": 2.4265,
"num_input_tokens_seen": 2460352,
"step": 3280
},
{
"epoch": 5.077279752704792,
"grad_norm": 0.3719094395637512,
"learning_rate": 2.8688898795295477e-05,
"loss": 2.3331,
"num_input_tokens_seen": 2463808,
"step": 3285
},
{
"epoch": 5.08500772797527,
"grad_norm": 0.5433674454689026,
"learning_rate": 2.8622184358054228e-05,
"loss": 2.4872,
"num_input_tokens_seen": 2467584,
"step": 3290
},
{
"epoch": 5.09273570324575,
"grad_norm": 0.4143747091293335,
"learning_rate": 2.855544356256975e-05,
"loss": 2.1259,
"num_input_tokens_seen": 2471552,
"step": 3295
},
{
"epoch": 5.1004636785162285,
"grad_norm": 0.5412753820419312,
"learning_rate": 2.8488676894507577e-05,
"loss": 2.3704,
"num_input_tokens_seen": 2475008,
"step": 3300
},
{
"epoch": 5.108191653786708,
"grad_norm": 0.4551786184310913,
"learning_rate": 2.842188483972153e-05,
"loss": 2.3857,
"num_input_tokens_seen": 2478272,
"step": 3305
},
{
"epoch": 5.115919629057187,
"grad_norm": 0.4686163365840912,
"learning_rate": 2.8355067884250147e-05,
"loss": 2.4618,
"num_input_tokens_seen": 2481984,
"step": 3310
},
{
"epoch": 5.123647604327666,
"grad_norm": 0.7603648900985718,
"learning_rate": 2.8288226514313177e-05,
"loss": 2.4902,
"num_input_tokens_seen": 2485696,
"step": 3315
},
{
"epoch": 5.131375579598146,
"grad_norm": 0.5762485265731812,
"learning_rate": 2.822136121630804e-05,
"loss": 2.2111,
"num_input_tokens_seen": 2489344,
"step": 3320
},
{
"epoch": 5.139103554868624,
"grad_norm": 0.5430300831794739,
"learning_rate": 2.815447247680626e-05,
"loss": 2.3911,
"num_input_tokens_seen": 2492864,
"step": 3325
},
{
"epoch": 5.146831530139104,
"grad_norm": 0.6499159336090088,
"learning_rate": 2.8087560782549944e-05,
"loss": 2.5143,
"num_input_tokens_seen": 2496448,
"step": 3330
},
{
"epoch": 5.1545595054095825,
"grad_norm": 0.5955728888511658,
"learning_rate": 2.8020626620448248e-05,
"loss": 2.6991,
"num_input_tokens_seen": 2500224,
"step": 3335
},
{
"epoch": 5.162287480680062,
"grad_norm": 0.3927296996116638,
"learning_rate": 2.7953670477573823e-05,
"loss": 2.5315,
"num_input_tokens_seen": 2503936,
"step": 3340
},
{
"epoch": 5.170015455950541,
"grad_norm": 0.4476173222064972,
"learning_rate": 2.788669284115926e-05,
"loss": 2.5084,
"num_input_tokens_seen": 2508096,
"step": 3345
},
{
"epoch": 5.17774343122102,
"grad_norm": 0.594795823097229,
"learning_rate": 2.7819694198593567e-05,
"loss": 2.523,
"num_input_tokens_seen": 2512000,
"step": 3350
},
{
"epoch": 5.185471406491499,
"grad_norm": 0.49172693490982056,
"learning_rate": 2.775267503741862e-05,
"loss": 2.702,
"num_input_tokens_seen": 2515968,
"step": 3355
},
{
"epoch": 5.193199381761978,
"grad_norm": 0.634707510471344,
"learning_rate": 2.768563584532558e-05,
"loss": 2.133,
"num_input_tokens_seen": 2519424,
"step": 3360
},
{
"epoch": 5.200927357032458,
"grad_norm": 0.37945079803466797,
"learning_rate": 2.7618577110151394e-05,
"loss": 2.2683,
"num_input_tokens_seen": 2523136,
"step": 3365
},
{
"epoch": 5.2086553323029365,
"grad_norm": 0.4269804060459137,
"learning_rate": 2.7551499319875212e-05,
"loss": 2.202,
"num_input_tokens_seen": 2526848,
"step": 3370
},
{
"epoch": 5.216383307573416,
"grad_norm": 0.4263918399810791,
"learning_rate": 2.748440296261485e-05,
"loss": 2.5293,
"num_input_tokens_seen": 2531072,
"step": 3375
},
{
"epoch": 5.224111282843895,
"grad_norm": 0.43470582365989685,
"learning_rate": 2.741728852662323e-05,
"loss": 2.3115,
"num_input_tokens_seen": 2534720,
"step": 3380
},
{
"epoch": 5.231839258114374,
"grad_norm": 0.39447730779647827,
"learning_rate": 2.735015650028484e-05,
"loss": 2.6058,
"num_input_tokens_seen": 2539264,
"step": 3385
},
{
"epoch": 5.239567233384853,
"grad_norm": 0.5679147839546204,
"learning_rate": 2.728300737211215e-05,
"loss": 2.3294,
"num_input_tokens_seen": 2542912,
"step": 3390
},
{
"epoch": 5.247295208655332,
"grad_norm": 0.39888083934783936,
"learning_rate": 2.7215841630742112e-05,
"loss": 2.2617,
"num_input_tokens_seen": 2546944,
"step": 3395
},
{
"epoch": 5.255023183925811,
"grad_norm": 0.41213181614875793,
"learning_rate": 2.714865976493253e-05,
"loss": 2.1931,
"num_input_tokens_seen": 2550528,
"step": 3400
},
{
"epoch": 5.2627511591962906,
"grad_norm": 0.4185974597930908,
"learning_rate": 2.708146226355858e-05,
"loss": 2.2597,
"num_input_tokens_seen": 2553920,
"step": 3405
},
{
"epoch": 5.27047913446677,
"grad_norm": 0.5793778300285339,
"learning_rate": 2.7014249615609194e-05,
"loss": 2.2198,
"num_input_tokens_seen": 2557696,
"step": 3410
},
{
"epoch": 5.278207109737249,
"grad_norm": 0.4878511130809784,
"learning_rate": 2.6947022310183528e-05,
"loss": 2.2857,
"num_input_tokens_seen": 2561792,
"step": 3415
},
{
"epoch": 5.285935085007728,
"grad_norm": 0.6548928022384644,
"learning_rate": 2.6879780836487412e-05,
"loss": 2.2111,
"num_input_tokens_seen": 2565440,
"step": 3420
},
{
"epoch": 5.293663060278207,
"grad_norm": 0.5079330801963806,
"learning_rate": 2.681252568382976e-05,
"loss": 2.5244,
"num_input_tokens_seen": 2569088,
"step": 3425
},
{
"epoch": 5.301391035548686,
"grad_norm": 0.4245012700557709,
"learning_rate": 2.6745257341619035e-05,
"loss": 2.1497,
"num_input_tokens_seen": 2572736,
"step": 3430
},
{
"epoch": 5.309119010819165,
"grad_norm": 0.570668637752533,
"learning_rate": 2.667797629935967e-05,
"loss": 2.5076,
"num_input_tokens_seen": 2576512,
"step": 3435
},
{
"epoch": 5.316846986089645,
"grad_norm": 0.4023292660713196,
"learning_rate": 2.6610683046648533e-05,
"loss": 2.2498,
"num_input_tokens_seen": 2579968,
"step": 3440
},
{
"epoch": 5.324574961360124,
"grad_norm": 0.5683576464653015,
"learning_rate": 2.654337807317132e-05,
"loss": 2.5712,
"num_input_tokens_seen": 2583936,
"step": 3445
},
{
"epoch": 5.332302936630603,
"grad_norm": 0.4126036465167999,
"learning_rate": 2.647606186869905e-05,
"loss": 2.4584,
"num_input_tokens_seen": 2587392,
"step": 3450
},
{
"epoch": 5.340030911901082,
"grad_norm": 0.42776116728782654,
"learning_rate": 2.6408734923084444e-05,
"loss": 2.3173,
"num_input_tokens_seen": 2591040,
"step": 3455
},
{
"epoch": 5.347758887171561,
"grad_norm": 0.4142238199710846,
"learning_rate": 2.6341397726258392e-05,
"loss": 2.2085,
"num_input_tokens_seen": 2594368,
"step": 3460
},
{
"epoch": 5.35548686244204,
"grad_norm": 0.4580320715904236,
"learning_rate": 2.6274050768226384e-05,
"loss": 2.3484,
"num_input_tokens_seen": 2598592,
"step": 3465
},
{
"epoch": 5.363214837712519,
"grad_norm": 0.37727484107017517,
"learning_rate": 2.620669453906493e-05,
"loss": 2.6041,
"num_input_tokens_seen": 2602496,
"step": 3470
},
{
"epoch": 5.370942812982999,
"grad_norm": 0.3904915452003479,
"learning_rate": 2.6139329528918016e-05,
"loss": 2.3611,
"num_input_tokens_seen": 2606016,
"step": 3475
},
{
"epoch": 5.378670788253477,
"grad_norm": 0.4388081729412079,
"learning_rate": 2.6071956227993538e-05,
"loss": 2.1859,
"num_input_tokens_seen": 2609600,
"step": 3480
},
{
"epoch": 5.386398763523957,
"grad_norm": 0.39128202199935913,
"learning_rate": 2.60045751265597e-05,
"loss": 2.2651,
"num_input_tokens_seen": 2613760,
"step": 3485
},
{
"epoch": 5.394126738794436,
"grad_norm": 0.4795575737953186,
"learning_rate": 2.5937186714941474e-05,
"loss": 2.4204,
"num_input_tokens_seen": 2617536,
"step": 3490
},
{
"epoch": 5.401854714064915,
"grad_norm": 0.4449489712715149,
"learning_rate": 2.586979148351704e-05,
"loss": 2.2625,
"num_input_tokens_seen": 2621440,
"step": 3495
},
{
"epoch": 5.409582689335394,
"grad_norm": 0.5936563611030579,
"learning_rate": 2.5802389922714195e-05,
"loss": 2.3463,
"num_input_tokens_seen": 2625280,
"step": 3500
},
{
"epoch": 5.417310664605873,
"grad_norm": 0.44013652205467224,
"learning_rate": 2.5734982523006786e-05,
"loss": 2.5644,
"num_input_tokens_seen": 2628736,
"step": 3505
},
{
"epoch": 5.425038639876353,
"grad_norm": 0.5395840406417847,
"learning_rate": 2.5667569774911175e-05,
"loss": 2.343,
"num_input_tokens_seen": 2632320,
"step": 3510
},
{
"epoch": 5.432766615146831,
"grad_norm": 0.3468901813030243,
"learning_rate": 2.560015216898262e-05,
"loss": 2.1964,
"num_input_tokens_seen": 2635968,
"step": 3515
},
{
"epoch": 5.440494590417311,
"grad_norm": 0.4727214276790619,
"learning_rate": 2.553273019581174e-05,
"loss": 2.2021,
"num_input_tokens_seen": 2639872,
"step": 3520
},
{
"epoch": 5.448222565687789,
"grad_norm": 0.5066657066345215,
"learning_rate": 2.5465304346020924e-05,
"loss": 2.111,
"num_input_tokens_seen": 2644224,
"step": 3525
},
{
"epoch": 5.455950540958269,
"grad_norm": 0.4220898449420929,
"learning_rate": 2.5397875110260784e-05,
"loss": 2.5323,
"num_input_tokens_seen": 2648128,
"step": 3530
},
{
"epoch": 5.4636785162287484,
"grad_norm": 0.3824640214443207,
"learning_rate": 2.5330442979206566e-05,
"loss": 2.428,
"num_input_tokens_seen": 2651712,
"step": 3535
},
{
"epoch": 5.471406491499227,
"grad_norm": 0.4880902171134949,
"learning_rate": 2.526300844355457e-05,
"loss": 2.4646,
"num_input_tokens_seen": 2655424,
"step": 3540
},
{
"epoch": 5.479134466769707,
"grad_norm": 0.4218856394290924,
"learning_rate": 2.519557199401863e-05,
"loss": 2.4592,
"num_input_tokens_seen": 2659200,
"step": 3545
},
{
"epoch": 5.486862442040185,
"grad_norm": 0.39387452602386475,
"learning_rate": 2.512813412132647e-05,
"loss": 2.4111,
"num_input_tokens_seen": 2662912,
"step": 3550
},
{
"epoch": 5.494590417310665,
"grad_norm": 0.5984361171722412,
"learning_rate": 2.5060695316216188e-05,
"loss": 2.3876,
"num_input_tokens_seen": 2666944,
"step": 3555
},
{
"epoch": 5.502318392581143,
"grad_norm": 0.46139827370643616,
"learning_rate": 2.4993256069432666e-05,
"loss": 2.4696,
"num_input_tokens_seen": 2670592,
"step": 3560
},
{
"epoch": 5.508500772797527,
"eval_loss": 2.352015256881714,
"eval_runtime": 9.8446,
"eval_samples_per_second": 58.408,
"eval_steps_per_second": 7.314,
"num_input_tokens_seen": 2673664,
"step": 3564
},
{
"epoch": 5.510046367851623,
"grad_norm": 0.42209917306900024,
"learning_rate": 2.4925816871723997e-05,
"loss": 2.4421,
"num_input_tokens_seen": 2674304,
"step": 3565
},
{
"epoch": 5.5177743431221025,
"grad_norm": 0.4543595612049103,
"learning_rate": 2.4858378213837908e-05,
"loss": 2.3185,
"num_input_tokens_seen": 2678528,
"step": 3570
},
{
"epoch": 5.525502318392581,
"grad_norm": 0.34188148379325867,
"learning_rate": 2.479094058651823e-05,
"loss": 2.2586,
"num_input_tokens_seen": 2682496,
"step": 3575
},
{
"epoch": 5.533230293663061,
"grad_norm": 0.36892926692962646,
"learning_rate": 2.4723504480501248e-05,
"loss": 2.5486,
"num_input_tokens_seen": 2686592,
"step": 3580
},
{
"epoch": 5.540958268933539,
"grad_norm": 0.8839777708053589,
"learning_rate": 2.4656070386512224e-05,
"loss": 2.1179,
"num_input_tokens_seen": 2690304,
"step": 3585
},
{
"epoch": 5.548686244204019,
"grad_norm": 0.39712879061698914,
"learning_rate": 2.4588638795261732e-05,
"loss": 2.3346,
"num_input_tokens_seen": 2694016,
"step": 3590
},
{
"epoch": 5.556414219474497,
"grad_norm": 0.5067217946052551,
"learning_rate": 2.4521210197442176e-05,
"loss": 1.9961,
"num_input_tokens_seen": 2697728,
"step": 3595
},
{
"epoch": 5.564142194744977,
"grad_norm": 0.6092778444290161,
"learning_rate": 2.4453785083724147e-05,
"loss": 2.0222,
"num_input_tokens_seen": 2700992,
"step": 3600
},
{
"epoch": 5.571870170015456,
"grad_norm": 0.5331489443778992,
"learning_rate": 2.438636394475291e-05,
"loss": 2.2948,
"num_input_tokens_seen": 2704704,
"step": 3605
},
{
"epoch": 5.579598145285935,
"grad_norm": 0.4057201147079468,
"learning_rate": 2.4318947271144768e-05,
"loss": 2.1596,
"num_input_tokens_seen": 2707904,
"step": 3610
},
{
"epoch": 5.587326120556414,
"grad_norm": 0.5431353449821472,
"learning_rate": 2.4251535553483575e-05,
"loss": 2.1688,
"num_input_tokens_seen": 2710784,
"step": 3615
},
{
"epoch": 5.595054095826893,
"grad_norm": 0.5729431509971619,
"learning_rate": 2.418412928231708e-05,
"loss": 2.3437,
"num_input_tokens_seen": 2714560,
"step": 3620
},
{
"epoch": 5.602782071097373,
"grad_norm": 0.39663970470428467,
"learning_rate": 2.4116728948153427e-05,
"loss": 2.4228,
"num_input_tokens_seen": 2718656,
"step": 3625
},
{
"epoch": 5.6105100463678514,
"grad_norm": 0.6071697473526001,
"learning_rate": 2.404933504145755e-05,
"loss": 2.1259,
"num_input_tokens_seen": 2721856,
"step": 3630
},
{
"epoch": 5.618238021638331,
"grad_norm": 0.4467135965824127,
"learning_rate": 2.39819480526476e-05,
"loss": 2.1758,
"num_input_tokens_seen": 2725440,
"step": 3635
},
{
"epoch": 5.62596599690881,
"grad_norm": 0.40764495730400085,
"learning_rate": 2.3914568472091393e-05,
"loss": 1.9979,
"num_input_tokens_seen": 2729088,
"step": 3640
},
{
"epoch": 5.633693972179289,
"grad_norm": 0.457857221364975,
"learning_rate": 2.3847196790102853e-05,
"loss": 2.2177,
"num_input_tokens_seen": 2732736,
"step": 3645
},
{
"epoch": 5.641421947449768,
"grad_norm": 0.42833542823791504,
"learning_rate": 2.37798334969384e-05,
"loss": 2.2948,
"num_input_tokens_seen": 2737024,
"step": 3650
},
{
"epoch": 5.649149922720247,
"grad_norm": 0.47789230942726135,
"learning_rate": 2.371247908279343e-05,
"loss": 2.4004,
"num_input_tokens_seen": 2740928,
"step": 3655
},
{
"epoch": 5.656877897990727,
"grad_norm": 0.5181143283843994,
"learning_rate": 2.3645134037798704e-05,
"loss": 2.3255,
"num_input_tokens_seen": 2744960,
"step": 3660
},
{
"epoch": 5.6646058732612055,
"grad_norm": 0.4948902428150177,
"learning_rate": 2.357779885201684e-05,
"loss": 2.0075,
"num_input_tokens_seen": 2748096,
"step": 3665
},
{
"epoch": 5.672333848531685,
"grad_norm": 0.4389133155345917,
"learning_rate": 2.3510474015438673e-05,
"loss": 1.9645,
"num_input_tokens_seen": 2751616,
"step": 3670
},
{
"epoch": 5.680061823802164,
"grad_norm": 0.4908589720726013,
"learning_rate": 2.344316001797977e-05,
"loss": 1.9947,
"num_input_tokens_seen": 2755328,
"step": 3675
},
{
"epoch": 5.687789799072643,
"grad_norm": 0.4054723083972931,
"learning_rate": 2.3375857349476768e-05,
"loss": 2.1622,
"num_input_tokens_seen": 2758656,
"step": 3680
},
{
"epoch": 5.695517774343122,
"grad_norm": 0.39807790517807007,
"learning_rate": 2.3308566499683922e-05,
"loss": 2.3103,
"num_input_tokens_seen": 2762432,
"step": 3685
},
{
"epoch": 5.703245749613601,
"grad_norm": 0.4937276244163513,
"learning_rate": 2.3241287958269442e-05,
"loss": 2.0892,
"num_input_tokens_seen": 2766656,
"step": 3690
},
{
"epoch": 5.710973724884081,
"grad_norm": 0.432624876499176,
"learning_rate": 2.3174022214811993e-05,
"loss": 2.2162,
"num_input_tokens_seen": 2770496,
"step": 3695
},
{
"epoch": 5.7187017001545595,
"grad_norm": 0.42396533489227295,
"learning_rate": 2.31067697587971e-05,
"loss": 2.1959,
"num_input_tokens_seen": 2774144,
"step": 3700
},
{
"epoch": 5.726429675425039,
"grad_norm": 0.5416736006736755,
"learning_rate": 2.3039531079613613e-05,
"loss": 1.7434,
"num_input_tokens_seen": 2777344,
"step": 3705
},
{
"epoch": 5.734157650695518,
"grad_norm": 0.6681102514266968,
"learning_rate": 2.2972306666550098e-05,
"loss": 2.2059,
"num_input_tokens_seen": 2780928,
"step": 3710
},
{
"epoch": 5.741885625965997,
"grad_norm": 0.5061545372009277,
"learning_rate": 2.290509700879135e-05,
"loss": 2.3656,
"num_input_tokens_seen": 2784448,
"step": 3715
},
{
"epoch": 5.749613601236476,
"grad_norm": 0.41529789566993713,
"learning_rate": 2.283790259541474e-05,
"loss": 2.1918,
"num_input_tokens_seen": 2787840,
"step": 3720
},
{
"epoch": 5.757341576506955,
"grad_norm": 0.4075319468975067,
"learning_rate": 2.277072391538676e-05,
"loss": 2.1543,
"num_input_tokens_seen": 2791680,
"step": 3725
},
{
"epoch": 5.765069551777434,
"grad_norm": 0.37512338161468506,
"learning_rate": 2.2703561457559376e-05,
"loss": 1.9263,
"num_input_tokens_seen": 2794944,
"step": 3730
},
{
"epoch": 5.7727975270479135,
"grad_norm": 0.7959144711494446,
"learning_rate": 2.263641571066653e-05,
"loss": 2.3819,
"num_input_tokens_seen": 2799296,
"step": 3735
},
{
"epoch": 5.780525502318392,
"grad_norm": 0.6679904460906982,
"learning_rate": 2.2569287163320534e-05,
"loss": 2.3377,
"num_input_tokens_seen": 2803264,
"step": 3740
},
{
"epoch": 5.788253477588872,
"grad_norm": 0.4480963945388794,
"learning_rate": 2.2502176304008575e-05,
"loss": 1.9277,
"num_input_tokens_seen": 2807616,
"step": 3745
},
{
"epoch": 5.795981452859351,
"grad_norm": 0.4576186239719391,
"learning_rate": 2.2435083621089085e-05,
"loss": 2.4183,
"num_input_tokens_seen": 2811456,
"step": 3750
},
{
"epoch": 5.80370942812983,
"grad_norm": 0.5255048871040344,
"learning_rate": 2.2368009602788264e-05,
"loss": 2.2337,
"num_input_tokens_seen": 2815360,
"step": 3755
},
{
"epoch": 5.811437403400309,
"grad_norm": 0.5405028462409973,
"learning_rate": 2.230095473719647e-05,
"loss": 2.6278,
"num_input_tokens_seen": 2819264,
"step": 3760
},
{
"epoch": 5.819165378670788,
"grad_norm": 0.43693581223487854,
"learning_rate": 2.2233919512264713e-05,
"loss": 1.9339,
"num_input_tokens_seen": 2823296,
"step": 3765
},
{
"epoch": 5.8268933539412675,
"grad_norm": 0.40162092447280884,
"learning_rate": 2.216690441580104e-05,
"loss": 2.4475,
"num_input_tokens_seen": 2827328,
"step": 3770
},
{
"epoch": 5.834621329211746,
"grad_norm": 0.4420531392097473,
"learning_rate": 2.2099909935467076e-05,
"loss": 2.0878,
"num_input_tokens_seen": 2831040,
"step": 3775
},
{
"epoch": 5.842349304482226,
"grad_norm": 0.5679764151573181,
"learning_rate": 2.203293655877437e-05,
"loss": 2.1213,
"num_input_tokens_seen": 2834752,
"step": 3780
},
{
"epoch": 5.850077279752705,
"grad_norm": 0.665807843208313,
"learning_rate": 2.196598477308095e-05,
"loss": 2.3224,
"num_input_tokens_seen": 2839104,
"step": 3785
},
{
"epoch": 5.857805255023184,
"grad_norm": 0.44736775755882263,
"learning_rate": 2.1899055065587698e-05,
"loss": 2.235,
"num_input_tokens_seen": 2842880,
"step": 3790
},
{
"epoch": 5.865533230293663,
"grad_norm": 0.5406590104103088,
"learning_rate": 2.1832147923334853e-05,
"loss": 2.0263,
"num_input_tokens_seen": 2846528,
"step": 3795
},
{
"epoch": 5.873261205564142,
"grad_norm": 0.4958436191082001,
"learning_rate": 2.1765263833198435e-05,
"loss": 2.2885,
"num_input_tokens_seen": 2850688,
"step": 3800
},
{
"epoch": 5.8809891808346215,
"grad_norm": 0.5033271908760071,
"learning_rate": 2.1698403281886734e-05,
"loss": 2.003,
"num_input_tokens_seen": 2854272,
"step": 3805
},
{
"epoch": 5.8887171561051,
"grad_norm": 0.4177800118923187,
"learning_rate": 2.163156675593672e-05,
"loss": 2.3459,
"num_input_tokens_seen": 2858240,
"step": 3810
},
{
"epoch": 5.89644513137558,
"grad_norm": 0.5443251729011536,
"learning_rate": 2.1564754741710578e-05,
"loss": 2.5596,
"num_input_tokens_seen": 2862208,
"step": 3815
},
{
"epoch": 5.904173106646059,
"grad_norm": 0.410028338432312,
"learning_rate": 2.149796772539208e-05,
"loss": 2.3332,
"num_input_tokens_seen": 2865472,
"step": 3820
},
{
"epoch": 5.911901081916538,
"grad_norm": 0.6708984971046448,
"learning_rate": 2.1431206192983117e-05,
"loss": 2.7066,
"num_input_tokens_seen": 2869568,
"step": 3825
},
{
"epoch": 5.919629057187017,
"grad_norm": 0.5683284401893616,
"learning_rate": 2.136447063030012e-05,
"loss": 2.1768,
"num_input_tokens_seen": 2873344,
"step": 3830
},
{
"epoch": 5.927357032457496,
"grad_norm": 0.49835681915283203,
"learning_rate": 2.129776152297057e-05,
"loss": 2.2855,
"num_input_tokens_seen": 2877376,
"step": 3835
},
{
"epoch": 5.9350850077279755,
"grad_norm": 0.39036598801612854,
"learning_rate": 2.1231079356429394e-05,
"loss": 2.3365,
"num_input_tokens_seen": 2881472,
"step": 3840
},
{
"epoch": 5.942812982998454,
"grad_norm": 0.4243427515029907,
"learning_rate": 2.1164424615915514e-05,
"loss": 1.8171,
"num_input_tokens_seen": 2885504,
"step": 3845
},
{
"epoch": 5.950540958268934,
"grad_norm": 0.42286694049835205,
"learning_rate": 2.1097797786468236e-05,
"loss": 2.2061,
"num_input_tokens_seen": 2889408,
"step": 3850
},
{
"epoch": 5.958268933539412,
"grad_norm": 0.6979780793190002,
"learning_rate": 2.10311993529238e-05,
"loss": 2.2993,
"num_input_tokens_seen": 2893312,
"step": 3855
},
{
"epoch": 5.965996908809892,
"grad_norm": 0.48626625537872314,
"learning_rate": 2.0964629799911778e-05,
"loss": 2.3468,
"num_input_tokens_seen": 2897088,
"step": 3860
},
{
"epoch": 5.9737248840803705,
"grad_norm": 0.5056301951408386,
"learning_rate": 2.0898089611851612e-05,
"loss": 2.1338,
"num_input_tokens_seen": 2901248,
"step": 3865
},
{
"epoch": 5.98145285935085,
"grad_norm": 0.4730468988418579,
"learning_rate": 2.0831579272949027e-05,
"loss": 2.1404,
"num_input_tokens_seen": 2905280,
"step": 3870
},
{
"epoch": 5.9891808346213296,
"grad_norm": 0.40042728185653687,
"learning_rate": 2.0765099267192575e-05,
"loss": 2.4586,
"num_input_tokens_seen": 2908736,
"step": 3875
},
{
"epoch": 5.996908809891808,
"grad_norm": 0.5426847338676453,
"learning_rate": 2.069865007835003e-05,
"loss": 2.4524,
"num_input_tokens_seen": 2912512,
"step": 3880
},
{
"epoch": 6.004636785162288,
"grad_norm": 0.4294570982456207,
"learning_rate": 2.0632232189964966e-05,
"loss": 2.2091,
"num_input_tokens_seen": 2915504,
"step": 3885
},
{
"epoch": 6.0092735703245745,
"eval_loss": 2.2262656688690186,
"eval_runtime": 9.8518,
"eval_samples_per_second": 58.365,
"eval_steps_per_second": 7.308,
"num_input_tokens_seen": 2917488,
"step": 3888
},
{
"epoch": 6.012364760432766,
"grad_norm": 0.4641660451889038,
"learning_rate": 2.0565846085353147e-05,
"loss": 2.0986,
"num_input_tokens_seen": 2918832,
"step": 3890
},
{
"epoch": 6.020092735703246,
"grad_norm": 0.39218223094940186,
"learning_rate": 2.0499492247599085e-05,
"loss": 2.167,
"num_input_tokens_seen": 2922544,
"step": 3895
},
{
"epoch": 6.0278207109737245,
"grad_norm": 0.5193983912467957,
"learning_rate": 2.0433171159552442e-05,
"loss": 2.2209,
"num_input_tokens_seen": 2926256,
"step": 3900
},
{
"epoch": 6.035548686244204,
"grad_norm": 0.5520421266555786,
"learning_rate": 2.036688330382462e-05,
"loss": 2.4272,
"num_input_tokens_seen": 2930096,
"step": 3905
},
{
"epoch": 6.043276661514684,
"grad_norm": 0.5342735648155212,
"learning_rate": 2.030062916278514e-05,
"loss": 2.1048,
"num_input_tokens_seen": 2933872,
"step": 3910
},
{
"epoch": 6.051004636785162,
"grad_norm": 0.6556943655014038,
"learning_rate": 2.0234409218558226e-05,
"loss": 1.9015,
"num_input_tokens_seen": 2937328,
"step": 3915
},
{
"epoch": 6.058732612055642,
"grad_norm": 0.4831690192222595,
"learning_rate": 2.0168223953019233e-05,
"loss": 2.0656,
"num_input_tokens_seen": 2940848,
"step": 3920
},
{
"epoch": 6.06646058732612,
"grad_norm": 0.561551034450531,
"learning_rate": 2.0102073847791182e-05,
"loss": 2.1184,
"num_input_tokens_seen": 2944432,
"step": 3925
},
{
"epoch": 6.0741885625966,
"grad_norm": 0.4626164138317108,
"learning_rate": 2.0035959384241203e-05,
"loss": 2.0853,
"num_input_tokens_seen": 2948080,
"step": 3930
},
{
"epoch": 6.0819165378670785,
"grad_norm": 0.5871320962905884,
"learning_rate": 1.9969881043477105e-05,
"loss": 2.0832,
"num_input_tokens_seen": 2951920,
"step": 3935
},
{
"epoch": 6.089644513137558,
"grad_norm": 0.4352818727493286,
"learning_rate": 1.9903839306343798e-05,
"loss": 2.2511,
"num_input_tokens_seen": 2955184,
"step": 3940
},
{
"epoch": 6.097372488408037,
"grad_norm": 0.44709163904190063,
"learning_rate": 1.9837834653419862e-05,
"loss": 2.2942,
"num_input_tokens_seen": 2959088,
"step": 3945
},
{
"epoch": 6.105100463678516,
"grad_norm": 0.518781304359436,
"learning_rate": 1.9771867565014008e-05,
"loss": 2.0329,
"num_input_tokens_seen": 2962800,
"step": 3950
},
{
"epoch": 6.112828438948996,
"grad_norm": 0.6104958057403564,
"learning_rate": 1.970593852116159e-05,
"loss": 1.9864,
"num_input_tokens_seen": 2966384,
"step": 3955
},
{
"epoch": 6.120556414219474,
"grad_norm": 0.39966410398483276,
"learning_rate": 1.964004800162111e-05,
"loss": 2.1951,
"num_input_tokens_seen": 2969712,
"step": 3960
},
{
"epoch": 6.128284389489954,
"grad_norm": 0.44955769181251526,
"learning_rate": 1.957419648587076e-05,
"loss": 2.1336,
"num_input_tokens_seen": 2973616,
"step": 3965
},
{
"epoch": 6.1360123647604325,
"grad_norm": 0.485278844833374,
"learning_rate": 1.9508384453104867e-05,
"loss": 2.1819,
"num_input_tokens_seen": 2977328,
"step": 3970
},
{
"epoch": 6.143740340030912,
"grad_norm": 0.5128074884414673,
"learning_rate": 1.9442612382230484e-05,
"loss": 2.1988,
"num_input_tokens_seen": 2981104,
"step": 3975
},
{
"epoch": 6.151468315301391,
"grad_norm": 0.489914208650589,
"learning_rate": 1.9376880751863828e-05,
"loss": 1.9642,
"num_input_tokens_seen": 2985008,
"step": 3980
},
{
"epoch": 6.15919629057187,
"grad_norm": 0.5936963558197021,
"learning_rate": 1.931119004032687e-05,
"loss": 2.2304,
"num_input_tokens_seen": 2989104,
"step": 3985
},
{
"epoch": 6.166924265842349,
"grad_norm": 0.48407799005508423,
"learning_rate": 1.9245540725643788e-05,
"loss": 2.3725,
"num_input_tokens_seen": 2992624,
"step": 3990
},
{
"epoch": 6.174652241112828,
"grad_norm": 0.46309277415275574,
"learning_rate": 1.9179933285537554e-05,
"loss": 2.2003,
"num_input_tokens_seen": 2996208,
"step": 3995
},
{
"epoch": 6.182380216383308,
"grad_norm": 0.5524800419807434,
"learning_rate": 1.911436819742638e-05,
"loss": 2.0188,
"num_input_tokens_seen": 3000816,
"step": 4000
},
{
"epoch": 6.190108191653787,
"grad_norm": 0.4268532395362854,
"learning_rate": 1.9048845938420327e-05,
"loss": 2.0816,
"num_input_tokens_seen": 3004528,
"step": 4005
},
{
"epoch": 6.197836166924266,
"grad_norm": 0.6886558532714844,
"learning_rate": 1.8983366985317763e-05,
"loss": 2.2171,
"num_input_tokens_seen": 3008112,
"step": 4010
},
{
"epoch": 6.205564142194745,
"grad_norm": 0.4119880497455597,
"learning_rate": 1.8917931814601952e-05,
"loss": 1.9501,
"num_input_tokens_seen": 3012208,
"step": 4015
},
{
"epoch": 6.213292117465224,
"grad_norm": 0.4245622456073761,
"learning_rate": 1.885254090243753e-05,
"loss": 2.1936,
"num_input_tokens_seen": 3015792,
"step": 4020
},
{
"epoch": 6.221020092735703,
"grad_norm": 0.46879667043685913,
"learning_rate": 1.8787194724667094e-05,
"loss": 1.9108,
"num_input_tokens_seen": 3019184,
"step": 4025
},
{
"epoch": 6.228748068006182,
"grad_norm": 0.4324242174625397,
"learning_rate": 1.8721893756807694e-05,
"loss": 2.1635,
"num_input_tokens_seen": 3022832,
"step": 4030
},
{
"epoch": 6.236476043276662,
"grad_norm": 0.3699527978897095,
"learning_rate": 1.8656638474047404e-05,
"loss": 1.8437,
"num_input_tokens_seen": 3026544,
"step": 4035
},
{
"epoch": 6.244204018547141,
"grad_norm": 0.4022957980632782,
"learning_rate": 1.859142935124184e-05,
"loss": 2.342,
"num_input_tokens_seen": 3030256,
"step": 4040
},
{
"epoch": 6.25193199381762,
"grad_norm": 0.3864610195159912,
"learning_rate": 1.8526266862910742e-05,
"loss": 2.2219,
"num_input_tokens_seen": 3034224,
"step": 4045
},
{
"epoch": 6.259659969088099,
"grad_norm": 0.49996402859687805,
"learning_rate": 1.8461151483234456e-05,
"loss": 2.3413,
"num_input_tokens_seen": 3037808,
"step": 4050
},
{
"epoch": 6.267387944358578,
"grad_norm": 0.5802761316299438,
"learning_rate": 1.8396083686050573e-05,
"loss": 2.06,
"num_input_tokens_seen": 3041520,
"step": 4055
},
{
"epoch": 6.275115919629057,
"grad_norm": 0.7262587547302246,
"learning_rate": 1.833106394485038e-05,
"loss": 2.1752,
"num_input_tokens_seen": 3045296,
"step": 4060
},
{
"epoch": 6.282843894899536,
"grad_norm": 0.7141749858856201,
"learning_rate": 1.8266092732775514e-05,
"loss": 2.0812,
"num_input_tokens_seen": 3049008,
"step": 4065
},
{
"epoch": 6.290571870170015,
"grad_norm": 0.3760020434856415,
"learning_rate": 1.8201170522614428e-05,
"loss": 1.9564,
"num_input_tokens_seen": 3052656,
"step": 4070
},
{
"epoch": 6.298299845440495,
"grad_norm": 0.5244548320770264,
"learning_rate": 1.8136297786799025e-05,
"loss": 1.9544,
"num_input_tokens_seen": 3056432,
"step": 4075
},
{
"epoch": 6.306027820710974,
"grad_norm": 0.3853306770324707,
"learning_rate": 1.807147499740117e-05,
"loss": 1.7615,
"num_input_tokens_seen": 3060464,
"step": 4080
},
{
"epoch": 6.313755795981453,
"grad_norm": 0.490071177482605,
"learning_rate": 1.8006702626129293e-05,
"loss": 2.2103,
"num_input_tokens_seen": 3064688,
"step": 4085
},
{
"epoch": 6.321483771251932,
"grad_norm": 0.4956612288951874,
"learning_rate": 1.7941981144324904e-05,
"loss": 2.0875,
"num_input_tokens_seen": 3068144,
"step": 4090
},
{
"epoch": 6.329211746522411,
"grad_norm": 0.4515208899974823,
"learning_rate": 1.787731102295924e-05,
"loss": 2.2417,
"num_input_tokens_seen": 3072048,
"step": 4095
},
{
"epoch": 6.3369397217928904,
"grad_norm": 0.4446680545806885,
"learning_rate": 1.7812692732629744e-05,
"loss": 1.949,
"num_input_tokens_seen": 3075824,
"step": 4100
},
{
"epoch": 6.344667697063369,
"grad_norm": 0.4429253041744232,
"learning_rate": 1.7748126743556727e-05,
"loss": 2.2903,
"num_input_tokens_seen": 3079280,
"step": 4105
},
{
"epoch": 6.352395672333849,
"grad_norm": 0.4691123068332672,
"learning_rate": 1.76836135255799e-05,
"loss": 2.2928,
"num_input_tokens_seen": 3083376,
"step": 4110
},
{
"epoch": 6.360123647604327,
"grad_norm": 0.49845725297927856,
"learning_rate": 1.7619153548154967e-05,
"loss": 2.0365,
"num_input_tokens_seen": 3086832,
"step": 4115
},
{
"epoch": 6.367851622874807,
"grad_norm": 0.4709586501121521,
"learning_rate": 1.7554747280350184e-05,
"loss": 1.8176,
"num_input_tokens_seen": 3090224,
"step": 4120
},
{
"epoch": 6.375579598145286,
"grad_norm": 0.4777339696884155,
"learning_rate": 1.7490395190843005e-05,
"loss": 2.4652,
"num_input_tokens_seen": 3094064,
"step": 4125
},
{
"epoch": 6.383307573415765,
"grad_norm": 0.4250446856021881,
"learning_rate": 1.7426097747916602e-05,
"loss": 1.8572,
"num_input_tokens_seen": 3098160,
"step": 4130
},
{
"epoch": 6.3910355486862445,
"grad_norm": 0.4578975737094879,
"learning_rate": 1.7361855419456507e-05,
"loss": 2.122,
"num_input_tokens_seen": 3101104,
"step": 4135
},
{
"epoch": 6.398763523956723,
"grad_norm": 0.5164878368377686,
"learning_rate": 1.729766867294719e-05,
"loss": 2.3874,
"num_input_tokens_seen": 3104752,
"step": 4140
},
{
"epoch": 6.406491499227203,
"grad_norm": 0.49974367022514343,
"learning_rate": 1.7233537975468646e-05,
"loss": 1.9827,
"num_input_tokens_seen": 3108592,
"step": 4145
},
{
"epoch": 6.414219474497681,
"grad_norm": 0.517787516117096,
"learning_rate": 1.7169463793693014e-05,
"loss": 2.1285,
"num_input_tokens_seen": 3112816,
"step": 4150
},
{
"epoch": 6.421947449768161,
"grad_norm": 0.5714470744132996,
"learning_rate": 1.7105446593881186e-05,
"loss": 2.2337,
"num_input_tokens_seen": 3116656,
"step": 4155
},
{
"epoch": 6.42967542503864,
"grad_norm": 0.44557783007621765,
"learning_rate": 1.704148684187937e-05,
"loss": 2.2485,
"num_input_tokens_seen": 3120496,
"step": 4160
},
{
"epoch": 6.437403400309119,
"grad_norm": 0.38803085684776306,
"learning_rate": 1.6977585003115777e-05,
"loss": 2.363,
"num_input_tokens_seen": 3124272,
"step": 4165
},
{
"epoch": 6.4451313755795985,
"grad_norm": 0.5083007216453552,
"learning_rate": 1.6913741542597145e-05,
"loss": 2.4348,
"num_input_tokens_seen": 3128048,
"step": 4170
},
{
"epoch": 6.452859350850077,
"grad_norm": 0.4518604576587677,
"learning_rate": 1.6849956924905435e-05,
"loss": 1.8019,
"num_input_tokens_seen": 3131376,
"step": 4175
},
{
"epoch": 6.460587326120557,
"grad_norm": 0.4489939510822296,
"learning_rate": 1.678623161419439e-05,
"loss": 2.3105,
"num_input_tokens_seen": 3135216,
"step": 4180
},
{
"epoch": 6.468315301391035,
"grad_norm": 0.4724332094192505,
"learning_rate": 1.6722566074186214e-05,
"loss": 2.2169,
"num_input_tokens_seen": 3138928,
"step": 4185
},
{
"epoch": 6.476043276661515,
"grad_norm": 0.4101565480232239,
"learning_rate": 1.665896076816812e-05,
"loss": 1.9876,
"num_input_tokens_seen": 3142768,
"step": 4190
},
{
"epoch": 6.483771251931993,
"grad_norm": 0.49106982350349426,
"learning_rate": 1.659541615898905e-05,
"loss": 1.9611,
"num_input_tokens_seen": 3146416,
"step": 4195
},
{
"epoch": 6.491499227202473,
"grad_norm": 0.49804726243019104,
"learning_rate": 1.6531932709056228e-05,
"loss": 1.9822,
"num_input_tokens_seen": 3149552,
"step": 4200
},
{
"epoch": 6.4992272024729525,
"grad_norm": 0.38408616185188293,
"learning_rate": 1.646851088033185e-05,
"loss": 1.9707,
"num_input_tokens_seen": 3153648,
"step": 4205
},
{
"epoch": 6.506955177743431,
"grad_norm": 0.387453556060791,
"learning_rate": 1.6405151134329687e-05,
"loss": 1.8969,
"num_input_tokens_seen": 3157744,
"step": 4210
},
{
"epoch": 6.510046367851623,
"eval_loss": 2.1331775188446045,
"eval_runtime": 9.8453,
"eval_samples_per_second": 58.403,
"eval_steps_per_second": 7.313,
"num_input_tokens_seen": 3159216,
"step": 4212
},
{
"epoch": 6.514683153013911,
"grad_norm": 0.4173484146595001,
"learning_rate": 1.6341853932111767e-05,
"loss": 2.1683,
"num_input_tokens_seen": 3161072,
"step": 4215
},
{
"epoch": 6.522411128284389,
"grad_norm": 0.5903739333152771,
"learning_rate": 1.627861973428496e-05,
"loss": 2.025,
"num_input_tokens_seen": 3164400,
"step": 4220
},
{
"epoch": 6.530139103554869,
"grad_norm": 0.5123242139816284,
"learning_rate": 1.6215449000997667e-05,
"loss": 2.1599,
"num_input_tokens_seen": 3167984,
"step": 4225
},
{
"epoch": 6.5378670788253475,
"grad_norm": 0.4895840883255005,
"learning_rate": 1.6152342191936483e-05,
"loss": 2.3419,
"num_input_tokens_seen": 3171824,
"step": 4230
},
{
"epoch": 6.545595054095827,
"grad_norm": 0.5733851790428162,
"learning_rate": 1.6089299766322812e-05,
"loss": 2.0644,
"num_input_tokens_seen": 3175728,
"step": 4235
},
{
"epoch": 6.553323029366306,
"grad_norm": 0.4447968900203705,
"learning_rate": 1.6026322182909575e-05,
"loss": 1.6845,
"num_input_tokens_seen": 3179248,
"step": 4240
},
{
"epoch": 6.561051004636785,
"grad_norm": 0.5073620080947876,
"learning_rate": 1.5963409899977804e-05,
"loss": 2.6895,
"num_input_tokens_seen": 3182768,
"step": 4245
},
{
"epoch": 6.568778979907265,
"grad_norm": 0.47167372703552246,
"learning_rate": 1.5900563375333388e-05,
"loss": 2.2403,
"num_input_tokens_seen": 3186544,
"step": 4250
},
{
"epoch": 6.576506955177743,
"grad_norm": 0.47486698627471924,
"learning_rate": 1.583778306630366e-05,
"loss": 1.6739,
"num_input_tokens_seen": 3190320,
"step": 4255
},
{
"epoch": 6.584234930448223,
"grad_norm": 0.6215065717697144,
"learning_rate": 1.5775069429734135e-05,
"loss": 2.2426,
"num_input_tokens_seen": 3194096,
"step": 4260
},
{
"epoch": 6.5919629057187015,
"grad_norm": 0.42234382033348083,
"learning_rate": 1.5712422921985157e-05,
"loss": 2.4955,
"num_input_tokens_seen": 3197872,
"step": 4265
},
{
"epoch": 6.599690880989181,
"grad_norm": 0.4679786264896393,
"learning_rate": 1.5649843998928585e-05,
"loss": 2.175,
"num_input_tokens_seen": 3201456,
"step": 4270
},
{
"epoch": 6.60741885625966,
"grad_norm": 0.43814560770988464,
"learning_rate": 1.558733311594444e-05,
"loss": 2.047,
"num_input_tokens_seen": 3205360,
"step": 4275
},
{
"epoch": 6.615146831530139,
"grad_norm": 0.45244526863098145,
"learning_rate": 1.5524890727917676e-05,
"loss": 2.0997,
"num_input_tokens_seen": 3208752,
"step": 4280
},
{
"epoch": 6.622874806800619,
"grad_norm": 0.45812249183654785,
"learning_rate": 1.546251728923476e-05,
"loss": 1.977,
"num_input_tokens_seen": 3212528,
"step": 4285
},
{
"epoch": 6.630602782071097,
"grad_norm": 0.4441308081150055,
"learning_rate": 1.5400213253780467e-05,
"loss": 1.9004,
"num_input_tokens_seen": 3216176,
"step": 4290
},
{
"epoch": 6.638330757341577,
"grad_norm": 0.5535650253295898,
"learning_rate": 1.5337979074934505e-05,
"loss": 2.1144,
"num_input_tokens_seen": 3220144,
"step": 4295
},
{
"epoch": 6.6460587326120555,
"grad_norm": 0.46812766790390015,
"learning_rate": 1.5275815205568264e-05,
"loss": 1.9624,
"num_input_tokens_seen": 3223920,
"step": 4300
},
{
"epoch": 6.653786707882535,
"grad_norm": 0.38829505443573,
"learning_rate": 1.5213722098041472e-05,
"loss": 2.0715,
"num_input_tokens_seen": 3227312,
"step": 4305
},
{
"epoch": 6.661514683153014,
"grad_norm": 0.5156342387199402,
"learning_rate": 1.5151700204198965e-05,
"loss": 1.5928,
"num_input_tokens_seen": 3231216,
"step": 4310
},
{
"epoch": 6.669242658423493,
"grad_norm": 0.417357474565506,
"learning_rate": 1.5089749975367324e-05,
"loss": 2.2893,
"num_input_tokens_seen": 3234608,
"step": 4315
},
{
"epoch": 6.676970633693972,
"grad_norm": 0.4816182255744934,
"learning_rate": 1.5027871862351671e-05,
"loss": 1.9179,
"num_input_tokens_seen": 3239088,
"step": 4320
},
{
"epoch": 6.684698608964451,
"grad_norm": 0.46260756254196167,
"learning_rate": 1.4966066315432331e-05,
"loss": 2.071,
"num_input_tokens_seen": 3242608,
"step": 4325
},
{
"epoch": 6.69242658423493,
"grad_norm": 0.4879600405693054,
"learning_rate": 1.4904333784361568e-05,
"loss": 1.8896,
"num_input_tokens_seen": 3246256,
"step": 4330
},
{
"epoch": 6.7001545595054095,
"grad_norm": 0.5067187547683716,
"learning_rate": 1.4842674718360323e-05,
"loss": 2.2088,
"num_input_tokens_seen": 3249648,
"step": 4335
},
{
"epoch": 6.707882534775889,
"grad_norm": 0.3779104948043823,
"learning_rate": 1.4781089566114953e-05,
"loss": 2.2161,
"num_input_tokens_seen": 3253232,
"step": 4340
},
{
"epoch": 6.715610510046368,
"grad_norm": 0.5668134689331055,
"learning_rate": 1.4719578775773924e-05,
"loss": 2.2821,
"num_input_tokens_seen": 3257072,
"step": 4345
},
{
"epoch": 6.723338485316847,
"grad_norm": 0.516550600528717,
"learning_rate": 1.465814279494461e-05,
"loss": 1.9797,
"num_input_tokens_seen": 3260400,
"step": 4350
},
{
"epoch": 6.731066460587326,
"grad_norm": 0.4872724115848541,
"learning_rate": 1.4596782070689971e-05,
"loss": 1.8425,
"num_input_tokens_seen": 3263856,
"step": 4355
},
{
"epoch": 6.738794435857805,
"grad_norm": 0.41736355423927307,
"learning_rate": 1.4535497049525371e-05,
"loss": 2.1885,
"num_input_tokens_seen": 3267888,
"step": 4360
},
{
"epoch": 6.746522411128284,
"grad_norm": 0.4729522168636322,
"learning_rate": 1.4474288177415245e-05,
"loss": 1.8801,
"num_input_tokens_seen": 3271472,
"step": 4365
},
{
"epoch": 6.7542503863987635,
"grad_norm": 0.5210955739021301,
"learning_rate": 1.4413155899769954e-05,
"loss": 2.24,
"num_input_tokens_seen": 3275184,
"step": 4370
},
{
"epoch": 6.761978361669243,
"grad_norm": 0.5247458219528198,
"learning_rate": 1.4352100661442448e-05,
"loss": 1.8741,
"num_input_tokens_seen": 3279408,
"step": 4375
},
{
"epoch": 6.769706336939722,
"grad_norm": 0.47709551453590393,
"learning_rate": 1.429112290672508e-05,
"loss": 1.6511,
"num_input_tokens_seen": 3282800,
"step": 4380
},
{
"epoch": 6.777434312210201,
"grad_norm": 0.5259020924568176,
"learning_rate": 1.4230223079346371e-05,
"loss": 2.294,
"num_input_tokens_seen": 3286128,
"step": 4385
},
{
"epoch": 6.78516228748068,
"grad_norm": 0.611424446105957,
"learning_rate": 1.4169401622467768e-05,
"loss": 2.2026,
"num_input_tokens_seen": 3290160,
"step": 4390
},
{
"epoch": 6.792890262751159,
"grad_norm": 0.5359120965003967,
"learning_rate": 1.4108658978680422e-05,
"loss": 1.62,
"num_input_tokens_seen": 3293616,
"step": 4395
},
{
"epoch": 6.800618238021638,
"grad_norm": 0.535717785358429,
"learning_rate": 1.4047995590001975e-05,
"loss": 2.0069,
"num_input_tokens_seen": 3297456,
"step": 4400
},
{
"epoch": 6.8083462132921175,
"grad_norm": 0.6203454732894897,
"learning_rate": 1.3987411897873321e-05,
"loss": 2.2569,
"num_input_tokens_seen": 3301296,
"step": 4405
},
{
"epoch": 6.816074188562597,
"grad_norm": 0.5075907707214355,
"learning_rate": 1.3926908343155462e-05,
"loss": 1.9357,
"num_input_tokens_seen": 3305072,
"step": 4410
},
{
"epoch": 6.823802163833076,
"grad_norm": 0.39903944730758667,
"learning_rate": 1.3866485366126169e-05,
"loss": 1.6716,
"num_input_tokens_seen": 3308784,
"step": 4415
},
{
"epoch": 6.831530139103555,
"grad_norm": 0.6213854551315308,
"learning_rate": 1.3806143406476938e-05,
"loss": 2.1229,
"num_input_tokens_seen": 3313072,
"step": 4420
},
{
"epoch": 6.839258114374034,
"grad_norm": 0.6845709681510925,
"learning_rate": 1.3745882903309637e-05,
"loss": 1.9561,
"num_input_tokens_seen": 3316912,
"step": 4425
},
{
"epoch": 6.846986089644513,
"grad_norm": 0.4705151617527008,
"learning_rate": 1.3685704295133451e-05,
"loss": 1.5797,
"num_input_tokens_seen": 3321584,
"step": 4430
},
{
"epoch": 6.854714064914992,
"grad_norm": 0.5121558308601379,
"learning_rate": 1.362560801986158e-05,
"loss": 2.215,
"num_input_tokens_seen": 3325168,
"step": 4435
},
{
"epoch": 6.8624420401854715,
"grad_norm": 0.45194992423057556,
"learning_rate": 1.356559451480811e-05,
"loss": 2.1466,
"num_input_tokens_seen": 3329584,
"step": 4440
},
{
"epoch": 6.87017001545595,
"grad_norm": 0.5940639972686768,
"learning_rate": 1.3505664216684824e-05,
"loss": 1.8275,
"num_input_tokens_seen": 3334064,
"step": 4445
},
{
"epoch": 6.87789799072643,
"grad_norm": 0.5700973272323608,
"learning_rate": 1.3445817561598002e-05,
"loss": 2.047,
"num_input_tokens_seen": 3338032,
"step": 4450
},
{
"epoch": 6.885625965996908,
"grad_norm": 0.44290101528167725,
"learning_rate": 1.3386054985045271e-05,
"loss": 2.1383,
"num_input_tokens_seen": 3341488,
"step": 4455
},
{
"epoch": 6.893353941267388,
"grad_norm": 0.34908556938171387,
"learning_rate": 1.3326376921912431e-05,
"loss": 1.6381,
"num_input_tokens_seen": 3344816,
"step": 4460
},
{
"epoch": 6.901081916537867,
"grad_norm": 0.4741649329662323,
"learning_rate": 1.3266783806470279e-05,
"loss": 1.8414,
"num_input_tokens_seen": 3348784,
"step": 4465
},
{
"epoch": 6.908809891808346,
"grad_norm": 0.432268887758255,
"learning_rate": 1.3207276072371466e-05,
"loss": 1.8426,
"num_input_tokens_seen": 3352624,
"step": 4470
},
{
"epoch": 6.916537867078826,
"grad_norm": 0.4336986839771271,
"learning_rate": 1.3147854152647315e-05,
"loss": 2.1196,
"num_input_tokens_seen": 3356784,
"step": 4475
},
{
"epoch": 6.924265842349304,
"grad_norm": 0.4205768406391144,
"learning_rate": 1.308851847970473e-05,
"loss": 1.763,
"num_input_tokens_seen": 3361008,
"step": 4480
},
{
"epoch": 6.931993817619784,
"grad_norm": 0.44432491064071655,
"learning_rate": 1.3029269485322937e-05,
"loss": 1.8207,
"num_input_tokens_seen": 3364912,
"step": 4485
},
{
"epoch": 6.939721792890262,
"grad_norm": 0.4959534704685211,
"learning_rate": 1.2970107600650483e-05,
"loss": 1.9731,
"num_input_tokens_seen": 3369072,
"step": 4490
},
{
"epoch": 6.947449768160742,
"grad_norm": 0.40225279331207275,
"learning_rate": 1.2911033256201965e-05,
"loss": 1.9828,
"num_input_tokens_seen": 3372464,
"step": 4495
},
{
"epoch": 6.955177743431221,
"grad_norm": 0.44361382722854614,
"learning_rate": 1.2852046881855015e-05,
"loss": 2.0853,
"num_input_tokens_seen": 3376240,
"step": 4500
},
{
"epoch": 6.9629057187017,
"grad_norm": 0.5393215417861938,
"learning_rate": 1.279314890684708e-05,
"loss": 1.9504,
"num_input_tokens_seen": 3380464,
"step": 4505
},
{
"epoch": 6.97063369397218,
"grad_norm": 0.7672788500785828,
"learning_rate": 1.2734339759772341e-05,
"loss": 2.0457,
"num_input_tokens_seen": 3383792,
"step": 4510
},
{
"epoch": 6.978361669242658,
"grad_norm": 0.4358603060245514,
"learning_rate": 1.2675619868578592e-05,
"loss": 2.0066,
"num_input_tokens_seen": 3387504,
"step": 4515
},
{
"epoch": 6.986089644513138,
"grad_norm": 0.48252299427986145,
"learning_rate": 1.2616989660564127e-05,
"loss": 1.7536,
"num_input_tokens_seen": 3390896,
"step": 4520
},
{
"epoch": 6.993817619783616,
"grad_norm": 0.688645601272583,
"learning_rate": 1.2558449562374614e-05,
"loss": 1.9599,
"num_input_tokens_seen": 3395248,
"step": 4525
},
{
"epoch": 7.001545595054096,
"grad_norm": 0.4696773290634155,
"learning_rate": 1.2500000000000006e-05,
"loss": 1.9528,
"num_input_tokens_seen": 3399200,
"step": 4530
},
{
"epoch": 7.0092735703245745,
"grad_norm": 0.40950486063957214,
"learning_rate": 1.2441641398771431e-05,
"loss": 1.7298,
"num_input_tokens_seen": 3402272,
"step": 4535
},
{
"epoch": 7.0108191653786704,
"eval_loss": 2.0646586418151855,
"eval_runtime": 9.85,
"eval_samples_per_second": 58.376,
"eval_steps_per_second": 7.31,
"num_input_tokens_seen": 3403040,
"step": 4536
},
{
"epoch": 7.017001545595054,
"grad_norm": 0.3565578758716583,
"learning_rate": 1.2383374183358135e-05,
"loss": 2.2223,
"num_input_tokens_seen": 3406304,
"step": 4540
},
{
"epoch": 7.024729520865534,
"grad_norm": 0.4340266287326813,
"learning_rate": 1.2325198777764297e-05,
"loss": 1.9125,
"num_input_tokens_seen": 3409824,
"step": 4545
},
{
"epoch": 7.032457496136012,
"grad_norm": 0.569847822189331,
"learning_rate": 1.2267115605326076e-05,
"loss": 2.0876,
"num_input_tokens_seen": 3413600,
"step": 4550
},
{
"epoch": 7.040185471406492,
"grad_norm": 0.5271437764167786,
"learning_rate": 1.2209125088708395e-05,
"loss": 2.2137,
"num_input_tokens_seen": 3417248,
"step": 4555
},
{
"epoch": 7.04791344667697,
"grad_norm": 0.5423272252082825,
"learning_rate": 1.2151227649901986e-05,
"loss": 1.9259,
"num_input_tokens_seen": 3420512,
"step": 4560
},
{
"epoch": 7.05564142194745,
"grad_norm": 0.36704373359680176,
"learning_rate": 1.2093423710220231e-05,
"loss": 1.9247,
"num_input_tokens_seen": 3424096,
"step": 4565
},
{
"epoch": 7.063369397217929,
"grad_norm": 0.5190703868865967,
"learning_rate": 1.203571369029614e-05,
"loss": 1.765,
"num_input_tokens_seen": 3427680,
"step": 4570
},
{
"epoch": 7.071097372488408,
"grad_norm": 0.8384991884231567,
"learning_rate": 1.1978098010079275e-05,
"loss": 1.9814,
"num_input_tokens_seen": 3431712,
"step": 4575
},
{
"epoch": 7.078825347758887,
"grad_norm": 0.5724890232086182,
"learning_rate": 1.1920577088832702e-05,
"loss": 1.8155,
"num_input_tokens_seen": 3434976,
"step": 4580
},
{
"epoch": 7.086553323029366,
"grad_norm": 0.6341719627380371,
"learning_rate": 1.1863151345129933e-05,
"loss": 1.9678,
"num_input_tokens_seen": 3438816,
"step": 4585
},
{
"epoch": 7.094281298299846,
"grad_norm": 0.7012672424316406,
"learning_rate": 1.1805821196851886e-05,
"loss": 1.8407,
"num_input_tokens_seen": 3442400,
"step": 4590
},
{
"epoch": 7.102009273570324,
"grad_norm": 0.5077885985374451,
"learning_rate": 1.1748587061183835e-05,
"loss": 2.2851,
"num_input_tokens_seen": 3445984,
"step": 4595
},
{
"epoch": 7.109737248840804,
"grad_norm": 0.5049794912338257,
"learning_rate": 1.1691449354612393e-05,
"loss": 2.0633,
"num_input_tokens_seen": 3449504,
"step": 4600
},
{
"epoch": 7.117465224111283,
"grad_norm": 0.5647863149642944,
"learning_rate": 1.163440849292245e-05,
"loss": 1.828,
"num_input_tokens_seen": 3453792,
"step": 4605
},
{
"epoch": 7.125193199381762,
"grad_norm": 0.4179993271827698,
"learning_rate": 1.1577464891194203e-05,
"loss": 1.7997,
"num_input_tokens_seen": 3457568,
"step": 4610
},
{
"epoch": 7.132921174652241,
"grad_norm": 0.6215378046035767,
"learning_rate": 1.1520618963800043e-05,
"loss": 2.0511,
"num_input_tokens_seen": 3461152,
"step": 4615
},
{
"epoch": 7.14064914992272,
"grad_norm": 0.442844033241272,
"learning_rate": 1.1463871124401657e-05,
"loss": 1.8991,
"num_input_tokens_seen": 3464864,
"step": 4620
},
{
"epoch": 7.1483771251932,
"grad_norm": 0.5186662077903748,
"learning_rate": 1.1407221785946892e-05,
"loss": 1.854,
"num_input_tokens_seen": 3468128,
"step": 4625
},
{
"epoch": 7.156105100463678,
"grad_norm": 0.6374059319496155,
"learning_rate": 1.1350671360666873e-05,
"loss": 1.9453,
"num_input_tokens_seen": 3471904,
"step": 4630
},
{
"epoch": 7.163833075734158,
"grad_norm": 0.4631650447845459,
"learning_rate": 1.1294220260072912e-05,
"loss": 1.8227,
"num_input_tokens_seen": 3475808,
"step": 4635
},
{
"epoch": 7.171561051004637,
"grad_norm": 0.44500941038131714,
"learning_rate": 1.1237868894953554e-05,
"loss": 1.9856,
"num_input_tokens_seen": 3479392,
"step": 4640
},
{
"epoch": 7.179289026275116,
"grad_norm": 0.4622701406478882,
"learning_rate": 1.1181617675371581e-05,
"loss": 1.8469,
"num_input_tokens_seen": 3483040,
"step": 4645
},
{
"epoch": 7.187017001545595,
"grad_norm": 0.5993435382843018,
"learning_rate": 1.112546701066102e-05,
"loss": 2.0912,
"num_input_tokens_seen": 3487264,
"step": 4650
},
{
"epoch": 7.194744976816074,
"grad_norm": 0.6248143911361694,
"learning_rate": 1.1069417309424176e-05,
"loss": 2.2415,
"num_input_tokens_seen": 3491488,
"step": 4655
},
{
"epoch": 7.202472952086553,
"grad_norm": 0.48824310302734375,
"learning_rate": 1.101346897952866e-05,
"loss": 1.8952,
"num_input_tokens_seen": 3495264,
"step": 4660
},
{
"epoch": 7.210200927357032,
"grad_norm": 0.5553814172744751,
"learning_rate": 1.0957622428104394e-05,
"loss": 1.7953,
"num_input_tokens_seen": 3498592,
"step": 4665
},
{
"epoch": 7.217928902627512,
"grad_norm": 0.65151047706604,
"learning_rate": 1.0901878061540712e-05,
"loss": 2.0573,
"num_input_tokens_seen": 3502560,
"step": 4670
},
{
"epoch": 7.225656877897991,
"grad_norm": 0.4101492464542389,
"learning_rate": 1.0846236285483296e-05,
"loss": 1.9602,
"num_input_tokens_seen": 3506528,
"step": 4675
},
{
"epoch": 7.23338485316847,
"grad_norm": 0.37585628032684326,
"learning_rate": 1.079069750483136e-05,
"loss": 1.5444,
"num_input_tokens_seen": 3510496,
"step": 4680
},
{
"epoch": 7.241112828438949,
"grad_norm": 0.46534785628318787,
"learning_rate": 1.0735262123734557e-05,
"loss": 2.1299,
"num_input_tokens_seen": 3513824,
"step": 4685
},
{
"epoch": 7.248840803709428,
"grad_norm": 0.5972572565078735,
"learning_rate": 1.067993054559018e-05,
"loss": 2.1535,
"num_input_tokens_seen": 3517920,
"step": 4690
},
{
"epoch": 7.256568778979907,
"grad_norm": 0.4854303002357483,
"learning_rate": 1.062470317304012e-05,
"loss": 1.9984,
"num_input_tokens_seen": 3521568,
"step": 4695
},
{
"epoch": 7.2642967542503865,
"grad_norm": 0.4328206479549408,
"learning_rate": 1.0569580407967983e-05,
"loss": 2.4716,
"num_input_tokens_seen": 3525536,
"step": 4700
},
{
"epoch": 7.272024729520865,
"grad_norm": 0.5262501239776611,
"learning_rate": 1.0514562651496162e-05,
"loss": 2.1431,
"num_input_tokens_seen": 3529312,
"step": 4705
},
{
"epoch": 7.279752704791345,
"grad_norm": 0.43528902530670166,
"learning_rate": 1.0459650303982912e-05,
"loss": 1.8515,
"num_input_tokens_seen": 3533536,
"step": 4710
},
{
"epoch": 7.287480680061824,
"grad_norm": 0.5224335789680481,
"learning_rate": 1.0404843765019436e-05,
"loss": 2.0884,
"num_input_tokens_seen": 3537184,
"step": 4715
},
{
"epoch": 7.295208655332303,
"grad_norm": 0.4448750615119934,
"learning_rate": 1.0350143433426981e-05,
"loss": 1.6565,
"num_input_tokens_seen": 3540960,
"step": 4720
},
{
"epoch": 7.302936630602782,
"grad_norm": 0.43913865089416504,
"learning_rate": 1.029554970725393e-05,
"loss": 1.9423,
"num_input_tokens_seen": 3545184,
"step": 4725
},
{
"epoch": 7.310664605873261,
"grad_norm": 0.43090900778770447,
"learning_rate": 1.0241062983772939e-05,
"loss": 1.8798,
"num_input_tokens_seen": 3549024,
"step": 4730
},
{
"epoch": 7.3183925811437405,
"grad_norm": 0.508183479309082,
"learning_rate": 1.0186683659477956e-05,
"loss": 1.6893,
"num_input_tokens_seen": 3553056,
"step": 4735
},
{
"epoch": 7.326120556414219,
"grad_norm": 0.6264638900756836,
"learning_rate": 1.0132412130081473e-05,
"loss": 1.9907,
"num_input_tokens_seen": 3557024,
"step": 4740
},
{
"epoch": 7.333848531684699,
"grad_norm": 0.5664180517196655,
"learning_rate": 1.0078248790511492e-05,
"loss": 1.7209,
"num_input_tokens_seen": 3560544,
"step": 4745
},
{
"epoch": 7.341576506955178,
"grad_norm": 0.45052483677864075,
"learning_rate": 1.0024194034908793e-05,
"loss": 1.7309,
"num_input_tokens_seen": 3564192,
"step": 4750
},
{
"epoch": 7.349304482225657,
"grad_norm": 0.4567320942878723,
"learning_rate": 9.970248256623976e-06,
"loss": 1.7103,
"num_input_tokens_seen": 3567648,
"step": 4755
},
{
"epoch": 7.357032457496136,
"grad_norm": 0.5011356472969055,
"learning_rate": 9.916411848214618e-06,
"loss": 2.2025,
"num_input_tokens_seen": 3571168,
"step": 4760
},
{
"epoch": 7.364760432766615,
"grad_norm": 0.3981023132801056,
"learning_rate": 9.86268520144244e-06,
"loss": 2.0211,
"num_input_tokens_seen": 3575520,
"step": 4765
},
{
"epoch": 7.3724884080370945,
"grad_norm": 0.863990068435669,
"learning_rate": 9.809068707270425e-06,
"loss": 1.8911,
"num_input_tokens_seen": 3579104,
"step": 4770
},
{
"epoch": 7.380216383307573,
"grad_norm": 0.5234079360961914,
"learning_rate": 9.755562755859996e-06,
"loss": 2.1423,
"num_input_tokens_seen": 3582944,
"step": 4775
},
{
"epoch": 7.387944358578053,
"grad_norm": 0.3961091935634613,
"learning_rate": 9.702167736568163e-06,
"loss": 1.8697,
"num_input_tokens_seen": 3586464,
"step": 4780
},
{
"epoch": 7.395672333848531,
"grad_norm": 0.4912174344062805,
"learning_rate": 9.6488840379447e-06,
"loss": 1.6679,
"num_input_tokens_seen": 3590304,
"step": 4785
},
{
"epoch": 7.403400309119011,
"grad_norm": 0.479442834854126,
"learning_rate": 9.59571204772931e-06,
"loss": 2.1609,
"num_input_tokens_seen": 3594016,
"step": 4790
},
{
"epoch": 7.41112828438949,
"grad_norm": 0.550214946269989,
"learning_rate": 9.5426521528488e-06,
"loss": 1.5704,
"num_input_tokens_seen": 3597536,
"step": 4795
},
{
"epoch": 7.418856259659969,
"grad_norm": 0.4294738471508026,
"learning_rate": 9.489704739414302e-06,
"loss": 2.0111,
"num_input_tokens_seen": 3601696,
"step": 4800
},
{
"epoch": 7.4265842349304485,
"grad_norm": 0.512971818447113,
"learning_rate": 9.436870192718372e-06,
"loss": 1.9792,
"num_input_tokens_seen": 3605408,
"step": 4805
},
{
"epoch": 7.434312210200927,
"grad_norm": 0.5744072198867798,
"learning_rate": 9.38414889723232e-06,
"loss": 1.6386,
"num_input_tokens_seen": 3609056,
"step": 4810
},
{
"epoch": 7.442040185471407,
"grad_norm": 0.4001683294773102,
"learning_rate": 9.331541236603267e-06,
"loss": 2.0762,
"num_input_tokens_seen": 3613024,
"step": 4815
},
{
"epoch": 7.449768160741885,
"grad_norm": 0.4479902386665344,
"learning_rate": 9.279047593651488e-06,
"loss": 2.1537,
"num_input_tokens_seen": 3617440,
"step": 4820
},
{
"epoch": 7.457496136012365,
"grad_norm": 0.4362366497516632,
"learning_rate": 9.226668350367528e-06,
"loss": 2.0465,
"num_input_tokens_seen": 3621984,
"step": 4825
},
{
"epoch": 7.4652241112828435,
"grad_norm": 0.4821748733520508,
"learning_rate": 9.174403887909466e-06,
"loss": 2.0988,
"num_input_tokens_seen": 3625696,
"step": 4830
},
{
"epoch": 7.472952086553323,
"grad_norm": 0.4055638313293457,
"learning_rate": 9.122254586600138e-06,
"loss": 1.8805,
"num_input_tokens_seen": 3628960,
"step": 4835
},
{
"epoch": 7.4806800618238025,
"grad_norm": 0.44100403785705566,
"learning_rate": 9.070220825924356e-06,
"loss": 1.894,
"num_input_tokens_seen": 3632416,
"step": 4840
},
{
"epoch": 7.488408037094281,
"grad_norm": 0.4403553903102875,
"learning_rate": 9.018302984526161e-06,
"loss": 2.2233,
"num_input_tokens_seen": 3636576,
"step": 4845
},
{
"epoch": 7.496136012364761,
"grad_norm": 0.4374490976333618,
"learning_rate": 8.966501440206063e-06,
"loss": 1.9553,
"num_input_tokens_seen": 3640288,
"step": 4850
},
{
"epoch": 7.503863987635239,
"grad_norm": 0.49835798144340515,
"learning_rate": 8.91481656991828e-06,
"loss": 1.8019,
"num_input_tokens_seen": 3644320,
"step": 4855
},
{
"epoch": 7.511591962905719,
"grad_norm": 0.559605598449707,
"learning_rate": 8.863248749768042e-06,
"loss": 2.0465,
"num_input_tokens_seen": 3648160,
"step": 4860
},
{
"epoch": 7.511591962905719,
"eval_loss": 2.0179080963134766,
"eval_runtime": 9.8471,
"eval_samples_per_second": 58.393,
"eval_steps_per_second": 7.312,
"num_input_tokens_seen": 3648160,
"step": 4860
},
{
"epoch": 7.5193199381761975,
"grad_norm": 0.6090003252029419,
"learning_rate": 8.811798355008753e-06,
"loss": 1.6761,
"num_input_tokens_seen": 3651808,
"step": 4865
},
{
"epoch": 7.527047913446677,
"grad_norm": 0.41798070073127747,
"learning_rate": 8.760465760039399e-06,
"loss": 1.7734,
"num_input_tokens_seen": 3655904,
"step": 4870
},
{
"epoch": 7.5347758887171565,
"grad_norm": 0.5017128586769104,
"learning_rate": 8.709251338401681e-06,
"loss": 1.7878,
"num_input_tokens_seen": 3659680,
"step": 4875
},
{
"epoch": 7.542503863987635,
"grad_norm": 0.6273453831672668,
"learning_rate": 8.658155462777418e-06,
"loss": 2.0905,
"num_input_tokens_seen": 3663264,
"step": 4880
},
{
"epoch": 7.550231839258115,
"grad_norm": 0.45074906945228577,
"learning_rate": 8.607178504985759e-06,
"loss": 1.9266,
"num_input_tokens_seen": 3667424,
"step": 4885
},
{
"epoch": 7.557959814528593,
"grad_norm": 0.41543567180633545,
"learning_rate": 8.556320835980503e-06,
"loss": 2.0426,
"num_input_tokens_seen": 3671008,
"step": 4890
},
{
"epoch": 7.565687789799073,
"grad_norm": 0.4987678825855255,
"learning_rate": 8.505582825847397e-06,
"loss": 1.7163,
"num_input_tokens_seen": 3674976,
"step": 4895
},
{
"epoch": 7.5734157650695515,
"grad_norm": 0.6415022015571594,
"learning_rate": 8.454964843801445e-06,
"loss": 1.9027,
"num_input_tokens_seen": 3678560,
"step": 4900
},
{
"epoch": 7.581143740340031,
"grad_norm": 0.5627877712249756,
"learning_rate": 8.404467258184223e-06,
"loss": 1.7147,
"num_input_tokens_seen": 3682144,
"step": 4905
},
{
"epoch": 7.58887171561051,
"grad_norm": 0.5032637119293213,
"learning_rate": 8.354090436461186e-06,
"loss": 1.7561,
"num_input_tokens_seen": 3685664,
"step": 4910
},
{
"epoch": 7.596599690880989,
"grad_norm": 0.4875337481498718,
"learning_rate": 8.303834745219007e-06,
"loss": 2.0047,
"num_input_tokens_seen": 3689248,
"step": 4915
},
{
"epoch": 7.604327666151468,
"grad_norm": 0.5122409462928772,
"learning_rate": 8.25370055016293e-06,
"loss": 2.0755,
"num_input_tokens_seen": 3693152,
"step": 4920
},
{
"epoch": 7.612055641421947,
"grad_norm": 0.4014969766139984,
"learning_rate": 8.203688216114027e-06,
"loss": 2.2292,
"num_input_tokens_seen": 3697184,
"step": 4925
},
{
"epoch": 7.619783616692427,
"grad_norm": 0.5143061280250549,
"learning_rate": 8.153798107006671e-06,
"loss": 2.078,
"num_input_tokens_seen": 3700768,
"step": 4930
},
{
"epoch": 7.6275115919629055,
"grad_norm": 0.4903722107410431,
"learning_rate": 8.10403058588575e-06,
"loss": 2.0966,
"num_input_tokens_seen": 3705184,
"step": 4935
},
{
"epoch": 7.635239567233385,
"grad_norm": 0.4987504780292511,
"learning_rate": 8.054386014904145e-06,
"loss": 1.6639,
"num_input_tokens_seen": 3709216,
"step": 4940
},
{
"epoch": 7.642967542503864,
"grad_norm": 0.6909155249595642,
"learning_rate": 8.004864755320016e-06,
"loss": 1.8366,
"num_input_tokens_seen": 3713248,
"step": 4945
},
{
"epoch": 7.650695517774343,
"grad_norm": 0.4917794466018677,
"learning_rate": 7.955467167494208e-06,
"loss": 1.694,
"num_input_tokens_seen": 3716832,
"step": 4950
},
{
"epoch": 7.658423493044822,
"grad_norm": 0.5293083786964417,
"learning_rate": 7.90619361088761e-06,
"loss": 1.9388,
"num_input_tokens_seen": 3720608,
"step": 4955
},
{
"epoch": 7.666151468315301,
"grad_norm": 0.4672509431838989,
"learning_rate": 7.857044444058562e-06,
"loss": 1.6769,
"num_input_tokens_seen": 3724256,
"step": 4960
},
{
"epoch": 7.673879443585781,
"grad_norm": 0.6263973116874695,
"learning_rate": 7.80802002466023e-06,
"loss": 1.9622,
"num_input_tokens_seen": 3728032,
"step": 4965
},
{
"epoch": 7.6816074188562595,
"grad_norm": 0.4829172194004059,
"learning_rate": 7.759120709437993e-06,
"loss": 2.198,
"num_input_tokens_seen": 3732640,
"step": 4970
},
{
"epoch": 7.689335394126739,
"grad_norm": 0.42988526821136475,
"learning_rate": 7.71034685422688e-06,
"loss": 1.8157,
"num_input_tokens_seen": 3735968,
"step": 4975
},
{
"epoch": 7.697063369397218,
"grad_norm": 0.3681195378303528,
"learning_rate": 7.661698813948953e-06,
"loss": 1.6465,
"num_input_tokens_seen": 3739808,
"step": 4980
},
{
"epoch": 7.704791344667697,
"grad_norm": 0.5963971018791199,
"learning_rate": 7.6131769426107165e-06,
"loss": 2.1698,
"num_input_tokens_seen": 3743456,
"step": 4985
},
{
"epoch": 7.712519319938176,
"grad_norm": 0.5191236734390259,
"learning_rate": 7.564781593300605e-06,
"loss": 2.0448,
"num_input_tokens_seen": 3746976,
"step": 4990
},
{
"epoch": 7.720247295208655,
"grad_norm": 0.41211771965026855,
"learning_rate": 7.516513118186294e-06,
"loss": 2.0512,
"num_input_tokens_seen": 3751072,
"step": 4995
},
{
"epoch": 7.727975270479135,
"grad_norm": 0.47517818212509155,
"learning_rate": 7.468371868512286e-06,
"loss": 2.1844,
"num_input_tokens_seen": 3755040,
"step": 5000
},
{
"epoch": 7.7357032457496135,
"grad_norm": 0.5340629816055298,
"learning_rate": 7.420358194597205e-06,
"loss": 2.0041,
"num_input_tokens_seen": 3759072,
"step": 5005
},
{
"epoch": 7.743431221020093,
"grad_norm": 0.500800371170044,
"learning_rate": 7.37247244583138e-06,
"loss": 2.3047,
"num_input_tokens_seen": 3762528,
"step": 5010
},
{
"epoch": 7.751159196290572,
"grad_norm": 0.46884685754776,
"learning_rate": 7.324714970674212e-06,
"loss": 1.9463,
"num_input_tokens_seen": 3766176,
"step": 5015
},
{
"epoch": 7.758887171561051,
"grad_norm": 0.419576495885849,
"learning_rate": 7.277086116651674e-06,
"loss": 1.9457,
"num_input_tokens_seen": 3769760,
"step": 5020
},
{
"epoch": 7.76661514683153,
"grad_norm": 0.42754513025283813,
"learning_rate": 7.229586230353777e-06,
"loss": 2.1629,
"num_input_tokens_seen": 3773408,
"step": 5025
},
{
"epoch": 7.774343122102009,
"grad_norm": 0.5824078917503357,
"learning_rate": 7.182215657432045e-06,
"loss": 2.1221,
"num_input_tokens_seen": 3776928,
"step": 5030
},
{
"epoch": 7.782071097372488,
"grad_norm": 0.5148240327835083,
"learning_rate": 7.134974742597015e-06,
"loss": 1.988,
"num_input_tokens_seen": 3780256,
"step": 5035
},
{
"epoch": 7.789799072642968,
"grad_norm": 0.5740344524383545,
"learning_rate": 7.087863829615698e-06,
"loss": 2.1161,
"num_input_tokens_seen": 3784096,
"step": 5040
},
{
"epoch": 7.797527047913446,
"grad_norm": 0.5428912043571472,
"learning_rate": 7.0408832613091034e-06,
"loss": 2.0609,
"num_input_tokens_seen": 3787296,
"step": 5045
},
{
"epoch": 7.805255023183926,
"grad_norm": 0.42774394154548645,
"learning_rate": 6.994033379549758e-06,
"loss": 2.2296,
"num_input_tokens_seen": 3791008,
"step": 5050
},
{
"epoch": 7.812982998454405,
"grad_norm": 0.4487341344356537,
"learning_rate": 6.947314525259147e-06,
"loss": 2.181,
"num_input_tokens_seen": 3794528,
"step": 5055
},
{
"epoch": 7.820710973724884,
"grad_norm": 0.360095739364624,
"learning_rate": 6.900727038405344e-06,
"loss": 1.9638,
"num_input_tokens_seen": 3798048,
"step": 5060
},
{
"epoch": 7.828438948995363,
"grad_norm": 0.5661271810531616,
"learning_rate": 6.854271258000414e-06,
"loss": 1.9825,
"num_input_tokens_seen": 3801760,
"step": 5065
},
{
"epoch": 7.836166924265842,
"grad_norm": 0.48149120807647705,
"learning_rate": 6.80794752209806e-06,
"loss": 1.9839,
"num_input_tokens_seen": 3805280,
"step": 5070
},
{
"epoch": 7.843894899536322,
"grad_norm": 0.4871172606945038,
"learning_rate": 6.761756167791083e-06,
"loss": 2.031,
"num_input_tokens_seen": 3808864,
"step": 5075
},
{
"epoch": 7.8516228748068,
"grad_norm": 0.5680673122406006,
"learning_rate": 6.715697531208967e-06,
"loss": 1.9571,
"num_input_tokens_seen": 3812512,
"step": 5080
},
{
"epoch": 7.85935085007728,
"grad_norm": 0.5056851506233215,
"learning_rate": 6.669771947515421e-06,
"loss": 1.9788,
"num_input_tokens_seen": 3816352,
"step": 5085
},
{
"epoch": 7.867078825347759,
"grad_norm": 0.41278329491615295,
"learning_rate": 6.6239797509059424e-06,
"loss": 1.6949,
"num_input_tokens_seen": 3820000,
"step": 5090
},
{
"epoch": 7.874806800618238,
"grad_norm": 0.43449851870536804,
"learning_rate": 6.578321274605384e-06,
"loss": 1.7573,
"num_input_tokens_seen": 3823584,
"step": 5095
},
{
"epoch": 7.882534775888717,
"grad_norm": 0.5349349975585938,
"learning_rate": 6.532796850865539e-06,
"loss": 1.8192,
"num_input_tokens_seen": 3827296,
"step": 5100
},
{
"epoch": 7.890262751159196,
"grad_norm": 0.5766939520835876,
"learning_rate": 6.4874068109626985e-06,
"loss": 1.9581,
"num_input_tokens_seen": 3830944,
"step": 5105
},
{
"epoch": 7.897990726429676,
"grad_norm": 0.5526953935623169,
"learning_rate": 6.442151485195275e-06,
"loss": 2.0384,
"num_input_tokens_seen": 3834784,
"step": 5110
},
{
"epoch": 7.905718701700154,
"grad_norm": 0.42307910323143005,
"learning_rate": 6.397031202881357e-06,
"loss": 1.7874,
"num_input_tokens_seen": 3838752,
"step": 5115
},
{
"epoch": 7.913446676970634,
"grad_norm": 0.5621873736381531,
"learning_rate": 6.352046292356381e-06,
"loss": 2.066,
"num_input_tokens_seen": 3842272,
"step": 5120
},
{
"epoch": 7.921174652241113,
"grad_norm": 0.5356681942939758,
"learning_rate": 6.307197080970634e-06,
"loss": 1.7912,
"num_input_tokens_seen": 3846368,
"step": 5125
},
{
"epoch": 7.928902627511592,
"grad_norm": 0.5139599442481995,
"learning_rate": 6.262483895087002e-06,
"loss": 2.0138,
"num_input_tokens_seen": 3850272,
"step": 5130
},
{
"epoch": 7.936630602782071,
"grad_norm": 0.45896539092063904,
"learning_rate": 6.21790706007846e-06,
"loss": 2.1746,
"num_input_tokens_seen": 3853600,
"step": 5135
},
{
"epoch": 7.94435857805255,
"grad_norm": 0.4682660698890686,
"learning_rate": 6.173466900325839e-06,
"loss": 1.8312,
"num_input_tokens_seen": 3857312,
"step": 5140
},
{
"epoch": 7.95208655332303,
"grad_norm": 0.49920710921287537,
"learning_rate": 6.129163739215352e-06,
"loss": 1.9881,
"num_input_tokens_seen": 3860960,
"step": 5145
},
{
"epoch": 7.959814528593508,
"grad_norm": 0.3703214228153229,
"learning_rate": 6.084997899136311e-06,
"loss": 1.7246,
"num_input_tokens_seen": 3864736,
"step": 5150
},
{
"epoch": 7.967542503863988,
"grad_norm": 0.43374818563461304,
"learning_rate": 6.040969701478743e-06,
"loss": 1.8493,
"num_input_tokens_seen": 3868640,
"step": 5155
},
{
"epoch": 7.975270479134466,
"grad_norm": 0.49463748931884766,
"learning_rate": 5.997079466631081e-06,
"loss": 1.6057,
"num_input_tokens_seen": 3872224,
"step": 5160
},
{
"epoch": 7.982998454404946,
"grad_norm": 0.4423999786376953,
"learning_rate": 5.953327513977805e-06,
"loss": 1.9082,
"num_input_tokens_seen": 3876512,
"step": 5165
},
{
"epoch": 7.990726429675425,
"grad_norm": 0.5020461082458496,
"learning_rate": 5.909714161897137e-06,
"loss": 1.885,
"num_input_tokens_seen": 3880224,
"step": 5170
},
{
"epoch": 7.998454404945904,
"grad_norm": 0.6084979772567749,
"learning_rate": 5.8662397277587125e-06,
"loss": 1.7686,
"num_input_tokens_seen": 3884384,
"step": 5175
},
{
"epoch": 8.006182380216384,
"grad_norm": 0.5096596479415894,
"learning_rate": 5.822904527921285e-06,
"loss": 1.6594,
"num_input_tokens_seen": 3887664,
"step": 5180
},
{
"epoch": 8.012364760432767,
"eval_loss": 1.9898368120193481,
"eval_runtime": 9.8217,
"eval_samples_per_second": 58.544,
"eval_steps_per_second": 7.331,
"num_input_tokens_seen": 3890608,
"step": 5184
},
{
"epoch": 8.013910355486862,
"grad_norm": 0.5536606311798096,
"learning_rate": 5.779708877730411e-06,
"loss": 1.7109,
"num_input_tokens_seen": 3891568,
"step": 5185
},
{
"epoch": 8.021638330757341,
"grad_norm": 0.4795739948749542,
"learning_rate": 5.73665309151615e-06,
"loss": 1.8976,
"num_input_tokens_seen": 3895472,
"step": 5190
},
{
"epoch": 8.029366306027821,
"grad_norm": 0.4783843457698822,
"learning_rate": 5.6937374825908e-06,
"loss": 2.0581,
"num_input_tokens_seen": 3899312,
"step": 5195
},
{
"epoch": 8.0370942812983,
"grad_norm": 0.4617125988006592,
"learning_rate": 5.650962363246592e-06,
"loss": 2.0033,
"num_input_tokens_seen": 3902448,
"step": 5200
},
{
"epoch": 8.044822256568779,
"grad_norm": 0.6532540917396545,
"learning_rate": 5.6083280447534585e-06,
"loss": 1.481,
"num_input_tokens_seen": 3906160,
"step": 5205
},
{
"epoch": 8.052550231839259,
"grad_norm": 0.511972963809967,
"learning_rate": 5.5658348373566815e-06,
"loss": 1.9637,
"num_input_tokens_seen": 3910064,
"step": 5210
},
{
"epoch": 8.060278207109738,
"grad_norm": 0.42409616708755493,
"learning_rate": 5.523483050274766e-06,
"loss": 1.7105,
"num_input_tokens_seen": 3913648,
"step": 5215
},
{
"epoch": 8.068006182380216,
"grad_norm": 0.46803751587867737,
"learning_rate": 5.481272991697045e-06,
"loss": 2.1197,
"num_input_tokens_seen": 3917296,
"step": 5220
},
{
"epoch": 8.075734157650695,
"grad_norm": 0.43307095766067505,
"learning_rate": 5.439204968781566e-06,
"loss": 1.851,
"num_input_tokens_seen": 3920624,
"step": 5225
},
{
"epoch": 8.083462132921175,
"grad_norm": 0.43012967705726624,
"learning_rate": 5.397279287652771e-06,
"loss": 1.797,
"num_input_tokens_seen": 3924272,
"step": 5230
},
{
"epoch": 8.091190108191654,
"grad_norm": 0.5172163248062134,
"learning_rate": 5.355496253399294e-06,
"loss": 1.7261,
"num_input_tokens_seen": 3928176,
"step": 5235
},
{
"epoch": 8.098918083462133,
"grad_norm": 0.5393996834754944,
"learning_rate": 5.313856170071754e-06,
"loss": 2.2175,
"num_input_tokens_seen": 3932400,
"step": 5240
},
{
"epoch": 8.106646058732611,
"grad_norm": 0.42386066913604736,
"learning_rate": 5.272359340680524e-06,
"loss": 2.1419,
"num_input_tokens_seen": 3936240,
"step": 5245
},
{
"epoch": 8.114374034003092,
"grad_norm": 0.44247618317604065,
"learning_rate": 5.231006067193539e-06,
"loss": 1.7462,
"num_input_tokens_seen": 3940080,
"step": 5250
},
{
"epoch": 8.12210200927357,
"grad_norm": 0.5681042075157166,
"learning_rate": 5.189796650534093e-06,
"loss": 2.2102,
"num_input_tokens_seen": 3944048,
"step": 5255
},
{
"epoch": 8.129829984544049,
"grad_norm": 0.4637432396411896,
"learning_rate": 5.1487313905786346e-06,
"loss": 1.6742,
"num_input_tokens_seen": 3947824,
"step": 5260
},
{
"epoch": 8.13755795981453,
"grad_norm": 0.5209901928901672,
"learning_rate": 5.107810586154637e-06,
"loss": 2.2237,
"num_input_tokens_seen": 3951280,
"step": 5265
},
{
"epoch": 8.145285935085008,
"grad_norm": 0.5490251779556274,
"learning_rate": 5.0670345350383346e-06,
"loss": 1.7778,
"num_input_tokens_seen": 3955248,
"step": 5270
},
{
"epoch": 8.153013910355487,
"grad_norm": 0.4561353027820587,
"learning_rate": 5.026403533952659e-06,
"loss": 2.2651,
"num_input_tokens_seen": 3959216,
"step": 5275
},
{
"epoch": 8.160741885625965,
"grad_norm": 0.5111833214759827,
"learning_rate": 4.98591787856498e-06,
"loss": 1.8427,
"num_input_tokens_seen": 3962608,
"step": 5280
},
{
"epoch": 8.168469860896446,
"grad_norm": 0.4493984580039978,
"learning_rate": 4.945577863485046e-06,
"loss": 1.8568,
"num_input_tokens_seen": 3966320,
"step": 5285
},
{
"epoch": 8.176197836166924,
"grad_norm": 0.5941608548164368,
"learning_rate": 4.905383782262768e-06,
"loss": 1.8362,
"num_input_tokens_seen": 3970160,
"step": 5290
},
{
"epoch": 8.183925811437403,
"grad_norm": 0.5745226740837097,
"learning_rate": 4.865335927386125e-06,
"loss": 1.8006,
"num_input_tokens_seen": 3973872,
"step": 5295
},
{
"epoch": 8.191653786707883,
"grad_norm": 0.49066871404647827,
"learning_rate": 4.825434590279015e-06,
"loss": 1.887,
"num_input_tokens_seen": 3977904,
"step": 5300
},
{
"epoch": 8.199381761978362,
"grad_norm": 0.6205068230628967,
"learning_rate": 4.785680061299153e-06,
"loss": 1.8471,
"num_input_tokens_seen": 3981552,
"step": 5305
},
{
"epoch": 8.20710973724884,
"grad_norm": 0.7366943955421448,
"learning_rate": 4.746072629735932e-06,
"loss": 1.8981,
"num_input_tokens_seen": 3985200,
"step": 5310
},
{
"epoch": 8.21483771251932,
"grad_norm": 0.613416850566864,
"learning_rate": 4.706612583808348e-06,
"loss": 1.8575,
"num_input_tokens_seen": 3989424,
"step": 5315
},
{
"epoch": 8.2225656877898,
"grad_norm": 0.41795915365219116,
"learning_rate": 4.6673002106628786e-06,
"loss": 1.6017,
"num_input_tokens_seen": 3993264,
"step": 5320
},
{
"epoch": 8.230293663060278,
"grad_norm": 0.5964676737785339,
"learning_rate": 4.628135796371402e-06,
"loss": 2.2189,
"num_input_tokens_seen": 3997232,
"step": 5325
},
{
"epoch": 8.238021638330757,
"grad_norm": 0.49453970789909363,
"learning_rate": 4.5891196259291165e-06,
"loss": 1.9771,
"num_input_tokens_seen": 4000816,
"step": 5330
},
{
"epoch": 8.245749613601237,
"grad_norm": 0.592045783996582,
"learning_rate": 4.550251983252485e-06,
"loss": 2.2162,
"num_input_tokens_seen": 4004720,
"step": 5335
},
{
"epoch": 8.253477588871716,
"grad_norm": 0.6761609315872192,
"learning_rate": 4.511533151177111e-06,
"loss": 1.7625,
"num_input_tokens_seen": 4008752,
"step": 5340
},
{
"epoch": 8.261205564142195,
"grad_norm": 0.6841716766357422,
"learning_rate": 4.472963411455764e-06,
"loss": 1.8694,
"num_input_tokens_seen": 4012272,
"step": 5345
},
{
"epoch": 8.268933539412673,
"grad_norm": 0.4839349687099457,
"learning_rate": 4.434543044756237e-06,
"loss": 1.9231,
"num_input_tokens_seen": 4015920,
"step": 5350
},
{
"epoch": 8.276661514683154,
"grad_norm": 0.6607086658477783,
"learning_rate": 4.396272330659398e-06,
"loss": 1.7466,
"num_input_tokens_seen": 4019504,
"step": 5355
},
{
"epoch": 8.284389489953632,
"grad_norm": 0.5170435309410095,
"learning_rate": 4.35815154765708e-06,
"loss": 2.2421,
"num_input_tokens_seen": 4023472,
"step": 5360
},
{
"epoch": 8.292117465224111,
"grad_norm": 0.6370357871055603,
"learning_rate": 4.32018097315009e-06,
"loss": 1.6066,
"num_input_tokens_seen": 4027120,
"step": 5365
},
{
"epoch": 8.29984544049459,
"grad_norm": 0.4809553027153015,
"learning_rate": 4.28236088344619e-06,
"loss": 2.2748,
"num_input_tokens_seen": 4031152,
"step": 5370
},
{
"epoch": 8.30757341576507,
"grad_norm": 0.44451555609703064,
"learning_rate": 4.244691553758076e-06,
"loss": 2.1046,
"num_input_tokens_seen": 4034864,
"step": 5375
},
{
"epoch": 8.315301391035549,
"grad_norm": 0.4132705628871918,
"learning_rate": 4.207173258201375e-06,
"loss": 1.739,
"num_input_tokens_seen": 4038384,
"step": 5380
},
{
"epoch": 8.323029366306027,
"grad_norm": 0.5284491181373596,
"learning_rate": 4.1698062697926645e-06,
"loss": 1.6406,
"num_input_tokens_seen": 4041840,
"step": 5385
},
{
"epoch": 8.330757341576508,
"grad_norm": 0.5363196730613708,
"learning_rate": 4.132590860447463e-06,
"loss": 2.1619,
"num_input_tokens_seen": 4045808,
"step": 5390
},
{
"epoch": 8.338485316846986,
"grad_norm": 0.43313705921173096,
"learning_rate": 4.095527300978297e-06,
"loss": 1.8315,
"num_input_tokens_seen": 4049648,
"step": 5395
},
{
"epoch": 8.346213292117465,
"grad_norm": 0.504666268825531,
"learning_rate": 4.05861586109264e-06,
"loss": 1.8599,
"num_input_tokens_seen": 4053680,
"step": 5400
},
{
"epoch": 8.353941267387944,
"grad_norm": 0.6796596050262451,
"learning_rate": 4.021856809391075e-06,
"loss": 1.8656,
"num_input_tokens_seen": 4056944,
"step": 5405
},
{
"epoch": 8.361669242658424,
"grad_norm": 0.3838791847229004,
"learning_rate": 3.985250413365213e-06,
"loss": 1.8435,
"num_input_tokens_seen": 4061040,
"step": 5410
},
{
"epoch": 8.369397217928903,
"grad_norm": 0.492909699678421,
"learning_rate": 3.948796939395849e-06,
"loss": 1.9219,
"num_input_tokens_seen": 4065456,
"step": 5415
},
{
"epoch": 8.377125193199381,
"grad_norm": 0.42821991443634033,
"learning_rate": 3.912496652750958e-06,
"loss": 1.8958,
"num_input_tokens_seen": 4069296,
"step": 5420
},
{
"epoch": 8.384853168469862,
"grad_norm": 0.5378724932670593,
"learning_rate": 3.8763498175837965e-06,
"loss": 1.7691,
"num_input_tokens_seen": 4072880,
"step": 5425
},
{
"epoch": 8.39258114374034,
"grad_norm": 0.5229602456092834,
"learning_rate": 3.840356696930969e-06,
"loss": 1.9893,
"num_input_tokens_seen": 4076528,
"step": 5430
},
{
"epoch": 8.400309119010819,
"grad_norm": 0.5723522305488586,
"learning_rate": 3.8045175527105127e-06,
"loss": 1.7024,
"num_input_tokens_seen": 4079856,
"step": 5435
},
{
"epoch": 8.408037094281298,
"grad_norm": 0.45156463980674744,
"learning_rate": 3.7688326457200025e-06,
"loss": 1.9089,
"num_input_tokens_seen": 4083504,
"step": 5440
},
{
"epoch": 8.415765069551778,
"grad_norm": 0.3913841247558594,
"learning_rate": 3.7333022356346365e-06,
"loss": 1.6517,
"num_input_tokens_seen": 4086576,
"step": 5445
},
{
"epoch": 8.423493044822257,
"grad_norm": 0.6192737817764282,
"learning_rate": 3.6979265810053566e-06,
"loss": 1.9048,
"num_input_tokens_seen": 4090352,
"step": 5450
},
{
"epoch": 8.431221020092735,
"grad_norm": 0.3701123595237732,
"learning_rate": 3.6627059392569883e-06,
"loss": 1.8114,
"num_input_tokens_seen": 4094576,
"step": 5455
},
{
"epoch": 8.438948995363216,
"grad_norm": 0.5786436200141907,
"learning_rate": 3.6276405666863023e-06,
"loss": 2.2703,
"num_input_tokens_seen": 4098160,
"step": 5460
},
{
"epoch": 8.446676970633694,
"grad_norm": 0.5972141623497009,
"learning_rate": 3.592730718460241e-06,
"loss": 1.8991,
"num_input_tokens_seen": 4102192,
"step": 5465
},
{
"epoch": 8.454404945904173,
"grad_norm": 0.6472261548042297,
"learning_rate": 3.5579766486139643e-06,
"loss": 2.0854,
"num_input_tokens_seen": 4105904,
"step": 5470
},
{
"epoch": 8.462132921174652,
"grad_norm": 0.5761744976043701,
"learning_rate": 3.523378610049091e-06,
"loss": 1.6798,
"num_input_tokens_seen": 4109808,
"step": 5475
},
{
"epoch": 8.469860896445132,
"grad_norm": 0.618687093257904,
"learning_rate": 3.4889368545317963e-06,
"loss": 2.0827,
"num_input_tokens_seen": 4113520,
"step": 5480
},
{
"epoch": 8.47758887171561,
"grad_norm": 0.5880991220474243,
"learning_rate": 3.4546516326910027e-06,
"loss": 1.8648,
"num_input_tokens_seen": 4117104,
"step": 5485
},
{
"epoch": 8.48531684698609,
"grad_norm": 0.43996042013168335,
"learning_rate": 3.420523194016556e-06,
"loss": 2.0635,
"num_input_tokens_seen": 4120816,
"step": 5490
},
{
"epoch": 8.493044822256568,
"grad_norm": 0.4868849217891693,
"learning_rate": 3.386551786857409e-06,
"loss": 1.7675,
"num_input_tokens_seen": 4124784,
"step": 5495
},
{
"epoch": 8.500772797527048,
"grad_norm": 0.4164232313632965,
"learning_rate": 3.3527376584198104e-06,
"loss": 1.8835,
"num_input_tokens_seen": 4128880,
"step": 5500
},
{
"epoch": 8.508500772797527,
"grad_norm": 0.6961061358451843,
"learning_rate": 3.3190810547655105e-06,
"loss": 2.1859,
"num_input_tokens_seen": 4133040,
"step": 5505
},
{
"epoch": 8.513137557959814,
"eval_loss": 1.9703497886657715,
"eval_runtime": 9.8287,
"eval_samples_per_second": 58.502,
"eval_steps_per_second": 7.325,
"num_input_tokens_seen": 4134704,
"step": 5508
},
{
"epoch": 8.516228748068006,
"grad_norm": 0.4723997414112091,
"learning_rate": 3.2855822208099683e-06,
"loss": 1.6973,
"num_input_tokens_seen": 4136240,
"step": 5510
},
{
"epoch": 8.523956723338486,
"grad_norm": 0.5245389342308044,
"learning_rate": 3.2522414003205713e-06,
"loss": 1.8515,
"num_input_tokens_seen": 4140144,
"step": 5515
},
{
"epoch": 8.531684698608965,
"grad_norm": 0.48844999074935913,
"learning_rate": 3.2190588359148537e-06,
"loss": 1.6946,
"num_input_tokens_seen": 4143536,
"step": 5520
},
{
"epoch": 8.539412673879443,
"grad_norm": 0.5237007737159729,
"learning_rate": 3.1860347690587573e-06,
"loss": 2.2678,
"num_input_tokens_seen": 4147440,
"step": 5525
},
{
"epoch": 8.547140649149922,
"grad_norm": 0.6524617671966553,
"learning_rate": 3.153169440064818e-06,
"loss": 1.9823,
"num_input_tokens_seen": 4151024,
"step": 5530
},
{
"epoch": 8.554868624420402,
"grad_norm": 0.4519035518169403,
"learning_rate": 3.1204630880904944e-06,
"loss": 1.8902,
"num_input_tokens_seen": 4154672,
"step": 5535
},
{
"epoch": 8.562596599690881,
"grad_norm": 0.6340076327323914,
"learning_rate": 3.0879159511363525e-06,
"loss": 1.7432,
"num_input_tokens_seen": 4158512,
"step": 5540
},
{
"epoch": 8.57032457496136,
"grad_norm": 0.5062048435211182,
"learning_rate": 3.0555282660443914e-06,
"loss": 1.9369,
"num_input_tokens_seen": 4162224,
"step": 5545
},
{
"epoch": 8.578052550231838,
"grad_norm": 0.5006168484687805,
"learning_rate": 3.0233002684962872e-06,
"loss": 1.9589,
"num_input_tokens_seen": 4165872,
"step": 5550
},
{
"epoch": 8.585780525502319,
"grad_norm": 0.48547643423080444,
"learning_rate": 2.9912321930116836e-06,
"loss": 1.8006,
"num_input_tokens_seen": 4170160,
"step": 5555
},
{
"epoch": 8.593508500772797,
"grad_norm": 0.4373511075973511,
"learning_rate": 2.9593242729464926e-06,
"loss": 1.7253,
"num_input_tokens_seen": 4173872,
"step": 5560
},
{
"epoch": 8.601236476043276,
"grad_norm": 0.5265281796455383,
"learning_rate": 2.927576740491195e-06,
"loss": 1.9011,
"num_input_tokens_seen": 4177712,
"step": 5565
},
{
"epoch": 8.608964451313756,
"grad_norm": 0.4482889175415039,
"learning_rate": 2.8959898266691434e-06,
"loss": 1.5769,
"num_input_tokens_seen": 4182192,
"step": 5570
},
{
"epoch": 8.616692426584235,
"grad_norm": 0.4474455714225769,
"learning_rate": 2.8645637613348904e-06,
"loss": 1.7703,
"num_input_tokens_seen": 4185776,
"step": 5575
},
{
"epoch": 8.624420401854714,
"grad_norm": 0.553152859210968,
"learning_rate": 2.833298773172502e-06,
"loss": 2.245,
"num_input_tokens_seen": 4189488,
"step": 5580
},
{
"epoch": 8.632148377125194,
"grad_norm": 0.5896114110946655,
"learning_rate": 2.8021950896939266e-06,
"loss": 2.0839,
"num_input_tokens_seen": 4193264,
"step": 5585
},
{
"epoch": 8.639876352395673,
"grad_norm": 0.44060713052749634,
"learning_rate": 2.7712529372372814e-06,
"loss": 1.8894,
"num_input_tokens_seen": 4196912,
"step": 5590
},
{
"epoch": 8.647604327666151,
"grad_norm": 0.40510526299476624,
"learning_rate": 2.7404725409652747e-06,
"loss": 2.1538,
"num_input_tokens_seen": 4200752,
"step": 5595
},
{
"epoch": 8.65533230293663,
"grad_norm": 0.41664084792137146,
"learning_rate": 2.7098541248635007e-06,
"loss": 1.8733,
"num_input_tokens_seen": 4204656,
"step": 5600
},
{
"epoch": 8.66306027820711,
"grad_norm": 0.5716778039932251,
"learning_rate": 2.679397911738868e-06,
"loss": 2.0691,
"num_input_tokens_seen": 4208112,
"step": 5605
},
{
"epoch": 8.670788253477589,
"grad_norm": 0.8286293148994446,
"learning_rate": 2.6491041232179352e-06,
"loss": 2.0023,
"num_input_tokens_seen": 4212016,
"step": 5610
},
{
"epoch": 8.678516228748068,
"grad_norm": 0.461260050535202,
"learning_rate": 2.618972979745324e-06,
"loss": 1.7957,
"num_input_tokens_seen": 4215856,
"step": 5615
},
{
"epoch": 8.686244204018546,
"grad_norm": 0.44486209750175476,
"learning_rate": 2.589004700582101e-06,
"loss": 1.9745,
"num_input_tokens_seen": 4219376,
"step": 5620
},
{
"epoch": 8.693972179289027,
"grad_norm": 0.4456034302711487,
"learning_rate": 2.559199503804183e-06,
"loss": 1.9242,
"num_input_tokens_seen": 4222640,
"step": 5625
},
{
"epoch": 8.701700154559505,
"grad_norm": 0.5015110969543457,
"learning_rate": 2.529557606300764e-06,
"loss": 1.4574,
"num_input_tokens_seen": 4225904,
"step": 5630
},
{
"epoch": 8.709428129829984,
"grad_norm": 0.45508769154548645,
"learning_rate": 2.5000792237727165e-06,
"loss": 2.1172,
"num_input_tokens_seen": 4230576,
"step": 5635
},
{
"epoch": 8.717156105100464,
"grad_norm": 0.4284096360206604,
"learning_rate": 2.470764570731038e-06,
"loss": 1.9638,
"num_input_tokens_seen": 4234544,
"step": 5640
},
{
"epoch": 8.724884080370943,
"grad_norm": 0.4829674959182739,
"learning_rate": 2.4416138604952952e-06,
"loss": 1.931,
"num_input_tokens_seen": 4238320,
"step": 5645
},
{
"epoch": 8.732612055641422,
"grad_norm": 0.5861620306968689,
"learning_rate": 2.4126273051920277e-06,
"loss": 1.562,
"num_input_tokens_seen": 4241840,
"step": 5650
},
{
"epoch": 8.7403400309119,
"grad_norm": 0.4229719340801239,
"learning_rate": 2.383805115753279e-06,
"loss": 1.8194,
"num_input_tokens_seen": 4245232,
"step": 5655
},
{
"epoch": 8.74806800618238,
"grad_norm": 0.47962188720703125,
"learning_rate": 2.355147501914981e-06,
"loss": 2.1567,
"num_input_tokens_seen": 4249072,
"step": 5660
},
{
"epoch": 8.75579598145286,
"grad_norm": 0.40144792199134827,
"learning_rate": 2.326654672215503e-06,
"loss": 1.7857,
"num_input_tokens_seen": 4252336,
"step": 5665
},
{
"epoch": 8.763523956723338,
"grad_norm": 0.5309076905250549,
"learning_rate": 2.298326833994069e-06,
"loss": 1.8052,
"num_input_tokens_seen": 4256176,
"step": 5670
},
{
"epoch": 8.771251931993817,
"grad_norm": 0.4937804937362671,
"learning_rate": 2.270164193389296e-06,
"loss": 1.9206,
"num_input_tokens_seen": 4259696,
"step": 5675
},
{
"epoch": 8.778979907264297,
"grad_norm": 0.5504066348075867,
"learning_rate": 2.2421669553376654e-06,
"loss": 2.0706,
"num_input_tokens_seen": 4263088,
"step": 5680
},
{
"epoch": 8.786707882534776,
"grad_norm": 0.5018023252487183,
"learning_rate": 2.214335323572045e-06,
"loss": 1.7451,
"num_input_tokens_seen": 4266864,
"step": 5685
},
{
"epoch": 8.794435857805254,
"grad_norm": 0.5152173638343811,
"learning_rate": 2.1866695006202086e-06,
"loss": 1.722,
"num_input_tokens_seen": 4270704,
"step": 5690
},
{
"epoch": 8.802163833075735,
"grad_norm": 0.5058028101921082,
"learning_rate": 2.15916968780335e-06,
"loss": 1.8042,
"num_input_tokens_seen": 4273904,
"step": 5695
},
{
"epoch": 8.809891808346213,
"grad_norm": 0.4861660599708557,
"learning_rate": 2.1318360852346285e-06,
"loss": 1.737,
"num_input_tokens_seen": 4278192,
"step": 5700
},
{
"epoch": 8.817619783616692,
"grad_norm": 0.5702298283576965,
"learning_rate": 2.1046688918177128e-06,
"loss": 1.8842,
"num_input_tokens_seen": 4281520,
"step": 5705
},
{
"epoch": 8.825347758887172,
"grad_norm": 0.46663275361061096,
"learning_rate": 2.077668305245317e-06,
"loss": 1.6112,
"num_input_tokens_seen": 4285040,
"step": 5710
},
{
"epoch": 8.833075734157651,
"grad_norm": 0.4473027288913727,
"learning_rate": 2.050834521997802e-06,
"loss": 1.8675,
"num_input_tokens_seen": 4288432,
"step": 5715
},
{
"epoch": 8.84080370942813,
"grad_norm": 0.4665966033935547,
"learning_rate": 2.024167737341684e-06,
"loss": 1.7568,
"num_input_tokens_seen": 4291696,
"step": 5720
},
{
"epoch": 8.848531684698608,
"grad_norm": 0.5533444285392761,
"learning_rate": 1.99766814532828e-06,
"loss": 2.436,
"num_input_tokens_seen": 4295536,
"step": 5725
},
{
"epoch": 8.856259659969089,
"grad_norm": 0.5889791250228882,
"learning_rate": 1.9713359387922378e-06,
"loss": 1.6759,
"num_input_tokens_seen": 4299248,
"step": 5730
},
{
"epoch": 8.863987635239567,
"grad_norm": 0.5730096697807312,
"learning_rate": 1.9451713093501855e-06,
"loss": 2.1711,
"num_input_tokens_seen": 4303088,
"step": 5735
},
{
"epoch": 8.871715610510046,
"grad_norm": 0.534789502620697,
"learning_rate": 1.9191744473992913e-06,
"loss": 1.5128,
"num_input_tokens_seen": 4306736,
"step": 5740
},
{
"epoch": 8.879443585780525,
"grad_norm": 0.4847296178340912,
"learning_rate": 1.8933455421159014e-06,
"loss": 1.7468,
"num_input_tokens_seen": 4310384,
"step": 5745
},
{
"epoch": 8.887171561051005,
"grad_norm": 0.5766510963439941,
"learning_rate": 1.8676847814541654e-06,
"loss": 1.6574,
"num_input_tokens_seen": 4313648,
"step": 5750
},
{
"epoch": 8.894899536321484,
"grad_norm": 0.5227906703948975,
"learning_rate": 1.8421923521446587e-06,
"loss": 1.7965,
"num_input_tokens_seen": 4317488,
"step": 5755
},
{
"epoch": 8.902627511591962,
"grad_norm": 0.4841407537460327,
"learning_rate": 1.8168684396930285e-06,
"loss": 2.2404,
"num_input_tokens_seen": 4321392,
"step": 5760
},
{
"epoch": 8.910355486862443,
"grad_norm": 0.46578606963157654,
"learning_rate": 1.7917132283786386e-06,
"loss": 1.6578,
"num_input_tokens_seen": 4325360,
"step": 5765
},
{
"epoch": 8.918083462132921,
"grad_norm": 0.48235538601875305,
"learning_rate": 1.7667269012532406e-06,
"loss": 2.2661,
"num_input_tokens_seen": 4328752,
"step": 5770
},
{
"epoch": 8.9258114374034,
"grad_norm": 0.4867675304412842,
"learning_rate": 1.7419096401396357e-06,
"loss": 2.2521,
"num_input_tokens_seen": 4332464,
"step": 5775
},
{
"epoch": 8.933539412673879,
"grad_norm": 0.4980892241001129,
"learning_rate": 1.7172616256303288e-06,
"loss": 1.8225,
"num_input_tokens_seen": 4336496,
"step": 5780
},
{
"epoch": 8.94126738794436,
"grad_norm": 0.42616257071495056,
"learning_rate": 1.6927830370862736e-06,
"loss": 1.645,
"num_input_tokens_seen": 4340208,
"step": 5785
},
{
"epoch": 8.948995363214838,
"grad_norm": 0.44497111439704895,
"learning_rate": 1.6684740526354853e-06,
"loss": 1.8208,
"num_input_tokens_seen": 4343472,
"step": 5790
},
{
"epoch": 8.956723338485316,
"grad_norm": 0.4740997850894928,
"learning_rate": 1.6443348491718274e-06,
"loss": 1.9233,
"num_input_tokens_seen": 4347376,
"step": 5795
},
{
"epoch": 8.964451313755795,
"grad_norm": 0.5786256194114685,
"learning_rate": 1.6203656023536629e-06,
"loss": 1.871,
"num_input_tokens_seen": 4351152,
"step": 5800
},
{
"epoch": 8.972179289026275,
"grad_norm": 0.5951398015022278,
"learning_rate": 1.5965664866026047e-06,
"loss": 1.6606,
"num_input_tokens_seen": 4355184,
"step": 5805
},
{
"epoch": 8.979907264296754,
"grad_norm": 0.5134670734405518,
"learning_rate": 1.57293767510224e-06,
"loss": 1.8239,
"num_input_tokens_seen": 4358640,
"step": 5810
},
{
"epoch": 8.987635239567233,
"grad_norm": 0.3904499411582947,
"learning_rate": 1.5494793397968694e-06,
"loss": 1.918,
"num_input_tokens_seen": 4362288,
"step": 5815
},
{
"epoch": 8.995363214837713,
"grad_norm": 0.4581204354763031,
"learning_rate": 1.5261916513902603e-06,
"loss": 1.7761,
"num_input_tokens_seen": 4366768,
"step": 5820
},
{
"epoch": 9.003091190108192,
"grad_norm": 0.4983641505241394,
"learning_rate": 1.5030747793443989e-06,
"loss": 1.991,
"num_input_tokens_seen": 4369936,
"step": 5825
},
{
"epoch": 9.01081916537867,
"grad_norm": 0.45775794982910156,
"learning_rate": 1.4801288918782574e-06,
"loss": 1.9089,
"num_input_tokens_seen": 4374288,
"step": 5830
},
{
"epoch": 9.013910355486862,
"eval_loss": 1.9623879194259644,
"eval_runtime": 9.8422,
"eval_samples_per_second": 58.422,
"eval_steps_per_second": 7.315,
"num_input_tokens_seen": 4375824,
"step": 5832
},
{
"epoch": 9.018547140649149,
"grad_norm": 0.5283871293067932,
"learning_rate": 1.4573541559665754e-06,
"loss": 1.6235,
"num_input_tokens_seen": 4378064,
"step": 5835
},
{
"epoch": 9.02627511591963,
"grad_norm": 0.49816635251045227,
"learning_rate": 1.4347507373386331e-06,
"loss": 1.7438,
"num_input_tokens_seen": 4381712,
"step": 5840
},
{
"epoch": 9.034003091190108,
"grad_norm": 0.5023284554481506,
"learning_rate": 1.412318800477072e-06,
"loss": 2.245,
"num_input_tokens_seen": 4385424,
"step": 5845
},
{
"epoch": 9.041731066460587,
"grad_norm": 0.4554198086261749,
"learning_rate": 1.3900585086166513e-06,
"loss": 2.0172,
"num_input_tokens_seen": 4389776,
"step": 5850
},
{
"epoch": 9.049459041731067,
"grad_norm": 0.5572301745414734,
"learning_rate": 1.3679700237431203e-06,
"loss": 1.9297,
"num_input_tokens_seen": 4393488,
"step": 5855
},
{
"epoch": 9.057187017001546,
"grad_norm": 0.4626171290874481,
"learning_rate": 1.3460535065919738e-06,
"loss": 1.8209,
"num_input_tokens_seen": 4397968,
"step": 5860
},
{
"epoch": 9.064914992272024,
"grad_norm": 0.5806289315223694,
"learning_rate": 1.324309116647346e-06,
"loss": 2.168,
"num_input_tokens_seen": 4401936,
"step": 5865
},
{
"epoch": 9.072642967542503,
"grad_norm": 0.39930564165115356,
"learning_rate": 1.3027370121408034e-06,
"loss": 2.0342,
"num_input_tokens_seen": 4405328,
"step": 5870
},
{
"epoch": 9.080370942812984,
"grad_norm": 0.5544669032096863,
"learning_rate": 1.2813373500502128e-06,
"loss": 1.7005,
"num_input_tokens_seen": 4408976,
"step": 5875
},
{
"epoch": 9.088098918083462,
"grad_norm": 0.6343225240707397,
"learning_rate": 1.2601102860986008e-06,
"loss": 1.5891,
"num_input_tokens_seen": 4413136,
"step": 5880
},
{
"epoch": 9.09582689335394,
"grad_norm": 0.4393240213394165,
"learning_rate": 1.2390559747530062e-06,
"loss": 1.9156,
"num_input_tokens_seen": 4416784,
"step": 5885
},
{
"epoch": 9.103554868624421,
"grad_norm": 0.43060049414634705,
"learning_rate": 1.2181745692233766e-06,
"loss": 1.9855,
"num_input_tokens_seen": 4420688,
"step": 5890
},
{
"epoch": 9.1112828438949,
"grad_norm": 0.506955087184906,
"learning_rate": 1.1974662214614379e-06,
"loss": 2.047,
"num_input_tokens_seen": 4424656,
"step": 5895
},
{
"epoch": 9.119010819165378,
"grad_norm": 0.43633195757865906,
"learning_rate": 1.1769310821595907e-06,
"loss": 2.0666,
"num_input_tokens_seen": 4428432,
"step": 5900
},
{
"epoch": 9.126738794435857,
"grad_norm": 0.4248088598251343,
"learning_rate": 1.156569300749827e-06,
"loss": 1.4644,
"num_input_tokens_seen": 4431440,
"step": 5905
},
{
"epoch": 9.134466769706338,
"grad_norm": 0.44746243953704834,
"learning_rate": 1.1363810254026108e-06,
"loss": 1.9164,
"num_input_tokens_seen": 4435344,
"step": 5910
},
{
"epoch": 9.142194744976816,
"grad_norm": 0.5325887799263,
"learning_rate": 1.1163664030258536e-06,
"loss": 1.8609,
"num_input_tokens_seen": 4439184,
"step": 5915
},
{
"epoch": 9.149922720247295,
"grad_norm": 0.6899775266647339,
"learning_rate": 1.0965255792637768e-06,
"loss": 1.7784,
"num_input_tokens_seen": 4442832,
"step": 5920
},
{
"epoch": 9.157650695517773,
"grad_norm": 0.509699821472168,
"learning_rate": 1.0768586984959167e-06,
"loss": 1.7023,
"num_input_tokens_seen": 4446672,
"step": 5925
},
{
"epoch": 9.165378670788254,
"grad_norm": 0.5541631579399109,
"learning_rate": 1.0573659038360301e-06,
"loss": 1.8195,
"num_input_tokens_seen": 4450576,
"step": 5930
},
{
"epoch": 9.173106646058732,
"grad_norm": 0.4798594117164612,
"learning_rate": 1.0380473371310762e-06,
"loss": 1.8747,
"num_input_tokens_seen": 4454160,
"step": 5935
},
{
"epoch": 9.180834621329211,
"grad_norm": 0.5438898801803589,
"learning_rate": 1.0189031389601672e-06,
"loss": 1.7346,
"num_input_tokens_seen": 4457168,
"step": 5940
},
{
"epoch": 9.188562596599692,
"grad_norm": 0.5284126996994019,
"learning_rate": 9.999334486335636e-07,
"loss": 2.1073,
"num_input_tokens_seen": 4460752,
"step": 5945
},
{
"epoch": 9.19629057187017,
"grad_norm": 0.4561935067176819,
"learning_rate": 9.81138404191645e-07,
"loss": 1.9123,
"num_input_tokens_seen": 4464400,
"step": 5950
},
{
"epoch": 9.204018547140649,
"grad_norm": 0.4085952341556549,
"learning_rate": 9.625181424039147e-07,
"loss": 1.7345,
"num_input_tokens_seen": 4467856,
"step": 5955
},
{
"epoch": 9.211746522411127,
"grad_norm": 0.48208120465278625,
"learning_rate": 9.440727987679976e-07,
"loss": 2.0286,
"num_input_tokens_seen": 4471696,
"step": 5960
},
{
"epoch": 9.219474497681608,
"grad_norm": 0.5661913752555847,
"learning_rate": 9.25802507508669e-07,
"loss": 2.1419,
"num_input_tokens_seen": 4475472,
"step": 5965
},
{
"epoch": 9.227202472952087,
"grad_norm": 0.4500444531440735,
"learning_rate": 9.077074015768516e-07,
"loss": 1.5232,
"num_input_tokens_seen": 4478864,
"step": 5970
},
{
"epoch": 9.234930448222565,
"grad_norm": 0.4850940704345703,
"learning_rate": 8.897876126486793e-07,
"loss": 1.8175,
"num_input_tokens_seen": 4482896,
"step": 5975
},
{
"epoch": 9.242658423493046,
"grad_norm": 0.5343092083930969,
"learning_rate": 8.720432711245064e-07,
"loss": 2.2061,
"num_input_tokens_seen": 4486544,
"step": 5980
},
{
"epoch": 9.250386398763524,
"grad_norm": 0.5756357312202454,
"learning_rate": 8.544745061279891e-07,
"loss": 2.0404,
"num_input_tokens_seen": 4490576,
"step": 5985
},
{
"epoch": 9.258114374034003,
"grad_norm": 0.49154385924339294,
"learning_rate": 8.370814455051279e-07,
"loss": 2.0676,
"num_input_tokens_seen": 4494224,
"step": 5990
},
{
"epoch": 9.265842349304481,
"grad_norm": 0.5070616006851196,
"learning_rate": 8.198642158233377e-07,
"loss": 1.7215,
"num_input_tokens_seen": 4498064,
"step": 5995
},
{
"epoch": 9.273570324574962,
"grad_norm": 0.531429648399353,
"learning_rate": 8.028229423705375e-07,
"loss": 2.2743,
"num_input_tokens_seen": 4501776,
"step": 6000
},
{
"epoch": 9.28129829984544,
"grad_norm": 0.4259873032569885,
"learning_rate": 7.859577491542259e-07,
"loss": 1.6607,
"num_input_tokens_seen": 4505296,
"step": 6005
},
{
"epoch": 9.28902627511592,
"grad_norm": 0.6347371935844421,
"learning_rate": 7.692687589005876e-07,
"loss": 1.7177,
"num_input_tokens_seen": 4509136,
"step": 6010
},
{
"epoch": 9.2967542503864,
"grad_norm": 0.550849974155426,
"learning_rate": 7.527560930535971e-07,
"loss": 1.6047,
"num_input_tokens_seen": 4512912,
"step": 6015
},
{
"epoch": 9.304482225656878,
"grad_norm": 0.4533255100250244,
"learning_rate": 7.364198717741355e-07,
"loss": 1.7645,
"num_input_tokens_seen": 4516944,
"step": 6020
},
{
"epoch": 9.312210200927357,
"grad_norm": 0.49000996351242065,
"learning_rate": 7.20260213939114e-07,
"loss": 1.9519,
"num_input_tokens_seen": 4520208,
"step": 6025
},
{
"epoch": 9.319938176197835,
"grad_norm": 0.5339989066123962,
"learning_rate": 7.042772371406131e-07,
"loss": 2.0867,
"num_input_tokens_seen": 4524176,
"step": 6030
},
{
"epoch": 9.327666151468316,
"grad_norm": 0.4784291386604309,
"learning_rate": 6.884710576850306e-07,
"loss": 1.6407,
"num_input_tokens_seen": 4527632,
"step": 6035
},
{
"epoch": 9.335394126738795,
"grad_norm": 0.45697057247161865,
"learning_rate": 6.728417905922074e-07,
"loss": 2.3036,
"num_input_tokens_seen": 4531408,
"step": 6040
},
{
"epoch": 9.343122102009273,
"grad_norm": 0.4117041528224945,
"learning_rate": 6.573895495946447e-07,
"loss": 1.8848,
"num_input_tokens_seen": 4535184,
"step": 6045
},
{
"epoch": 9.350850077279752,
"grad_norm": 0.5256591439247131,
"learning_rate": 6.421144471366103e-07,
"loss": 1.9648,
"num_input_tokens_seen": 4539536,
"step": 6050
},
{
"epoch": 9.358578052550232,
"grad_norm": 0.4190206825733185,
"learning_rate": 6.270165943733807e-07,
"loss": 1.7893,
"num_input_tokens_seen": 4543184,
"step": 6055
},
{
"epoch": 9.36630602782071,
"grad_norm": 0.4589918851852417,
"learning_rate": 6.120961011703924e-07,
"loss": 1.8888,
"num_input_tokens_seen": 4546896,
"step": 6060
},
{
"epoch": 9.37403400309119,
"grad_norm": 0.5328862071037292,
"learning_rate": 5.973530761024582e-07,
"loss": 1.7838,
"num_input_tokens_seen": 4550736,
"step": 6065
},
{
"epoch": 9.38176197836167,
"grad_norm": 0.40755224227905273,
"learning_rate": 5.827876264529741e-07,
"loss": 1.8243,
"num_input_tokens_seen": 4554512,
"step": 6070
},
{
"epoch": 9.389489953632149,
"grad_norm": 0.43924909830093384,
"learning_rate": 5.683998582131395e-07,
"loss": 1.7699,
"num_input_tokens_seen": 4558288,
"step": 6075
},
{
"epoch": 9.397217928902627,
"grad_norm": 0.43658119440078735,
"learning_rate": 5.541898760811848e-07,
"loss": 1.9572,
"num_input_tokens_seen": 4562576,
"step": 6080
},
{
"epoch": 9.404945904173106,
"grad_norm": 0.468034565448761,
"learning_rate": 5.401577834616145e-07,
"loss": 1.5804,
"num_input_tokens_seen": 4566352,
"step": 6085
},
{
"epoch": 9.412673879443586,
"grad_norm": 0.5616998076438904,
"learning_rate": 5.26303682464438e-07,
"loss": 1.8576,
"num_input_tokens_seen": 4570384,
"step": 6090
},
{
"epoch": 9.420401854714065,
"grad_norm": 0.6615525484085083,
"learning_rate": 5.126276739044617e-07,
"loss": 1.7841,
"num_input_tokens_seen": 4574032,
"step": 6095
},
{
"epoch": 9.428129829984544,
"grad_norm": 0.4598679542541504,
"learning_rate": 4.991298573005038e-07,
"loss": 1.718,
"num_input_tokens_seen": 4578000,
"step": 6100
},
{
"epoch": 9.435857805255024,
"grad_norm": 0.4575481116771698,
"learning_rate": 4.858103308747225e-07,
"loss": 1.6118,
"num_input_tokens_seen": 4581904,
"step": 6105
},
{
"epoch": 9.443585780525503,
"grad_norm": 0.5747670531272888,
"learning_rate": 4.726691915518694e-07,
"loss": 1.9901,
"num_input_tokens_seen": 4585552,
"step": 6110
},
{
"epoch": 9.451313755795981,
"grad_norm": 0.5150761604309082,
"learning_rate": 4.597065349585844e-07,
"loss": 1.843,
"num_input_tokens_seen": 4589264,
"step": 6115
},
{
"epoch": 9.45904173106646,
"grad_norm": 0.4432414174079895,
"learning_rate": 4.4692245542272417e-07,
"loss": 1.8543,
"num_input_tokens_seen": 4593552,
"step": 6120
},
{
"epoch": 9.46676970633694,
"grad_norm": 0.4179718792438507,
"learning_rate": 4.3431704597264313e-07,
"loss": 1.9304,
"num_input_tokens_seen": 4597264,
"step": 6125
},
{
"epoch": 9.474497681607419,
"grad_norm": 0.439662903547287,
"learning_rate": 4.218903983365469e-07,
"loss": 1.8635,
"num_input_tokens_seen": 4600976,
"step": 6130
},
{
"epoch": 9.482225656877898,
"grad_norm": 0.6849305033683777,
"learning_rate": 4.096426029417982e-07,
"loss": 1.86,
"num_input_tokens_seen": 4604240,
"step": 6135
},
{
"epoch": 9.489953632148378,
"grad_norm": 0.39302560687065125,
"learning_rate": 3.975737489142845e-07,
"loss": 1.5986,
"num_input_tokens_seen": 4608016,
"step": 6140
},
{
"epoch": 9.497681607418857,
"grad_norm": 0.5208292007446289,
"learning_rate": 3.8568392407774544e-07,
"loss": 1.4805,
"num_input_tokens_seen": 4611664,
"step": 6145
},
{
"epoch": 9.505409582689335,
"grad_norm": 0.6550456285476685,
"learning_rate": 3.7397321495314666e-07,
"loss": 1.7417,
"num_input_tokens_seen": 4615760,
"step": 6150
},
{
"epoch": 9.513137557959814,
"grad_norm": 0.4527139663696289,
"learning_rate": 3.624417067580543e-07,
"loss": 1.9414,
"num_input_tokens_seen": 4619472,
"step": 6155
},
{
"epoch": 9.51468315301391,
"eval_loss": 1.9586824178695679,
"eval_runtime": 9.8493,
"eval_samples_per_second": 58.38,
"eval_steps_per_second": 7.31,
"num_input_tokens_seen": 4620240,
"step": 6156
},
{
"epoch": 9.520865533230294,
"grad_norm": 0.46713271737098694,
"learning_rate": 3.5108948340600024e-07,
"loss": 2.0021,
"num_input_tokens_seen": 4623248,
"step": 6160
},
{
"epoch": 9.528593508500773,
"grad_norm": 0.5319985747337341,
"learning_rate": 3.399166275058874e-07,
"loss": 2.0089,
"num_input_tokens_seen": 4627152,
"step": 6165
},
{
"epoch": 9.536321483771252,
"grad_norm": 0.5472551584243774,
"learning_rate": 3.289232203613768e-07,
"loss": 2.0052,
"num_input_tokens_seen": 4630992,
"step": 6170
},
{
"epoch": 9.54404945904173,
"grad_norm": 0.3944055438041687,
"learning_rate": 3.181093419702991e-07,
"loss": 1.6991,
"num_input_tokens_seen": 4634512,
"step": 6175
},
{
"epoch": 9.55177743431221,
"grad_norm": 0.4235127866268158,
"learning_rate": 3.074750710240798e-07,
"loss": 1.965,
"num_input_tokens_seen": 4638160,
"step": 6180
},
{
"epoch": 9.55950540958269,
"grad_norm": 0.5224214196205139,
"learning_rate": 2.97020484907154e-07,
"loss": 1.5174,
"num_input_tokens_seen": 4641232,
"step": 6185
},
{
"epoch": 9.567233384853168,
"grad_norm": 0.48956823348999023,
"learning_rate": 2.8674565969641633e-07,
"loss": 1.9404,
"num_input_tokens_seen": 4645456,
"step": 6190
},
{
"epoch": 9.574961360123648,
"grad_norm": 0.3983800411224365,
"learning_rate": 2.766506701606525e-07,
"loss": 2.085,
"num_input_tokens_seen": 4649360,
"step": 6195
},
{
"epoch": 9.582689335394127,
"grad_norm": 0.46103692054748535,
"learning_rate": 2.667355897600088e-07,
"loss": 1.7913,
"num_input_tokens_seen": 4653392,
"step": 6200
},
{
"epoch": 9.590417310664606,
"grad_norm": 0.48668143153190613,
"learning_rate": 2.5700049064545373e-07,
"loss": 1.7479,
"num_input_tokens_seen": 4656720,
"step": 6205
},
{
"epoch": 9.598145285935084,
"grad_norm": 0.4885518252849579,
"learning_rate": 2.4744544365824793e-07,
"loss": 2.0991,
"num_input_tokens_seen": 4660048,
"step": 6210
},
{
"epoch": 9.605873261205565,
"grad_norm": 0.5907134413719177,
"learning_rate": 2.3807051832943071e-07,
"loss": 1.9784,
"num_input_tokens_seen": 4663824,
"step": 6215
},
{
"epoch": 9.613601236476043,
"grad_norm": 0.469518780708313,
"learning_rate": 2.288757828793231e-07,
"loss": 1.9894,
"num_input_tokens_seen": 4667408,
"step": 6220
},
{
"epoch": 9.621329211746522,
"grad_norm": 0.37500569224357605,
"learning_rate": 2.1986130421701445e-07,
"loss": 1.8386,
"num_input_tokens_seen": 4671120,
"step": 6225
},
{
"epoch": 9.629057187017002,
"grad_norm": 0.569591760635376,
"learning_rate": 2.1102714793989063e-07,
"loss": 1.5365,
"num_input_tokens_seen": 4674768,
"step": 6230
},
{
"epoch": 9.636785162287481,
"grad_norm": 0.5992723703384399,
"learning_rate": 2.0237337833315384e-07,
"loss": 1.9869,
"num_input_tokens_seen": 4678416,
"step": 6235
},
{
"epoch": 9.64451313755796,
"grad_norm": 0.4954376816749573,
"learning_rate": 1.9390005836934232e-07,
"loss": 1.5203,
"num_input_tokens_seen": 4682000,
"step": 6240
},
{
"epoch": 9.652241112828438,
"grad_norm": 0.46121746301651,
"learning_rate": 1.8560724970789202e-07,
"loss": 1.7654,
"num_input_tokens_seen": 4685584,
"step": 6245
},
{
"epoch": 9.659969088098919,
"grad_norm": 0.43793782591819763,
"learning_rate": 1.7749501269467282e-07,
"loss": 1.5659,
"num_input_tokens_seen": 4688912,
"step": 6250
},
{
"epoch": 9.667697063369397,
"grad_norm": 0.4031641483306885,
"learning_rate": 1.6956340636155033e-07,
"loss": 2.1642,
"num_input_tokens_seen": 4692176,
"step": 6255
},
{
"epoch": 9.675425038639876,
"grad_norm": 0.5168727040290833,
"learning_rate": 1.6181248842597196e-07,
"loss": 1.7044,
"num_input_tokens_seen": 4696336,
"step": 6260
},
{
"epoch": 9.683153013910356,
"grad_norm": 0.390639066696167,
"learning_rate": 1.5424231529052035e-07,
"loss": 1.7278,
"num_input_tokens_seen": 4700496,
"step": 6265
},
{
"epoch": 9.690880989180835,
"grad_norm": 0.376815527677536,
"learning_rate": 1.4685294204253296e-07,
"loss": 1.6015,
"num_input_tokens_seen": 4703952,
"step": 6270
},
{
"epoch": 9.698608964451314,
"grad_norm": 0.43562066555023193,
"learning_rate": 1.3964442245367193e-07,
"loss": 2.5439,
"num_input_tokens_seen": 4707984,
"step": 6275
},
{
"epoch": 9.706336939721792,
"grad_norm": 0.8875200748443604,
"learning_rate": 1.3261680897955765e-07,
"loss": 2.0866,
"num_input_tokens_seen": 4711632,
"step": 6280
},
{
"epoch": 9.714064914992273,
"grad_norm": 0.5330743789672852,
"learning_rate": 1.2577015275937188e-07,
"loss": 1.9099,
"num_input_tokens_seen": 4715088,
"step": 6285
},
{
"epoch": 9.721792890262751,
"grad_norm": 0.4825518727302551,
"learning_rate": 1.1910450361548587e-07,
"loss": 1.708,
"num_input_tokens_seen": 4718864,
"step": 6290
},
{
"epoch": 9.72952086553323,
"grad_norm": 0.531721830368042,
"learning_rate": 1.1261991005311334e-07,
"loss": 1.7221,
"num_input_tokens_seen": 4721808,
"step": 6295
},
{
"epoch": 9.737248840803709,
"grad_norm": 0.5634981989860535,
"learning_rate": 1.0631641925993307e-07,
"loss": 1.8634,
"num_input_tokens_seen": 4725520,
"step": 6300
},
{
"epoch": 9.744976816074189,
"grad_norm": 0.5600470900535583,
"learning_rate": 1.0019407710576967e-07,
"loss": 1.5357,
"num_input_tokens_seen": 4729616,
"step": 6305
},
{
"epoch": 9.752704791344668,
"grad_norm": 0.4644871652126312,
"learning_rate": 9.425292814224107e-08,
"loss": 1.7279,
"num_input_tokens_seen": 4733200,
"step": 6310
},
{
"epoch": 9.760432766615146,
"grad_norm": 0.4587952792644501,
"learning_rate": 8.849301560244494e-08,
"loss": 1.9601,
"num_input_tokens_seen": 4736784,
"step": 6315
},
{
"epoch": 9.768160741885627,
"grad_norm": 0.4871441423892975,
"learning_rate": 8.291438140064223e-08,
"loss": 1.9649,
"num_input_tokens_seen": 4740688,
"step": 6320
},
{
"epoch": 9.775888717156105,
"grad_norm": 0.38628652691841125,
"learning_rate": 7.751706613194909e-08,
"loss": 2.15,
"num_input_tokens_seen": 4744400,
"step": 6325
},
{
"epoch": 9.783616692426584,
"grad_norm": 0.46603044867515564,
"learning_rate": 7.230110907204269e-08,
"loss": 1.904,
"num_input_tokens_seen": 4747984,
"step": 6330
},
{
"epoch": 9.791344667697063,
"grad_norm": 0.6037775278091431,
"learning_rate": 6.726654817687805e-08,
"loss": 1.4794,
"num_input_tokens_seen": 4752080,
"step": 6335
},
{
"epoch": 9.799072642967543,
"grad_norm": 0.5536606907844543,
"learning_rate": 6.241342008241336e-08,
"loss": 2.1317,
"num_input_tokens_seen": 4756112,
"step": 6340
},
{
"epoch": 9.806800618238022,
"grad_norm": 0.49296537041664124,
"learning_rate": 5.774176010432952e-08,
"loss": 1.6372,
"num_input_tokens_seen": 4759632,
"step": 6345
},
{
"epoch": 9.8145285935085,
"grad_norm": 0.4221416413784027,
"learning_rate": 5.3251602237797126e-08,
"loss": 2.142,
"num_input_tokens_seen": 4763344,
"step": 6350
},
{
"epoch": 9.82225656877898,
"grad_norm": 0.4220753610134125,
"learning_rate": 4.8942979157201586e-08,
"loss": 1.7889,
"num_input_tokens_seen": 4766736,
"step": 6355
},
{
"epoch": 9.82998454404946,
"grad_norm": 0.4357856512069702,
"learning_rate": 4.481592221593223e-08,
"loss": 2.2538,
"num_input_tokens_seen": 4770576,
"step": 6360
},
{
"epoch": 9.837712519319938,
"grad_norm": 0.45150232315063477,
"learning_rate": 4.087046144613249e-08,
"loss": 2.0142,
"num_input_tokens_seen": 4774352,
"step": 6365
},
{
"epoch": 9.845440494590417,
"grad_norm": 0.3519168198108673,
"learning_rate": 3.7106625558494534e-08,
"loss": 1.8235,
"num_input_tokens_seen": 4778256,
"step": 6370
},
{
"epoch": 9.853168469860897,
"grad_norm": 0.44991186261177063,
"learning_rate": 3.352444194203996e-08,
"loss": 1.8635,
"num_input_tokens_seen": 4782544,
"step": 6375
},
{
"epoch": 9.860896445131376,
"grad_norm": 0.4822252690792084,
"learning_rate": 3.012393666393665e-08,
"loss": 2.2841,
"num_input_tokens_seen": 4786000,
"step": 6380
},
{
"epoch": 9.868624420401854,
"grad_norm": 0.4566858410835266,
"learning_rate": 2.690513446929055e-08,
"loss": 1.936,
"num_input_tokens_seen": 4789584,
"step": 6385
},
{
"epoch": 9.876352395672335,
"grad_norm": 0.5575479865074158,
"learning_rate": 2.3868058780979198e-08,
"loss": 2.047,
"num_input_tokens_seen": 4793424,
"step": 6390
},
{
"epoch": 9.884080370942813,
"grad_norm": 0.41943296790122986,
"learning_rate": 2.101273169946849e-08,
"loss": 1.5426,
"num_input_tokens_seen": 4796880,
"step": 6395
},
{
"epoch": 9.891808346213292,
"grad_norm": 0.4676728844642639,
"learning_rate": 1.833917400266838e-08,
"loss": 2.3344,
"num_input_tokens_seen": 4800848,
"step": 6400
},
{
"epoch": 9.89953632148377,
"grad_norm": 0.5828633308410645,
"learning_rate": 1.5847405145769102e-08,
"loss": 1.692,
"num_input_tokens_seen": 4804432,
"step": 6405
},
{
"epoch": 9.907264296754251,
"grad_norm": 0.38569849729537964,
"learning_rate": 1.353744326109685e-08,
"loss": 1.9036,
"num_input_tokens_seen": 4808592,
"step": 6410
},
{
"epoch": 9.91499227202473,
"grad_norm": 0.5596803426742554,
"learning_rate": 1.1409305157999983e-08,
"loss": 1.97,
"num_input_tokens_seen": 4812240,
"step": 6415
},
{
"epoch": 9.922720247295208,
"grad_norm": 0.5255624651908875,
"learning_rate": 9.463006322707468e-09,
"loss": 1.9222,
"num_input_tokens_seen": 4815952,
"step": 6420
},
{
"epoch": 9.930448222565687,
"grad_norm": 0.5257275700569153,
"learning_rate": 7.698560918226183e-09,
"loss": 1.9876,
"num_input_tokens_seen": 4819856,
"step": 6425
},
{
"epoch": 9.938176197836167,
"grad_norm": 0.5967457294464111,
"learning_rate": 6.115981784229896e-09,
"loss": 2.1135,
"num_input_tokens_seen": 4823568,
"step": 6430
},
{
"epoch": 9.945904173106646,
"grad_norm": 0.4129837453365326,
"learning_rate": 4.715280436981551e-09,
"loss": 1.6423,
"num_input_tokens_seen": 4827344,
"step": 6435
},
{
"epoch": 9.953632148377125,
"grad_norm": 0.5613869428634644,
"learning_rate": 3.4964670692277934e-09,
"loss": 2.0167,
"num_input_tokens_seen": 4831184,
"step": 6440
},
{
"epoch": 9.961360123647605,
"grad_norm": 0.5644814372062683,
"learning_rate": 2.4595505501434633e-09,
"loss": 2.3765,
"num_input_tokens_seen": 4835152,
"step": 6445
},
{
"epoch": 9.969088098918084,
"grad_norm": 0.4961523413658142,
"learning_rate": 1.6045384252594275e-09,
"loss": 1.8894,
"num_input_tokens_seen": 4838800,
"step": 6450
},
{
"epoch": 9.976816074188562,
"grad_norm": 0.6483544707298279,
"learning_rate": 9.314369164042936e-10,
"loss": 1.875,
"num_input_tokens_seen": 4842064,
"step": 6455
},
{
"epoch": 9.984544049459041,
"grad_norm": 0.44400128722190857,
"learning_rate": 4.402509216655526e-10,
"loss": 1.5224,
"num_input_tokens_seen": 4845776,
"step": 6460
},
{
"epoch": 9.992272024729521,
"grad_norm": 0.46386170387268066,
"learning_rate": 1.3098401535072137e-10,
"loss": 1.6659,
"num_input_tokens_seen": 4849296,
"step": 6465
},
{
"epoch": 10.0,
"grad_norm": 0.5652341842651367,
"learning_rate": 3.6384479595863445e-12,
"loss": 2.0513,
"num_input_tokens_seen": 4852608,
"step": 6470
},
{
"epoch": 10.0,
"num_input_tokens_seen": 4852608,
"step": 6470,
"total_flos": 2.1851074501646746e+17,
"train_loss": 3.185584316518981,
"train_runtime": 2159.8077,
"train_samples_per_second": 23.956,
"train_steps_per_second": 2.996
}
],
"logging_steps": 5,
"max_steps": 6470,
"num_input_tokens_seen": 4852608,
"num_train_epochs": 10,
"save_steps": 324,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1851074501646746e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}