train_rte_1754652145 / trainer_state.json
rbelanec's picture
End of training
7a9bd40 verified
{
"best_global_step": 5058,
"best_metric": 0.18196314573287964,
"best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_rte_1754652145/checkpoint-5058",
"epoch": 10.0,
"eval_steps": 281,
"global_step": 5610,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008912655971479501,
"grad_norm": 2.640630006790161,
"learning_rate": 3.5650623885918005e-07,
"loss": 11.4646,
"num_input_tokens_seen": 3168,
"step": 5
},
{
"epoch": 0.017825311942959002,
"grad_norm": 2.218021869659424,
"learning_rate": 8.021390374331552e-07,
"loss": 11.5893,
"num_input_tokens_seen": 6272,
"step": 10
},
{
"epoch": 0.026737967914438502,
"grad_norm": 2.5081377029418945,
"learning_rate": 1.2477718360071302e-06,
"loss": 11.4013,
"num_input_tokens_seen": 10144,
"step": 15
},
{
"epoch": 0.035650623885918005,
"grad_norm": 2.2221429347991943,
"learning_rate": 1.6934046345811053e-06,
"loss": 11.4758,
"num_input_tokens_seen": 13536,
"step": 20
},
{
"epoch": 0.044563279857397504,
"grad_norm": 2.0649468898773193,
"learning_rate": 2.1390374331550802e-06,
"loss": 11.3651,
"num_input_tokens_seen": 16128,
"step": 25
},
{
"epoch": 0.053475935828877004,
"grad_norm": 2.8866872787475586,
"learning_rate": 2.5846702317290554e-06,
"loss": 11.4745,
"num_input_tokens_seen": 18784,
"step": 30
},
{
"epoch": 0.062388591800356503,
"grad_norm": 2.091982841491699,
"learning_rate": 3.0303030303030305e-06,
"loss": 11.5827,
"num_input_tokens_seen": 22336,
"step": 35
},
{
"epoch": 0.07130124777183601,
"grad_norm": 2.3348405361175537,
"learning_rate": 3.4759358288770056e-06,
"loss": 11.5288,
"num_input_tokens_seen": 25408,
"step": 40
},
{
"epoch": 0.08021390374331551,
"grad_norm": 2.6505680084228516,
"learning_rate": 3.92156862745098e-06,
"loss": 11.3178,
"num_input_tokens_seen": 27968,
"step": 45
},
{
"epoch": 0.08912655971479501,
"grad_norm": 2.241699457168579,
"learning_rate": 4.3672014260249555e-06,
"loss": 11.2239,
"num_input_tokens_seen": 30752,
"step": 50
},
{
"epoch": 0.09803921568627451,
"grad_norm": 2.149437427520752,
"learning_rate": 4.812834224598931e-06,
"loss": 11.3085,
"num_input_tokens_seen": 33376,
"step": 55
},
{
"epoch": 0.10695187165775401,
"grad_norm": 2.2778542041778564,
"learning_rate": 5.258467023172906e-06,
"loss": 11.3491,
"num_input_tokens_seen": 37280,
"step": 60
},
{
"epoch": 0.11586452762923351,
"grad_norm": 2.1370596885681152,
"learning_rate": 5.704099821746881e-06,
"loss": 11.0892,
"num_input_tokens_seen": 40640,
"step": 65
},
{
"epoch": 0.12477718360071301,
"grad_norm": 2.1671693325042725,
"learning_rate": 6.149732620320856e-06,
"loss": 11.4158,
"num_input_tokens_seen": 44128,
"step": 70
},
{
"epoch": 0.13368983957219252,
"grad_norm": 2.1441879272460938,
"learning_rate": 6.59536541889483e-06,
"loss": 11.0242,
"num_input_tokens_seen": 47648,
"step": 75
},
{
"epoch": 0.14260249554367202,
"grad_norm": 2.2412052154541016,
"learning_rate": 7.040998217468805e-06,
"loss": 10.8869,
"num_input_tokens_seen": 50816,
"step": 80
},
{
"epoch": 0.15151515151515152,
"grad_norm": 2.3039534091949463,
"learning_rate": 7.4866310160427806e-06,
"loss": 11.031,
"num_input_tokens_seen": 53728,
"step": 85
},
{
"epoch": 0.16042780748663102,
"grad_norm": 2.3388712406158447,
"learning_rate": 7.932263814616755e-06,
"loss": 10.9959,
"num_input_tokens_seen": 57056,
"step": 90
},
{
"epoch": 0.16934046345811052,
"grad_norm": 2.324082851409912,
"learning_rate": 8.377896613190733e-06,
"loss": 10.8078,
"num_input_tokens_seen": 59808,
"step": 95
},
{
"epoch": 0.17825311942959002,
"grad_norm": 2.343338966369629,
"learning_rate": 8.823529411764707e-06,
"loss": 10.6152,
"num_input_tokens_seen": 62848,
"step": 100
},
{
"epoch": 0.18716577540106952,
"grad_norm": 2.170870542526245,
"learning_rate": 9.269162210338681e-06,
"loss": 10.7252,
"num_input_tokens_seen": 65856,
"step": 105
},
{
"epoch": 0.19607843137254902,
"grad_norm": 2.1757500171661377,
"learning_rate": 9.714795008912657e-06,
"loss": 10.702,
"num_input_tokens_seen": 68672,
"step": 110
},
{
"epoch": 0.20499108734402852,
"grad_norm": 2.319809675216675,
"learning_rate": 1.0160427807486631e-05,
"loss": 10.7596,
"num_input_tokens_seen": 71840,
"step": 115
},
{
"epoch": 0.21390374331550802,
"grad_norm": 2.85723876953125,
"learning_rate": 1.0606060606060607e-05,
"loss": 10.6329,
"num_input_tokens_seen": 74624,
"step": 120
},
{
"epoch": 0.22281639928698752,
"grad_norm": 2.3634092807769775,
"learning_rate": 1.1051693404634582e-05,
"loss": 10.694,
"num_input_tokens_seen": 78080,
"step": 125
},
{
"epoch": 0.23172905525846701,
"grad_norm": 2.2238471508026123,
"learning_rate": 1.1497326203208558e-05,
"loss": 10.4616,
"num_input_tokens_seen": 81408,
"step": 130
},
{
"epoch": 0.24064171122994651,
"grad_norm": 2.2605199813842773,
"learning_rate": 1.1942959001782532e-05,
"loss": 10.2704,
"num_input_tokens_seen": 84192,
"step": 135
},
{
"epoch": 0.24955436720142601,
"grad_norm": 2.334446668624878,
"learning_rate": 1.2388591800356506e-05,
"loss": 10.1217,
"num_input_tokens_seen": 87264,
"step": 140
},
{
"epoch": 0.25846702317290554,
"grad_norm": 2.1008996963500977,
"learning_rate": 1.2834224598930484e-05,
"loss": 9.9505,
"num_input_tokens_seen": 90336,
"step": 145
},
{
"epoch": 0.26737967914438504,
"grad_norm": 2.1396262645721436,
"learning_rate": 1.3279857397504458e-05,
"loss": 9.9953,
"num_input_tokens_seen": 93760,
"step": 150
},
{
"epoch": 0.27629233511586454,
"grad_norm": 1.9306892156600952,
"learning_rate": 1.3725490196078432e-05,
"loss": 10.0273,
"num_input_tokens_seen": 97120,
"step": 155
},
{
"epoch": 0.28520499108734404,
"grad_norm": 2.2339835166931152,
"learning_rate": 1.4171122994652408e-05,
"loss": 9.8194,
"num_input_tokens_seen": 100160,
"step": 160
},
{
"epoch": 0.29411764705882354,
"grad_norm": 2.1370038986206055,
"learning_rate": 1.4616755793226383e-05,
"loss": 9.7234,
"num_input_tokens_seen": 103136,
"step": 165
},
{
"epoch": 0.30303030303030304,
"grad_norm": 2.2204971313476562,
"learning_rate": 1.5062388591800359e-05,
"loss": 9.4737,
"num_input_tokens_seen": 105696,
"step": 170
},
{
"epoch": 0.31194295900178254,
"grad_norm": 2.0649607181549072,
"learning_rate": 1.5508021390374333e-05,
"loss": 9.299,
"num_input_tokens_seen": 108800,
"step": 175
},
{
"epoch": 0.32085561497326204,
"grad_norm": 2.166388511657715,
"learning_rate": 1.5953654188948307e-05,
"loss": 9.3115,
"num_input_tokens_seen": 111808,
"step": 180
},
{
"epoch": 0.32976827094474154,
"grad_norm": 2.0328972339630127,
"learning_rate": 1.639928698752228e-05,
"loss": 9.3707,
"num_input_tokens_seen": 114944,
"step": 185
},
{
"epoch": 0.33868092691622104,
"grad_norm": 2.443514347076416,
"learning_rate": 1.684491978609626e-05,
"loss": 8.9663,
"num_input_tokens_seen": 118112,
"step": 190
},
{
"epoch": 0.34759358288770054,
"grad_norm": 2.0616464614868164,
"learning_rate": 1.7290552584670233e-05,
"loss": 8.9474,
"num_input_tokens_seen": 120896,
"step": 195
},
{
"epoch": 0.35650623885918004,
"grad_norm": 2.2355945110321045,
"learning_rate": 1.7736185383244208e-05,
"loss": 8.6637,
"num_input_tokens_seen": 123904,
"step": 200
},
{
"epoch": 0.36541889483065954,
"grad_norm": 2.044498920440674,
"learning_rate": 1.8181818181818182e-05,
"loss": 8.6211,
"num_input_tokens_seen": 127008,
"step": 205
},
{
"epoch": 0.37433155080213903,
"grad_norm": 2.1903281211853027,
"learning_rate": 1.862745098039216e-05,
"loss": 8.4521,
"num_input_tokens_seen": 129984,
"step": 210
},
{
"epoch": 0.38324420677361853,
"grad_norm": 2.253875255584717,
"learning_rate": 1.9073083778966134e-05,
"loss": 8.4635,
"num_input_tokens_seen": 133152,
"step": 215
},
{
"epoch": 0.39215686274509803,
"grad_norm": 2.23766827583313,
"learning_rate": 1.951871657754011e-05,
"loss": 8.4012,
"num_input_tokens_seen": 136096,
"step": 220
},
{
"epoch": 0.40106951871657753,
"grad_norm": 2.4483225345611572,
"learning_rate": 1.9964349376114083e-05,
"loss": 8.019,
"num_input_tokens_seen": 139136,
"step": 225
},
{
"epoch": 0.40998217468805703,
"grad_norm": 2.141366958618164,
"learning_rate": 2.0409982174688057e-05,
"loss": 8.2362,
"num_input_tokens_seen": 142080,
"step": 230
},
{
"epoch": 0.41889483065953653,
"grad_norm": 2.049794912338257,
"learning_rate": 2.0855614973262035e-05,
"loss": 8.3716,
"num_input_tokens_seen": 145824,
"step": 235
},
{
"epoch": 0.42780748663101603,
"grad_norm": 2.0718395709991455,
"learning_rate": 2.130124777183601e-05,
"loss": 7.742,
"num_input_tokens_seen": 149280,
"step": 240
},
{
"epoch": 0.43672014260249553,
"grad_norm": 2.133650064468384,
"learning_rate": 2.1746880570409983e-05,
"loss": 7.7851,
"num_input_tokens_seen": 152544,
"step": 245
},
{
"epoch": 0.44563279857397503,
"grad_norm": 2.0652763843536377,
"learning_rate": 2.2192513368983957e-05,
"loss": 7.4258,
"num_input_tokens_seen": 156416,
"step": 250
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.8414599895477295,
"learning_rate": 2.2638146167557932e-05,
"loss": 7.1734,
"num_input_tokens_seen": 159712,
"step": 255
},
{
"epoch": 0.46345811051693403,
"grad_norm": 2.0587077140808105,
"learning_rate": 2.308377896613191e-05,
"loss": 6.8801,
"num_input_tokens_seen": 162400,
"step": 260
},
{
"epoch": 0.47237076648841353,
"grad_norm": 1.8652368783950806,
"learning_rate": 2.3529411764705884e-05,
"loss": 7.0346,
"num_input_tokens_seen": 166048,
"step": 265
},
{
"epoch": 0.48128342245989303,
"grad_norm": 1.6939105987548828,
"learning_rate": 2.3975044563279858e-05,
"loss": 6.5944,
"num_input_tokens_seen": 168576,
"step": 270
},
{
"epoch": 0.49019607843137253,
"grad_norm": 1.9076436758041382,
"learning_rate": 2.4420677361853832e-05,
"loss": 6.7204,
"num_input_tokens_seen": 172320,
"step": 275
},
{
"epoch": 0.49910873440285203,
"grad_norm": 1.65463387966156,
"learning_rate": 2.4866310160427807e-05,
"loss": 6.7786,
"num_input_tokens_seen": 175424,
"step": 280
},
{
"epoch": 0.5008912655971479,
"eval_loss": 6.320615768432617,
"eval_runtime": 4.2449,
"eval_samples_per_second": 58.659,
"eval_steps_per_second": 14.841,
"num_input_tokens_seen": 176032,
"step": 281
},
{
"epoch": 0.5080213903743316,
"grad_norm": 2.3921778202056885,
"learning_rate": 2.5311942959001784e-05,
"loss": 6.4536,
"num_input_tokens_seen": 178016,
"step": 285
},
{
"epoch": 0.5169340463458111,
"grad_norm": 1.5767650604248047,
"learning_rate": 2.575757575757576e-05,
"loss": 6.7214,
"num_input_tokens_seen": 181888,
"step": 290
},
{
"epoch": 0.5258467023172906,
"grad_norm": 1.6409612894058228,
"learning_rate": 2.6203208556149733e-05,
"loss": 6.0779,
"num_input_tokens_seen": 184960,
"step": 295
},
{
"epoch": 0.5347593582887701,
"grad_norm": 1.5643103122711182,
"learning_rate": 2.6648841354723707e-05,
"loss": 5.8182,
"num_input_tokens_seen": 187488,
"step": 300
},
{
"epoch": 0.5436720142602496,
"grad_norm": 1.7608228921890259,
"learning_rate": 2.7094474153297685e-05,
"loss": 6.2207,
"num_input_tokens_seen": 191232,
"step": 305
},
{
"epoch": 0.5525846702317291,
"grad_norm": 1.5273125171661377,
"learning_rate": 2.754010695187166e-05,
"loss": 5.8064,
"num_input_tokens_seen": 194272,
"step": 310
},
{
"epoch": 0.5614973262032086,
"grad_norm": 1.3673619031906128,
"learning_rate": 2.7985739750445633e-05,
"loss": 5.7312,
"num_input_tokens_seen": 197184,
"step": 315
},
{
"epoch": 0.5704099821746881,
"grad_norm": 1.3092046976089478,
"learning_rate": 2.8431372549019608e-05,
"loss": 5.36,
"num_input_tokens_seen": 199840,
"step": 320
},
{
"epoch": 0.5793226381461676,
"grad_norm": 1.5241113901138306,
"learning_rate": 2.8877005347593582e-05,
"loss": 5.6509,
"num_input_tokens_seen": 203008,
"step": 325
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.2224637269973755,
"learning_rate": 2.932263814616756e-05,
"loss": 5.3917,
"num_input_tokens_seen": 206400,
"step": 330
},
{
"epoch": 0.5971479500891266,
"grad_norm": 1.1933878660202026,
"learning_rate": 2.9768270944741534e-05,
"loss": 5.2637,
"num_input_tokens_seen": 209440,
"step": 335
},
{
"epoch": 0.6060606060606061,
"grad_norm": 1.1900209188461304,
"learning_rate": 3.0213903743315508e-05,
"loss": 5.4659,
"num_input_tokens_seen": 212736,
"step": 340
},
{
"epoch": 0.6149732620320856,
"grad_norm": 1.3414652347564697,
"learning_rate": 3.065953654188948e-05,
"loss": 5.324,
"num_input_tokens_seen": 216096,
"step": 345
},
{
"epoch": 0.6238859180035651,
"grad_norm": 1.1607022285461426,
"learning_rate": 3.110516934046346e-05,
"loss": 5.2878,
"num_input_tokens_seen": 219200,
"step": 350
},
{
"epoch": 0.6327985739750446,
"grad_norm": 1.153671383857727,
"learning_rate": 3.155080213903743e-05,
"loss": 4.9444,
"num_input_tokens_seen": 221952,
"step": 355
},
{
"epoch": 0.6417112299465241,
"grad_norm": 1.139689326286316,
"learning_rate": 3.199643493761141e-05,
"loss": 4.891,
"num_input_tokens_seen": 225376,
"step": 360
},
{
"epoch": 0.6506238859180036,
"grad_norm": 1.0437010526657104,
"learning_rate": 3.2442067736185386e-05,
"loss": 4.9337,
"num_input_tokens_seen": 228736,
"step": 365
},
{
"epoch": 0.6595365418894831,
"grad_norm": 1.2458043098449707,
"learning_rate": 3.288770053475936e-05,
"loss": 4.7023,
"num_input_tokens_seen": 231648,
"step": 370
},
{
"epoch": 0.6684491978609626,
"grad_norm": 1.0675745010375977,
"learning_rate": 3.3333333333333335e-05,
"loss": 4.594,
"num_input_tokens_seen": 234976,
"step": 375
},
{
"epoch": 0.6773618538324421,
"grad_norm": 1.0720183849334717,
"learning_rate": 3.3778966131907306e-05,
"loss": 4.82,
"num_input_tokens_seen": 238368,
"step": 380
},
{
"epoch": 0.6862745098039216,
"grad_norm": 1.044710636138916,
"learning_rate": 3.4224598930481284e-05,
"loss": 4.5563,
"num_input_tokens_seen": 241440,
"step": 385
},
{
"epoch": 0.6951871657754011,
"grad_norm": 1.0943641662597656,
"learning_rate": 3.467023172905526e-05,
"loss": 4.5969,
"num_input_tokens_seen": 244448,
"step": 390
},
{
"epoch": 0.7040998217468806,
"grad_norm": 1.082396149635315,
"learning_rate": 3.511586452762923e-05,
"loss": 4.3737,
"num_input_tokens_seen": 246880,
"step": 395
},
{
"epoch": 0.7130124777183601,
"grad_norm": 1.1410984992980957,
"learning_rate": 3.556149732620321e-05,
"loss": 4.3754,
"num_input_tokens_seen": 250240,
"step": 400
},
{
"epoch": 0.7219251336898396,
"grad_norm": 1.1234968900680542,
"learning_rate": 3.600713012477718e-05,
"loss": 4.3313,
"num_input_tokens_seen": 253184,
"step": 405
},
{
"epoch": 0.7308377896613191,
"grad_norm": 1.2889167070388794,
"learning_rate": 3.645276292335116e-05,
"loss": 4.1676,
"num_input_tokens_seen": 255968,
"step": 410
},
{
"epoch": 0.7397504456327986,
"grad_norm": 0.9909088611602783,
"learning_rate": 3.6898395721925136e-05,
"loss": 4.1332,
"num_input_tokens_seen": 258688,
"step": 415
},
{
"epoch": 0.7486631016042781,
"grad_norm": 1.12320077419281,
"learning_rate": 3.734402852049911e-05,
"loss": 4.1551,
"num_input_tokens_seen": 262240,
"step": 420
},
{
"epoch": 0.7575757575757576,
"grad_norm": 1.1998422145843506,
"learning_rate": 3.7789661319073085e-05,
"loss": 4.1066,
"num_input_tokens_seen": 265952,
"step": 425
},
{
"epoch": 0.7664884135472371,
"grad_norm": 1.6095830202102661,
"learning_rate": 3.8235294117647055e-05,
"loss": 4.3427,
"num_input_tokens_seen": 269312,
"step": 430
},
{
"epoch": 0.7754010695187166,
"grad_norm": 1.1973387002944946,
"learning_rate": 3.868092691622103e-05,
"loss": 4.0544,
"num_input_tokens_seen": 272128,
"step": 435
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.13062584400177,
"learning_rate": 3.912655971479501e-05,
"loss": 4.0524,
"num_input_tokens_seen": 275552,
"step": 440
},
{
"epoch": 0.7932263814616756,
"grad_norm": 1.095451831817627,
"learning_rate": 3.957219251336899e-05,
"loss": 3.9436,
"num_input_tokens_seen": 278720,
"step": 445
},
{
"epoch": 0.8021390374331551,
"grad_norm": 0.9978923201560974,
"learning_rate": 4.0017825311942966e-05,
"loss": 3.6121,
"num_input_tokens_seen": 281536,
"step": 450
},
{
"epoch": 0.8110516934046346,
"grad_norm": 1.036067008972168,
"learning_rate": 4.046345811051694e-05,
"loss": 3.8184,
"num_input_tokens_seen": 284672,
"step": 455
},
{
"epoch": 0.8199643493761141,
"grad_norm": 0.8888896107673645,
"learning_rate": 4.0909090909090915e-05,
"loss": 3.7184,
"num_input_tokens_seen": 288416,
"step": 460
},
{
"epoch": 0.8288770053475936,
"grad_norm": 0.8882661461830139,
"learning_rate": 4.1354723707664886e-05,
"loss": 3.6762,
"num_input_tokens_seen": 291232,
"step": 465
},
{
"epoch": 0.8377896613190731,
"grad_norm": 1.3067046403884888,
"learning_rate": 4.180035650623886e-05,
"loss": 3.7256,
"num_input_tokens_seen": 294784,
"step": 470
},
{
"epoch": 0.8467023172905526,
"grad_norm": 1.1890095472335815,
"learning_rate": 4.224598930481284e-05,
"loss": 3.4105,
"num_input_tokens_seen": 297632,
"step": 475
},
{
"epoch": 0.8556149732620321,
"grad_norm": 0.9891613125801086,
"learning_rate": 4.269162210338681e-05,
"loss": 3.2745,
"num_input_tokens_seen": 300416,
"step": 480
},
{
"epoch": 0.8645276292335116,
"grad_norm": 0.9931787848472595,
"learning_rate": 4.313725490196079e-05,
"loss": 3.1763,
"num_input_tokens_seen": 303232,
"step": 485
},
{
"epoch": 0.8734402852049911,
"grad_norm": 0.8934875130653381,
"learning_rate": 4.358288770053476e-05,
"loss": 3.2828,
"num_input_tokens_seen": 306144,
"step": 490
},
{
"epoch": 0.8823529411764706,
"grad_norm": 1.265254259109497,
"learning_rate": 4.402852049910874e-05,
"loss": 3.2048,
"num_input_tokens_seen": 308576,
"step": 495
},
{
"epoch": 0.8912655971479501,
"grad_norm": 1.0396374464035034,
"learning_rate": 4.4474153297682716e-05,
"loss": 3.197,
"num_input_tokens_seen": 312000,
"step": 500
},
{
"epoch": 0.9001782531194296,
"grad_norm": 0.8916023373603821,
"learning_rate": 4.491978609625669e-05,
"loss": 2.9296,
"num_input_tokens_seen": 314848,
"step": 505
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.1076226234436035,
"learning_rate": 4.5365418894830664e-05,
"loss": 3.0006,
"num_input_tokens_seen": 318112,
"step": 510
},
{
"epoch": 0.9180035650623886,
"grad_norm": 1.0348403453826904,
"learning_rate": 4.5811051693404635e-05,
"loss": 3.2128,
"num_input_tokens_seen": 321152,
"step": 515
},
{
"epoch": 0.9269162210338681,
"grad_norm": 0.9368388056755066,
"learning_rate": 4.625668449197861e-05,
"loss": 2.6109,
"num_input_tokens_seen": 323552,
"step": 520
},
{
"epoch": 0.9358288770053476,
"grad_norm": 0.9401017427444458,
"learning_rate": 4.670231729055259e-05,
"loss": 2.6761,
"num_input_tokens_seen": 326112,
"step": 525
},
{
"epoch": 0.9447415329768271,
"grad_norm": 1.0641679763793945,
"learning_rate": 4.714795008912656e-05,
"loss": 2.7169,
"num_input_tokens_seen": 328800,
"step": 530
},
{
"epoch": 0.9536541889483066,
"grad_norm": 1.1021815538406372,
"learning_rate": 4.759358288770054e-05,
"loss": 3.1103,
"num_input_tokens_seen": 332512,
"step": 535
},
{
"epoch": 0.9625668449197861,
"grad_norm": 0.8338248133659363,
"learning_rate": 4.803921568627452e-05,
"loss": 2.3949,
"num_input_tokens_seen": 335360,
"step": 540
},
{
"epoch": 0.9714795008912656,
"grad_norm": 1.311125636100769,
"learning_rate": 4.848484848484849e-05,
"loss": 2.9292,
"num_input_tokens_seen": 339488,
"step": 545
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.993326723575592,
"learning_rate": 4.8930481283422465e-05,
"loss": 2.2154,
"num_input_tokens_seen": 342176,
"step": 550
},
{
"epoch": 0.9893048128342246,
"grad_norm": 1.0523838996887207,
"learning_rate": 4.9376114081996436e-05,
"loss": 2.6187,
"num_input_tokens_seen": 345568,
"step": 555
},
{
"epoch": 0.9982174688057041,
"grad_norm": 1.2461936473846436,
"learning_rate": 4.9821746880570414e-05,
"loss": 2.0606,
"num_input_tokens_seen": 348000,
"step": 560
},
{
"epoch": 1.0017825311942958,
"eval_loss": 2.2780375480651855,
"eval_runtime": 4.2492,
"eval_samples_per_second": 58.599,
"eval_steps_per_second": 14.826,
"num_input_tokens_seen": 349200,
"step": 562
},
{
"epoch": 1.0071301247771836,
"grad_norm": 0.8942297697067261,
"learning_rate": 4.99999564446608e-05,
"loss": 2.598,
"num_input_tokens_seen": 350960,
"step": 565
},
{
"epoch": 1.0160427807486632,
"grad_norm": 0.9582070708274841,
"learning_rate": 4.9999690273693036e-05,
"loss": 2.0767,
"num_input_tokens_seen": 354288,
"step": 570
},
{
"epoch": 1.0249554367201426,
"grad_norm": 1.0559678077697754,
"learning_rate": 4.999918213174131e-05,
"loss": 2.1588,
"num_input_tokens_seen": 357648,
"step": 575
},
{
"epoch": 1.0338680926916222,
"grad_norm": 1.2316597700119019,
"learning_rate": 4.9998432023723915e-05,
"loss": 2.0186,
"num_input_tokens_seen": 360496,
"step": 580
},
{
"epoch": 1.0427807486631016,
"grad_norm": 1.1366970539093018,
"learning_rate": 4.9997439956901106e-05,
"loss": 2.0455,
"num_input_tokens_seen": 363376,
"step": 585
},
{
"epoch": 1.0516934046345812,
"grad_norm": 1.041366696357727,
"learning_rate": 4.999620594087507e-05,
"loss": 1.995,
"num_input_tokens_seen": 366320,
"step": 590
},
{
"epoch": 1.0606060606060606,
"grad_norm": 0.9262757301330566,
"learning_rate": 4.999472998758978e-05,
"loss": 1.912,
"num_input_tokens_seen": 369488,
"step": 595
},
{
"epoch": 1.0695187165775402,
"grad_norm": 1.3618220090866089,
"learning_rate": 4.999301211133095e-05,
"loss": 1.8174,
"num_input_tokens_seen": 372656,
"step": 600
},
{
"epoch": 1.0784313725490196,
"grad_norm": 0.9017401337623596,
"learning_rate": 4.999105232872582e-05,
"loss": 1.7304,
"num_input_tokens_seen": 376048,
"step": 605
},
{
"epoch": 1.0873440285204992,
"grad_norm": 1.131372332572937,
"learning_rate": 4.998885065874305e-05,
"loss": 2.0501,
"num_input_tokens_seen": 379472,
"step": 610
},
{
"epoch": 1.0962566844919786,
"grad_norm": 0.743751585483551,
"learning_rate": 4.9986407122692504e-05,
"loss": 1.6725,
"num_input_tokens_seen": 382288,
"step": 615
},
{
"epoch": 1.1051693404634582,
"grad_norm": 1.2746849060058594,
"learning_rate": 4.998372174422507e-05,
"loss": 1.5424,
"num_input_tokens_seen": 385392,
"step": 620
},
{
"epoch": 1.1140819964349375,
"grad_norm": 1.250909686088562,
"learning_rate": 4.998079454933244e-05,
"loss": 1.9679,
"num_input_tokens_seen": 389200,
"step": 625
},
{
"epoch": 1.1229946524064172,
"grad_norm": 0.8632287979125977,
"learning_rate": 4.99776255663468e-05,
"loss": 1.2718,
"num_input_tokens_seen": 391664,
"step": 630
},
{
"epoch": 1.1319073083778965,
"grad_norm": 0.773535966873169,
"learning_rate": 4.997421482594059e-05,
"loss": 1.3693,
"num_input_tokens_seen": 394416,
"step": 635
},
{
"epoch": 1.1408199643493762,
"grad_norm": 1.104138731956482,
"learning_rate": 4.997056236112625e-05,
"loss": 1.9817,
"num_input_tokens_seen": 399248,
"step": 640
},
{
"epoch": 1.1497326203208555,
"grad_norm": 0.7540408372879028,
"learning_rate": 4.9966668207255826e-05,
"loss": 1.2948,
"num_input_tokens_seen": 402032,
"step": 645
},
{
"epoch": 1.1586452762923352,
"grad_norm": 0.9450183510780334,
"learning_rate": 4.996253240202069e-05,
"loss": 1.2707,
"num_input_tokens_seen": 405296,
"step": 650
},
{
"epoch": 1.1675579322638145,
"grad_norm": 1.1226730346679688,
"learning_rate": 4.9958154985451114e-05,
"loss": 1.2088,
"num_input_tokens_seen": 408400,
"step": 655
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.9736111760139465,
"learning_rate": 4.995353599991595e-05,
"loss": 1.4309,
"num_input_tokens_seen": 412016,
"step": 660
},
{
"epoch": 1.1853832442067735,
"grad_norm": 0.9693507552146912,
"learning_rate": 4.994867549012215e-05,
"loss": 1.2743,
"num_input_tokens_seen": 415504,
"step": 665
},
{
"epoch": 1.1942959001782532,
"grad_norm": 1.0443888902664185,
"learning_rate": 4.99435735031144e-05,
"loss": 1.1155,
"num_input_tokens_seen": 418448,
"step": 670
},
{
"epoch": 1.2032085561497325,
"grad_norm": 1.0174163579940796,
"learning_rate": 4.993823008827465e-05,
"loss": 1.092,
"num_input_tokens_seen": 421168,
"step": 675
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.7569769620895386,
"learning_rate": 4.9932645297321555e-05,
"loss": 0.9307,
"num_input_tokens_seen": 423632,
"step": 680
},
{
"epoch": 1.2210338680926915,
"grad_norm": 0.7273694276809692,
"learning_rate": 4.9926819184310103e-05,
"loss": 0.9791,
"num_input_tokens_seen": 426640,
"step": 685
},
{
"epoch": 1.2299465240641712,
"grad_norm": 0.952115535736084,
"learning_rate": 4.9920751805631e-05,
"loss": 1.1522,
"num_input_tokens_seen": 430032,
"step": 690
},
{
"epoch": 1.2388591800356505,
"grad_norm": 1.1709868907928467,
"learning_rate": 4.991444322001014e-05,
"loss": 1.0973,
"num_input_tokens_seen": 433008,
"step": 695
},
{
"epoch": 1.2477718360071302,
"grad_norm": 0.6561676263809204,
"learning_rate": 4.99078934885081e-05,
"loss": 1.0868,
"num_input_tokens_seen": 436400,
"step": 700
},
{
"epoch": 1.2566844919786098,
"grad_norm": 0.8287897109985352,
"learning_rate": 4.990110267451944e-05,
"loss": 0.8352,
"num_input_tokens_seen": 439248,
"step": 705
},
{
"epoch": 1.2655971479500892,
"grad_norm": 0.9313675165176392,
"learning_rate": 4.989407084377218e-05,
"loss": 0.8707,
"num_input_tokens_seen": 442416,
"step": 710
},
{
"epoch": 1.2745098039215685,
"grad_norm": 0.9105520844459534,
"learning_rate": 4.988679806432712e-05,
"loss": 0.9153,
"num_input_tokens_seen": 445616,
"step": 715
},
{
"epoch": 1.2834224598930482,
"grad_norm": 0.7386419773101807,
"learning_rate": 4.9879284406577195e-05,
"loss": 0.7514,
"num_input_tokens_seen": 448528,
"step": 720
},
{
"epoch": 1.2923351158645278,
"grad_norm": 0.8464149236679077,
"learning_rate": 4.98715299432468e-05,
"loss": 0.897,
"num_input_tokens_seen": 451664,
"step": 725
},
{
"epoch": 1.3012477718360071,
"grad_norm": 0.7016708254814148,
"learning_rate": 4.986353474939106e-05,
"loss": 0.9608,
"num_input_tokens_seen": 455120,
"step": 730
},
{
"epoch": 1.3101604278074865,
"grad_norm": 0.7350292801856995,
"learning_rate": 4.9855298902395134e-05,
"loss": 0.8485,
"num_input_tokens_seen": 458352,
"step": 735
},
{
"epoch": 1.3190730837789661,
"grad_norm": 0.657071053981781,
"learning_rate": 4.9846822481973455e-05,
"loss": 0.9055,
"num_input_tokens_seen": 461488,
"step": 740
},
{
"epoch": 1.3279857397504458,
"grad_norm": 0.7406115531921387,
"learning_rate": 4.9838105570168946e-05,
"loss": 0.9068,
"num_input_tokens_seen": 464848,
"step": 745
},
{
"epoch": 1.3368983957219251,
"grad_norm": 0.9874480962753296,
"learning_rate": 4.982914825135224e-05,
"loss": 1.0902,
"num_input_tokens_seen": 468944,
"step": 750
},
{
"epoch": 1.3458110516934045,
"grad_norm": 0.7415845990180969,
"learning_rate": 4.981995061222087e-05,
"loss": 0.6795,
"num_input_tokens_seen": 471312,
"step": 755
},
{
"epoch": 1.3547237076648841,
"grad_norm": 0.6649575233459473,
"learning_rate": 4.98105127417984e-05,
"loss": 0.6273,
"num_input_tokens_seen": 474128,
"step": 760
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.0872315168380737,
"learning_rate": 4.9800834731433596e-05,
"loss": 0.5981,
"num_input_tokens_seen": 476592,
"step": 765
},
{
"epoch": 1.3725490196078431,
"grad_norm": 0.7500861287117004,
"learning_rate": 4.9790916674799526e-05,
"loss": 1.014,
"num_input_tokens_seen": 480240,
"step": 770
},
{
"epoch": 1.3814616755793225,
"grad_norm": 1.2134431600570679,
"learning_rate": 4.9780758667892656e-05,
"loss": 0.681,
"num_input_tokens_seen": 483472,
"step": 775
},
{
"epoch": 1.3903743315508021,
"grad_norm": 0.8633726835250854,
"learning_rate": 4.977036080903193e-05,
"loss": 0.6929,
"num_input_tokens_seen": 486768,
"step": 780
},
{
"epoch": 1.3992869875222818,
"grad_norm": 0.903477668762207,
"learning_rate": 4.975972319885779e-05,
"loss": 0.5834,
"num_input_tokens_seen": 489392,
"step": 785
},
{
"epoch": 1.4081996434937611,
"grad_norm": 0.7039727568626404,
"learning_rate": 4.974884594033123e-05,
"loss": 0.7406,
"num_input_tokens_seen": 492560,
"step": 790
},
{
"epoch": 1.4171122994652405,
"grad_norm": 0.9972723126411438,
"learning_rate": 4.9737729138732805e-05,
"loss": 0.5558,
"num_input_tokens_seen": 495344,
"step": 795
},
{
"epoch": 1.4260249554367201,
"grad_norm": 1.2662111520767212,
"learning_rate": 4.972637290166158e-05,
"loss": 0.6374,
"num_input_tokens_seen": 498128,
"step": 800
},
{
"epoch": 1.4349376114081998,
"grad_norm": 1.4038677215576172,
"learning_rate": 4.97147773390341e-05,
"loss": 0.8173,
"num_input_tokens_seen": 501488,
"step": 805
},
{
"epoch": 1.4438502673796791,
"grad_norm": 0.730514407157898,
"learning_rate": 4.9702942563083356e-05,
"loss": 0.5782,
"num_input_tokens_seen": 504272,
"step": 810
},
{
"epoch": 1.4527629233511585,
"grad_norm": 0.5917222499847412,
"learning_rate": 4.969086868835765e-05,
"loss": 0.4533,
"num_input_tokens_seen": 506672,
"step": 815
},
{
"epoch": 1.4616755793226381,
"grad_norm": 0.49027279019355774,
"learning_rate": 4.967855583171954e-05,
"loss": 0.4866,
"num_input_tokens_seen": 509232,
"step": 820
},
{
"epoch": 1.4705882352941178,
"grad_norm": 1.1144423484802246,
"learning_rate": 4.9666004112344656e-05,
"loss": 0.7116,
"num_input_tokens_seen": 512528,
"step": 825
},
{
"epoch": 1.4795008912655971,
"grad_norm": 0.6267158389091492,
"learning_rate": 4.965321365172057e-05,
"loss": 0.576,
"num_input_tokens_seen": 514896,
"step": 830
},
{
"epoch": 1.4884135472370765,
"grad_norm": 0.8494957089424133,
"learning_rate": 4.9640184573645646e-05,
"loss": 0.6064,
"num_input_tokens_seen": 518384,
"step": 835
},
{
"epoch": 1.4973262032085561,
"grad_norm": 1.1032313108444214,
"learning_rate": 4.962691700422778e-05,
"loss": 0.8595,
"num_input_tokens_seen": 522448,
"step": 840
},
{
"epoch": 1.5026737967914439,
"eval_loss": 0.5879648327827454,
"eval_runtime": 4.2487,
"eval_samples_per_second": 58.606,
"eval_steps_per_second": 14.828,
"num_input_tokens_seen": 524208,
"step": 843
},
{
"epoch": 1.5062388591800357,
"grad_norm": 0.7947481274604797,
"learning_rate": 4.9613411071883267e-05,
"loss": 0.4532,
"num_input_tokens_seen": 525264,
"step": 845
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.6550034284591675,
"learning_rate": 4.959966690733544e-05,
"loss": 0.7043,
"num_input_tokens_seen": 528528,
"step": 850
},
{
"epoch": 1.5240641711229945,
"grad_norm": 1.126085877418518,
"learning_rate": 4.958568464361353e-05,
"loss": 0.6396,
"num_input_tokens_seen": 531536,
"step": 855
},
{
"epoch": 1.5329768270944741,
"grad_norm": 0.6209072470664978,
"learning_rate": 4.9571464416051294e-05,
"loss": 0.5435,
"num_input_tokens_seen": 534704,
"step": 860
},
{
"epoch": 1.5418894830659537,
"grad_norm": 0.5790075063705444,
"learning_rate": 4.955700636228573e-05,
"loss": 0.359,
"num_input_tokens_seen": 537264,
"step": 865
},
{
"epoch": 1.5508021390374331,
"grad_norm": 0.9781410694122314,
"learning_rate": 4.954231062225576e-05,
"loss": 0.6823,
"num_input_tokens_seen": 541328,
"step": 870
},
{
"epoch": 1.5597147950089125,
"grad_norm": 0.7598072290420532,
"learning_rate": 4.9527377338200855e-05,
"loss": 0.4973,
"num_input_tokens_seen": 544496,
"step": 875
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.8549111485481262,
"learning_rate": 4.951220665465964e-05,
"loss": 0.6291,
"num_input_tokens_seen": 547696,
"step": 880
},
{
"epoch": 1.5775401069518717,
"grad_norm": 0.7234603762626648,
"learning_rate": 4.949679871846857e-05,
"loss": 0.4632,
"num_input_tokens_seen": 550416,
"step": 885
},
{
"epoch": 1.5864527629233511,
"grad_norm": 0.5888731479644775,
"learning_rate": 4.948115367876043e-05,
"loss": 0.5336,
"num_input_tokens_seen": 553968,
"step": 890
},
{
"epoch": 1.5953654188948305,
"grad_norm": 0.8173357844352722,
"learning_rate": 4.94652716869629e-05,
"loss": 0.3634,
"num_input_tokens_seen": 556656,
"step": 895
},
{
"epoch": 1.6042780748663101,
"grad_norm": 0.5093280673027039,
"learning_rate": 4.944915289679716e-05,
"loss": 0.3877,
"num_input_tokens_seen": 559536,
"step": 900
},
{
"epoch": 1.6131907308377897,
"grad_norm": 0.9982839226722717,
"learning_rate": 4.94327974642763e-05,
"loss": 0.5395,
"num_input_tokens_seen": 562704,
"step": 905
},
{
"epoch": 1.6221033868092691,
"grad_norm": 1.0210356712341309,
"learning_rate": 4.94162055477039e-05,
"loss": 0.5995,
"num_input_tokens_seen": 566352,
"step": 910
},
{
"epoch": 1.6310160427807485,
"grad_norm": 1.2152962684631348,
"learning_rate": 4.939937730767243e-05,
"loss": 0.5234,
"num_input_tokens_seen": 569584,
"step": 915
},
{
"epoch": 1.6399286987522281,
"grad_norm": 0.8112650513648987,
"learning_rate": 4.9382312907061755e-05,
"loss": 0.3781,
"num_input_tokens_seen": 571824,
"step": 920
},
{
"epoch": 1.6488413547237077,
"grad_norm": 0.8025038838386536,
"learning_rate": 4.9365012511037514e-05,
"loss": 0.5397,
"num_input_tokens_seen": 575248,
"step": 925
},
{
"epoch": 1.6577540106951871,
"grad_norm": 1.2283076047897339,
"learning_rate": 4.934747628704952e-05,
"loss": 0.4426,
"num_input_tokens_seen": 578032,
"step": 930
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.8238184452056885,
"learning_rate": 4.932970440483018e-05,
"loss": 0.4614,
"num_input_tokens_seen": 581744,
"step": 935
},
{
"epoch": 1.6755793226381461,
"grad_norm": 0.7958811521530151,
"learning_rate": 4.931169703639282e-05,
"loss": 0.4136,
"num_input_tokens_seen": 584880,
"step": 940
},
{
"epoch": 1.6844919786096257,
"grad_norm": 1.2087262868881226,
"learning_rate": 4.929345435603003e-05,
"loss": 0.4801,
"num_input_tokens_seen": 587856,
"step": 945
},
{
"epoch": 1.6934046345811051,
"grad_norm": 0.7868252992630005,
"learning_rate": 4.9274976540311956e-05,
"loss": 0.5347,
"num_input_tokens_seen": 590928,
"step": 950
},
{
"epoch": 1.7023172905525845,
"grad_norm": 0.9967821836471558,
"learning_rate": 4.9256263768084635e-05,
"loss": 0.37,
"num_input_tokens_seen": 594096,
"step": 955
},
{
"epoch": 1.7112299465240641,
"grad_norm": 0.8641761541366577,
"learning_rate": 4.923731622046823e-05,
"loss": 0.3977,
"num_input_tokens_seen": 597136,
"step": 960
},
{
"epoch": 1.7201426024955437,
"grad_norm": 0.6801542639732361,
"learning_rate": 4.9218134080855273e-05,
"loss": 0.5575,
"num_input_tokens_seen": 600912,
"step": 965
},
{
"epoch": 1.7290552584670231,
"grad_norm": 0.9356634616851807,
"learning_rate": 4.919871753490891e-05,
"loss": 0.5977,
"num_input_tokens_seen": 604240,
"step": 970
},
{
"epoch": 1.7379679144385025,
"grad_norm": 0.898560106754303,
"learning_rate": 4.917906677056111e-05,
"loss": 0.4074,
"num_input_tokens_seen": 607248,
"step": 975
},
{
"epoch": 1.7468805704099821,
"grad_norm": 0.7507029175758362,
"learning_rate": 4.9159181978010814e-05,
"loss": 0.4812,
"num_input_tokens_seen": 610736,
"step": 980
},
{
"epoch": 1.7557932263814617,
"grad_norm": 0.9444867372512817,
"learning_rate": 4.9139063349722113e-05,
"loss": 0.4682,
"num_input_tokens_seen": 614128,
"step": 985
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.9318161010742188,
"learning_rate": 4.911871108042241e-05,
"loss": 0.4571,
"num_input_tokens_seen": 617232,
"step": 990
},
{
"epoch": 1.7736185383244205,
"grad_norm": 0.7218228578567505,
"learning_rate": 4.909812536710048e-05,
"loss": 0.5007,
"num_input_tokens_seen": 620880,
"step": 995
},
{
"epoch": 1.7825311942959001,
"grad_norm": 0.7028499841690063,
"learning_rate": 4.9077306409004585e-05,
"loss": 0.6674,
"num_input_tokens_seen": 624368,
"step": 1000
},
{
"epoch": 1.7914438502673797,
"grad_norm": 0.5062604546546936,
"learning_rate": 4.9056254407640604e-05,
"loss": 0.3413,
"num_input_tokens_seen": 627152,
"step": 1005
},
{
"epoch": 1.8003565062388591,
"grad_norm": 0.49366044998168945,
"learning_rate": 4.903496956676998e-05,
"loss": 0.3736,
"num_input_tokens_seen": 629680,
"step": 1010
},
{
"epoch": 1.8092691622103387,
"grad_norm": 0.6387802958488464,
"learning_rate": 4.901345209240784e-05,
"loss": 0.3377,
"num_input_tokens_seen": 632848,
"step": 1015
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.8644296526908875,
"learning_rate": 4.8991702192820924e-05,
"loss": 0.4588,
"num_input_tokens_seen": 635920,
"step": 1020
},
{
"epoch": 1.8270944741532977,
"grad_norm": 0.4941517114639282,
"learning_rate": 4.896972007852563e-05,
"loss": 0.3705,
"num_input_tokens_seen": 639056,
"step": 1025
},
{
"epoch": 1.8360071301247771,
"grad_norm": 0.5460651516914368,
"learning_rate": 4.894750596228594e-05,
"loss": 0.3389,
"num_input_tokens_seen": 642192,
"step": 1030
},
{
"epoch": 1.8449197860962567,
"grad_norm": 0.7782461643218994,
"learning_rate": 4.8925060059111394e-05,
"loss": 0.4158,
"num_input_tokens_seen": 645488,
"step": 1035
},
{
"epoch": 1.8538324420677363,
"grad_norm": 0.5338404178619385,
"learning_rate": 4.890238258625496e-05,
"loss": 0.3644,
"num_input_tokens_seen": 648336,
"step": 1040
},
{
"epoch": 1.8627450980392157,
"grad_norm": 0.8528239727020264,
"learning_rate": 4.887947376321099e-05,
"loss": 0.3682,
"num_input_tokens_seen": 651696,
"step": 1045
},
{
"epoch": 1.8716577540106951,
"grad_norm": 0.4754684865474701,
"learning_rate": 4.885633381171304e-05,
"loss": 0.3467,
"num_input_tokens_seen": 654640,
"step": 1050
},
{
"epoch": 1.8805704099821747,
"grad_norm": 0.9799590110778809,
"learning_rate": 4.883296295573176e-05,
"loss": 0.511,
"num_input_tokens_seen": 658128,
"step": 1055
},
{
"epoch": 1.8894830659536543,
"grad_norm": 0.6689459085464478,
"learning_rate": 4.880936142147271e-05,
"loss": 0.3246,
"num_input_tokens_seen": 660848,
"step": 1060
},
{
"epoch": 1.8983957219251337,
"grad_norm": 0.7261871099472046,
"learning_rate": 4.878552943737418e-05,
"loss": 0.2685,
"num_input_tokens_seen": 663120,
"step": 1065
},
{
"epoch": 1.9073083778966131,
"grad_norm": 0.7026433944702148,
"learning_rate": 4.876146723410498e-05,
"loss": 0.3756,
"num_input_tokens_seen": 666288,
"step": 1070
},
{
"epoch": 1.9162210338680927,
"grad_norm": 1.4159960746765137,
"learning_rate": 4.873717504456219e-05,
"loss": 0.3687,
"num_input_tokens_seen": 669360,
"step": 1075
},
{
"epoch": 1.9251336898395723,
"grad_norm": 0.7870906591415405,
"learning_rate": 4.8712653103868916e-05,
"loss": 0.2532,
"num_input_tokens_seen": 671344,
"step": 1080
},
{
"epoch": 1.9340463458110517,
"grad_norm": 0.8793025612831116,
"learning_rate": 4.868790164937204e-05,
"loss": 0.3925,
"num_input_tokens_seen": 674672,
"step": 1085
},
{
"epoch": 1.9429590017825311,
"grad_norm": 0.40374019742012024,
"learning_rate": 4.8662920920639866e-05,
"loss": 0.3251,
"num_input_tokens_seen": 677968,
"step": 1090
},
{
"epoch": 1.9518716577540107,
"grad_norm": 0.5041529536247253,
"learning_rate": 4.8637711159459855e-05,
"loss": 0.3022,
"num_input_tokens_seen": 680560,
"step": 1095
},
{
"epoch": 1.9607843137254903,
"grad_norm": 1.0466898679733276,
"learning_rate": 4.8612272609836263e-05,
"loss": 0.3464,
"num_input_tokens_seen": 683824,
"step": 1100
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.8734254240989685,
"learning_rate": 4.858660551798778e-05,
"loss": 0.4663,
"num_input_tokens_seen": 687216,
"step": 1105
},
{
"epoch": 1.9786096256684491,
"grad_norm": 0.589005172252655,
"learning_rate": 4.856071013234513e-05,
"loss": 0.3396,
"num_input_tokens_seen": 690128,
"step": 1110
},
{
"epoch": 1.9875222816399287,
"grad_norm": 0.570462167263031,
"learning_rate": 4.85345867035487e-05,
"loss": 0.3839,
"num_input_tokens_seen": 693232,
"step": 1115
},
{
"epoch": 1.9964349376114083,
"grad_norm": 0.9086877107620239,
"learning_rate": 4.8508235484446095e-05,
"loss": 0.4327,
"num_input_tokens_seen": 696880,
"step": 1120
},
{
"epoch": 2.0035650623885917,
"eval_loss": 0.37957677245140076,
"eval_runtime": 4.2451,
"eval_samples_per_second": 58.656,
"eval_steps_per_second": 14.841,
"num_input_tokens_seen": 699264,
"step": 1124
},
{
"epoch": 2.0053475935828877,
"grad_norm": 0.9719306826591492,
"learning_rate": 4.8481656730089695e-05,
"loss": 0.4008,
"num_input_tokens_seen": 700096,
"step": 1125
},
{
"epoch": 2.014260249554367,
"grad_norm": 0.9481471180915833,
"learning_rate": 4.8454850697734174e-05,
"loss": 0.4113,
"num_input_tokens_seen": 703360,
"step": 1130
},
{
"epoch": 2.0231729055258465,
"grad_norm": 0.7257654666900635,
"learning_rate": 4.842781764683403e-05,
"loss": 0.3966,
"num_input_tokens_seen": 706624,
"step": 1135
},
{
"epoch": 2.0320855614973263,
"grad_norm": 0.8015730977058411,
"learning_rate": 4.8400557839041064e-05,
"loss": 0.3069,
"num_input_tokens_seen": 709472,
"step": 1140
},
{
"epoch": 2.0409982174688057,
"grad_norm": 0.43969354033470154,
"learning_rate": 4.837307153820184e-05,
"loss": 0.337,
"num_input_tokens_seen": 713152,
"step": 1145
},
{
"epoch": 2.049910873440285,
"grad_norm": 0.934760570526123,
"learning_rate": 4.8345359010355155e-05,
"loss": 0.3539,
"num_input_tokens_seen": 716480,
"step": 1150
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.4905712306499481,
"learning_rate": 4.831742052372943e-05,
"loss": 0.3069,
"num_input_tokens_seen": 719104,
"step": 1155
},
{
"epoch": 2.0677361853832443,
"grad_norm": 0.6868427395820618,
"learning_rate": 4.828925634874014e-05,
"loss": 0.3006,
"num_input_tokens_seen": 722016,
"step": 1160
},
{
"epoch": 2.0766488413547237,
"grad_norm": 0.6591427326202393,
"learning_rate": 4.8260866757987177e-05,
"loss": 0.2809,
"num_input_tokens_seen": 725184,
"step": 1165
},
{
"epoch": 2.085561497326203,
"grad_norm": 1.2832831144332886,
"learning_rate": 4.823225202625226e-05,
"loss": 0.3441,
"num_input_tokens_seen": 728352,
"step": 1170
},
{
"epoch": 2.0944741532976825,
"grad_norm": 0.7174959182739258,
"learning_rate": 4.820341243049618e-05,
"loss": 0.4048,
"num_input_tokens_seen": 731712,
"step": 1175
},
{
"epoch": 2.1033868092691623,
"grad_norm": 0.6431313157081604,
"learning_rate": 4.8174348249856236e-05,
"loss": 0.3201,
"num_input_tokens_seen": 734880,
"step": 1180
},
{
"epoch": 2.1122994652406417,
"grad_norm": 0.658487856388092,
"learning_rate": 4.814505976564343e-05,
"loss": 0.3509,
"num_input_tokens_seen": 737728,
"step": 1185
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.7958409786224365,
"learning_rate": 4.8115547261339824e-05,
"loss": 0.3429,
"num_input_tokens_seen": 741376,
"step": 1190
},
{
"epoch": 2.1301247771836005,
"grad_norm": 0.6729584336280823,
"learning_rate": 4.808581102259573e-05,
"loss": 0.2909,
"num_input_tokens_seen": 744256,
"step": 1195
},
{
"epoch": 2.1390374331550803,
"grad_norm": 0.740015983581543,
"learning_rate": 4.8055851337227006e-05,
"loss": 0.2479,
"num_input_tokens_seen": 746944,
"step": 1200
},
{
"epoch": 2.1479500891265597,
"grad_norm": 0.5458919405937195,
"learning_rate": 4.802566849521222e-05,
"loss": 0.2943,
"num_input_tokens_seen": 750272,
"step": 1205
},
{
"epoch": 2.156862745098039,
"grad_norm": 0.508515477180481,
"learning_rate": 4.799526278868987e-05,
"loss": 0.2486,
"num_input_tokens_seen": 753024,
"step": 1210
},
{
"epoch": 2.165775401069519,
"grad_norm": 0.8448687791824341,
"learning_rate": 4.796463451195554e-05,
"loss": 0.388,
"num_input_tokens_seen": 756576,
"step": 1215
},
{
"epoch": 2.1746880570409983,
"grad_norm": 0.5762525200843811,
"learning_rate": 4.7933783961459094e-05,
"loss": 0.3068,
"num_input_tokens_seen": 759680,
"step": 1220
},
{
"epoch": 2.1836007130124777,
"grad_norm": 0.6639679670333862,
"learning_rate": 4.790271143580174e-05,
"loss": 0.331,
"num_input_tokens_seen": 762880,
"step": 1225
},
{
"epoch": 2.192513368983957,
"grad_norm": 0.5362179279327393,
"learning_rate": 4.7871417235733196e-05,
"loss": 0.2964,
"num_input_tokens_seen": 765920,
"step": 1230
},
{
"epoch": 2.2014260249554365,
"grad_norm": 0.5786792039871216,
"learning_rate": 4.783990166414875e-05,
"loss": 0.4138,
"num_input_tokens_seen": 769728,
"step": 1235
},
{
"epoch": 2.2103386809269163,
"grad_norm": 0.47215279936790466,
"learning_rate": 4.780816502608632e-05,
"loss": 0.3199,
"num_input_tokens_seen": 772832,
"step": 1240
},
{
"epoch": 2.2192513368983957,
"grad_norm": 0.4350599944591522,
"learning_rate": 4.777620762872355e-05,
"loss": 0.3148,
"num_input_tokens_seen": 776352,
"step": 1245
},
{
"epoch": 2.228163992869875,
"grad_norm": 0.6416548490524292,
"learning_rate": 4.774402978137479e-05,
"loss": 0.3055,
"num_input_tokens_seen": 779456,
"step": 1250
},
{
"epoch": 2.237076648841355,
"grad_norm": 0.2961161434650421,
"learning_rate": 4.7711631795488096e-05,
"loss": 0.2604,
"num_input_tokens_seen": 782112,
"step": 1255
},
{
"epoch": 2.2459893048128343,
"grad_norm": 0.5333968997001648,
"learning_rate": 4.767901398464227e-05,
"loss": 0.346,
"num_input_tokens_seen": 784864,
"step": 1260
},
{
"epoch": 2.2549019607843137,
"grad_norm": 0.7181191444396973,
"learning_rate": 4.7646176664543763e-05,
"loss": 0.2688,
"num_input_tokens_seen": 787936,
"step": 1265
},
{
"epoch": 2.263814616755793,
"grad_norm": 1.1632299423217773,
"learning_rate": 4.761312015302367e-05,
"loss": 0.2973,
"num_input_tokens_seen": 790976,
"step": 1270
},
{
"epoch": 2.2727272727272725,
"grad_norm": 1.0037575960159302,
"learning_rate": 4.757984477003462e-05,
"loss": 0.3304,
"num_input_tokens_seen": 794016,
"step": 1275
},
{
"epoch": 2.2816399286987523,
"grad_norm": 0.6830529570579529,
"learning_rate": 4.7546350837647666e-05,
"loss": 0.2141,
"num_input_tokens_seen": 796864,
"step": 1280
},
{
"epoch": 2.2905525846702317,
"grad_norm": 0.7043412327766418,
"learning_rate": 4.7512638680049245e-05,
"loss": 0.3195,
"num_input_tokens_seen": 800096,
"step": 1285
},
{
"epoch": 2.299465240641711,
"grad_norm": 0.6342535018920898,
"learning_rate": 4.7478708623537956e-05,
"loss": 0.2506,
"num_input_tokens_seen": 803392,
"step": 1290
},
{
"epoch": 2.308377896613191,
"grad_norm": 1.047386646270752,
"learning_rate": 4.7444560996521415e-05,
"loss": 0.3365,
"num_input_tokens_seen": 806400,
"step": 1295
},
{
"epoch": 2.3172905525846703,
"grad_norm": 1.372889518737793,
"learning_rate": 4.741019612951312e-05,
"loss": 0.4817,
"num_input_tokens_seen": 809568,
"step": 1300
},
{
"epoch": 2.3262032085561497,
"grad_norm": 0.4855256974697113,
"learning_rate": 4.737561435512923e-05,
"loss": 0.2226,
"num_input_tokens_seen": 812768,
"step": 1305
},
{
"epoch": 2.335115864527629,
"grad_norm": 0.5740591287612915,
"learning_rate": 4.734081600808531e-05,
"loss": 0.2448,
"num_input_tokens_seen": 815968,
"step": 1310
},
{
"epoch": 2.344028520499109,
"grad_norm": 0.5068109631538391,
"learning_rate": 4.7305801425193165e-05,
"loss": 0.2175,
"num_input_tokens_seen": 818976,
"step": 1315
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.9766526818275452,
"learning_rate": 4.727057094535749e-05,
"loss": 0.2615,
"num_input_tokens_seen": 821760,
"step": 1320
},
{
"epoch": 2.3618538324420677,
"grad_norm": 0.5878629684448242,
"learning_rate": 4.72351249095727e-05,
"loss": 0.3121,
"num_input_tokens_seen": 824288,
"step": 1325
},
{
"epoch": 2.370766488413547,
"grad_norm": 0.8109356760978699,
"learning_rate": 4.7199463660919514e-05,
"loss": 0.3045,
"num_input_tokens_seen": 827424,
"step": 1330
},
{
"epoch": 2.379679144385027,
"grad_norm": 0.6713225245475769,
"learning_rate": 4.7163587544561705e-05,
"loss": 0.2503,
"num_input_tokens_seen": 830176,
"step": 1335
},
{
"epoch": 2.3885918003565063,
"grad_norm": 0.7476429343223572,
"learning_rate": 4.7127496907742734e-05,
"loss": 0.357,
"num_input_tokens_seen": 833664,
"step": 1340
},
{
"epoch": 2.3975044563279857,
"grad_norm": 1.1430628299713135,
"learning_rate": 4.709119209978242e-05,
"loss": 0.3525,
"num_input_tokens_seen": 836736,
"step": 1345
},
{
"epoch": 2.406417112299465,
"grad_norm": 0.5232317447662354,
"learning_rate": 4.7054673472073506e-05,
"loss": 0.3624,
"num_input_tokens_seen": 840160,
"step": 1350
},
{
"epoch": 2.415329768270945,
"grad_norm": 0.9793670773506165,
"learning_rate": 4.7017941378078314e-05,
"loss": 0.3082,
"num_input_tokens_seen": 843168,
"step": 1355
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.6311604380607605,
"learning_rate": 4.698099617332528e-05,
"loss": 0.2339,
"num_input_tokens_seen": 845952,
"step": 1360
},
{
"epoch": 2.4331550802139037,
"grad_norm": 0.9364222288131714,
"learning_rate": 4.694383821540555e-05,
"loss": 0.2302,
"num_input_tokens_seen": 848448,
"step": 1365
},
{
"epoch": 2.442067736185383,
"grad_norm": 1.2326656579971313,
"learning_rate": 4.690646786396945e-05,
"loss": 0.2639,
"num_input_tokens_seen": 851552,
"step": 1370
},
{
"epoch": 2.450980392156863,
"grad_norm": 0.7579092979431152,
"learning_rate": 4.686888548072312e-05,
"loss": 0.3276,
"num_input_tokens_seen": 854752,
"step": 1375
},
{
"epoch": 2.4598930481283423,
"grad_norm": 0.9993529915809631,
"learning_rate": 4.683109142942492e-05,
"loss": 0.2741,
"num_input_tokens_seen": 857600,
"step": 1380
},
{
"epoch": 2.4688057040998217,
"grad_norm": 0.5094732642173767,
"learning_rate": 4.679308607588192e-05,
"loss": 0.4073,
"num_input_tokens_seen": 861248,
"step": 1385
},
{
"epoch": 2.477718360071301,
"grad_norm": 0.6214059591293335,
"learning_rate": 4.6754869787946386e-05,
"loss": 0.3205,
"num_input_tokens_seen": 865056,
"step": 1390
},
{
"epoch": 2.486631016042781,
"grad_norm": 0.432815283536911,
"learning_rate": 4.6716442935512214e-05,
"loss": 0.2478,
"num_input_tokens_seen": 867936,
"step": 1395
},
{
"epoch": 2.4955436720142603,
"grad_norm": 0.5354329347610474,
"learning_rate": 4.6677805890511354e-05,
"loss": 0.2816,
"num_input_tokens_seen": 871136,
"step": 1400
},
{
"epoch": 2.5044563279857397,
"grad_norm": 0.5837387442588806,
"learning_rate": 4.663895902691018e-05,
"loss": 0.239,
"num_input_tokens_seen": 873600,
"step": 1405
},
{
"epoch": 2.5044563279857397,
"eval_loss": 0.286673367023468,
"eval_runtime": 4.2516,
"eval_samples_per_second": 58.566,
"eval_steps_per_second": 14.818,
"num_input_tokens_seen": 873600,
"step": 1405
},
{
"epoch": 2.5133689839572195,
"grad_norm": 0.48573535680770874,
"learning_rate": 4.659990272070591e-05,
"loss": 0.31,
"num_input_tokens_seen": 877152,
"step": 1410
},
{
"epoch": 2.522281639928699,
"grad_norm": 0.5476496815681458,
"learning_rate": 4.656063734992294e-05,
"loss": 0.2718,
"num_input_tokens_seen": 880096,
"step": 1415
},
{
"epoch": 2.5311942959001783,
"grad_norm": 0.5417474508285522,
"learning_rate": 4.6521163294609196e-05,
"loss": 0.2433,
"num_input_tokens_seen": 882944,
"step": 1420
},
{
"epoch": 2.5401069518716577,
"grad_norm": 0.7648299932479858,
"learning_rate": 4.6481480936832444e-05,
"loss": 0.3607,
"num_input_tokens_seen": 886848,
"step": 1425
},
{
"epoch": 2.549019607843137,
"grad_norm": 0.6219758987426758,
"learning_rate": 4.644159066067662e-05,
"loss": 0.2771,
"num_input_tokens_seen": 890272,
"step": 1430
},
{
"epoch": 2.557932263814617,
"grad_norm": 0.6586949825286865,
"learning_rate": 4.640149285223806e-05,
"loss": 0.2683,
"num_input_tokens_seen": 893600,
"step": 1435
},
{
"epoch": 2.5668449197860963,
"grad_norm": 1.156497836112976,
"learning_rate": 4.636118789962184e-05,
"loss": 0.2513,
"num_input_tokens_seen": 896448,
"step": 1440
},
{
"epoch": 2.5757575757575757,
"grad_norm": 0.6117565631866455,
"learning_rate": 4.632067619293795e-05,
"loss": 0.2491,
"num_input_tokens_seen": 899424,
"step": 1445
},
{
"epoch": 2.5846702317290555,
"grad_norm": 0.6213181614875793,
"learning_rate": 4.6279958124297554e-05,
"loss": 0.2476,
"num_input_tokens_seen": 902624,
"step": 1450
},
{
"epoch": 2.593582887700535,
"grad_norm": 0.8394727110862732,
"learning_rate": 4.623903408780916e-05,
"loss": 0.2327,
"num_input_tokens_seen": 905568,
"step": 1455
},
{
"epoch": 2.6024955436720143,
"grad_norm": 0.65825355052948,
"learning_rate": 4.619790447957488e-05,
"loss": 0.321,
"num_input_tokens_seen": 908960,
"step": 1460
},
{
"epoch": 2.6114081996434937,
"grad_norm": 0.7782941460609436,
"learning_rate": 4.615656969768649e-05,
"loss": 0.2843,
"num_input_tokens_seen": 912640,
"step": 1465
},
{
"epoch": 2.620320855614973,
"grad_norm": 0.8492444157600403,
"learning_rate": 4.611503014222168e-05,
"loss": 0.2464,
"num_input_tokens_seen": 915328,
"step": 1470
},
{
"epoch": 2.629233511586453,
"grad_norm": 1.3704971075057983,
"learning_rate": 4.6073286215240105e-05,
"loss": 0.2942,
"num_input_tokens_seen": 918656,
"step": 1475
},
{
"epoch": 2.6381461675579323,
"grad_norm": 0.8433835506439209,
"learning_rate": 4.6031338320779534e-05,
"loss": 0.2215,
"num_input_tokens_seen": 921344,
"step": 1480
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.5805216431617737,
"learning_rate": 4.598918686485193e-05,
"loss": 0.2321,
"num_input_tokens_seen": 924192,
"step": 1485
},
{
"epoch": 2.6559714795008915,
"grad_norm": 0.4831686317920685,
"learning_rate": 4.594683225543952e-05,
"loss": 0.2957,
"num_input_tokens_seen": 927424,
"step": 1490
},
{
"epoch": 2.664884135472371,
"grad_norm": 0.7766821980476379,
"learning_rate": 4.590427490249084e-05,
"loss": 0.2587,
"num_input_tokens_seen": 930080,
"step": 1495
},
{
"epoch": 2.6737967914438503,
"grad_norm": 0.4486106038093567,
"learning_rate": 4.5861515217916785e-05,
"loss": 0.202,
"num_input_tokens_seen": 932768,
"step": 1500
},
{
"epoch": 2.6827094474153297,
"grad_norm": 0.43728289008140564,
"learning_rate": 4.581855361558659e-05,
"loss": 0.2685,
"num_input_tokens_seen": 935904,
"step": 1505
},
{
"epoch": 2.691622103386809,
"grad_norm": 0.5914068222045898,
"learning_rate": 4.577539051132386e-05,
"loss": 0.2218,
"num_input_tokens_seen": 938784,
"step": 1510
},
{
"epoch": 2.700534759358289,
"grad_norm": 0.4907556176185608,
"learning_rate": 4.573202632290252e-05,
"loss": 0.2022,
"num_input_tokens_seen": 941280,
"step": 1515
},
{
"epoch": 2.7094474153297683,
"grad_norm": 0.7610965967178345,
"learning_rate": 4.568846147004279e-05,
"loss": 0.2046,
"num_input_tokens_seen": 944672,
"step": 1520
},
{
"epoch": 2.7183600713012477,
"grad_norm": 0.7069556713104248,
"learning_rate": 4.5644696374407105e-05,
"loss": 0.2896,
"num_input_tokens_seen": 948032,
"step": 1525
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.7775002121925354,
"learning_rate": 4.560073145959602e-05,
"loss": 0.322,
"num_input_tokens_seen": 952000,
"step": 1530
},
{
"epoch": 2.736185383244207,
"grad_norm": 0.5535850524902344,
"learning_rate": 4.555656715114419e-05,
"loss": 0.278,
"num_input_tokens_seen": 955456,
"step": 1535
},
{
"epoch": 2.7450980392156863,
"grad_norm": 0.6513121724128723,
"learning_rate": 4.551220387651615e-05,
"loss": 0.2629,
"num_input_tokens_seen": 959232,
"step": 1540
},
{
"epoch": 2.7540106951871657,
"grad_norm": 0.5215713977813721,
"learning_rate": 4.546764206510221e-05,
"loss": 0.2042,
"num_input_tokens_seen": 962304,
"step": 1545
},
{
"epoch": 2.762923351158645,
"grad_norm": 0.5402376651763916,
"learning_rate": 4.542288214821433e-05,
"loss": 0.213,
"num_input_tokens_seen": 965344,
"step": 1550
},
{
"epoch": 2.771836007130125,
"grad_norm": 1.1007705926895142,
"learning_rate": 4.5377924559081946e-05,
"loss": 0.1996,
"num_input_tokens_seen": 968032,
"step": 1555
},
{
"epoch": 2.7807486631016043,
"grad_norm": 0.5571001172065735,
"learning_rate": 4.533276973284771e-05,
"loss": 0.2281,
"num_input_tokens_seen": 970624,
"step": 1560
},
{
"epoch": 2.7896613190730837,
"grad_norm": 0.7429901361465454,
"learning_rate": 4.528741810656336e-05,
"loss": 0.2868,
"num_input_tokens_seen": 973760,
"step": 1565
},
{
"epoch": 2.7985739750445635,
"grad_norm": 0.3642044961452484,
"learning_rate": 4.5241870119185426e-05,
"loss": 0.2662,
"num_input_tokens_seen": 976480,
"step": 1570
},
{
"epoch": 2.807486631016043,
"grad_norm": 0.5374373197555542,
"learning_rate": 4.519612621157103e-05,
"loss": 0.241,
"num_input_tokens_seen": 979328,
"step": 1575
},
{
"epoch": 2.8163992869875223,
"grad_norm": 0.9241515398025513,
"learning_rate": 4.515018682647359e-05,
"loss": 0.2839,
"num_input_tokens_seen": 982624,
"step": 1580
},
{
"epoch": 2.8253119429590017,
"grad_norm": 0.6853222846984863,
"learning_rate": 4.510405240853854e-05,
"loss": 0.2158,
"num_input_tokens_seen": 985664,
"step": 1585
},
{
"epoch": 2.834224598930481,
"grad_norm": 0.5483903884887695,
"learning_rate": 4.505772340429905e-05,
"loss": 0.2571,
"num_input_tokens_seen": 989024,
"step": 1590
},
{
"epoch": 2.843137254901961,
"grad_norm": 0.4872891902923584,
"learning_rate": 4.501120026217164e-05,
"loss": 0.2331,
"num_input_tokens_seen": 992160,
"step": 1595
},
{
"epoch": 2.8520499108734403,
"grad_norm": 0.5892439484596252,
"learning_rate": 4.496448343245192e-05,
"loss": 0.2645,
"num_input_tokens_seen": 995328,
"step": 1600
},
{
"epoch": 2.8609625668449197,
"grad_norm": 0.6122104525566101,
"learning_rate": 4.4917573367310184e-05,
"loss": 0.3106,
"num_input_tokens_seen": 999136,
"step": 1605
},
{
"epoch": 2.8698752228163995,
"grad_norm": 0.657755970954895,
"learning_rate": 4.4870470520787035e-05,
"loss": 0.2123,
"num_input_tokens_seen": 1001920,
"step": 1610
},
{
"epoch": 2.878787878787879,
"grad_norm": 0.6398863196372986,
"learning_rate": 4.482317534878901e-05,
"loss": 0.385,
"num_input_tokens_seen": 1005632,
"step": 1615
},
{
"epoch": 2.8877005347593583,
"grad_norm": 0.9357530474662781,
"learning_rate": 4.477568830908415e-05,
"loss": 0.2565,
"num_input_tokens_seen": 1009408,
"step": 1620
},
{
"epoch": 2.8966131907308377,
"grad_norm": 0.767514705657959,
"learning_rate": 4.4728009861297586e-05,
"loss": 0.2551,
"num_input_tokens_seen": 1012448,
"step": 1625
},
{
"epoch": 2.905525846702317,
"grad_norm": 0.5800440311431885,
"learning_rate": 4.468014046690707e-05,
"loss": 0.2587,
"num_input_tokens_seen": 1015616,
"step": 1630
},
{
"epoch": 2.914438502673797,
"grad_norm": 0.487104207277298,
"learning_rate": 4.463208058923851e-05,
"loss": 0.2677,
"num_input_tokens_seen": 1018944,
"step": 1635
},
{
"epoch": 2.9233511586452763,
"grad_norm": 0.799360454082489,
"learning_rate": 4.458383069346152e-05,
"loss": 0.2031,
"num_input_tokens_seen": 1021696,
"step": 1640
},
{
"epoch": 2.9322638146167557,
"grad_norm": 0.5832977890968323,
"learning_rate": 4.453539124658486e-05,
"loss": 0.2505,
"num_input_tokens_seen": 1024832,
"step": 1645
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.7471289038658142,
"learning_rate": 4.4486762717451975e-05,
"loss": 0.2521,
"num_input_tokens_seen": 1027712,
"step": 1650
},
{
"epoch": 2.950089126559715,
"grad_norm": 0.5479772090911865,
"learning_rate": 4.443794557673641e-05,
"loss": 0.2542,
"num_input_tokens_seen": 1031040,
"step": 1655
},
{
"epoch": 2.9590017825311943,
"grad_norm": 0.5916025042533875,
"learning_rate": 4.43889402969373e-05,
"loss": 0.1892,
"num_input_tokens_seen": 1033440,
"step": 1660
},
{
"epoch": 2.9679144385026737,
"grad_norm": 0.7155612111091614,
"learning_rate": 4.4339747352374726e-05,
"loss": 0.2661,
"num_input_tokens_seen": 1036864,
"step": 1665
},
{
"epoch": 2.976827094474153,
"grad_norm": 0.4465028941631317,
"learning_rate": 4.4290367219185206e-05,
"loss": 0.2583,
"num_input_tokens_seen": 1039808,
"step": 1670
},
{
"epoch": 2.985739750445633,
"grad_norm": 0.5775701999664307,
"learning_rate": 4.424080037531705e-05,
"loss": 0.2162,
"num_input_tokens_seen": 1043200,
"step": 1675
},
{
"epoch": 2.9946524064171123,
"grad_norm": 0.49966952204704285,
"learning_rate": 4.4191047300525704e-05,
"loss": 0.1902,
"num_input_tokens_seen": 1045504,
"step": 1680
},
{
"epoch": 3.0035650623885917,
"grad_norm": 0.5228843092918396,
"learning_rate": 4.414110847636916e-05,
"loss": 0.196,
"num_input_tokens_seen": 1047768,
"step": 1685
},
{
"epoch": 3.0053475935828877,
"eval_loss": 0.2455865740776062,
"eval_runtime": 4.252,
"eval_samples_per_second": 58.561,
"eval_steps_per_second": 14.817,
"num_input_tokens_seen": 1048184,
"step": 1686
},
{
"epoch": 3.0124777183600715,
"grad_norm": 0.3864419162273407,
"learning_rate": 4.409098438620326e-05,
"loss": 0.1859,
"num_input_tokens_seen": 1050456,
"step": 1690
},
{
"epoch": 3.021390374331551,
"grad_norm": 0.7427952885627747,
"learning_rate": 4.404067551517703e-05,
"loss": 0.2342,
"num_input_tokens_seen": 1053592,
"step": 1695
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.8005133867263794,
"learning_rate": 4.399018235022799e-05,
"loss": 0.2547,
"num_input_tokens_seen": 1056664,
"step": 1700
},
{
"epoch": 3.0392156862745097,
"grad_norm": 0.42377611994743347,
"learning_rate": 4.393950538007743e-05,
"loss": 0.2227,
"num_input_tokens_seen": 1059384,
"step": 1705
},
{
"epoch": 3.0481283422459895,
"grad_norm": 0.4982529878616333,
"learning_rate": 4.3888645095225675e-05,
"loss": 0.1863,
"num_input_tokens_seen": 1062168,
"step": 1710
},
{
"epoch": 3.057040998217469,
"grad_norm": 0.9931812882423401,
"learning_rate": 4.383760198794734e-05,
"loss": 0.2083,
"num_input_tokens_seen": 1064952,
"step": 1715
},
{
"epoch": 3.0659536541889483,
"grad_norm": 0.6572649478912354,
"learning_rate": 4.37863765522866e-05,
"loss": 0.1863,
"num_input_tokens_seen": 1067416,
"step": 1720
},
{
"epoch": 3.0748663101604277,
"grad_norm": 0.6921285390853882,
"learning_rate": 4.3734969284052345e-05,
"loss": 0.2354,
"num_input_tokens_seen": 1070552,
"step": 1725
},
{
"epoch": 3.0837789661319075,
"grad_norm": 0.7747342586517334,
"learning_rate": 4.368338068081343e-05,
"loss": 0.3332,
"num_input_tokens_seen": 1074136,
"step": 1730
},
{
"epoch": 3.092691622103387,
"grad_norm": 1.056235432624817,
"learning_rate": 4.3631611241893874e-05,
"loss": 0.2396,
"num_input_tokens_seen": 1077848,
"step": 1735
},
{
"epoch": 3.1016042780748663,
"grad_norm": 0.7865013480186462,
"learning_rate": 4.3579661468367924e-05,
"loss": 0.2057,
"num_input_tokens_seen": 1080664,
"step": 1740
},
{
"epoch": 3.1105169340463457,
"grad_norm": 0.6681080460548401,
"learning_rate": 4.352753186305536e-05,
"loss": 0.2823,
"num_input_tokens_seen": 1083992,
"step": 1745
},
{
"epoch": 3.1194295900178255,
"grad_norm": 0.4991186559200287,
"learning_rate": 4.347522293051648e-05,
"loss": 0.2609,
"num_input_tokens_seen": 1087800,
"step": 1750
},
{
"epoch": 3.128342245989305,
"grad_norm": 0.5108634829521179,
"learning_rate": 4.3422735177047324e-05,
"loss": 0.2318,
"num_input_tokens_seen": 1090776,
"step": 1755
},
{
"epoch": 3.1372549019607843,
"grad_norm": 1.343435525894165,
"learning_rate": 4.337006911067473e-05,
"loss": 0.2593,
"num_input_tokens_seen": 1093624,
"step": 1760
},
{
"epoch": 3.1461675579322637,
"grad_norm": 0.7029876708984375,
"learning_rate": 4.331722524115139e-05,
"loss": 0.1993,
"num_input_tokens_seen": 1096472,
"step": 1765
},
{
"epoch": 3.1550802139037435,
"grad_norm": 0.5673936605453491,
"learning_rate": 4.3264204079950975e-05,
"loss": 0.2703,
"num_input_tokens_seen": 1099736,
"step": 1770
},
{
"epoch": 3.163992869875223,
"grad_norm": 0.49642717838287354,
"learning_rate": 4.321100614026315e-05,
"loss": 0.3485,
"num_input_tokens_seen": 1103384,
"step": 1775
},
{
"epoch": 3.1729055258467023,
"grad_norm": 0.7280632257461548,
"learning_rate": 4.31576319369886e-05,
"loss": 0.2451,
"num_input_tokens_seen": 1106520,
"step": 1780
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.642463207244873,
"learning_rate": 4.310408198673406e-05,
"loss": 0.2062,
"num_input_tokens_seen": 1109208,
"step": 1785
},
{
"epoch": 3.1907308377896615,
"grad_norm": 0.7189128994941711,
"learning_rate": 4.305035680780732e-05,
"loss": 0.2478,
"num_input_tokens_seen": 1112536,
"step": 1790
},
{
"epoch": 3.199643493761141,
"grad_norm": 1.2781462669372559,
"learning_rate": 4.299645692021221e-05,
"loss": 0.2381,
"num_input_tokens_seen": 1115992,
"step": 1795
},
{
"epoch": 3.2085561497326203,
"grad_norm": 0.598044753074646,
"learning_rate": 4.294238284564354e-05,
"loss": 0.2208,
"num_input_tokens_seen": 1119192,
"step": 1800
},
{
"epoch": 3.2174688057040997,
"grad_norm": 0.6014571189880371,
"learning_rate": 4.2888135107482067e-05,
"loss": 0.2393,
"num_input_tokens_seen": 1122552,
"step": 1805
},
{
"epoch": 3.2263814616755795,
"grad_norm": 0.8126239776611328,
"learning_rate": 4.283371423078945e-05,
"loss": 0.2321,
"num_input_tokens_seen": 1126072,
"step": 1810
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.6001937985420227,
"learning_rate": 4.277912074230312e-05,
"loss": 0.1901,
"num_input_tokens_seen": 1128792,
"step": 1815
},
{
"epoch": 3.2442067736185383,
"grad_norm": 0.6077953577041626,
"learning_rate": 4.272435517043125e-05,
"loss": 0.2166,
"num_input_tokens_seen": 1132152,
"step": 1820
},
{
"epoch": 3.2531194295900177,
"grad_norm": 0.38485997915267944,
"learning_rate": 4.2669418045247576e-05,
"loss": 0.2028,
"num_input_tokens_seen": 1135064,
"step": 1825
},
{
"epoch": 3.2620320855614975,
"grad_norm": 0.5066972970962524,
"learning_rate": 4.2614309898486297e-05,
"loss": 0.247,
"num_input_tokens_seen": 1137976,
"step": 1830
},
{
"epoch": 3.270944741532977,
"grad_norm": 0.5907444357872009,
"learning_rate": 4.25590312635369e-05,
"loss": 0.1952,
"num_input_tokens_seen": 1141080,
"step": 1835
},
{
"epoch": 3.2798573975044563,
"grad_norm": 0.6255643963813782,
"learning_rate": 4.250358267543907e-05,
"loss": 0.2124,
"num_input_tokens_seen": 1144376,
"step": 1840
},
{
"epoch": 3.2887700534759357,
"grad_norm": 0.9536407589912415,
"learning_rate": 4.244796467087741e-05,
"loss": 0.23,
"num_input_tokens_seen": 1147224,
"step": 1845
},
{
"epoch": 3.2976827094474155,
"grad_norm": 0.7920709252357483,
"learning_rate": 4.2392177788176335e-05,
"loss": 0.2005,
"num_input_tokens_seen": 1150360,
"step": 1850
},
{
"epoch": 3.306595365418895,
"grad_norm": 0.4633888602256775,
"learning_rate": 4.2336222567294804e-05,
"loss": 0.1962,
"num_input_tokens_seen": 1153688,
"step": 1855
},
{
"epoch": 3.3155080213903743,
"grad_norm": 0.384843111038208,
"learning_rate": 4.228009954982112e-05,
"loss": 0.2039,
"num_input_tokens_seen": 1157016,
"step": 1860
},
{
"epoch": 3.3244206773618536,
"grad_norm": 0.4141569435596466,
"learning_rate": 4.22238092789677e-05,
"loss": 0.2075,
"num_input_tokens_seen": 1159768,
"step": 1865
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.5076260566711426,
"learning_rate": 4.2167352299565746e-05,
"loss": 0.198,
"num_input_tokens_seen": 1162520,
"step": 1870
},
{
"epoch": 3.342245989304813,
"grad_norm": 0.6106960773468018,
"learning_rate": 4.21107291580601e-05,
"loss": 0.1931,
"num_input_tokens_seen": 1165336,
"step": 1875
},
{
"epoch": 3.3511586452762923,
"grad_norm": 0.49231547117233276,
"learning_rate": 4.205394040250382e-05,
"loss": 0.2574,
"num_input_tokens_seen": 1168632,
"step": 1880
},
{
"epoch": 3.3600713012477716,
"grad_norm": 0.5341747403144836,
"learning_rate": 4.199698658255298e-05,
"loss": 0.2002,
"num_input_tokens_seen": 1171352,
"step": 1885
},
{
"epoch": 3.3689839572192515,
"grad_norm": 0.5527672171592712,
"learning_rate": 4.193986824946125e-05,
"loss": 0.2148,
"num_input_tokens_seen": 1174360,
"step": 1890
},
{
"epoch": 3.377896613190731,
"grad_norm": 0.5493122935295105,
"learning_rate": 4.188258595607468e-05,
"loss": 0.2173,
"num_input_tokens_seen": 1177368,
"step": 1895
},
{
"epoch": 3.3868092691622103,
"grad_norm": 0.6076507568359375,
"learning_rate": 4.182514025682625e-05,
"loss": 0.2365,
"num_input_tokens_seen": 1180824,
"step": 1900
},
{
"epoch": 3.3957219251336896,
"grad_norm": 0.38345441222190857,
"learning_rate": 4.176753170773052e-05,
"loss": 0.237,
"num_input_tokens_seen": 1183544,
"step": 1905
},
{
"epoch": 3.4046345811051695,
"grad_norm": 0.8067929744720459,
"learning_rate": 4.170976086637832e-05,
"loss": 0.1945,
"num_input_tokens_seen": 1185848,
"step": 1910
},
{
"epoch": 3.413547237076649,
"grad_norm": 0.5404775142669678,
"learning_rate": 4.1651828291931264e-05,
"loss": 0.1856,
"num_input_tokens_seen": 1189176,
"step": 1915
},
{
"epoch": 3.4224598930481283,
"grad_norm": 0.6067723631858826,
"learning_rate": 4.159373454511636e-05,
"loss": 0.2464,
"num_input_tokens_seen": 1192984,
"step": 1920
},
{
"epoch": 3.431372549019608,
"grad_norm": 0.6056991815567017,
"learning_rate": 4.1535480188220636e-05,
"loss": 0.2909,
"num_input_tokens_seen": 1196888,
"step": 1925
},
{
"epoch": 3.4402852049910875,
"grad_norm": 0.7518835067749023,
"learning_rate": 4.1477065785085634e-05,
"loss": 0.2496,
"num_input_tokens_seen": 1200792,
"step": 1930
},
{
"epoch": 3.449197860962567,
"grad_norm": 0.41140249371528625,
"learning_rate": 4.141849190110199e-05,
"loss": 0.2267,
"num_input_tokens_seen": 1203832,
"step": 1935
},
{
"epoch": 3.4581105169340463,
"grad_norm": 0.44746679067611694,
"learning_rate": 4.1359759103203935e-05,
"loss": 0.215,
"num_input_tokens_seen": 1207160,
"step": 1940
},
{
"epoch": 3.4670231729055256,
"grad_norm": 0.7266998291015625,
"learning_rate": 4.130086795986383e-05,
"loss": 0.2169,
"num_input_tokens_seen": 1210616,
"step": 1945
},
{
"epoch": 3.4759358288770055,
"grad_norm": 0.5968104600906372,
"learning_rate": 4.124181904108664e-05,
"loss": 0.1875,
"num_input_tokens_seen": 1213528,
"step": 1950
},
{
"epoch": 3.484848484848485,
"grad_norm": 0.5463330149650574,
"learning_rate": 4.1182612918404466e-05,
"loss": 0.1969,
"num_input_tokens_seen": 1216568,
"step": 1955
},
{
"epoch": 3.4937611408199643,
"grad_norm": 0.6442824006080627,
"learning_rate": 4.1123250164870955e-05,
"loss": 0.3184,
"num_input_tokens_seen": 1219896,
"step": 1960
},
{
"epoch": 3.502673796791444,
"grad_norm": 0.701900064945221,
"learning_rate": 4.1063731355055763e-05,
"loss": 0.2079,
"num_input_tokens_seen": 1222904,
"step": 1965
},
{
"epoch": 3.5062388591800357,
"eval_loss": 0.22395405173301697,
"eval_runtime": 4.2462,
"eval_samples_per_second": 58.641,
"eval_steps_per_second": 14.837,
"num_input_tokens_seen": 1223864,
"step": 1967
},
{
"epoch": 3.5115864527629235,
"grad_norm": 0.39802566170692444,
"learning_rate": 4.100405706503904e-05,
"loss": 0.158,
"num_input_tokens_seen": 1225496,
"step": 1970
},
{
"epoch": 3.520499108734403,
"grad_norm": 0.7380387783050537,
"learning_rate": 4.094422787240581e-05,
"loss": 0.1725,
"num_input_tokens_seen": 1228280,
"step": 1975
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.6759628653526306,
"learning_rate": 4.088424435624038e-05,
"loss": 0.2052,
"num_input_tokens_seen": 1231288,
"step": 1980
},
{
"epoch": 3.5383244206773616,
"grad_norm": 1.158799409866333,
"learning_rate": 4.082410709712077e-05,
"loss": 0.2018,
"num_input_tokens_seen": 1234456,
"step": 1985
},
{
"epoch": 3.5472370766488415,
"grad_norm": 0.7307495474815369,
"learning_rate": 4.0763816677113064e-05,
"loss": 0.2669,
"num_input_tokens_seen": 1237912,
"step": 1990
},
{
"epoch": 3.556149732620321,
"grad_norm": 0.9738561511039734,
"learning_rate": 4.070337367976578e-05,
"loss": 0.2444,
"num_input_tokens_seen": 1240984,
"step": 1995
},
{
"epoch": 3.5650623885918002,
"grad_norm": 0.5394619703292847,
"learning_rate": 4.064277869010421e-05,
"loss": 0.2265,
"num_input_tokens_seen": 1244280,
"step": 2000
},
{
"epoch": 3.57397504456328,
"grad_norm": 0.7028752565383911,
"learning_rate": 4.058203229462482e-05,
"loss": 0.2192,
"num_input_tokens_seen": 1246904,
"step": 2005
},
{
"epoch": 3.5828877005347595,
"grad_norm": 1.353464126586914,
"learning_rate": 4.052113508128948e-05,
"loss": 0.2313,
"num_input_tokens_seen": 1249880,
"step": 2010
},
{
"epoch": 3.591800356506239,
"grad_norm": 0.8846970796585083,
"learning_rate": 4.0460087639519836e-05,
"loss": 0.1889,
"num_input_tokens_seen": 1252408,
"step": 2015
},
{
"epoch": 3.6007130124777182,
"grad_norm": 1.0351589918136597,
"learning_rate": 4.039889056019159e-05,
"loss": 0.2567,
"num_input_tokens_seen": 1255800,
"step": 2020
},
{
"epoch": 3.6096256684491976,
"grad_norm": 0.6438773274421692,
"learning_rate": 4.03375444356288e-05,
"loss": 0.2018,
"num_input_tokens_seen": 1259160,
"step": 2025
},
{
"epoch": 3.6185383244206775,
"grad_norm": 0.8322818279266357,
"learning_rate": 4.0276049859598084e-05,
"loss": 0.2269,
"num_input_tokens_seen": 1262488,
"step": 2030
},
{
"epoch": 3.627450980392157,
"grad_norm": 0.5302309393882751,
"learning_rate": 4.021440742730295e-05,
"loss": 0.2032,
"num_input_tokens_seen": 1265368,
"step": 2035
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.8041933178901672,
"learning_rate": 4.015261773537799e-05,
"loss": 0.2316,
"num_input_tokens_seen": 1269112,
"step": 2040
},
{
"epoch": 3.645276292335116,
"grad_norm": 0.5872630476951599,
"learning_rate": 4.009068138188311e-05,
"loss": 0.2389,
"num_input_tokens_seen": 1272408,
"step": 2045
},
{
"epoch": 3.6541889483065955,
"grad_norm": 0.5462104678153992,
"learning_rate": 4.002859896629776e-05,
"loss": 0.1955,
"num_input_tokens_seen": 1275640,
"step": 2050
},
{
"epoch": 3.663101604278075,
"grad_norm": 0.7330032587051392,
"learning_rate": 3.99663710895151e-05,
"loss": 0.2116,
"num_input_tokens_seen": 1278616,
"step": 2055
},
{
"epoch": 3.6720142602495542,
"grad_norm": 0.5604473352432251,
"learning_rate": 3.990399835383623e-05,
"loss": 0.2285,
"num_input_tokens_seen": 1281624,
"step": 2060
},
{
"epoch": 3.6809269162210336,
"grad_norm": 0.49228572845458984,
"learning_rate": 3.984148136296431e-05,
"loss": 0.2026,
"num_input_tokens_seen": 1284216,
"step": 2065
},
{
"epoch": 3.6898395721925135,
"grad_norm": 0.8332962393760681,
"learning_rate": 3.977882072199874e-05,
"loss": 0.2028,
"num_input_tokens_seen": 1286808,
"step": 2070
},
{
"epoch": 3.698752228163993,
"grad_norm": 0.6717101335525513,
"learning_rate": 3.971601703742932e-05,
"loss": 0.2117,
"num_input_tokens_seen": 1289944,
"step": 2075
},
{
"epoch": 3.7076648841354722,
"grad_norm": 0.6963510513305664,
"learning_rate": 3.965307091713037e-05,
"loss": 0.1899,
"num_input_tokens_seen": 1292856,
"step": 2080
},
{
"epoch": 3.716577540106952,
"grad_norm": 0.771668553352356,
"learning_rate": 3.95899829703548e-05,
"loss": 0.2491,
"num_input_tokens_seen": 1296792,
"step": 2085
},
{
"epoch": 3.7254901960784315,
"grad_norm": 0.9969800710678101,
"learning_rate": 3.9526753807728295e-05,
"loss": 0.2512,
"num_input_tokens_seen": 1299800,
"step": 2090
},
{
"epoch": 3.734402852049911,
"grad_norm": 0.5737549066543579,
"learning_rate": 3.946338404124334e-05,
"loss": 0.1831,
"num_input_tokens_seen": 1302648,
"step": 2095
},
{
"epoch": 3.7433155080213902,
"grad_norm": 0.5544306039810181,
"learning_rate": 3.939987428425331e-05,
"loss": 0.1678,
"num_input_tokens_seen": 1305016,
"step": 2100
},
{
"epoch": 3.7522281639928696,
"grad_norm": 0.4125676155090332,
"learning_rate": 3.933622515146658e-05,
"loss": 0.1715,
"num_input_tokens_seen": 1308024,
"step": 2105
},
{
"epoch": 3.7611408199643495,
"grad_norm": 0.6266154646873474,
"learning_rate": 3.9272437258940494e-05,
"loss": 0.2112,
"num_input_tokens_seen": 1310552,
"step": 2110
},
{
"epoch": 3.770053475935829,
"grad_norm": 0.44769471883773804,
"learning_rate": 3.9208511224075484e-05,
"loss": 0.2325,
"num_input_tokens_seen": 1313656,
"step": 2115
},
{
"epoch": 3.7789661319073082,
"grad_norm": 0.5761722922325134,
"learning_rate": 3.914444766560902e-05,
"loss": 0.2712,
"num_input_tokens_seen": 1316728,
"step": 2120
},
{
"epoch": 3.787878787878788,
"grad_norm": 0.556746780872345,
"learning_rate": 3.908024720360968e-05,
"loss": 0.2286,
"num_input_tokens_seen": 1320344,
"step": 2125
},
{
"epoch": 3.7967914438502675,
"grad_norm": 0.45677894353866577,
"learning_rate": 3.9015910459471126e-05,
"loss": 0.196,
"num_input_tokens_seen": 1323416,
"step": 2130
},
{
"epoch": 3.805704099821747,
"grad_norm": 0.6750150322914124,
"learning_rate": 3.8951438055906084e-05,
"loss": 0.1779,
"num_input_tokens_seen": 1326360,
"step": 2135
},
{
"epoch": 3.8146167557932262,
"grad_norm": 0.9360057711601257,
"learning_rate": 3.888683061694032e-05,
"loss": 0.2523,
"num_input_tokens_seen": 1329944,
"step": 2140
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.4923909604549408,
"learning_rate": 3.882208876790661e-05,
"loss": 0.1995,
"num_input_tokens_seen": 1333080,
"step": 2145
},
{
"epoch": 3.8324420677361855,
"grad_norm": 0.6493288278579712,
"learning_rate": 3.8757213135438655e-05,
"loss": 0.1972,
"num_input_tokens_seen": 1336504,
"step": 2150
},
{
"epoch": 3.841354723707665,
"grad_norm": 0.5835461616516113,
"learning_rate": 3.869220434746509e-05,
"loss": 0.2229,
"num_input_tokens_seen": 1339704,
"step": 2155
},
{
"epoch": 3.8502673796791442,
"grad_norm": 0.6278809309005737,
"learning_rate": 3.862706303320329e-05,
"loss": 0.2137,
"num_input_tokens_seen": 1343032,
"step": 2160
},
{
"epoch": 3.859180035650624,
"grad_norm": 0.7989611625671387,
"learning_rate": 3.856178982315342e-05,
"loss": 0.2522,
"num_input_tokens_seen": 1346104,
"step": 2165
},
{
"epoch": 3.8680926916221035,
"grad_norm": 0.4888596534729004,
"learning_rate": 3.849638534909219e-05,
"loss": 0.1977,
"num_input_tokens_seen": 1348984,
"step": 2170
},
{
"epoch": 3.877005347593583,
"grad_norm": 0.590801477432251,
"learning_rate": 3.843085024406686e-05,
"loss": 0.2031,
"num_input_tokens_seen": 1351480,
"step": 2175
},
{
"epoch": 3.8859180035650622,
"grad_norm": 0.6255959868431091,
"learning_rate": 3.836518514238903e-05,
"loss": 0.2707,
"num_input_tokens_seen": 1355448,
"step": 2180
},
{
"epoch": 3.8948306595365416,
"grad_norm": 0.5446547269821167,
"learning_rate": 3.8299390679628555e-05,
"loss": 0.1831,
"num_input_tokens_seen": 1358392,
"step": 2185
},
{
"epoch": 3.9037433155080214,
"grad_norm": 0.5819702744483948,
"learning_rate": 3.8233467492607354e-05,
"loss": 0.2039,
"num_input_tokens_seen": 1361368,
"step": 2190
},
{
"epoch": 3.912655971479501,
"grad_norm": 0.5366934537887573,
"learning_rate": 3.816741621939327e-05,
"loss": 0.1955,
"num_input_tokens_seen": 1364536,
"step": 2195
},
{
"epoch": 3.9215686274509802,
"grad_norm": 1.1435610055923462,
"learning_rate": 3.81012374992939e-05,
"loss": 0.2049,
"num_input_tokens_seen": 1367800,
"step": 2200
},
{
"epoch": 3.93048128342246,
"grad_norm": 0.5551317930221558,
"learning_rate": 3.803493197285036e-05,
"loss": 0.2268,
"num_input_tokens_seen": 1371224,
"step": 2205
},
{
"epoch": 3.9393939393939394,
"grad_norm": 1.10652756690979,
"learning_rate": 3.7968500281831146e-05,
"loss": 0.1848,
"num_input_tokens_seen": 1373944,
"step": 2210
},
{
"epoch": 3.948306595365419,
"grad_norm": 0.9579757452011108,
"learning_rate": 3.79019430692259e-05,
"loss": 0.2114,
"num_input_tokens_seen": 1377240,
"step": 2215
},
{
"epoch": 3.9572192513368982,
"grad_norm": 0.42045828700065613,
"learning_rate": 3.783526097923915e-05,
"loss": 0.2034,
"num_input_tokens_seen": 1380248,
"step": 2220
},
{
"epoch": 3.966131907308378,
"grad_norm": 0.6384634375572205,
"learning_rate": 3.7768454657284154e-05,
"loss": 0.1566,
"num_input_tokens_seen": 1382712,
"step": 2225
},
{
"epoch": 3.9750445632798574,
"grad_norm": 0.9116731882095337,
"learning_rate": 3.770152474997657e-05,
"loss": 0.2102,
"num_input_tokens_seen": 1385976,
"step": 2230
},
{
"epoch": 3.983957219251337,
"grad_norm": 0.6810240149497986,
"learning_rate": 3.763447190512824e-05,
"loss": 0.2052,
"num_input_tokens_seen": 1389624,
"step": 2235
},
{
"epoch": 3.9928698752228167,
"grad_norm": 0.3541090488433838,
"learning_rate": 3.7567296771740925e-05,
"loss": 0.244,
"num_input_tokens_seen": 1392728,
"step": 2240
},
{
"epoch": 4.001782531194296,
"grad_norm": 1.0409997701644897,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.2358,
"num_input_tokens_seen": 1395704,
"step": 2245
},
{
"epoch": 4.007130124777183,
"eval_loss": 0.21653257310390472,
"eval_runtime": 4.2509,
"eval_samples_per_second": 58.576,
"eval_steps_per_second": 14.82,
"num_input_tokens_seen": 1397624,
"step": 2248
},
{
"epoch": 4.010695187165775,
"grad_norm": 0.5523825287818909,
"learning_rate": 3.743258224126819e-05,
"loss": 0.1735,
"num_input_tokens_seen": 1398584,
"step": 2250
},
{
"epoch": 4.019607843137255,
"grad_norm": 0.7276411652565002,
"learning_rate": 3.736504414807922e-05,
"loss": 0.1992,
"num_input_tokens_seen": 1401784,
"step": 2255
},
{
"epoch": 4.028520499108734,
"grad_norm": 0.36699721217155457,
"learning_rate": 3.729738637413156e-05,
"loss": 0.1728,
"num_input_tokens_seen": 1404312,
"step": 2260
},
{
"epoch": 4.037433155080214,
"grad_norm": 0.7663154006004333,
"learning_rate": 3.722960957428203e-05,
"loss": 0.1866,
"num_input_tokens_seen": 1407352,
"step": 2265
},
{
"epoch": 4.046345811051693,
"grad_norm": 0.4959503412246704,
"learning_rate": 3.716171440453952e-05,
"loss": 0.1823,
"num_input_tokens_seen": 1410648,
"step": 2270
},
{
"epoch": 4.055258467023173,
"grad_norm": 0.6325064897537231,
"learning_rate": 3.709370152205863e-05,
"loss": 0.1698,
"num_input_tokens_seen": 1413816,
"step": 2275
},
{
"epoch": 4.064171122994653,
"grad_norm": 0.4548736810684204,
"learning_rate": 3.7025571585133254e-05,
"loss": 0.1626,
"num_input_tokens_seen": 1416024,
"step": 2280
},
{
"epoch": 4.073083778966132,
"grad_norm": 0.3842249810695648,
"learning_rate": 3.69573252531903e-05,
"loss": 0.1929,
"num_input_tokens_seen": 1419128,
"step": 2285
},
{
"epoch": 4.081996434937611,
"grad_norm": 0.6341343522071838,
"learning_rate": 3.6888963186783224e-05,
"loss": 0.1625,
"num_input_tokens_seen": 1421720,
"step": 2290
},
{
"epoch": 4.090909090909091,
"grad_norm": 0.5091090798377991,
"learning_rate": 3.682048604758567e-05,
"loss": 0.1771,
"num_input_tokens_seen": 1424632,
"step": 2295
},
{
"epoch": 4.09982174688057,
"grad_norm": 0.24424993991851807,
"learning_rate": 3.67518944983851e-05,
"loss": 0.1739,
"num_input_tokens_seen": 1427480,
"step": 2300
},
{
"epoch": 4.10873440285205,
"grad_norm": 0.589100182056427,
"learning_rate": 3.668318920307632e-05,
"loss": 0.2092,
"num_input_tokens_seen": 1430296,
"step": 2305
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.41250258684158325,
"learning_rate": 3.6614370826655074e-05,
"loss": 0.1714,
"num_input_tokens_seen": 1432920,
"step": 2310
},
{
"epoch": 4.126559714795009,
"grad_norm": 0.7590497136116028,
"learning_rate": 3.654544003521164e-05,
"loss": 0.2039,
"num_input_tokens_seen": 1435544,
"step": 2315
},
{
"epoch": 4.135472370766489,
"grad_norm": 0.8127907514572144,
"learning_rate": 3.647639749592433e-05,
"loss": 0.1583,
"num_input_tokens_seen": 1438040,
"step": 2320
},
{
"epoch": 4.144385026737968,
"grad_norm": 0.6445732712745667,
"learning_rate": 3.640724387705308e-05,
"loss": 0.2149,
"num_input_tokens_seen": 1441528,
"step": 2325
},
{
"epoch": 4.153297682709447,
"grad_norm": 0.44771522283554077,
"learning_rate": 3.633797984793294e-05,
"loss": 0.1543,
"num_input_tokens_seen": 1444920,
"step": 2330
},
{
"epoch": 4.162210338680927,
"grad_norm": 0.47167617082595825,
"learning_rate": 3.626860607896764e-05,
"loss": 0.2014,
"num_input_tokens_seen": 1447896,
"step": 2335
},
{
"epoch": 4.171122994652406,
"grad_norm": 0.49547502398490906,
"learning_rate": 3.6199123241623046e-05,
"loss": 0.2085,
"num_input_tokens_seen": 1451256,
"step": 2340
},
{
"epoch": 4.180035650623886,
"grad_norm": 0.5464377403259277,
"learning_rate": 3.6129532008420715e-05,
"loss": 0.1821,
"num_input_tokens_seen": 1454136,
"step": 2345
},
{
"epoch": 4.188948306595365,
"grad_norm": 0.44719406962394714,
"learning_rate": 3.605983305293137e-05,
"loss": 0.1703,
"num_input_tokens_seen": 1456504,
"step": 2350
},
{
"epoch": 4.197860962566845,
"grad_norm": 0.905034065246582,
"learning_rate": 3.599002704976835e-05,
"loss": 0.1734,
"num_input_tokens_seen": 1459768,
"step": 2355
},
{
"epoch": 4.206773618538325,
"grad_norm": 0.3426745533943176,
"learning_rate": 3.592011467458113e-05,
"loss": 0.1501,
"num_input_tokens_seen": 1462392,
"step": 2360
},
{
"epoch": 4.215686274509804,
"grad_norm": 1.105431318283081,
"learning_rate": 3.585009660404873e-05,
"loss": 0.2289,
"num_input_tokens_seen": 1466040,
"step": 2365
},
{
"epoch": 4.224598930481283,
"grad_norm": 0.6577187776565552,
"learning_rate": 3.577997351587322e-05,
"loss": 0.2166,
"num_input_tokens_seen": 1469208,
"step": 2370
},
{
"epoch": 4.233511586452763,
"grad_norm": 0.5719982981681824,
"learning_rate": 3.5709746088773085e-05,
"loss": 0.222,
"num_input_tokens_seen": 1472536,
"step": 2375
},
{
"epoch": 4.242424242424242,
"grad_norm": 0.4010562598705292,
"learning_rate": 3.563941500247676e-05,
"loss": 0.1836,
"num_input_tokens_seen": 1475608,
"step": 2380
},
{
"epoch": 4.251336898395722,
"grad_norm": 0.6845771074295044,
"learning_rate": 3.5568980937715945e-05,
"loss": 0.1762,
"num_input_tokens_seen": 1479256,
"step": 2385
},
{
"epoch": 4.260249554367201,
"grad_norm": 0.5753139853477478,
"learning_rate": 3.54984445762191e-05,
"loss": 0.2054,
"num_input_tokens_seen": 1483064,
"step": 2390
},
{
"epoch": 4.269162210338681,
"grad_norm": 0.586729109287262,
"learning_rate": 3.5427806600704785e-05,
"loss": 0.1733,
"num_input_tokens_seen": 1485880,
"step": 2395
},
{
"epoch": 4.278074866310161,
"grad_norm": 0.5614349842071533,
"learning_rate": 3.535706769487509e-05,
"loss": 0.1777,
"num_input_tokens_seen": 1489208,
"step": 2400
},
{
"epoch": 4.28698752228164,
"grad_norm": 0.6715386509895325,
"learning_rate": 3.5286228543409004e-05,
"loss": 0.1883,
"num_input_tokens_seen": 1492216,
"step": 2405
},
{
"epoch": 4.295900178253119,
"grad_norm": 0.5051096677780151,
"learning_rate": 3.5215289831955786e-05,
"loss": 0.2037,
"num_input_tokens_seen": 1495960,
"step": 2410
},
{
"epoch": 4.304812834224599,
"grad_norm": 0.8140228390693665,
"learning_rate": 3.514425224712835e-05,
"loss": 0.1892,
"num_input_tokens_seen": 1498584,
"step": 2415
},
{
"epoch": 4.313725490196078,
"grad_norm": 0.45702996850013733,
"learning_rate": 3.507311647649657e-05,
"loss": 0.179,
"num_input_tokens_seen": 1501880,
"step": 2420
},
{
"epoch": 4.322638146167558,
"grad_norm": 0.6330050230026245,
"learning_rate": 3.5001883208580665e-05,
"loss": 0.1901,
"num_input_tokens_seen": 1505112,
"step": 2425
},
{
"epoch": 4.331550802139038,
"grad_norm": 0.5689657330513,
"learning_rate": 3.493055313284456e-05,
"loss": 0.2295,
"num_input_tokens_seen": 1507768,
"step": 2430
},
{
"epoch": 4.340463458110517,
"grad_norm": 0.9648520946502686,
"learning_rate": 3.485912693968913e-05,
"loss": 0.2049,
"num_input_tokens_seen": 1511224,
"step": 2435
},
{
"epoch": 4.349376114081997,
"grad_norm": 0.4425726532936096,
"learning_rate": 3.478760532044561e-05,
"loss": 0.2032,
"num_input_tokens_seen": 1514456,
"step": 2440
},
{
"epoch": 4.358288770053476,
"grad_norm": 0.5605233311653137,
"learning_rate": 3.471598896736881e-05,
"loss": 0.207,
"num_input_tokens_seen": 1517400,
"step": 2445
},
{
"epoch": 4.367201426024955,
"grad_norm": 0.5907042622566223,
"learning_rate": 3.464427857363052e-05,
"loss": 0.2018,
"num_input_tokens_seen": 1520664,
"step": 2450
},
{
"epoch": 4.376114081996435,
"grad_norm": 0.8678156137466431,
"learning_rate": 3.457247483331272e-05,
"loss": 0.2408,
"num_input_tokens_seen": 1523960,
"step": 2455
},
{
"epoch": 4.385026737967914,
"grad_norm": 0.4271613359451294,
"learning_rate": 3.4500578441400876e-05,
"loss": 0.1568,
"num_input_tokens_seen": 1526616,
"step": 2460
},
{
"epoch": 4.393939393939394,
"grad_norm": 1.1846132278442383,
"learning_rate": 3.4428590093777244e-05,
"loss": 0.3417,
"num_input_tokens_seen": 1530808,
"step": 2465
},
{
"epoch": 4.402852049910873,
"grad_norm": 0.49708229303359985,
"learning_rate": 3.43565104872141e-05,
"loss": 0.1599,
"num_input_tokens_seen": 1533336,
"step": 2470
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.35631561279296875,
"learning_rate": 3.428434031936704e-05,
"loss": 0.1646,
"num_input_tokens_seen": 1535864,
"step": 2475
},
{
"epoch": 4.420677361853833,
"grad_norm": 0.6264846324920654,
"learning_rate": 3.421208028876815e-05,
"loss": 0.2114,
"num_input_tokens_seen": 1539192,
"step": 2480
},
{
"epoch": 4.429590017825312,
"grad_norm": 0.3950527310371399,
"learning_rate": 3.413973109481935e-05,
"loss": 0.227,
"num_input_tokens_seen": 1542712,
"step": 2485
},
{
"epoch": 4.438502673796791,
"grad_norm": 0.7369870543479919,
"learning_rate": 3.406729343778552e-05,
"loss": 0.1871,
"num_input_tokens_seen": 1545272,
"step": 2490
},
{
"epoch": 4.447415329768271,
"grad_norm": 0.549528956413269,
"learning_rate": 3.3994768018787815e-05,
"loss": 0.3024,
"num_input_tokens_seen": 1549464,
"step": 2495
},
{
"epoch": 4.45632798573975,
"grad_norm": 0.5840650796890259,
"learning_rate": 3.392215553979679e-05,
"loss": 0.2244,
"num_input_tokens_seen": 1552280,
"step": 2500
},
{
"epoch": 4.46524064171123,
"grad_norm": 0.399300754070282,
"learning_rate": 3.38494567036257e-05,
"loss": 0.2032,
"num_input_tokens_seen": 1555448,
"step": 2505
},
{
"epoch": 4.47415329768271,
"grad_norm": 0.47554269433021545,
"learning_rate": 3.3776672213923587e-05,
"loss": 0.2211,
"num_input_tokens_seen": 1559480,
"step": 2510
},
{
"epoch": 4.483065953654189,
"grad_norm": 0.3855815827846527,
"learning_rate": 3.370380277516858e-05,
"loss": 0.1718,
"num_input_tokens_seen": 1562872,
"step": 2515
},
{
"epoch": 4.491978609625669,
"grad_norm": 0.5743004679679871,
"learning_rate": 3.3630849092661e-05,
"loss": 0.183,
"num_input_tokens_seen": 1565752,
"step": 2520
},
{
"epoch": 4.500891265597148,
"grad_norm": 0.527409553527832,
"learning_rate": 3.355781187251657e-05,
"loss": 0.1778,
"num_input_tokens_seen": 1568600,
"step": 2525
},
{
"epoch": 4.508021390374331,
"eval_loss": 0.2118549942970276,
"eval_runtime": 4.2596,
"eval_samples_per_second": 58.457,
"eval_steps_per_second": 14.79,
"num_input_tokens_seen": 1570936,
"step": 2529
},
{
"epoch": 4.509803921568627,
"grad_norm": 0.39879217743873596,
"learning_rate": 3.3484691821659584e-05,
"loss": 0.1747,
"num_input_tokens_seen": 1571512,
"step": 2530
},
{
"epoch": 4.518716577540107,
"grad_norm": 0.5035882592201233,
"learning_rate": 3.3411489647816016e-05,
"loss": 0.1871,
"num_input_tokens_seen": 1574232,
"step": 2535
},
{
"epoch": 4.527629233511586,
"grad_norm": 1.1074864864349365,
"learning_rate": 3.3338206059506736e-05,
"loss": 0.2403,
"num_input_tokens_seen": 1577816,
"step": 2540
},
{
"epoch": 4.536541889483066,
"grad_norm": 0.8603164553642273,
"learning_rate": 3.326484176604061e-05,
"loss": 0.2662,
"num_input_tokens_seen": 1581368,
"step": 2545
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.43185243010520935,
"learning_rate": 3.3191397477507655e-05,
"loss": 0.1828,
"num_input_tokens_seen": 1583800,
"step": 2550
},
{
"epoch": 4.554367201426025,
"grad_norm": 0.397795170545578,
"learning_rate": 3.3117873904772123e-05,
"loss": 0.206,
"num_input_tokens_seen": 1587384,
"step": 2555
},
{
"epoch": 4.563279857397505,
"grad_norm": 0.7756383419036865,
"learning_rate": 3.30442717594657e-05,
"loss": 0.1919,
"num_input_tokens_seen": 1590328,
"step": 2560
},
{
"epoch": 4.572192513368984,
"grad_norm": 0.7332653999328613,
"learning_rate": 3.297059175398056e-05,
"loss": 0.2376,
"num_input_tokens_seen": 1594136,
"step": 2565
},
{
"epoch": 4.581105169340463,
"grad_norm": 0.541881799697876,
"learning_rate": 3.289683460146244e-05,
"loss": 0.1923,
"num_input_tokens_seen": 1597656,
"step": 2570
},
{
"epoch": 4.590017825311943,
"grad_norm": 0.48139122128486633,
"learning_rate": 3.282300101580386e-05,
"loss": 0.198,
"num_input_tokens_seen": 1600536,
"step": 2575
},
{
"epoch": 4.598930481283422,
"grad_norm": 0.7859025001525879,
"learning_rate": 3.274909171163706e-05,
"loss": 0.1965,
"num_input_tokens_seen": 1603832,
"step": 2580
},
{
"epoch": 4.607843137254902,
"grad_norm": 0.8468954563140869,
"learning_rate": 3.2675107404327194e-05,
"loss": 0.1882,
"num_input_tokens_seen": 1607480,
"step": 2585
},
{
"epoch": 4.616755793226382,
"grad_norm": 0.6784586310386658,
"learning_rate": 3.2601048809965355e-05,
"loss": 0.187,
"num_input_tokens_seen": 1610296,
"step": 2590
},
{
"epoch": 4.625668449197861,
"grad_norm": 0.4848667085170746,
"learning_rate": 3.2526916645361666e-05,
"loss": 0.1797,
"num_input_tokens_seen": 1613336,
"step": 2595
},
{
"epoch": 4.634581105169341,
"grad_norm": 0.4509483575820923,
"learning_rate": 3.2452711628038324e-05,
"loss": 0.159,
"num_input_tokens_seen": 1616152,
"step": 2600
},
{
"epoch": 4.64349376114082,
"grad_norm": 0.9891667366027832,
"learning_rate": 3.2378434476222666e-05,
"loss": 0.2153,
"num_input_tokens_seen": 1620024,
"step": 2605
},
{
"epoch": 4.652406417112299,
"grad_norm": 0.45274657011032104,
"learning_rate": 3.2304085908840244e-05,
"loss": 0.1975,
"num_input_tokens_seen": 1623544,
"step": 2610
},
{
"epoch": 4.661319073083779,
"grad_norm": 0.5668216943740845,
"learning_rate": 3.222966664550777e-05,
"loss": 0.1748,
"num_input_tokens_seen": 1626296,
"step": 2615
},
{
"epoch": 4.670231729055258,
"grad_norm": 0.6975745558738708,
"learning_rate": 3.2155177406526304e-05,
"loss": 0.1868,
"num_input_tokens_seen": 1629336,
"step": 2620
},
{
"epoch": 4.6791443850267385,
"grad_norm": 0.7208099961280823,
"learning_rate": 3.208061891287414e-05,
"loss": 0.214,
"num_input_tokens_seen": 1632888,
"step": 2625
},
{
"epoch": 4.688057040998218,
"grad_norm": 0.41192349791526794,
"learning_rate": 3.200599188619989e-05,
"loss": 0.1753,
"num_input_tokens_seen": 1635768,
"step": 2630
},
{
"epoch": 4.696969696969697,
"grad_norm": 1.2426398992538452,
"learning_rate": 3.1931297048815534e-05,
"loss": 0.2339,
"num_input_tokens_seen": 1639256,
"step": 2635
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.4843774735927582,
"learning_rate": 3.185653512368933e-05,
"loss": 0.2591,
"num_input_tokens_seen": 1643128,
"step": 2640
},
{
"epoch": 4.714795008912656,
"grad_norm": 0.6016537547111511,
"learning_rate": 3.178170683443893e-05,
"loss": 0.1748,
"num_input_tokens_seen": 1646424,
"step": 2645
},
{
"epoch": 4.723707664884135,
"grad_norm": 0.5028678178787231,
"learning_rate": 3.1706812905324276e-05,
"loss": 0.1844,
"num_input_tokens_seen": 1649240,
"step": 2650
},
{
"epoch": 4.732620320855615,
"grad_norm": 0.694146454334259,
"learning_rate": 3.1631854061240684e-05,
"loss": 0.1668,
"num_input_tokens_seen": 1652184,
"step": 2655
},
{
"epoch": 4.741532976827094,
"grad_norm": 0.6105802655220032,
"learning_rate": 3.155683102771173e-05,
"loss": 0.2189,
"num_input_tokens_seen": 1655480,
"step": 2660
},
{
"epoch": 4.750445632798574,
"grad_norm": 0.8289818167686462,
"learning_rate": 3.1481744530882305e-05,
"loss": 0.2437,
"num_input_tokens_seen": 1659352,
"step": 2665
},
{
"epoch": 4.759358288770054,
"grad_norm": 0.5131431221961975,
"learning_rate": 3.1406595297511566e-05,
"loss": 0.1756,
"num_input_tokens_seen": 1661976,
"step": 2670
},
{
"epoch": 4.768270944741533,
"grad_norm": 0.6698647737503052,
"learning_rate": 3.133138405496587e-05,
"loss": 0.1713,
"num_input_tokens_seen": 1664504,
"step": 2675
},
{
"epoch": 4.777183600713013,
"grad_norm": 0.5975663065910339,
"learning_rate": 3.125611153121178e-05,
"loss": 0.1763,
"num_input_tokens_seen": 1667288,
"step": 2680
},
{
"epoch": 4.786096256684492,
"grad_norm": 0.5346847772598267,
"learning_rate": 3.118077845480897e-05,
"loss": 0.1686,
"num_input_tokens_seen": 1670360,
"step": 2685
},
{
"epoch": 4.795008912655971,
"grad_norm": 0.5491595268249512,
"learning_rate": 3.110538555490324e-05,
"loss": 0.1884,
"num_input_tokens_seen": 1673624,
"step": 2690
},
{
"epoch": 4.803921568627451,
"grad_norm": 0.35313117504119873,
"learning_rate": 3.1029933561219375e-05,
"loss": 0.1675,
"num_input_tokens_seen": 1676440,
"step": 2695
},
{
"epoch": 4.81283422459893,
"grad_norm": 0.5857532024383545,
"learning_rate": 3.095442320405418e-05,
"loss": 0.1637,
"num_input_tokens_seen": 1679448,
"step": 2700
},
{
"epoch": 4.8217468805704105,
"grad_norm": 0.6775690913200378,
"learning_rate": 3.0878855214269293e-05,
"loss": 0.1642,
"num_input_tokens_seen": 1682520,
"step": 2705
},
{
"epoch": 4.83065953654189,
"grad_norm": 0.5732465386390686,
"learning_rate": 3.0803230323284225e-05,
"loss": 0.1834,
"num_input_tokens_seen": 1685656,
"step": 2710
},
{
"epoch": 4.839572192513369,
"grad_norm": 1.1239274740219116,
"learning_rate": 3.0727549263069224e-05,
"loss": 0.2211,
"num_input_tokens_seen": 1688856,
"step": 2715
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.8710312247276306,
"learning_rate": 3.065181276613817e-05,
"loss": 0.1483,
"num_input_tokens_seen": 1691768,
"step": 2720
},
{
"epoch": 4.857397504456328,
"grad_norm": 0.28014299273490906,
"learning_rate": 3.057602156554155e-05,
"loss": 0.1538,
"num_input_tokens_seen": 1694488,
"step": 2725
},
{
"epoch": 4.866310160427807,
"grad_norm": 0.5496522784233093,
"learning_rate": 3.0500176394859293e-05,
"loss": 0.2051,
"num_input_tokens_seen": 1697752,
"step": 2730
},
{
"epoch": 4.875222816399287,
"grad_norm": 0.673943817615509,
"learning_rate": 3.042427798819373e-05,
"loss": 0.1897,
"num_input_tokens_seen": 1700408,
"step": 2735
},
{
"epoch": 4.884135472370766,
"grad_norm": 0.7624504566192627,
"learning_rate": 3.0348327080162435e-05,
"loss": 0.1842,
"num_input_tokens_seen": 1703512,
"step": 2740
},
{
"epoch": 4.893048128342246,
"grad_norm": 0.5836613774299622,
"learning_rate": 3.0272324405891172e-05,
"loss": 0.1811,
"num_input_tokens_seen": 1707032,
"step": 2745
},
{
"epoch": 4.901960784313726,
"grad_norm": 0.6330267190933228,
"learning_rate": 3.0196270701006706e-05,
"loss": 0.1925,
"num_input_tokens_seen": 1710328,
"step": 2750
},
{
"epoch": 4.910873440285205,
"grad_norm": 0.764445960521698,
"learning_rate": 3.012016670162977e-05,
"loss": 0.1888,
"num_input_tokens_seen": 1712632,
"step": 2755
},
{
"epoch": 4.919786096256685,
"grad_norm": 0.3074583113193512,
"learning_rate": 3.0044013144367866e-05,
"loss": 0.2241,
"num_input_tokens_seen": 1716344,
"step": 2760
},
{
"epoch": 4.928698752228164,
"grad_norm": 0.4822777509689331,
"learning_rate": 2.996781076630816e-05,
"loss": 0.1661,
"num_input_tokens_seen": 1718712,
"step": 2765
},
{
"epoch": 4.937611408199643,
"grad_norm": 0.56252521276474,
"learning_rate": 2.9891560305010392e-05,
"loss": 0.1863,
"num_input_tokens_seen": 1722328,
"step": 2770
},
{
"epoch": 4.946524064171123,
"grad_norm": 0.5701931118965149,
"learning_rate": 2.9815262498499657e-05,
"loss": 0.2022,
"num_input_tokens_seen": 1725464,
"step": 2775
},
{
"epoch": 4.955436720142602,
"grad_norm": 0.6118953227996826,
"learning_rate": 2.9738918085259314e-05,
"loss": 0.1703,
"num_input_tokens_seen": 1728472,
"step": 2780
},
{
"epoch": 4.9643493761140824,
"grad_norm": 0.43155810236930847,
"learning_rate": 2.9662527804223827e-05,
"loss": 0.1658,
"num_input_tokens_seen": 1731160,
"step": 2785
},
{
"epoch": 4.973262032085562,
"grad_norm": 0.622303307056427,
"learning_rate": 2.9586092394771637e-05,
"loss": 0.2174,
"num_input_tokens_seen": 1734264,
"step": 2790
},
{
"epoch": 4.982174688057041,
"grad_norm": 0.592126727104187,
"learning_rate": 2.950961259671793e-05,
"loss": 0.1573,
"num_input_tokens_seen": 1737144,
"step": 2795
},
{
"epoch": 4.991087344028521,
"grad_norm": 0.4473949372768402,
"learning_rate": 2.943308915030757e-05,
"loss": 0.1619,
"num_input_tokens_seen": 1740664,
"step": 2800
},
{
"epoch": 5.0,
"grad_norm": 1.4496628046035767,
"learning_rate": 2.935652279620788e-05,
"loss": 0.194,
"num_input_tokens_seen": 1743216,
"step": 2805
},
{
"epoch": 5.008912655971479,
"grad_norm": 0.5206677913665771,
"learning_rate": 2.9279914275501473e-05,
"loss": 0.2055,
"num_input_tokens_seen": 1746384,
"step": 2810
},
{
"epoch": 5.008912655971479,
"eval_loss": 0.19685669243335724,
"eval_runtime": 4.2355,
"eval_samples_per_second": 58.788,
"eval_steps_per_second": 14.874,
"num_input_tokens_seen": 1746384,
"step": 2810
},
{
"epoch": 5.017825311942959,
"grad_norm": 0.46784770488739014,
"learning_rate": 2.9203264329679115e-05,
"loss": 0.1835,
"num_input_tokens_seen": 1749680,
"step": 2815
},
{
"epoch": 5.026737967914438,
"grad_norm": 0.9836930632591248,
"learning_rate": 2.9126573700632504e-05,
"loss": 0.1855,
"num_input_tokens_seen": 1753104,
"step": 2820
},
{
"epoch": 5.035650623885918,
"grad_norm": 0.48144713044166565,
"learning_rate": 2.9049843130647112e-05,
"loss": 0.1857,
"num_input_tokens_seen": 1756112,
"step": 2825
},
{
"epoch": 5.044563279857398,
"grad_norm": 0.49128931760787964,
"learning_rate": 2.8973073362394998e-05,
"loss": 0.1802,
"num_input_tokens_seen": 1759344,
"step": 2830
},
{
"epoch": 5.053475935828877,
"grad_norm": 0.4599247872829437,
"learning_rate": 2.8896265138927638e-05,
"loss": 0.1939,
"num_input_tokens_seen": 1762288,
"step": 2835
},
{
"epoch": 5.062388591800357,
"grad_norm": 0.4987725615501404,
"learning_rate": 2.881941920366868e-05,
"loss": 0.1583,
"num_input_tokens_seen": 1765072,
"step": 2840
},
{
"epoch": 5.071301247771836,
"grad_norm": 0.4939536452293396,
"learning_rate": 2.8742536300406804e-05,
"loss": 0.2022,
"num_input_tokens_seen": 1767952,
"step": 2845
},
{
"epoch": 5.080213903743315,
"grad_norm": 0.2937607765197754,
"learning_rate": 2.8665617173288516e-05,
"loss": 0.1696,
"num_input_tokens_seen": 1770896,
"step": 2850
},
{
"epoch": 5.089126559714795,
"grad_norm": 0.6866093277931213,
"learning_rate": 2.8588662566810893e-05,
"loss": 0.1683,
"num_input_tokens_seen": 1773840,
"step": 2855
},
{
"epoch": 5.098039215686274,
"grad_norm": 0.5026021003723145,
"learning_rate": 2.851167322581445e-05,
"loss": 0.1924,
"num_input_tokens_seen": 1776720,
"step": 2860
},
{
"epoch": 5.106951871657754,
"grad_norm": 0.5058155059814453,
"learning_rate": 2.8434649895475877e-05,
"loss": 0.1572,
"num_input_tokens_seen": 1779088,
"step": 2865
},
{
"epoch": 5.115864527629234,
"grad_norm": 0.47404804825782776,
"learning_rate": 2.8357593321300856e-05,
"loss": 0.1753,
"num_input_tokens_seen": 1781776,
"step": 2870
},
{
"epoch": 5.124777183600713,
"grad_norm": 0.5163501501083374,
"learning_rate": 2.828050424911683e-05,
"loss": 0.1685,
"num_input_tokens_seen": 1784720,
"step": 2875
},
{
"epoch": 5.133689839572193,
"grad_norm": 0.6680046319961548,
"learning_rate": 2.8203383425065787e-05,
"loss": 0.1854,
"num_input_tokens_seen": 1787856,
"step": 2880
},
{
"epoch": 5.142602495543672,
"grad_norm": 0.47441810369491577,
"learning_rate": 2.812623159559704e-05,
"loss": 0.1793,
"num_input_tokens_seen": 1791088,
"step": 2885
},
{
"epoch": 5.151515151515151,
"grad_norm": 0.4247751533985138,
"learning_rate": 2.8049049507460003e-05,
"loss": 0.2227,
"num_input_tokens_seen": 1795056,
"step": 2890
},
{
"epoch": 5.160427807486631,
"grad_norm": 0.4086715281009674,
"learning_rate": 2.7971837907696973e-05,
"loss": 0.2894,
"num_input_tokens_seen": 1798928,
"step": 2895
},
{
"epoch": 5.16934046345811,
"grad_norm": 0.48060083389282227,
"learning_rate": 2.7894597543635863e-05,
"loss": 0.1778,
"num_input_tokens_seen": 1802384,
"step": 2900
},
{
"epoch": 5.17825311942959,
"grad_norm": 0.5457305312156677,
"learning_rate": 2.781732916288303e-05,
"loss": 0.1873,
"num_input_tokens_seen": 1805616,
"step": 2905
},
{
"epoch": 5.18716577540107,
"grad_norm": 0.7138332724571228,
"learning_rate": 2.774003351331597e-05,
"loss": 0.1532,
"num_input_tokens_seen": 1809008,
"step": 2910
},
{
"epoch": 5.196078431372549,
"grad_norm": 0.5133665204048157,
"learning_rate": 2.7662711343076135e-05,
"loss": 0.1604,
"num_input_tokens_seen": 1812784,
"step": 2915
},
{
"epoch": 5.204991087344029,
"grad_norm": 0.48487603664398193,
"learning_rate": 2.7585363400561658e-05,
"loss": 0.155,
"num_input_tokens_seen": 1815248,
"step": 2920
},
{
"epoch": 5.213903743315508,
"grad_norm": 0.5267552137374878,
"learning_rate": 2.7507990434420126e-05,
"loss": 0.186,
"num_input_tokens_seen": 1818032,
"step": 2925
},
{
"epoch": 5.222816399286987,
"grad_norm": 0.45045390725135803,
"learning_rate": 2.7430593193541325e-05,
"loss": 0.1804,
"num_input_tokens_seen": 1821232,
"step": 2930
},
{
"epoch": 5.231729055258467,
"grad_norm": 0.5850667953491211,
"learning_rate": 2.7353172427049995e-05,
"loss": 0.2057,
"num_input_tokens_seen": 1824784,
"step": 2935
},
{
"epoch": 5.240641711229946,
"grad_norm": 0.4316384792327881,
"learning_rate": 2.7275728884298596e-05,
"loss": 0.1754,
"num_input_tokens_seen": 1827088,
"step": 2940
},
{
"epoch": 5.249554367201426,
"grad_norm": 0.350407212972641,
"learning_rate": 2.719826331486e-05,
"loss": 0.1627,
"num_input_tokens_seen": 1829328,
"step": 2945
},
{
"epoch": 5.258467023172906,
"grad_norm": 0.6626913547515869,
"learning_rate": 2.7120776468520314e-05,
"loss": 0.2147,
"num_input_tokens_seen": 1833136,
"step": 2950
},
{
"epoch": 5.267379679144385,
"grad_norm": 0.711764931678772,
"learning_rate": 2.7043269095271573e-05,
"loss": 0.185,
"num_input_tokens_seen": 1835632,
"step": 2955
},
{
"epoch": 5.276292335115865,
"grad_norm": 0.5972061157226562,
"learning_rate": 2.6965741945304467e-05,
"loss": 0.199,
"num_input_tokens_seen": 1838992,
"step": 2960
},
{
"epoch": 5.285204991087344,
"grad_norm": 0.9157897233963013,
"learning_rate": 2.6888195769001146e-05,
"loss": 0.1782,
"num_input_tokens_seen": 1841840,
"step": 2965
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.4935537874698639,
"learning_rate": 2.681063131692787e-05,
"loss": 0.1843,
"num_input_tokens_seen": 1844560,
"step": 2970
},
{
"epoch": 5.303030303030303,
"grad_norm": 0.5020252466201782,
"learning_rate": 2.673304933982783e-05,
"loss": 0.1891,
"num_input_tokens_seen": 1848624,
"step": 2975
},
{
"epoch": 5.311942959001782,
"grad_norm": 0.5348985195159912,
"learning_rate": 2.6655450588613806e-05,
"loss": 0.1925,
"num_input_tokens_seen": 1851952,
"step": 2980
},
{
"epoch": 5.320855614973262,
"grad_norm": 0.42828452587127686,
"learning_rate": 2.657783581436097e-05,
"loss": 0.2381,
"num_input_tokens_seen": 1855696,
"step": 2985
},
{
"epoch": 5.329768270944742,
"grad_norm": 0.6298767328262329,
"learning_rate": 2.6500205768299535e-05,
"loss": 0.193,
"num_input_tokens_seen": 1859408,
"step": 2990
},
{
"epoch": 5.338680926916221,
"grad_norm": 0.6732975244522095,
"learning_rate": 2.642256120180758e-05,
"loss": 0.1508,
"num_input_tokens_seen": 1861936,
"step": 2995
},
{
"epoch": 5.347593582887701,
"grad_norm": 0.6173202991485596,
"learning_rate": 2.6344902866403687e-05,
"loss": 0.1724,
"num_input_tokens_seen": 1864624,
"step": 3000
},
{
"epoch": 5.35650623885918,
"grad_norm": 0.4392896890640259,
"learning_rate": 2.6267231513739726e-05,
"loss": 0.2092,
"num_input_tokens_seen": 1867600,
"step": 3005
},
{
"epoch": 5.365418894830659,
"grad_norm": 0.621001660823822,
"learning_rate": 2.6189547895593562e-05,
"loss": 0.1982,
"num_input_tokens_seen": 1870672,
"step": 3010
},
{
"epoch": 5.374331550802139,
"grad_norm": 0.5161955952644348,
"learning_rate": 2.611185276386176e-05,
"loss": 0.1923,
"num_input_tokens_seen": 1874160,
"step": 3015
},
{
"epoch": 5.383244206773618,
"grad_norm": 0.5126301050186157,
"learning_rate": 2.6034146870552346e-05,
"loss": 0.1906,
"num_input_tokens_seen": 1877616,
"step": 3020
},
{
"epoch": 5.392156862745098,
"grad_norm": 0.6807987093925476,
"learning_rate": 2.595643096777748e-05,
"loss": 0.1862,
"num_input_tokens_seen": 1880432,
"step": 3025
},
{
"epoch": 5.401069518716578,
"grad_norm": 0.6361598372459412,
"learning_rate": 2.5878705807746245e-05,
"loss": 0.2137,
"num_input_tokens_seen": 1884528,
"step": 3030
},
{
"epoch": 5.409982174688057,
"grad_norm": 0.6302884221076965,
"learning_rate": 2.580097214275727e-05,
"loss": 0.1688,
"num_input_tokens_seen": 1887152,
"step": 3035
},
{
"epoch": 5.418894830659537,
"grad_norm": 0.5410829186439514,
"learning_rate": 2.5723230725191554e-05,
"loss": 0.1772,
"num_input_tokens_seen": 1890032,
"step": 3040
},
{
"epoch": 5.427807486631016,
"grad_norm": 0.5092021822929382,
"learning_rate": 2.5645482307505108e-05,
"loss": 0.1677,
"num_input_tokens_seen": 1892304,
"step": 3045
},
{
"epoch": 5.436720142602495,
"grad_norm": 0.7809433937072754,
"learning_rate": 2.55677276422217e-05,
"loss": 0.1875,
"num_input_tokens_seen": 1895728,
"step": 3050
},
{
"epoch": 5.445632798573975,
"grad_norm": 0.43497583270072937,
"learning_rate": 2.548996748192556e-05,
"loss": 0.167,
"num_input_tokens_seen": 1898384,
"step": 3055
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.36343979835510254,
"learning_rate": 2.541220257925412e-05,
"loss": 0.1719,
"num_input_tokens_seen": 1901104,
"step": 3060
},
{
"epoch": 5.463458110516934,
"grad_norm": 0.6379041075706482,
"learning_rate": 2.5334433686890702e-05,
"loss": 0.1879,
"num_input_tokens_seen": 1904976,
"step": 3065
},
{
"epoch": 5.472370766488414,
"grad_norm": 0.501068651676178,
"learning_rate": 2.5256661557557247e-05,
"loss": 0.1898,
"num_input_tokens_seen": 1908688,
"step": 3070
},
{
"epoch": 5.481283422459893,
"grad_norm": 0.4064844250679016,
"learning_rate": 2.517888694400704e-05,
"loss": 0.1471,
"num_input_tokens_seen": 1911792,
"step": 3075
},
{
"epoch": 5.490196078431373,
"grad_norm": 0.7375326156616211,
"learning_rate": 2.5101110599017374e-05,
"loss": 0.223,
"num_input_tokens_seen": 1915248,
"step": 3080
},
{
"epoch": 5.499108734402852,
"grad_norm": 0.7120162844657898,
"learning_rate": 2.502333327538235e-05,
"loss": 0.1666,
"num_input_tokens_seen": 1918544,
"step": 3085
},
{
"epoch": 5.508021390374331,
"grad_norm": 0.4658108353614807,
"learning_rate": 2.4945555725905502e-05,
"loss": 0.2039,
"num_input_tokens_seen": 1922032,
"step": 3090
},
{
"epoch": 5.509803921568627,
"eval_loss": 0.19006255269050598,
"eval_runtime": 4.2606,
"eval_samples_per_second": 58.442,
"eval_steps_per_second": 14.787,
"num_input_tokens_seen": 1922384,
"step": 3091
},
{
"epoch": 5.516934046345811,
"grad_norm": 0.6522291898727417,
"learning_rate": 2.4867778703392554e-05,
"loss": 0.1586,
"num_input_tokens_seen": 1924400,
"step": 3095
},
{
"epoch": 5.52584670231729,
"grad_norm": 0.5256299376487732,
"learning_rate": 2.479000296064417e-05,
"loss": 0.2169,
"num_input_tokens_seen": 1927376,
"step": 3100
},
{
"epoch": 5.53475935828877,
"grad_norm": 0.5868116021156311,
"learning_rate": 2.4712229250448567e-05,
"loss": 0.1768,
"num_input_tokens_seen": 1930352,
"step": 3105
},
{
"epoch": 5.54367201426025,
"grad_norm": 0.6082111597061157,
"learning_rate": 2.4634458325574323e-05,
"loss": 0.2153,
"num_input_tokens_seen": 1933680,
"step": 3110
},
{
"epoch": 5.552584670231729,
"grad_norm": 0.5021962523460388,
"learning_rate": 2.4556690938763062e-05,
"loss": 0.1667,
"num_input_tokens_seen": 1937488,
"step": 3115
},
{
"epoch": 5.561497326203209,
"grad_norm": 0.5544887781143188,
"learning_rate": 2.4478927842722154e-05,
"loss": 0.1854,
"num_input_tokens_seen": 1940368,
"step": 3120
},
{
"epoch": 5.570409982174688,
"grad_norm": 0.6153222322463989,
"learning_rate": 2.4401169790117427e-05,
"loss": 0.1775,
"num_input_tokens_seen": 1943728,
"step": 3125
},
{
"epoch": 5.579322638146167,
"grad_norm": 0.7217985987663269,
"learning_rate": 2.4323417533565916e-05,
"loss": 0.1929,
"num_input_tokens_seen": 1946832,
"step": 3130
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.5232107639312744,
"learning_rate": 2.424567182562854e-05,
"loss": 0.205,
"num_input_tokens_seen": 1949904,
"step": 3135
},
{
"epoch": 5.597147950089127,
"grad_norm": 0.5853015184402466,
"learning_rate": 2.4167933418802837e-05,
"loss": 0.1431,
"num_input_tokens_seen": 1952432,
"step": 3140
},
{
"epoch": 5.606060606060606,
"grad_norm": 0.7414368391036987,
"learning_rate": 2.4090203065515695e-05,
"loss": 0.1622,
"num_input_tokens_seen": 1955216,
"step": 3145
},
{
"epoch": 5.614973262032086,
"grad_norm": 0.4388047456741333,
"learning_rate": 2.4012481518116022e-05,
"loss": 0.1707,
"num_input_tokens_seen": 1958096,
"step": 3150
},
{
"epoch": 5.623885918003565,
"grad_norm": 0.5946722626686096,
"learning_rate": 2.3934769528867513e-05,
"loss": 0.198,
"num_input_tokens_seen": 1961456,
"step": 3155
},
{
"epoch": 5.632798573975045,
"grad_norm": 0.4028293192386627,
"learning_rate": 2.385706784994135e-05,
"loss": 0.162,
"num_input_tokens_seen": 1964272,
"step": 3160
},
{
"epoch": 5.641711229946524,
"grad_norm": 0.4915693700313568,
"learning_rate": 2.3779377233408923e-05,
"loss": 0.192,
"num_input_tokens_seen": 1967120,
"step": 3165
},
{
"epoch": 5.650623885918003,
"grad_norm": 0.4452253580093384,
"learning_rate": 2.3701698431234528e-05,
"loss": 0.1601,
"num_input_tokens_seen": 1969872,
"step": 3170
},
{
"epoch": 5.659536541889483,
"grad_norm": 0.5284585356712341,
"learning_rate": 2.362403219526815e-05,
"loss": 0.1605,
"num_input_tokens_seen": 1972944,
"step": 3175
},
{
"epoch": 5.668449197860962,
"grad_norm": 0.48784369230270386,
"learning_rate": 2.3546379277238107e-05,
"loss": 0.1533,
"num_input_tokens_seen": 1975888,
"step": 3180
},
{
"epoch": 5.677361853832442,
"grad_norm": 0.5844167470932007,
"learning_rate": 2.3468740428743833e-05,
"loss": 0.1903,
"num_input_tokens_seen": 1979088,
"step": 3185
},
{
"epoch": 5.686274509803922,
"grad_norm": 0.6798781752586365,
"learning_rate": 2.339111640124859e-05,
"loss": 0.171,
"num_input_tokens_seen": 1981520,
"step": 3190
},
{
"epoch": 5.695187165775401,
"grad_norm": 0.8696448802947998,
"learning_rate": 2.3313507946072172e-05,
"loss": 0.1648,
"num_input_tokens_seen": 1984880,
"step": 3195
},
{
"epoch": 5.704099821746881,
"grad_norm": 0.4180395007133484,
"learning_rate": 2.323591581438365e-05,
"loss": 0.1617,
"num_input_tokens_seen": 1987440,
"step": 3200
},
{
"epoch": 5.71301247771836,
"grad_norm": 0.6146518588066101,
"learning_rate": 2.3158340757194116e-05,
"loss": 0.1963,
"num_input_tokens_seen": 1990640,
"step": 3205
},
{
"epoch": 5.721925133689839,
"grad_norm": 0.8348390460014343,
"learning_rate": 2.3080783525349388e-05,
"loss": 0.1653,
"num_input_tokens_seen": 1993808,
"step": 3210
},
{
"epoch": 5.730837789661319,
"grad_norm": 0.7081406712532043,
"learning_rate": 2.3003244869522743e-05,
"loss": 0.1779,
"num_input_tokens_seen": 1996688,
"step": 3215
},
{
"epoch": 5.739750445632799,
"grad_norm": 0.5054243206977844,
"learning_rate": 2.2925725540207688e-05,
"loss": 0.1565,
"num_input_tokens_seen": 1999696,
"step": 3220
},
{
"epoch": 5.748663101604278,
"grad_norm": 0.5454304814338684,
"learning_rate": 2.2848226287710645e-05,
"loss": 0.1536,
"num_input_tokens_seen": 2002032,
"step": 3225
},
{
"epoch": 5.757575757575758,
"grad_norm": 0.6999877095222473,
"learning_rate": 2.277074786214372e-05,
"loss": 0.1683,
"num_input_tokens_seen": 2005584,
"step": 3230
},
{
"epoch": 5.766488413547237,
"grad_norm": 0.765386164188385,
"learning_rate": 2.2693291013417453e-05,
"loss": 0.1592,
"num_input_tokens_seen": 2008176,
"step": 3235
},
{
"epoch": 5.775401069518717,
"grad_norm": 0.7968612909317017,
"learning_rate": 2.2615856491233513e-05,
"loss": 0.3207,
"num_input_tokens_seen": 2011376,
"step": 3240
},
{
"epoch": 5.784313725490196,
"grad_norm": 0.3482127785682678,
"learning_rate": 2.2538445045077488e-05,
"loss": 0.1455,
"num_input_tokens_seen": 2014224,
"step": 3245
},
{
"epoch": 5.793226381461675,
"grad_norm": 0.5806959271430969,
"learning_rate": 2.246105742421162e-05,
"loss": 0.1741,
"num_input_tokens_seen": 2016912,
"step": 3250
},
{
"epoch": 5.802139037433155,
"grad_norm": 0.7654284834861755,
"learning_rate": 2.2383694377667543e-05,
"loss": 0.1575,
"num_input_tokens_seen": 2020048,
"step": 3255
},
{
"epoch": 5.811051693404634,
"grad_norm": 0.642106831073761,
"learning_rate": 2.2306356654239012e-05,
"loss": 0.1756,
"num_input_tokens_seen": 2023216,
"step": 3260
},
{
"epoch": 5.819964349376114,
"grad_norm": 0.43349790573120117,
"learning_rate": 2.222904500247473e-05,
"loss": 0.1924,
"num_input_tokens_seen": 2026928,
"step": 3265
},
{
"epoch": 5.828877005347594,
"grad_norm": 0.4377082884311676,
"learning_rate": 2.2151760170671004e-05,
"loss": 0.1696,
"num_input_tokens_seen": 2029584,
"step": 3270
},
{
"epoch": 5.837789661319073,
"grad_norm": 0.40771257877349854,
"learning_rate": 2.207450290686458e-05,
"loss": 0.1603,
"num_input_tokens_seen": 2032720,
"step": 3275
},
{
"epoch": 5.846702317290553,
"grad_norm": 0.5143370628356934,
"learning_rate": 2.1997273958825375e-05,
"loss": 0.1845,
"num_input_tokens_seen": 2036176,
"step": 3280
},
{
"epoch": 5.855614973262032,
"grad_norm": 0.5394704341888428,
"learning_rate": 2.1920074074049225e-05,
"loss": 0.1801,
"num_input_tokens_seen": 2039632,
"step": 3285
},
{
"epoch": 5.864527629233511,
"grad_norm": 0.6020737290382385,
"learning_rate": 2.1842903999750665e-05,
"loss": 0.1862,
"num_input_tokens_seen": 2043184,
"step": 3290
},
{
"epoch": 5.873440285204991,
"grad_norm": 0.7539795637130737,
"learning_rate": 2.1765764482855715e-05,
"loss": 0.1628,
"num_input_tokens_seen": 2046416,
"step": 3295
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.6914777755737305,
"learning_rate": 2.1688656269994612e-05,
"loss": 0.1768,
"num_input_tokens_seen": 2049008,
"step": 3300
},
{
"epoch": 5.89126559714795,
"grad_norm": 1.2212262153625488,
"learning_rate": 2.1611580107494597e-05,
"loss": 0.1982,
"num_input_tokens_seen": 2052656,
"step": 3305
},
{
"epoch": 5.90017825311943,
"grad_norm": 0.5432605743408203,
"learning_rate": 2.153453674137272e-05,
"loss": 0.1885,
"num_input_tokens_seen": 2055888,
"step": 3310
},
{
"epoch": 5.909090909090909,
"grad_norm": 0.5268386006355286,
"learning_rate": 2.1457526917328588e-05,
"loss": 0.1492,
"num_input_tokens_seen": 2059056,
"step": 3315
},
{
"epoch": 5.918003565062389,
"grad_norm": 0.8248959183692932,
"learning_rate": 2.1380551380737128e-05,
"loss": 0.1755,
"num_input_tokens_seen": 2062096,
"step": 3320
},
{
"epoch": 5.926916221033868,
"grad_norm": 0.5520910024642944,
"learning_rate": 2.130361087664145e-05,
"loss": 0.1899,
"num_input_tokens_seen": 2065168,
"step": 3325
},
{
"epoch": 5.935828877005347,
"grad_norm": 0.5292351841926575,
"learning_rate": 2.122670614974555e-05,
"loss": 0.1983,
"num_input_tokens_seen": 2067856,
"step": 3330
},
{
"epoch": 5.944741532976827,
"grad_norm": 0.8153255581855774,
"learning_rate": 2.1149837944407136e-05,
"loss": 0.1517,
"num_input_tokens_seen": 2071056,
"step": 3335
},
{
"epoch": 5.953654188948306,
"grad_norm": 0.7868825197219849,
"learning_rate": 2.107300700463045e-05,
"loss": 0.193,
"num_input_tokens_seen": 2074192,
"step": 3340
},
{
"epoch": 5.962566844919786,
"grad_norm": 0.39180079102516174,
"learning_rate": 2.0996214074059034e-05,
"loss": 0.166,
"num_input_tokens_seen": 2077040,
"step": 3345
},
{
"epoch": 5.971479500891266,
"grad_norm": 0.5239204168319702,
"learning_rate": 2.0919459895968517e-05,
"loss": 0.1395,
"num_input_tokens_seen": 2079312,
"step": 3350
},
{
"epoch": 5.980392156862745,
"grad_norm": 0.4734959304332733,
"learning_rate": 2.084274521325948e-05,
"loss": 0.1701,
"num_input_tokens_seen": 2082864,
"step": 3355
},
{
"epoch": 5.989304812834225,
"grad_norm": 0.6230949759483337,
"learning_rate": 2.0766070768450206e-05,
"loss": 0.1928,
"num_input_tokens_seen": 2085872,
"step": 3360
},
{
"epoch": 5.998217468805704,
"grad_norm": 0.6036242246627808,
"learning_rate": 2.0689437303669508e-05,
"loss": 0.1673,
"num_input_tokens_seen": 2088272,
"step": 3365
},
{
"epoch": 6.007130124777183,
"grad_norm": 0.6001238822937012,
"learning_rate": 2.0612845560649603e-05,
"loss": 0.1752,
"num_input_tokens_seen": 2091232,
"step": 3370
},
{
"epoch": 6.010695187165775,
"eval_loss": 0.19044770300388336,
"eval_runtime": 4.266,
"eval_samples_per_second": 58.369,
"eval_steps_per_second": 14.768,
"num_input_tokens_seen": 2092320,
"step": 3372
},
{
"epoch": 6.016042780748663,
"grad_norm": 0.9030793309211731,
"learning_rate": 2.0536296280718825e-05,
"loss": 0.1664,
"num_input_tokens_seen": 2093952,
"step": 3375
},
{
"epoch": 6.024955436720143,
"grad_norm": 0.6371573209762573,
"learning_rate": 2.0459790204794545e-05,
"loss": 0.1941,
"num_input_tokens_seen": 2097728,
"step": 3380
},
{
"epoch": 6.033868092691622,
"grad_norm": 0.4168316125869751,
"learning_rate": 2.0383328073375955e-05,
"loss": 0.2223,
"num_input_tokens_seen": 2100736,
"step": 3385
},
{
"epoch": 6.042780748663102,
"grad_norm": 0.8262919187545776,
"learning_rate": 2.0306910626536926e-05,
"loss": 0.1762,
"num_input_tokens_seen": 2104032,
"step": 3390
},
{
"epoch": 6.051693404634581,
"grad_norm": 0.482316255569458,
"learning_rate": 2.0230538603918787e-05,
"loss": 0.1594,
"num_input_tokens_seen": 2107264,
"step": 3395
},
{
"epoch": 6.0606060606060606,
"grad_norm": 1.0964471101760864,
"learning_rate": 2.015421274472325e-05,
"loss": 0.1881,
"num_input_tokens_seen": 2110336,
"step": 3400
},
{
"epoch": 6.06951871657754,
"grad_norm": 0.49298667907714844,
"learning_rate": 2.0077933787705204e-05,
"loss": 0.151,
"num_input_tokens_seen": 2113248,
"step": 3405
},
{
"epoch": 6.078431372549019,
"grad_norm": 0.6304886341094971,
"learning_rate": 2.000170247116554e-05,
"loss": 0.1657,
"num_input_tokens_seen": 2116032,
"step": 3410
},
{
"epoch": 6.087344028520499,
"grad_norm": 0.4530024230480194,
"learning_rate": 1.9925519532944104e-05,
"loss": 0.1692,
"num_input_tokens_seen": 2118848,
"step": 3415
},
{
"epoch": 6.096256684491979,
"grad_norm": 0.5926321744918823,
"learning_rate": 1.9849385710412424e-05,
"loss": 0.3085,
"num_input_tokens_seen": 2122208,
"step": 3420
},
{
"epoch": 6.105169340463458,
"grad_norm": 0.5866901874542236,
"learning_rate": 1.977330174046667e-05,
"loss": 0.1675,
"num_input_tokens_seen": 2125248,
"step": 3425
},
{
"epoch": 6.114081996434938,
"grad_norm": 0.35337719321250916,
"learning_rate": 1.9697268359520506e-05,
"loss": 0.2589,
"num_input_tokens_seen": 2129248,
"step": 3430
},
{
"epoch": 6.122994652406417,
"grad_norm": 0.4666219651699066,
"learning_rate": 1.9621286303497915e-05,
"loss": 0.1709,
"num_input_tokens_seen": 2131904,
"step": 3435
},
{
"epoch": 6.1319073083778965,
"grad_norm": 0.6858420372009277,
"learning_rate": 1.954535630782612e-05,
"loss": 0.183,
"num_input_tokens_seen": 2135552,
"step": 3440
},
{
"epoch": 6.140819964349376,
"grad_norm": 0.41474148631095886,
"learning_rate": 1.9469479107428463e-05,
"loss": 0.1723,
"num_input_tokens_seen": 2138688,
"step": 3445
},
{
"epoch": 6.149732620320855,
"grad_norm": 0.60605388879776,
"learning_rate": 1.9393655436717283e-05,
"loss": 0.1506,
"num_input_tokens_seen": 2141248,
"step": 3450
},
{
"epoch": 6.158645276292335,
"grad_norm": 0.9076442122459412,
"learning_rate": 1.9317886029586778e-05,
"loss": 0.2039,
"num_input_tokens_seen": 2144768,
"step": 3455
},
{
"epoch": 6.167557932263815,
"grad_norm": 0.9373259544372559,
"learning_rate": 1.9242171619405986e-05,
"loss": 0.1797,
"num_input_tokens_seen": 2147552,
"step": 3460
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.6851420998573303,
"learning_rate": 1.916651293901157e-05,
"loss": 0.1825,
"num_input_tokens_seen": 2151040,
"step": 3465
},
{
"epoch": 6.185383244206774,
"grad_norm": 0.6892784833908081,
"learning_rate": 1.909091072070083e-05,
"loss": 0.171,
"num_input_tokens_seen": 2155040,
"step": 3470
},
{
"epoch": 6.194295900178253,
"grad_norm": 0.6285828948020935,
"learning_rate": 1.9015365696224564e-05,
"loss": 0.158,
"num_input_tokens_seen": 2157824,
"step": 3475
},
{
"epoch": 6.2032085561497325,
"grad_norm": 0.5884494781494141,
"learning_rate": 1.893987859677997e-05,
"loss": 0.181,
"num_input_tokens_seen": 2160672,
"step": 3480
},
{
"epoch": 6.212121212121212,
"grad_norm": 0.7425735592842102,
"learning_rate": 1.886445015300362e-05,
"loss": 0.1473,
"num_input_tokens_seen": 2163552,
"step": 3485
},
{
"epoch": 6.221033868092691,
"grad_norm": 0.39105650782585144,
"learning_rate": 1.8789081094964347e-05,
"loss": 0.1441,
"num_input_tokens_seen": 2167456,
"step": 3490
},
{
"epoch": 6.229946524064171,
"grad_norm": 0.30422699451446533,
"learning_rate": 1.8713772152156205e-05,
"loss": 0.1294,
"num_input_tokens_seen": 2170560,
"step": 3495
},
{
"epoch": 6.238859180035651,
"grad_norm": 0.7964766621589661,
"learning_rate": 1.863852405349135e-05,
"loss": 0.1838,
"num_input_tokens_seen": 2173152,
"step": 3500
},
{
"epoch": 6.24777183600713,
"grad_norm": 0.6463519334793091,
"learning_rate": 1.856333752729311e-05,
"loss": 0.1637,
"num_input_tokens_seen": 2175808,
"step": 3505
},
{
"epoch": 6.25668449197861,
"grad_norm": 0.8007080554962158,
"learning_rate": 1.848821330128878e-05,
"loss": 0.1717,
"num_input_tokens_seen": 2178304,
"step": 3510
},
{
"epoch": 6.265597147950089,
"grad_norm": 1.0539445877075195,
"learning_rate": 1.8413152102602687e-05,
"loss": 0.1892,
"num_input_tokens_seen": 2181312,
"step": 3515
},
{
"epoch": 6.2745098039215685,
"grad_norm": 0.6273789405822754,
"learning_rate": 1.8338154657749128e-05,
"loss": 0.1699,
"num_input_tokens_seen": 2184128,
"step": 3520
},
{
"epoch": 6.283422459893048,
"grad_norm": 0.5192899703979492,
"learning_rate": 1.826322169262531e-05,
"loss": 0.1772,
"num_input_tokens_seen": 2187584,
"step": 3525
},
{
"epoch": 6.292335115864527,
"grad_norm": 0.6465858221054077,
"learning_rate": 1.818835393250434e-05,
"loss": 0.1814,
"num_input_tokens_seen": 2191168,
"step": 3530
},
{
"epoch": 6.301247771836007,
"grad_norm": 0.5996541380882263,
"learning_rate": 1.8113552102028236e-05,
"loss": 0.1888,
"num_input_tokens_seen": 2194880,
"step": 3535
},
{
"epoch": 6.310160427807487,
"grad_norm": 0.3005512058734894,
"learning_rate": 1.803881692520087e-05,
"loss": 0.1483,
"num_input_tokens_seen": 2197184,
"step": 3540
},
{
"epoch": 6.319073083778966,
"grad_norm": 0.4426136016845703,
"learning_rate": 1.796414912538095e-05,
"loss": 0.162,
"num_input_tokens_seen": 2200160,
"step": 3545
},
{
"epoch": 6.327985739750446,
"grad_norm": 0.7000912427902222,
"learning_rate": 1.7889549425275093e-05,
"loss": 0.1686,
"num_input_tokens_seen": 2203776,
"step": 3550
},
{
"epoch": 6.336898395721925,
"grad_norm": 0.5500680804252625,
"learning_rate": 1.7815018546930754e-05,
"loss": 0.1716,
"num_input_tokens_seen": 2207104,
"step": 3555
},
{
"epoch": 6.3458110516934045,
"grad_norm": 0.5378794074058533,
"learning_rate": 1.7740557211729258e-05,
"loss": 0.1653,
"num_input_tokens_seen": 2210400,
"step": 3560
},
{
"epoch": 6.354723707664884,
"grad_norm": 0.20100829005241394,
"learning_rate": 1.7666166140378852e-05,
"loss": 0.1604,
"num_input_tokens_seen": 2213728,
"step": 3565
},
{
"epoch": 6.363636363636363,
"grad_norm": 0.33214375376701355,
"learning_rate": 1.7591846052907673e-05,
"loss": 0.1524,
"num_input_tokens_seen": 2216416,
"step": 3570
},
{
"epoch": 6.372549019607844,
"grad_norm": 1.197052240371704,
"learning_rate": 1.7517597668656823e-05,
"loss": 0.1849,
"num_input_tokens_seen": 2219328,
"step": 3575
},
{
"epoch": 6.381461675579323,
"grad_norm": 0.704537034034729,
"learning_rate": 1.7443421706273395e-05,
"loss": 0.1927,
"num_input_tokens_seen": 2222496,
"step": 3580
},
{
"epoch": 6.390374331550802,
"grad_norm": 0.6272372007369995,
"learning_rate": 1.7369318883703506e-05,
"loss": 0.1855,
"num_input_tokens_seen": 2225504,
"step": 3585
},
{
"epoch": 6.399286987522282,
"grad_norm": 0.8482812643051147,
"learning_rate": 1.7295289918185348e-05,
"loss": 0.1753,
"num_input_tokens_seen": 2229312,
"step": 3590
},
{
"epoch": 6.408199643493761,
"grad_norm": 0.5499706864356995,
"learning_rate": 1.722133552624227e-05,
"loss": 0.1939,
"num_input_tokens_seen": 2232544,
"step": 3595
},
{
"epoch": 6.4171122994652405,
"grad_norm": 0.48051542043685913,
"learning_rate": 1.714745642367583e-05,
"loss": 0.1707,
"num_input_tokens_seen": 2235808,
"step": 3600
},
{
"epoch": 6.42602495543672,
"grad_norm": 1.0482089519500732,
"learning_rate": 1.707365332555883e-05,
"loss": 0.183,
"num_input_tokens_seen": 2239040,
"step": 3605
},
{
"epoch": 6.434937611408199,
"grad_norm": 0.5002045631408691,
"learning_rate": 1.699992694622847e-05,
"loss": 0.1476,
"num_input_tokens_seen": 2241728,
"step": 3610
},
{
"epoch": 6.443850267379679,
"grad_norm": 0.5338446497917175,
"learning_rate": 1.6926277999279372e-05,
"loss": 0.1712,
"num_input_tokens_seen": 2244928,
"step": 3615
},
{
"epoch": 6.452762923351159,
"grad_norm": 0.5092248320579529,
"learning_rate": 1.6852707197556677e-05,
"loss": 0.1569,
"num_input_tokens_seen": 2247936,
"step": 3620
},
{
"epoch": 6.461675579322638,
"grad_norm": 0.4300782382488251,
"learning_rate": 1.67792152531492e-05,
"loss": 0.1658,
"num_input_tokens_seen": 2250560,
"step": 3625
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.3229581415653229,
"learning_rate": 1.6705802877382464e-05,
"loss": 0.1451,
"num_input_tokens_seen": 2253248,
"step": 3630
},
{
"epoch": 6.479500891265597,
"grad_norm": 0.5048878788948059,
"learning_rate": 1.6632470780811866e-05,
"loss": 0.1803,
"num_input_tokens_seen": 2256320,
"step": 3635
},
{
"epoch": 6.4884135472370765,
"grad_norm": 0.7852115631103516,
"learning_rate": 1.6559219673215784e-05,
"loss": 0.1825,
"num_input_tokens_seen": 2259168,
"step": 3640
},
{
"epoch": 6.497326203208556,
"grad_norm": 0.3399798572063446,
"learning_rate": 1.6486050263588702e-05,
"loss": 0.1856,
"num_input_tokens_seen": 2262240,
"step": 3645
},
{
"epoch": 6.506238859180035,
"grad_norm": 0.5445297360420227,
"learning_rate": 1.641296326013436e-05,
"loss": 0.2109,
"num_input_tokens_seen": 2265600,
"step": 3650
},
{
"epoch": 6.5115864527629235,
"eval_loss": 0.1881975382566452,
"eval_runtime": 4.2584,
"eval_samples_per_second": 58.472,
"eval_steps_per_second": 14.794,
"num_input_tokens_seen": 2267520,
"step": 3653
},
{
"epoch": 6.515151515151516,
"grad_norm": 0.33709490299224854,
"learning_rate": 1.633995937025889e-05,
"loss": 0.1652,
"num_input_tokens_seen": 2268768,
"step": 3655
},
{
"epoch": 6.524064171122995,
"grad_norm": 0.4406679570674896,
"learning_rate": 1.6267039300563965e-05,
"loss": 0.2093,
"num_input_tokens_seen": 2272256,
"step": 3660
},
{
"epoch": 6.532976827094474,
"grad_norm": 0.6629878878593445,
"learning_rate": 1.619420375683996e-05,
"loss": 0.1718,
"num_input_tokens_seen": 2275968,
"step": 3665
},
{
"epoch": 6.541889483065954,
"grad_norm": 0.665874183177948,
"learning_rate": 1.6121453444059153e-05,
"loss": 0.1913,
"num_input_tokens_seen": 2278784,
"step": 3670
},
{
"epoch": 6.550802139037433,
"grad_norm": 0.5533963441848755,
"learning_rate": 1.6048789066368858e-05,
"loss": 0.1798,
"num_input_tokens_seen": 2281472,
"step": 3675
},
{
"epoch": 6.5597147950089125,
"grad_norm": 0.40691274404525757,
"learning_rate": 1.5976211327084606e-05,
"loss": 0.1737,
"num_input_tokens_seen": 2284608,
"step": 3680
},
{
"epoch": 6.568627450980392,
"grad_norm": 0.7153930068016052,
"learning_rate": 1.59037209286834e-05,
"loss": 0.1607,
"num_input_tokens_seen": 2287296,
"step": 3685
},
{
"epoch": 6.577540106951871,
"grad_norm": 0.4068545401096344,
"learning_rate": 1.583131857279685e-05,
"loss": 0.1584,
"num_input_tokens_seen": 2290176,
"step": 3690
},
{
"epoch": 6.586452762923351,
"grad_norm": 0.5864424109458923,
"learning_rate": 1.57590049602044e-05,
"loss": 0.175,
"num_input_tokens_seen": 2292960,
"step": 3695
},
{
"epoch": 6.595365418894831,
"grad_norm": 0.729058027267456,
"learning_rate": 1.5686780790826574e-05,
"loss": 0.1749,
"num_input_tokens_seen": 2296192,
"step": 3700
},
{
"epoch": 6.60427807486631,
"grad_norm": 0.7947399616241455,
"learning_rate": 1.561464676371816e-05,
"loss": 0.1895,
"num_input_tokens_seen": 2300224,
"step": 3705
},
{
"epoch": 6.61319073083779,
"grad_norm": 0.5141013860702515,
"learning_rate": 1.5542603577061464e-05,
"loss": 0.1672,
"num_input_tokens_seen": 2303040,
"step": 3710
},
{
"epoch": 6.622103386809269,
"grad_norm": 0.7291932702064514,
"learning_rate": 1.5470651928159564e-05,
"loss": 0.1447,
"num_input_tokens_seen": 2305600,
"step": 3715
},
{
"epoch": 6.6310160427807485,
"grad_norm": 0.48628827929496765,
"learning_rate": 1.539879251342954e-05,
"loss": 0.1646,
"num_input_tokens_seen": 2308736,
"step": 3720
},
{
"epoch": 6.639928698752228,
"grad_norm": 0.6047589778900146,
"learning_rate": 1.5327026028395724e-05,
"loss": 0.1547,
"num_input_tokens_seen": 2311840,
"step": 3725
},
{
"epoch": 6.648841354723707,
"grad_norm": 0.5494013428688049,
"learning_rate": 1.5255353167683017e-05,
"loss": 0.1728,
"num_input_tokens_seen": 2315808,
"step": 3730
},
{
"epoch": 6.657754010695188,
"grad_norm": 0.6367866396903992,
"learning_rate": 1.5183774625010119e-05,
"loss": 0.1566,
"num_input_tokens_seen": 2319072,
"step": 3735
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.6009120345115662,
"learning_rate": 1.5112291093182818e-05,
"loss": 0.187,
"num_input_tokens_seen": 2323104,
"step": 3740
},
{
"epoch": 6.675579322638146,
"grad_norm": 0.5307632088661194,
"learning_rate": 1.5040903264087328e-05,
"loss": 0.174,
"num_input_tokens_seen": 2325984,
"step": 3745
},
{
"epoch": 6.684491978609626,
"grad_norm": 0.4566698372364044,
"learning_rate": 1.4969611828683517e-05,
"loss": 0.1415,
"num_input_tokens_seen": 2329152,
"step": 3750
},
{
"epoch": 6.693404634581105,
"grad_norm": 0.5744293928146362,
"learning_rate": 1.4898417476998289e-05,
"loss": 0.2178,
"num_input_tokens_seen": 2332768,
"step": 3755
},
{
"epoch": 6.7023172905525845,
"grad_norm": 0.4906589984893799,
"learning_rate": 1.4827320898118884e-05,
"loss": 0.1595,
"num_input_tokens_seen": 2335680,
"step": 3760
},
{
"epoch": 6.711229946524064,
"grad_norm": 0.643140435218811,
"learning_rate": 1.4756322780186193e-05,
"loss": 0.1865,
"num_input_tokens_seen": 2338656,
"step": 3765
},
{
"epoch": 6.720142602495543,
"grad_norm": 0.6035706996917725,
"learning_rate": 1.4685423810388094e-05,
"loss": 0.1639,
"num_input_tokens_seen": 2342016,
"step": 3770
},
{
"epoch": 6.729055258467023,
"grad_norm": 0.35557159781455994,
"learning_rate": 1.4614624674952842e-05,
"loss": 0.1617,
"num_input_tokens_seen": 2345120,
"step": 3775
},
{
"epoch": 6.737967914438503,
"grad_norm": 0.589004397392273,
"learning_rate": 1.4543926059142379e-05,
"loss": 0.1699,
"num_input_tokens_seen": 2348512,
"step": 3780
},
{
"epoch": 6.746880570409982,
"grad_norm": 0.4238247573375702,
"learning_rate": 1.4473328647245726e-05,
"loss": 0.1614,
"num_input_tokens_seen": 2350688,
"step": 3785
},
{
"epoch": 6.755793226381462,
"grad_norm": 0.6005486845970154,
"learning_rate": 1.4402833122572368e-05,
"loss": 0.1801,
"num_input_tokens_seen": 2353504,
"step": 3790
},
{
"epoch": 6.764705882352941,
"grad_norm": 0.6389063000679016,
"learning_rate": 1.4332440167445613e-05,
"loss": 0.1597,
"num_input_tokens_seen": 2356672,
"step": 3795
},
{
"epoch": 6.7736185383244205,
"grad_norm": 0.4916219115257263,
"learning_rate": 1.4262150463195981e-05,
"loss": 0.1759,
"num_input_tokens_seen": 2360288,
"step": 3800
},
{
"epoch": 6.7825311942959,
"grad_norm": 0.6930426359176636,
"learning_rate": 1.4191964690154702e-05,
"loss": 0.1552,
"num_input_tokens_seen": 2362944,
"step": 3805
},
{
"epoch": 6.791443850267379,
"grad_norm": 0.5594033598899841,
"learning_rate": 1.412188352764699e-05,
"loss": 0.1858,
"num_input_tokens_seen": 2366080,
"step": 3810
},
{
"epoch": 6.80035650623886,
"grad_norm": 0.6492391228675842,
"learning_rate": 1.4051907653985552e-05,
"loss": 0.1954,
"num_input_tokens_seen": 2369632,
"step": 3815
},
{
"epoch": 6.809269162210339,
"grad_norm": 0.7449959516525269,
"learning_rate": 1.3982037746464043e-05,
"loss": 0.1986,
"num_input_tokens_seen": 2373504,
"step": 3820
},
{
"epoch": 6.818181818181818,
"grad_norm": 0.6552306413650513,
"learning_rate": 1.3912274481350433e-05,
"loss": 0.1672,
"num_input_tokens_seen": 2376480,
"step": 3825
},
{
"epoch": 6.827094474153298,
"grad_norm": 0.5298140048980713,
"learning_rate": 1.3842618533880531e-05,
"loss": 0.1679,
"num_input_tokens_seen": 2379488,
"step": 3830
},
{
"epoch": 6.836007130124777,
"grad_norm": 0.6472254395484924,
"learning_rate": 1.3773070578251424e-05,
"loss": 0.179,
"num_input_tokens_seen": 2382496,
"step": 3835
},
{
"epoch": 6.8449197860962565,
"grad_norm": 0.5164865851402283,
"learning_rate": 1.3703631287614935e-05,
"loss": 0.1802,
"num_input_tokens_seen": 2386304,
"step": 3840
},
{
"epoch": 6.853832442067736,
"grad_norm": 0.4910835325717926,
"learning_rate": 1.363430133407112e-05,
"loss": 0.1772,
"num_input_tokens_seen": 2389504,
"step": 3845
},
{
"epoch": 6.862745098039216,
"grad_norm": 0.5745038986206055,
"learning_rate": 1.3565081388661782e-05,
"loss": 0.1634,
"num_input_tokens_seen": 2392320,
"step": 3850
},
{
"epoch": 6.871657754010696,
"grad_norm": 0.5505916476249695,
"learning_rate": 1.3495972121363968e-05,
"loss": 0.1739,
"num_input_tokens_seen": 2395648,
"step": 3855
},
{
"epoch": 6.880570409982175,
"grad_norm": 0.6166315674781799,
"learning_rate": 1.3426974201083439e-05,
"loss": 0.1693,
"num_input_tokens_seen": 2398080,
"step": 3860
},
{
"epoch": 6.889483065953654,
"grad_norm": 1.0031318664550781,
"learning_rate": 1.3358088295648274e-05,
"loss": 0.175,
"num_input_tokens_seen": 2400448,
"step": 3865
},
{
"epoch": 6.898395721925134,
"grad_norm": 0.43097200989723206,
"learning_rate": 1.328931507180233e-05,
"loss": 0.1634,
"num_input_tokens_seen": 2403424,
"step": 3870
},
{
"epoch": 6.907308377896613,
"grad_norm": 0.4086379110813141,
"learning_rate": 1.3220655195198847e-05,
"loss": 0.1469,
"num_input_tokens_seen": 2405984,
"step": 3875
},
{
"epoch": 6.9162210338680925,
"grad_norm": 0.40902405977249146,
"learning_rate": 1.3152109330393985e-05,
"loss": 0.1677,
"num_input_tokens_seen": 2409472,
"step": 3880
},
{
"epoch": 6.925133689839572,
"grad_norm": 0.629298985004425,
"learning_rate": 1.3083678140840366e-05,
"loss": 0.1898,
"num_input_tokens_seen": 2412384,
"step": 3885
},
{
"epoch": 6.934046345811051,
"grad_norm": 0.4956974387168884,
"learning_rate": 1.3015362288880678e-05,
"loss": 0.1628,
"num_input_tokens_seen": 2415328,
"step": 3890
},
{
"epoch": 6.942959001782532,
"grad_norm": 0.31115609407424927,
"learning_rate": 1.2947162435741278e-05,
"loss": 0.1869,
"num_input_tokens_seen": 2418848,
"step": 3895
},
{
"epoch": 6.951871657754011,
"grad_norm": 0.5426957011222839,
"learning_rate": 1.2879079241525783e-05,
"loss": 0.1615,
"num_input_tokens_seen": 2421824,
"step": 3900
},
{
"epoch": 6.96078431372549,
"grad_norm": 0.6043846011161804,
"learning_rate": 1.2811113365208627e-05,
"loss": 0.189,
"num_input_tokens_seen": 2424224,
"step": 3905
},
{
"epoch": 6.96969696969697,
"grad_norm": 0.48290809988975525,
"learning_rate": 1.2743265464628786e-05,
"loss": 0.1779,
"num_input_tokens_seen": 2427616,
"step": 3910
},
{
"epoch": 6.978609625668449,
"grad_norm": 0.5067238211631775,
"learning_rate": 1.2675536196483306e-05,
"loss": 0.1568,
"num_input_tokens_seen": 2430368,
"step": 3915
},
{
"epoch": 6.9875222816399285,
"grad_norm": 0.43254604935646057,
"learning_rate": 1.260792621632102e-05,
"loss": 0.1876,
"num_input_tokens_seen": 2433376,
"step": 3920
},
{
"epoch": 6.996434937611408,
"grad_norm": 0.8352137804031372,
"learning_rate": 1.2540436178536186e-05,
"loss": 0.186,
"num_input_tokens_seen": 2436608,
"step": 3925
},
{
"epoch": 7.005347593582887,
"grad_norm": 0.8926360011100769,
"learning_rate": 1.2473066736362124e-05,
"loss": 0.1554,
"num_input_tokens_seen": 2439064,
"step": 3930
},
{
"epoch": 7.0124777183600715,
"eval_loss": 0.18532642722129822,
"eval_runtime": 4.2481,
"eval_samples_per_second": 58.614,
"eval_steps_per_second": 14.83,
"num_input_tokens_seen": 2441688,
"step": 3934
},
{
"epoch": 7.0142602495543676,
"grad_norm": 0.40735986828804016,
"learning_rate": 1.2405818541864905e-05,
"loss": 0.1639,
"num_input_tokens_seen": 2442328,
"step": 3935
},
{
"epoch": 7.023172905525847,
"grad_norm": 0.8125144243240356,
"learning_rate": 1.2338692245937077e-05,
"loss": 0.1518,
"num_input_tokens_seen": 2445272,
"step": 3940
},
{
"epoch": 7.032085561497326,
"grad_norm": 0.352469801902771,
"learning_rate": 1.2271688498291335e-05,
"loss": 0.1499,
"num_input_tokens_seen": 2448216,
"step": 3945
},
{
"epoch": 7.040998217468806,
"grad_norm": 0.5842772722244263,
"learning_rate": 1.2204807947454203e-05,
"loss": 0.173,
"num_input_tokens_seen": 2451704,
"step": 3950
},
{
"epoch": 7.049910873440285,
"grad_norm": 0.8481732606887817,
"learning_rate": 1.2138051240759826e-05,
"loss": 0.1489,
"num_input_tokens_seen": 2454392,
"step": 3955
},
{
"epoch": 7.0588235294117645,
"grad_norm": 0.6517293453216553,
"learning_rate": 1.2071419024343633e-05,
"loss": 0.1674,
"num_input_tokens_seen": 2457112,
"step": 3960
},
{
"epoch": 7.067736185383244,
"grad_norm": 0.5270460844039917,
"learning_rate": 1.2004911943136143e-05,
"loss": 0.1551,
"num_input_tokens_seen": 2460312,
"step": 3965
},
{
"epoch": 7.076648841354723,
"grad_norm": 0.5227533578872681,
"learning_rate": 1.1938530640856696e-05,
"loss": 0.1572,
"num_input_tokens_seen": 2463224,
"step": 3970
},
{
"epoch": 7.0855614973262036,
"grad_norm": 0.29230085015296936,
"learning_rate": 1.1872275760007198e-05,
"loss": 0.1661,
"num_input_tokens_seen": 2466008,
"step": 3975
},
{
"epoch": 7.094474153297683,
"grad_norm": 0.5345339179039001,
"learning_rate": 1.1806147941865938e-05,
"loss": 0.1784,
"num_input_tokens_seen": 2469176,
"step": 3980
},
{
"epoch": 7.103386809269162,
"grad_norm": 0.4222520589828491,
"learning_rate": 1.1740147826481385e-05,
"loss": 0.1405,
"num_input_tokens_seen": 2472408,
"step": 3985
},
{
"epoch": 7.112299465240642,
"grad_norm": 0.5282605290412903,
"learning_rate": 1.1674276052665973e-05,
"loss": 0.1902,
"num_input_tokens_seen": 2475608,
"step": 3990
},
{
"epoch": 7.121212121212121,
"grad_norm": 0.4751206636428833,
"learning_rate": 1.1608533257989901e-05,
"loss": 0.1489,
"num_input_tokens_seen": 2478680,
"step": 3995
},
{
"epoch": 7.1301247771836005,
"grad_norm": 0.3280528783798218,
"learning_rate": 1.1542920078775018e-05,
"loss": 0.1666,
"num_input_tokens_seen": 2481592,
"step": 4000
},
{
"epoch": 7.13903743315508,
"grad_norm": 0.9430297017097473,
"learning_rate": 1.14774371500886e-05,
"loss": 0.2094,
"num_input_tokens_seen": 2485176,
"step": 4005
},
{
"epoch": 7.14795008912656,
"grad_norm": 0.27522483468055725,
"learning_rate": 1.141208510573725e-05,
"loss": 0.1596,
"num_input_tokens_seen": 2488152,
"step": 4010
},
{
"epoch": 7.1568627450980395,
"grad_norm": 0.5842289328575134,
"learning_rate": 1.1346864578260758e-05,
"loss": 0.1904,
"num_input_tokens_seen": 2491320,
"step": 4015
},
{
"epoch": 7.165775401069519,
"grad_norm": 0.38907817006111145,
"learning_rate": 1.1281776198925939e-05,
"loss": 0.1459,
"num_input_tokens_seen": 2493944,
"step": 4020
},
{
"epoch": 7.174688057040998,
"grad_norm": 0.31314197182655334,
"learning_rate": 1.121682059772056e-05,
"loss": 0.1407,
"num_input_tokens_seen": 2496664,
"step": 4025
},
{
"epoch": 7.183600713012478,
"grad_norm": 0.5018792748451233,
"learning_rate": 1.1151998403347244e-05,
"loss": 0.2596,
"num_input_tokens_seen": 2500216,
"step": 4030
},
{
"epoch": 7.192513368983957,
"grad_norm": 0.4724593162536621,
"learning_rate": 1.1087310243217386e-05,
"loss": 0.1538,
"num_input_tokens_seen": 2503544,
"step": 4035
},
{
"epoch": 7.2014260249554365,
"grad_norm": 0.647865891456604,
"learning_rate": 1.1022756743445028e-05,
"loss": 0.1738,
"num_input_tokens_seen": 2507160,
"step": 4040
},
{
"epoch": 7.210338680926916,
"grad_norm": 0.48006606101989746,
"learning_rate": 1.0958338528840893e-05,
"loss": 0.1834,
"num_input_tokens_seen": 2510232,
"step": 4045
},
{
"epoch": 7.219251336898395,
"grad_norm": 0.4462122917175293,
"learning_rate": 1.0894056222906226e-05,
"loss": 0.1348,
"num_input_tokens_seen": 2513144,
"step": 4050
},
{
"epoch": 7.2281639928698755,
"grad_norm": 0.48262760043144226,
"learning_rate": 1.0829910447826868e-05,
"loss": 0.1547,
"num_input_tokens_seen": 2516504,
"step": 4055
},
{
"epoch": 7.237076648841355,
"grad_norm": 0.5589674711227417,
"learning_rate": 1.0765901824467167e-05,
"loss": 0.1723,
"num_input_tokens_seen": 2518648,
"step": 4060
},
{
"epoch": 7.245989304812834,
"grad_norm": 0.4827505946159363,
"learning_rate": 1.0702030972363963e-05,
"loss": 0.1625,
"num_input_tokens_seen": 2521880,
"step": 4065
},
{
"epoch": 7.254901960784314,
"grad_norm": 0.5129882097244263,
"learning_rate": 1.063829850972065e-05,
"loss": 0.1871,
"num_input_tokens_seen": 2525336,
"step": 4070
},
{
"epoch": 7.263814616755793,
"grad_norm": 0.5441546440124512,
"learning_rate": 1.0574705053401127e-05,
"loss": 0.1591,
"num_input_tokens_seen": 2528184,
"step": 4075
},
{
"epoch": 7.2727272727272725,
"grad_norm": 0.42811569571495056,
"learning_rate": 1.0511251218923868e-05,
"loss": 0.1592,
"num_input_tokens_seen": 2530904,
"step": 4080
},
{
"epoch": 7.281639928698752,
"grad_norm": 0.43192997574806213,
"learning_rate": 1.0447937620455964e-05,
"loss": 0.178,
"num_input_tokens_seen": 2533656,
"step": 4085
},
{
"epoch": 7.290552584670232,
"grad_norm": 0.7238538265228271,
"learning_rate": 1.0384764870807149e-05,
"loss": 0.1817,
"num_input_tokens_seen": 2535928,
"step": 4090
},
{
"epoch": 7.2994652406417115,
"grad_norm": 0.4946947991847992,
"learning_rate": 1.0321733581423884e-05,
"loss": 0.1685,
"num_input_tokens_seen": 2539352,
"step": 4095
},
{
"epoch": 7.308377896613191,
"grad_norm": 0.5055748224258423,
"learning_rate": 1.025884436238346e-05,
"loss": 0.1722,
"num_input_tokens_seen": 2542456,
"step": 4100
},
{
"epoch": 7.31729055258467,
"grad_norm": 0.9246964454650879,
"learning_rate": 1.0196097822388075e-05,
"loss": 0.1772,
"num_input_tokens_seen": 2545816,
"step": 4105
},
{
"epoch": 7.32620320855615,
"grad_norm": 0.8303518891334534,
"learning_rate": 1.013349456875892e-05,
"loss": 0.1608,
"num_input_tokens_seen": 2548824,
"step": 4110
},
{
"epoch": 7.335115864527629,
"grad_norm": 0.5074154734611511,
"learning_rate": 1.0071035207430352e-05,
"loss": 0.1655,
"num_input_tokens_seen": 2552152,
"step": 4115
},
{
"epoch": 7.3440285204991085,
"grad_norm": 0.4153769910335541,
"learning_rate": 1.0008720342943966e-05,
"loss": 0.1643,
"num_input_tokens_seen": 2555768,
"step": 4120
},
{
"epoch": 7.352941176470588,
"grad_norm": 0.3799455165863037,
"learning_rate": 9.94655057844281e-06,
"loss": 0.1602,
"num_input_tokens_seen": 2558328,
"step": 4125
},
{
"epoch": 7.361853832442068,
"grad_norm": 0.6474289298057556,
"learning_rate": 9.884526515665508e-06,
"loss": 0.17,
"num_input_tokens_seen": 2561368,
"step": 4130
},
{
"epoch": 7.3707664884135475,
"grad_norm": 0.7523593902587891,
"learning_rate": 9.822648754940431e-06,
"loss": 0.156,
"num_input_tokens_seen": 2564056,
"step": 4135
},
{
"epoch": 7.379679144385027,
"grad_norm": 0.5380316972732544,
"learning_rate": 9.760917895179894e-06,
"loss": 0.1746,
"num_input_tokens_seen": 2566744,
"step": 4140
},
{
"epoch": 7.388591800356506,
"grad_norm": 1.0373018980026245,
"learning_rate": 9.699334533874386e-06,
"loss": 0.1959,
"num_input_tokens_seen": 2569656,
"step": 4145
},
{
"epoch": 7.397504456327986,
"grad_norm": 0.6027229428291321,
"learning_rate": 9.637899267086758e-06,
"loss": 0.1752,
"num_input_tokens_seen": 2573112,
"step": 4150
},
{
"epoch": 7.406417112299465,
"grad_norm": 0.5722499489784241,
"learning_rate": 9.576612689446444e-06,
"loss": 0.1712,
"num_input_tokens_seen": 2576952,
"step": 4155
},
{
"epoch": 7.4153297682709445,
"grad_norm": 0.5797430276870728,
"learning_rate": 9.515475394143742e-06,
"loss": 0.1445,
"num_input_tokens_seen": 2579896,
"step": 4160
},
{
"epoch": 7.424242424242424,
"grad_norm": 0.4454365670681,
"learning_rate": 9.45448797292403e-06,
"loss": 0.2141,
"num_input_tokens_seen": 2583544,
"step": 4165
},
{
"epoch": 7.433155080213904,
"grad_norm": 0.3823348879814148,
"learning_rate": 9.393651016082083e-06,
"loss": 0.154,
"num_input_tokens_seen": 2586200,
"step": 4170
},
{
"epoch": 7.4420677361853835,
"grad_norm": 0.44054359197616577,
"learning_rate": 9.332965112456337e-06,
"loss": 0.1803,
"num_input_tokens_seen": 2589496,
"step": 4175
},
{
"epoch": 7.450980392156863,
"grad_norm": 0.4444521963596344,
"learning_rate": 9.272430849423174e-06,
"loss": 0.1813,
"num_input_tokens_seen": 2591928,
"step": 4180
},
{
"epoch": 7.459893048128342,
"grad_norm": 0.6432741284370422,
"learning_rate": 9.21204881289125e-06,
"loss": 0.1793,
"num_input_tokens_seen": 2595064,
"step": 4185
},
{
"epoch": 7.468805704099822,
"grad_norm": 0.5586231350898743,
"learning_rate": 9.151819587295845e-06,
"loss": 0.162,
"num_input_tokens_seen": 2597944,
"step": 4190
},
{
"epoch": 7.477718360071301,
"grad_norm": 0.4838408827781677,
"learning_rate": 9.09174375559319e-06,
"loss": 0.1969,
"num_input_tokens_seen": 2601656,
"step": 4195
},
{
"epoch": 7.4866310160427805,
"grad_norm": 0.4085644483566284,
"learning_rate": 9.031821899254796e-06,
"loss": 0.1497,
"num_input_tokens_seen": 2604472,
"step": 4200
},
{
"epoch": 7.49554367201426,
"grad_norm": 0.3888384699821472,
"learning_rate": 8.972054598261892e-06,
"loss": 0.1631,
"num_input_tokens_seen": 2607992,
"step": 4205
},
{
"epoch": 7.50445632798574,
"grad_norm": 0.7054049372673035,
"learning_rate": 8.912442431099724e-06,
"loss": 0.1672,
"num_input_tokens_seen": 2611800,
"step": 4210
},
{
"epoch": 7.5133689839572195,
"grad_norm": 0.5162657499313354,
"learning_rate": 8.852985974752045e-06,
"loss": 0.1665,
"num_input_tokens_seen": 2614936,
"step": 4215
},
{
"epoch": 7.5133689839572195,
"eval_loss": 0.18536153435707092,
"eval_runtime": 4.2538,
"eval_samples_per_second": 58.536,
"eval_steps_per_second": 14.81,
"num_input_tokens_seen": 2614936,
"step": 4215
},
{
"epoch": 7.522281639928699,
"grad_norm": 0.357683002948761,
"learning_rate": 8.793685804695482e-06,
"loss": 0.2229,
"num_input_tokens_seen": 2618744,
"step": 4220
},
{
"epoch": 7.531194295900178,
"grad_norm": 0.4619935154914856,
"learning_rate": 8.734542494893955e-06,
"loss": 0.1613,
"num_input_tokens_seen": 2621496,
"step": 4225
},
{
"epoch": 7.540106951871658,
"grad_norm": 0.5771064758300781,
"learning_rate": 8.675556617793143e-06,
"loss": 0.1607,
"num_input_tokens_seen": 2624568,
"step": 4230
},
{
"epoch": 7.549019607843137,
"grad_norm": 0.5340394377708435,
"learning_rate": 8.616728744314956e-06,
"loss": 0.1969,
"num_input_tokens_seen": 2627832,
"step": 4235
},
{
"epoch": 7.5579322638146165,
"grad_norm": 0.5918867588043213,
"learning_rate": 8.558059443851998e-06,
"loss": 0.1702,
"num_input_tokens_seen": 2631160,
"step": 4240
},
{
"epoch": 7.566844919786096,
"grad_norm": 0.4290253520011902,
"learning_rate": 8.499549284262017e-06,
"loss": 0.158,
"num_input_tokens_seen": 2634488,
"step": 4245
},
{
"epoch": 7.575757575757576,
"grad_norm": 0.6583709120750427,
"learning_rate": 8.441198831862485e-06,
"loss": 0.1691,
"num_input_tokens_seen": 2637240,
"step": 4250
},
{
"epoch": 7.5846702317290555,
"grad_norm": 0.6762195825576782,
"learning_rate": 8.383008651425035e-06,
"loss": 0.1565,
"num_input_tokens_seen": 2639992,
"step": 4255
},
{
"epoch": 7.593582887700535,
"grad_norm": 0.29171764850616455,
"learning_rate": 8.32497930617006e-06,
"loss": 0.1893,
"num_input_tokens_seen": 2643832,
"step": 4260
},
{
"epoch": 7.602495543672014,
"grad_norm": 0.4991152286529541,
"learning_rate": 8.267111357761243e-06,
"loss": 0.1343,
"num_input_tokens_seen": 2646712,
"step": 4265
},
{
"epoch": 7.611408199643494,
"grad_norm": 0.6517699360847473,
"learning_rate": 8.209405366300088e-06,
"loss": 0.1455,
"num_input_tokens_seen": 2650072,
"step": 4270
},
{
"epoch": 7.620320855614973,
"grad_norm": 1.1518526077270508,
"learning_rate": 8.151861890320528e-06,
"loss": 0.1928,
"num_input_tokens_seen": 2653656,
"step": 4275
},
{
"epoch": 7.6292335115864525,
"grad_norm": 0.7069615721702576,
"learning_rate": 8.094481486783534e-06,
"loss": 0.2059,
"num_input_tokens_seen": 2657464,
"step": 4280
},
{
"epoch": 7.638146167557933,
"grad_norm": 0.3675689697265625,
"learning_rate": 8.0372647110717e-06,
"loss": 0.1825,
"num_input_tokens_seen": 2660568,
"step": 4285
},
{
"epoch": 7.647058823529412,
"grad_norm": 0.5671415328979492,
"learning_rate": 7.98021211698385e-06,
"loss": 0.1507,
"num_input_tokens_seen": 2663448,
"step": 4290
},
{
"epoch": 7.6559714795008915,
"grad_norm": 0.5237590074539185,
"learning_rate": 7.923324256729738e-06,
"loss": 0.1794,
"num_input_tokens_seen": 2666136,
"step": 4295
},
{
"epoch": 7.664884135472371,
"grad_norm": 0.6967838406562805,
"learning_rate": 7.866601680924633e-06,
"loss": 0.183,
"num_input_tokens_seen": 2669048,
"step": 4300
},
{
"epoch": 7.67379679144385,
"grad_norm": 0.48244914412498474,
"learning_rate": 7.810044938584038e-06,
"loss": 0.1663,
"num_input_tokens_seen": 2671800,
"step": 4305
},
{
"epoch": 7.68270944741533,
"grad_norm": 0.5121620893478394,
"learning_rate": 7.75365457711837e-06,
"loss": 0.1757,
"num_input_tokens_seen": 2675448,
"step": 4310
},
{
"epoch": 7.691622103386809,
"grad_norm": 0.5723910331726074,
"learning_rate": 7.697431142327632e-06,
"loss": 0.1654,
"num_input_tokens_seen": 2678392,
"step": 4315
},
{
"epoch": 7.7005347593582885,
"grad_norm": 0.4338489770889282,
"learning_rate": 7.641375178396151e-06,
"loss": 0.1645,
"num_input_tokens_seen": 2681112,
"step": 4320
},
{
"epoch": 7.709447415329768,
"grad_norm": 0.5260465145111084,
"learning_rate": 7.585487227887328e-06,
"loss": 0.1636,
"num_input_tokens_seen": 2684856,
"step": 4325
},
{
"epoch": 7.718360071301248,
"grad_norm": 0.37905287742614746,
"learning_rate": 7.529767831738366e-06,
"loss": 0.1682,
"num_input_tokens_seen": 2687576,
"step": 4330
},
{
"epoch": 7.7272727272727275,
"grad_norm": 0.5463063716888428,
"learning_rate": 7.474217529255018e-06,
"loss": 0.1472,
"num_input_tokens_seen": 2690328,
"step": 4335
},
{
"epoch": 7.736185383244207,
"grad_norm": 0.640016496181488,
"learning_rate": 7.4188368581064124e-06,
"loss": 0.17,
"num_input_tokens_seen": 2694168,
"step": 4340
},
{
"epoch": 7.745098039215686,
"grad_norm": 0.42445164918899536,
"learning_rate": 7.3636263543197945e-06,
"loss": 0.1617,
"num_input_tokens_seen": 2697208,
"step": 4345
},
{
"epoch": 7.754010695187166,
"grad_norm": 1.0092363357543945,
"learning_rate": 7.30858655227539e-06,
"loss": 0.182,
"num_input_tokens_seen": 2700376,
"step": 4350
},
{
"epoch": 7.762923351158645,
"grad_norm": 0.2814575433731079,
"learning_rate": 7.253717984701208e-06,
"loss": 0.1667,
"num_input_tokens_seen": 2703256,
"step": 4355
},
{
"epoch": 7.7718360071301245,
"grad_norm": 0.5186646580696106,
"learning_rate": 7.199021182667873e-06,
"loss": 0.1594,
"num_input_tokens_seen": 2705752,
"step": 4360
},
{
"epoch": 7.780748663101605,
"grad_norm": 0.4522174000740051,
"learning_rate": 7.1444966755834954e-06,
"loss": 0.1373,
"num_input_tokens_seen": 2708888,
"step": 4365
},
{
"epoch": 7.789661319073084,
"grad_norm": 0.4952068328857422,
"learning_rate": 7.0901449911885685e-06,
"loss": 0.159,
"num_input_tokens_seen": 2711576,
"step": 4370
},
{
"epoch": 7.7985739750445635,
"grad_norm": 0.47718411684036255,
"learning_rate": 7.035966655550838e-06,
"loss": 0.1856,
"num_input_tokens_seen": 2715000,
"step": 4375
},
{
"epoch": 7.807486631016043,
"grad_norm": 0.5538311004638672,
"learning_rate": 6.98196219306019e-06,
"loss": 0.1708,
"num_input_tokens_seen": 2717880,
"step": 4380
},
{
"epoch": 7.816399286987522,
"grad_norm": 0.40867936611175537,
"learning_rate": 6.928132126423636e-06,
"loss": 0.1424,
"num_input_tokens_seen": 2721240,
"step": 4385
},
{
"epoch": 7.825311942959002,
"grad_norm": 0.579886257648468,
"learning_rate": 6.8744769766601854e-06,
"loss": 0.1844,
"num_input_tokens_seen": 2724696,
"step": 4390
},
{
"epoch": 7.834224598930481,
"grad_norm": 0.4526924788951874,
"learning_rate": 6.820997263095849e-06,
"loss": 0.1754,
"num_input_tokens_seen": 2727960,
"step": 4395
},
{
"epoch": 7.8431372549019605,
"grad_norm": 0.5530297756195068,
"learning_rate": 6.767693503358608e-06,
"loss": 0.1816,
"num_input_tokens_seen": 2731000,
"step": 4400
},
{
"epoch": 7.85204991087344,
"grad_norm": 0.3621399700641632,
"learning_rate": 6.7145662133733715e-06,
"loss": 0.1751,
"num_input_tokens_seen": 2734264,
"step": 4405
},
{
"epoch": 7.86096256684492,
"grad_norm": 0.5544110536575317,
"learning_rate": 6.6616159073570135e-06,
"loss": 0.1635,
"num_input_tokens_seen": 2736664,
"step": 4410
},
{
"epoch": 7.8698752228163995,
"grad_norm": 0.504298985004425,
"learning_rate": 6.6088430978133914e-06,
"loss": 0.1685,
"num_input_tokens_seen": 2739672,
"step": 4415
},
{
"epoch": 7.878787878787879,
"grad_norm": 0.45025068521499634,
"learning_rate": 6.556248295528389e-06,
"loss": 0.1576,
"num_input_tokens_seen": 2742552,
"step": 4420
},
{
"epoch": 7.887700534759358,
"grad_norm": 0.9994719624519348,
"learning_rate": 6.5038320095649395e-06,
"loss": 0.1938,
"num_input_tokens_seen": 2745880,
"step": 4425
},
{
"epoch": 7.896613190730838,
"grad_norm": 0.5288066267967224,
"learning_rate": 6.451594747258155e-06,
"loss": 0.1818,
"num_input_tokens_seen": 2749912,
"step": 4430
},
{
"epoch": 7.905525846702317,
"grad_norm": 0.5786968469619751,
"learning_rate": 6.399537014210355e-06,
"loss": 0.1757,
"num_input_tokens_seen": 2753368,
"step": 4435
},
{
"epoch": 7.9144385026737964,
"grad_norm": 0.3910267651081085,
"learning_rate": 6.3476593142862275e-06,
"loss": 0.1794,
"num_input_tokens_seen": 2756568,
"step": 4440
},
{
"epoch": 7.923351158645277,
"grad_norm": 1.0030827522277832,
"learning_rate": 6.29596214960792e-06,
"loss": 0.1752,
"num_input_tokens_seen": 2759704,
"step": 4445
},
{
"epoch": 7.932263814616756,
"grad_norm": 0.41212958097457886,
"learning_rate": 6.244446020550182e-06,
"loss": 0.1709,
"num_input_tokens_seen": 2762584,
"step": 4450
},
{
"epoch": 7.9411764705882355,
"grad_norm": 0.5541166067123413,
"learning_rate": 6.193111425735515e-06,
"loss": 0.1763,
"num_input_tokens_seen": 2765752,
"step": 4455
},
{
"epoch": 7.950089126559715,
"grad_norm": 0.6690767407417297,
"learning_rate": 6.141958862029384e-06,
"loss": 0.1624,
"num_input_tokens_seen": 2768696,
"step": 4460
},
{
"epoch": 7.959001782531194,
"grad_norm": 0.5791964530944824,
"learning_rate": 6.090988824535374e-06,
"loss": 0.1844,
"num_input_tokens_seen": 2772120,
"step": 4465
},
{
"epoch": 7.967914438502674,
"grad_norm": 0.40184465050697327,
"learning_rate": 6.040201806590387e-06,
"loss": 0.1918,
"num_input_tokens_seen": 2775384,
"step": 4470
},
{
"epoch": 7.976827094474153,
"grad_norm": 0.4650464951992035,
"learning_rate": 5.989598299759919e-06,
"loss": 0.1778,
"num_input_tokens_seen": 2778520,
"step": 4475
},
{
"epoch": 7.9857397504456324,
"grad_norm": 0.5422367453575134,
"learning_rate": 5.939178793833233e-06,
"loss": 0.1734,
"num_input_tokens_seen": 2780888,
"step": 4480
},
{
"epoch": 7.994652406417112,
"grad_norm": 0.5420627593994141,
"learning_rate": 5.888943776818684e-06,
"loss": 0.1781,
"num_input_tokens_seen": 2784312,
"step": 4485
},
{
"epoch": 8.003565062388592,
"grad_norm": 0.465055912733078,
"learning_rate": 5.83889373493896e-06,
"loss": 0.1861,
"num_input_tokens_seen": 2787056,
"step": 4490
},
{
"epoch": 8.01247771836007,
"grad_norm": 0.8877488970756531,
"learning_rate": 5.789029152626374e-06,
"loss": 0.1686,
"num_input_tokens_seen": 2790288,
"step": 4495
},
{
"epoch": 8.014260249554367,
"eval_loss": 0.18306031823158264,
"eval_runtime": 4.2492,
"eval_samples_per_second": 58.599,
"eval_steps_per_second": 14.826,
"num_input_tokens_seen": 2790832,
"step": 4496
},
{
"epoch": 8.02139037433155,
"grad_norm": 0.3791468143463135,
"learning_rate": 5.73935051251818e-06,
"loss": 0.1626,
"num_input_tokens_seen": 2793136,
"step": 4500
},
{
"epoch": 8.030303030303031,
"grad_norm": 0.6450890302658081,
"learning_rate": 5.689858295451914e-06,
"loss": 0.1684,
"num_input_tokens_seen": 2796464,
"step": 4505
},
{
"epoch": 8.03921568627451,
"grad_norm": 0.36496949195861816,
"learning_rate": 5.640552980460742e-06,
"loss": 0.1524,
"num_input_tokens_seen": 2799344,
"step": 4510
},
{
"epoch": 8.04812834224599,
"grad_norm": 0.5503035187721252,
"learning_rate": 5.591435044768783e-06,
"loss": 0.1529,
"num_input_tokens_seen": 2801648,
"step": 4515
},
{
"epoch": 8.057040998217468,
"grad_norm": 0.4298340678215027,
"learning_rate": 5.542504963786552e-06,
"loss": 0.1769,
"num_input_tokens_seen": 2804976,
"step": 4520
},
{
"epoch": 8.065953654188949,
"grad_norm": 0.44245445728302,
"learning_rate": 5.493763211106293e-06,
"loss": 0.1543,
"num_input_tokens_seen": 2807472,
"step": 4525
},
{
"epoch": 8.074866310160427,
"grad_norm": 0.27881208062171936,
"learning_rate": 5.4452102584974545e-06,
"loss": 0.1436,
"num_input_tokens_seen": 2810768,
"step": 4530
},
{
"epoch": 8.083778966131907,
"grad_norm": 0.9025391340255737,
"learning_rate": 5.396846575902095e-06,
"loss": 0.1822,
"num_input_tokens_seen": 2814480,
"step": 4535
},
{
"epoch": 8.092691622103386,
"grad_norm": 0.33398008346557617,
"learning_rate": 5.348672631430318e-06,
"loss": 0.1551,
"num_input_tokens_seen": 2817968,
"step": 4540
},
{
"epoch": 8.101604278074866,
"grad_norm": 0.45554453134536743,
"learning_rate": 5.300688891355765e-06,
"loss": 0.1626,
"num_input_tokens_seen": 2820784,
"step": 4545
},
{
"epoch": 8.110516934046347,
"grad_norm": 0.38997194170951843,
"learning_rate": 5.252895820111112e-06,
"loss": 0.1377,
"num_input_tokens_seen": 2823824,
"step": 4550
},
{
"epoch": 8.119429590017825,
"grad_norm": 0.5823608040809631,
"learning_rate": 5.205293880283552e-06,
"loss": 0.1602,
"num_input_tokens_seen": 2826832,
"step": 4555
},
{
"epoch": 8.128342245989305,
"grad_norm": 0.6442610025405884,
"learning_rate": 5.157883532610305e-06,
"loss": 0.189,
"num_input_tokens_seen": 2830256,
"step": 4560
},
{
"epoch": 8.137254901960784,
"grad_norm": 0.6161116361618042,
"learning_rate": 5.110665235974219e-06,
"loss": 0.181,
"num_input_tokens_seen": 2832848,
"step": 4565
},
{
"epoch": 8.146167557932264,
"grad_norm": 0.5139124989509583,
"learning_rate": 5.06363944739924e-06,
"loss": 0.1593,
"num_input_tokens_seen": 2835664,
"step": 4570
},
{
"epoch": 8.155080213903743,
"grad_norm": 0.4244152903556824,
"learning_rate": 5.0168066220460715e-06,
"loss": 0.1533,
"num_input_tokens_seen": 2838864,
"step": 4575
},
{
"epoch": 8.163992869875223,
"grad_norm": 0.8236415386199951,
"learning_rate": 4.97016721320773e-06,
"loss": 0.1638,
"num_input_tokens_seen": 2841840,
"step": 4580
},
{
"epoch": 8.172905525846703,
"grad_norm": 0.6396406292915344,
"learning_rate": 4.9237216723051485e-06,
"loss": 0.1693,
"num_input_tokens_seen": 2844976,
"step": 4585
},
{
"epoch": 8.181818181818182,
"grad_norm": 0.41378054022789,
"learning_rate": 4.877470448882815e-06,
"loss": 0.1585,
"num_input_tokens_seen": 2847856,
"step": 4590
},
{
"epoch": 8.190730837789662,
"grad_norm": 0.5032555460929871,
"learning_rate": 4.831413990604447e-06,
"loss": 0.1465,
"num_input_tokens_seen": 2850192,
"step": 4595
},
{
"epoch": 8.19964349376114,
"grad_norm": 0.4285055994987488,
"learning_rate": 4.7855527432486336e-06,
"loss": 0.1517,
"num_input_tokens_seen": 2853008,
"step": 4600
},
{
"epoch": 8.20855614973262,
"grad_norm": 0.5328398942947388,
"learning_rate": 4.739887150704508e-06,
"loss": 0.2001,
"num_input_tokens_seen": 2856464,
"step": 4605
},
{
"epoch": 8.2174688057041,
"grad_norm": 0.45751845836639404,
"learning_rate": 4.694417654967492e-06,
"loss": 0.1507,
"num_input_tokens_seen": 2858864,
"step": 4610
},
{
"epoch": 8.22638146167558,
"grad_norm": 0.44036829471588135,
"learning_rate": 4.649144696134972e-06,
"loss": 0.1711,
"num_input_tokens_seen": 2861488,
"step": 4615
},
{
"epoch": 8.235294117647058,
"grad_norm": 0.4446769654750824,
"learning_rate": 4.6040687124020794e-06,
"loss": 0.168,
"num_input_tokens_seen": 2865136,
"step": 4620
},
{
"epoch": 8.244206773618538,
"grad_norm": 0.6855089068412781,
"learning_rate": 4.5591901400574285e-06,
"loss": 0.1646,
"num_input_tokens_seen": 2867984,
"step": 4625
},
{
"epoch": 8.253119429590019,
"grad_norm": 0.6599955558776855,
"learning_rate": 4.514509413478888e-06,
"loss": 0.1795,
"num_input_tokens_seen": 2871088,
"step": 4630
},
{
"epoch": 8.262032085561497,
"grad_norm": 0.42294609546661377,
"learning_rate": 4.470026965129384e-06,
"loss": 0.1433,
"num_input_tokens_seen": 2874352,
"step": 4635
},
{
"epoch": 8.270944741532977,
"grad_norm": 0.4342804551124573,
"learning_rate": 4.425743225552731e-06,
"loss": 0.1762,
"num_input_tokens_seen": 2877840,
"step": 4640
},
{
"epoch": 8.279857397504456,
"grad_norm": 0.5680054426193237,
"learning_rate": 4.381658623369445e-06,
"loss": 0.1532,
"num_input_tokens_seen": 2881456,
"step": 4645
},
{
"epoch": 8.288770053475936,
"grad_norm": 0.5137624740600586,
"learning_rate": 4.337773585272581e-06,
"loss": 0.1694,
"num_input_tokens_seen": 2884400,
"step": 4650
},
{
"epoch": 8.297682709447415,
"grad_norm": 0.3794878125190735,
"learning_rate": 4.294088536023652e-06,
"loss": 0.1475,
"num_input_tokens_seen": 2887536,
"step": 4655
},
{
"epoch": 8.306595365418895,
"grad_norm": 0.6075329184532166,
"learning_rate": 4.250603898448455e-06,
"loss": 0.1811,
"num_input_tokens_seen": 2890352,
"step": 4660
},
{
"epoch": 8.315508021390375,
"grad_norm": 0.45767733454704285,
"learning_rate": 4.2073200934330315e-06,
"loss": 0.1871,
"num_input_tokens_seen": 2893520,
"step": 4665
},
{
"epoch": 8.324420677361854,
"grad_norm": 0.46819356083869934,
"learning_rate": 4.164237539919577e-06,
"loss": 0.1842,
"num_input_tokens_seen": 2896048,
"step": 4670
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.5235320329666138,
"learning_rate": 4.121356654902364e-06,
"loss": 0.164,
"num_input_tokens_seen": 2899472,
"step": 4675
},
{
"epoch": 8.342245989304812,
"grad_norm": 0.8180021047592163,
"learning_rate": 4.078677853423724e-06,
"loss": 0.1573,
"num_input_tokens_seen": 2902832,
"step": 4680
},
{
"epoch": 8.351158645276293,
"grad_norm": 0.9956904649734497,
"learning_rate": 4.036201548570049e-06,
"loss": 0.2367,
"num_input_tokens_seen": 2906576,
"step": 4685
},
{
"epoch": 8.360071301247771,
"grad_norm": 0.6165153980255127,
"learning_rate": 3.993928151467766e-06,
"loss": 0.1987,
"num_input_tokens_seen": 2909840,
"step": 4690
},
{
"epoch": 8.368983957219251,
"grad_norm": 0.48898622393608093,
"learning_rate": 3.951858071279352e-06,
"loss": 0.1454,
"num_input_tokens_seen": 2912752,
"step": 4695
},
{
"epoch": 8.37789661319073,
"grad_norm": 0.48024001717567444,
"learning_rate": 3.909991715199412e-06,
"loss": 0.1633,
"num_input_tokens_seen": 2915024,
"step": 4700
},
{
"epoch": 8.38680926916221,
"grad_norm": 0.4968958795070648,
"learning_rate": 3.8683294884506945e-06,
"loss": 0.1655,
"num_input_tokens_seen": 2918480,
"step": 4705
},
{
"epoch": 8.39572192513369,
"grad_norm": 0.5491753220558167,
"learning_rate": 3.826871794280193e-06,
"loss": 0.1729,
"num_input_tokens_seen": 2921712,
"step": 4710
},
{
"epoch": 8.404634581105169,
"grad_norm": 0.5808373093605042,
"learning_rate": 3.7856190339552513e-06,
"loss": 0.1851,
"num_input_tokens_seen": 2925040,
"step": 4715
},
{
"epoch": 8.41354723707665,
"grad_norm": 0.9629413485527039,
"learning_rate": 3.7445716067596503e-06,
"loss": 0.1578,
"num_input_tokens_seen": 2928112,
"step": 4720
},
{
"epoch": 8.422459893048128,
"grad_norm": 0.8614413142204285,
"learning_rate": 3.7037299099897586e-06,
"loss": 0.1865,
"num_input_tokens_seen": 2932368,
"step": 4725
},
{
"epoch": 8.431372549019608,
"grad_norm": 0.5639718770980835,
"learning_rate": 3.663094338950704e-06,
"loss": 0.1738,
"num_input_tokens_seen": 2935088,
"step": 4730
},
{
"epoch": 8.440285204991087,
"grad_norm": 0.5123082995414734,
"learning_rate": 3.6226652869525285e-06,
"loss": 0.1471,
"num_input_tokens_seen": 2937840,
"step": 4735
},
{
"epoch": 8.449197860962567,
"grad_norm": 0.5894414186477661,
"learning_rate": 3.5824431453063662e-06,
"loss": 0.1638,
"num_input_tokens_seen": 2941008,
"step": 4740
},
{
"epoch": 8.458110516934047,
"grad_norm": 0.34330514073371887,
"learning_rate": 3.5424283033207024e-06,
"loss": 0.1672,
"num_input_tokens_seen": 2944464,
"step": 4745
},
{
"epoch": 8.467023172905526,
"grad_norm": 0.37955033779144287,
"learning_rate": 3.5026211482975497e-06,
"loss": 0.1584,
"num_input_tokens_seen": 2947376,
"step": 4750
},
{
"epoch": 8.475935828877006,
"grad_norm": 0.9495477080345154,
"learning_rate": 3.463022065528748e-06,
"loss": 0.1767,
"num_input_tokens_seen": 2950480,
"step": 4755
},
{
"epoch": 8.484848484848484,
"grad_norm": 0.3263673782348633,
"learning_rate": 3.4236314382922103e-06,
"loss": 0.1429,
"num_input_tokens_seen": 2953392,
"step": 4760
},
{
"epoch": 8.493761140819965,
"grad_norm": 0.5537719130516052,
"learning_rate": 3.3844496478482064e-06,
"loss": 0.1588,
"num_input_tokens_seen": 2956272,
"step": 4765
},
{
"epoch": 8.502673796791443,
"grad_norm": 0.30169588327407837,
"learning_rate": 3.345477073435685e-06,
"loss": 0.167,
"num_input_tokens_seen": 2959056,
"step": 4770
},
{
"epoch": 8.511586452762923,
"grad_norm": 0.5430099964141846,
"learning_rate": 3.3067140922686174e-06,
"loss": 0.1655,
"num_input_tokens_seen": 2962480,
"step": 4775
},
{
"epoch": 8.515151515151516,
"eval_loss": 0.1827203780412674,
"eval_runtime": 4.2534,
"eval_samples_per_second": 58.541,
"eval_steps_per_second": 14.812,
"num_input_tokens_seen": 2963888,
"step": 4777
},
{
"epoch": 8.520499108734402,
"grad_norm": 0.44720202684402466,
"learning_rate": 3.268161079532317e-06,
"loss": 0.1494,
"num_input_tokens_seen": 2965360,
"step": 4780
},
{
"epoch": 8.529411764705882,
"grad_norm": 0.3062620162963867,
"learning_rate": 3.22981840837982e-06,
"loss": 0.1712,
"num_input_tokens_seen": 2968464,
"step": 4785
},
{
"epoch": 8.538324420677363,
"grad_norm": 0.9861251711845398,
"learning_rate": 3.1916864499282856e-06,
"loss": 0.1779,
"num_input_tokens_seen": 2972144,
"step": 4790
},
{
"epoch": 8.547237076648841,
"grad_norm": 0.43644168972969055,
"learning_rate": 3.1537655732553768e-06,
"loss": 0.1509,
"num_input_tokens_seen": 2974384,
"step": 4795
},
{
"epoch": 8.556149732620321,
"grad_norm": 0.5110581517219543,
"learning_rate": 3.1160561453957183e-06,
"loss": 0.1578,
"num_input_tokens_seen": 2977104,
"step": 4800
},
{
"epoch": 8.5650623885918,
"grad_norm": 0.5604438781738281,
"learning_rate": 3.078558531337336e-06,
"loss": 0.1694,
"num_input_tokens_seen": 2980464,
"step": 4805
},
{
"epoch": 8.57397504456328,
"grad_norm": 0.5687141418457031,
"learning_rate": 3.0412730940181015e-06,
"loss": 0.1643,
"num_input_tokens_seen": 2983248,
"step": 4810
},
{
"epoch": 8.582887700534759,
"grad_norm": 0.9281808137893677,
"learning_rate": 3.0042001943222376e-06,
"loss": 0.165,
"num_input_tokens_seen": 2986256,
"step": 4815
},
{
"epoch": 8.591800356506239,
"grad_norm": 0.6919686794281006,
"learning_rate": 2.967340191076834e-06,
"loss": 0.1902,
"num_input_tokens_seen": 2990256,
"step": 4820
},
{
"epoch": 8.60071301247772,
"grad_norm": 0.7080613374710083,
"learning_rate": 2.930693441048371e-06,
"loss": 0.149,
"num_input_tokens_seen": 2992592,
"step": 4825
},
{
"epoch": 8.609625668449198,
"grad_norm": 0.5117068886756897,
"learning_rate": 2.8942602989392386e-06,
"loss": 0.174,
"num_input_tokens_seen": 2995888,
"step": 4830
},
{
"epoch": 8.618538324420678,
"grad_norm": 0.37796565890312195,
"learning_rate": 2.858041117384341e-06,
"loss": 0.148,
"num_input_tokens_seen": 2999280,
"step": 4835
},
{
"epoch": 8.627450980392156,
"grad_norm": 0.6607238054275513,
"learning_rate": 2.8220362469476624e-06,
"loss": 0.1541,
"num_input_tokens_seen": 3002864,
"step": 4840
},
{
"epoch": 8.636363636363637,
"grad_norm": 0.4288221001625061,
"learning_rate": 2.7862460361188614e-06,
"loss": 0.1521,
"num_input_tokens_seen": 3004944,
"step": 4845
},
{
"epoch": 8.645276292335115,
"grad_norm": 0.49076348543167114,
"learning_rate": 2.750670831309957e-06,
"loss": 0.1682,
"num_input_tokens_seen": 3008464,
"step": 4850
},
{
"epoch": 8.654188948306595,
"grad_norm": 0.615407407283783,
"learning_rate": 2.7153109768518925e-06,
"loss": 0.171,
"num_input_tokens_seen": 3012240,
"step": 4855
},
{
"epoch": 8.663101604278076,
"grad_norm": 0.5121405124664307,
"learning_rate": 2.680166814991256e-06,
"loss": 0.1606,
"num_input_tokens_seen": 3015056,
"step": 4860
},
{
"epoch": 8.672014260249554,
"grad_norm": 0.7262160778045654,
"learning_rate": 2.645238685886961e-06,
"loss": 0.2009,
"num_input_tokens_seen": 3018160,
"step": 4865
},
{
"epoch": 8.680926916221035,
"grad_norm": 0.5012710690498352,
"learning_rate": 2.6105269276069573e-06,
"loss": 0.1641,
"num_input_tokens_seen": 3021392,
"step": 4870
},
{
"epoch": 8.689839572192513,
"grad_norm": 0.681621789932251,
"learning_rate": 2.5760318761249263e-06,
"loss": 0.1751,
"num_input_tokens_seen": 3024240,
"step": 4875
},
{
"epoch": 8.698752228163993,
"grad_norm": 0.4795394539833069,
"learning_rate": 2.541753865317076e-06,
"loss": 0.171,
"num_input_tokens_seen": 3026800,
"step": 4880
},
{
"epoch": 8.707664884135472,
"grad_norm": 0.4269944429397583,
"learning_rate": 2.507693226958871e-06,
"loss": 0.1673,
"num_input_tokens_seen": 3029968,
"step": 4885
},
{
"epoch": 8.716577540106952,
"grad_norm": 0.6113168597221375,
"learning_rate": 2.473850290721838e-06,
"loss": 0.1568,
"num_input_tokens_seen": 3032656,
"step": 4890
},
{
"epoch": 8.72549019607843,
"grad_norm": 0.5832796692848206,
"learning_rate": 2.4402253841703914e-06,
"loss": 0.1645,
"num_input_tokens_seen": 3035376,
"step": 4895
},
{
"epoch": 8.73440285204991,
"grad_norm": 0.4533407986164093,
"learning_rate": 2.4068188327586257e-06,
"loss": 0.1798,
"num_input_tokens_seen": 3038512,
"step": 4900
},
{
"epoch": 8.743315508021391,
"grad_norm": 0.6923168897628784,
"learning_rate": 2.373630959827186e-06,
"loss": 0.161,
"num_input_tokens_seen": 3041744,
"step": 4905
},
{
"epoch": 8.75222816399287,
"grad_norm": 0.5411429405212402,
"learning_rate": 2.3406620866001485e-06,
"loss": 0.1696,
"num_input_tokens_seen": 3045232,
"step": 4910
},
{
"epoch": 8.76114081996435,
"grad_norm": 0.40592697262763977,
"learning_rate": 2.3079125321818996e-06,
"loss": 0.1636,
"num_input_tokens_seen": 3047728,
"step": 4915
},
{
"epoch": 8.770053475935828,
"grad_norm": 0.7785168886184692,
"learning_rate": 2.275382613554031e-06,
"loss": 0.1534,
"num_input_tokens_seen": 3050864,
"step": 4920
},
{
"epoch": 8.778966131907309,
"grad_norm": 0.46840912103652954,
"learning_rate": 2.2430726455723113e-06,
"loss": 0.1651,
"num_input_tokens_seen": 3053680,
"step": 4925
},
{
"epoch": 8.787878787878787,
"grad_norm": 0.5858107209205627,
"learning_rate": 2.210982940963596e-06,
"loss": 0.1632,
"num_input_tokens_seen": 3057136,
"step": 4930
},
{
"epoch": 8.796791443850267,
"grad_norm": 0.8381409049034119,
"learning_rate": 2.1791138103228275e-06,
"loss": 0.1736,
"num_input_tokens_seen": 3060144,
"step": 4935
},
{
"epoch": 8.805704099821746,
"grad_norm": 0.4155525863170624,
"learning_rate": 2.1474655621100347e-06,
"loss": 0.1759,
"num_input_tokens_seen": 3063024,
"step": 4940
},
{
"epoch": 8.814616755793226,
"grad_norm": 0.7829816937446594,
"learning_rate": 2.116038502647319e-06,
"loss": 0.1736,
"num_input_tokens_seen": 3066320,
"step": 4945
},
{
"epoch": 8.823529411764707,
"grad_norm": 0.44637227058410645,
"learning_rate": 2.084832936115902e-06,
"loss": 0.1513,
"num_input_tokens_seen": 3069296,
"step": 4950
},
{
"epoch": 8.832442067736185,
"grad_norm": 0.49461662769317627,
"learning_rate": 2.0538491645531982e-06,
"loss": 0.1745,
"num_input_tokens_seen": 3071888,
"step": 4955
},
{
"epoch": 8.841354723707665,
"grad_norm": 0.5589842200279236,
"learning_rate": 2.0230874878498648e-06,
"loss": 0.2835,
"num_input_tokens_seen": 3075984,
"step": 4960
},
{
"epoch": 8.850267379679144,
"grad_norm": 0.544204592704773,
"learning_rate": 1.9925482037469188e-06,
"loss": 0.1654,
"num_input_tokens_seen": 3079152,
"step": 4965
},
{
"epoch": 8.859180035650624,
"grad_norm": 0.5478450059890747,
"learning_rate": 1.9622316078328566e-06,
"loss": 0.1682,
"num_input_tokens_seen": 3082544,
"step": 4970
},
{
"epoch": 8.868092691622103,
"grad_norm": 0.5605227947235107,
"learning_rate": 1.9321379935407697e-06,
"loss": 0.145,
"num_input_tokens_seen": 3085680,
"step": 4975
},
{
"epoch": 8.877005347593583,
"grad_norm": 0.5030500292778015,
"learning_rate": 1.9022676521455117e-06,
"loss": 0.1795,
"num_input_tokens_seen": 3089392,
"step": 4980
},
{
"epoch": 8.885918003565063,
"grad_norm": 0.6063732504844666,
"learning_rate": 1.8726208727609219e-06,
"loss": 0.1604,
"num_input_tokens_seen": 3092656,
"step": 4985
},
{
"epoch": 8.894830659536542,
"grad_norm": 0.6032387018203735,
"learning_rate": 1.8431979423369604e-06,
"loss": 0.1646,
"num_input_tokens_seen": 3095600,
"step": 4990
},
{
"epoch": 8.903743315508022,
"grad_norm": 0.4930381774902344,
"learning_rate": 1.8139991456569694e-06,
"loss": 0.1622,
"num_input_tokens_seen": 3098320,
"step": 4995
},
{
"epoch": 8.9126559714795,
"grad_norm": 0.8425898551940918,
"learning_rate": 1.7850247653349223e-06,
"loss": 0.1554,
"num_input_tokens_seen": 3101520,
"step": 5000
},
{
"epoch": 8.92156862745098,
"grad_norm": 0.6207576394081116,
"learning_rate": 1.7562750818126556e-06,
"loss": 0.1733,
"num_input_tokens_seen": 3104816,
"step": 5005
},
{
"epoch": 8.93048128342246,
"grad_norm": 0.5085470676422119,
"learning_rate": 1.727750373357187e-06,
"loss": 0.1686,
"num_input_tokens_seen": 3108176,
"step": 5010
},
{
"epoch": 8.93939393939394,
"grad_norm": 0.4193607568740845,
"learning_rate": 1.699450916058018e-06,
"loss": 0.1473,
"num_input_tokens_seen": 3111248,
"step": 5015
},
{
"epoch": 8.94830659536542,
"grad_norm": 0.3501569330692291,
"learning_rate": 1.6713769838244325e-06,
"loss": 0.154,
"num_input_tokens_seen": 3114224,
"step": 5020
},
{
"epoch": 8.957219251336898,
"grad_norm": 0.40926966071128845,
"learning_rate": 1.6435288483828748e-06,
"loss": 0.1529,
"num_input_tokens_seen": 3117232,
"step": 5025
},
{
"epoch": 8.966131907308379,
"grad_norm": 0.3181830644607544,
"learning_rate": 1.615906779274326e-06,
"loss": 0.2044,
"num_input_tokens_seen": 3120240,
"step": 5030
},
{
"epoch": 8.975044563279857,
"grad_norm": 0.9511982798576355,
"learning_rate": 1.588511043851662e-06,
"loss": 0.2427,
"num_input_tokens_seen": 3123792,
"step": 5035
},
{
"epoch": 8.983957219251337,
"grad_norm": 0.3971862494945526,
"learning_rate": 1.5613419072770864e-06,
"loss": 0.1803,
"num_input_tokens_seen": 3127184,
"step": 5040
},
{
"epoch": 8.992869875222816,
"grad_norm": 0.527430534362793,
"learning_rate": 1.534399632519573e-06,
"loss": 0.1621,
"num_input_tokens_seen": 3130480,
"step": 5045
},
{
"epoch": 9.001782531194296,
"grad_norm": 0.4454513490200043,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.1472,
"num_input_tokens_seen": 3132712,
"step": 5050
},
{
"epoch": 9.010695187165775,
"grad_norm": 0.8424109816551208,
"learning_rate": 1.4811967093501189e-06,
"loss": 0.1594,
"num_input_tokens_seen": 3135400,
"step": 5055
},
{
"epoch": 9.016042780748663,
"eval_loss": 0.18196314573287964,
"eval_runtime": 4.2599,
"eval_samples_per_second": 58.452,
"eval_steps_per_second": 14.789,
"num_input_tokens_seen": 3137352,
"step": 5058
},
{
"epoch": 9.019607843137255,
"grad_norm": 0.8189364075660706,
"learning_rate": 1.4549365758871142e-06,
"loss": 0.1552,
"num_input_tokens_seen": 3138248,
"step": 5060
},
{
"epoch": 9.028520499108735,
"grad_norm": 0.40512701869010925,
"learning_rate": 1.4289043341340375e-06,
"loss": 0.1724,
"num_input_tokens_seen": 3141480,
"step": 5065
},
{
"epoch": 9.037433155080214,
"grad_norm": 0.5652516484260559,
"learning_rate": 1.4031002360558849e-06,
"loss": 0.1694,
"num_input_tokens_seen": 3144904,
"step": 5070
},
{
"epoch": 9.046345811051694,
"grad_norm": 0.5365282893180847,
"learning_rate": 1.377524531409491e-06,
"loss": 0.1725,
"num_input_tokens_seen": 3148968,
"step": 5075
},
{
"epoch": 9.055258467023172,
"grad_norm": 0.3831281065940857,
"learning_rate": 1.3521774677410476e-06,
"loss": 0.1522,
"num_input_tokens_seen": 3151912,
"step": 5080
},
{
"epoch": 9.064171122994653,
"grad_norm": 0.4094650149345398,
"learning_rate": 1.3270592903837503e-06,
"loss": 0.1649,
"num_input_tokens_seen": 3155080,
"step": 5085
},
{
"epoch": 9.073083778966131,
"grad_norm": 0.7728195786476135,
"learning_rate": 1.3021702424554221e-06,
"loss": 0.1512,
"num_input_tokens_seen": 3157768,
"step": 5090
},
{
"epoch": 9.081996434937611,
"grad_norm": 0.6765234470367432,
"learning_rate": 1.2775105648561352e-06,
"loss": 0.1841,
"num_input_tokens_seen": 3161224,
"step": 5095
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.5181841254234314,
"learning_rate": 1.2530804962659098e-06,
"loss": 0.1716,
"num_input_tokens_seen": 3163944,
"step": 5100
},
{
"epoch": 9.09982174688057,
"grad_norm": 0.8874284625053406,
"learning_rate": 1.2288802731423883e-06,
"loss": 0.176,
"num_input_tokens_seen": 3166728,
"step": 5105
},
{
"epoch": 9.10873440285205,
"grad_norm": 0.6627284288406372,
"learning_rate": 1.2049101297185422e-06,
"loss": 0.1661,
"num_input_tokens_seen": 3170120,
"step": 5110
},
{
"epoch": 9.117647058823529,
"grad_norm": 0.7040612101554871,
"learning_rate": 1.1811702980004058e-06,
"loss": 0.1486,
"num_input_tokens_seen": 3173000,
"step": 5115
},
{
"epoch": 9.12655971479501,
"grad_norm": 0.6169217228889465,
"learning_rate": 1.1576610077648513e-06,
"loss": 0.1868,
"num_input_tokens_seen": 3176520,
"step": 5120
},
{
"epoch": 9.135472370766488,
"grad_norm": 0.464032381772995,
"learning_rate": 1.134382486557342e-06,
"loss": 0.1539,
"num_input_tokens_seen": 3179496,
"step": 5125
},
{
"epoch": 9.144385026737968,
"grad_norm": 0.679073691368103,
"learning_rate": 1.1113349596897331e-06,
"loss": 0.1429,
"num_input_tokens_seen": 3182248,
"step": 5130
},
{
"epoch": 9.153297682709447,
"grad_norm": 0.32752713561058044,
"learning_rate": 1.0885186502381017e-06,
"loss": 0.154,
"num_input_tokens_seen": 3184840,
"step": 5135
},
{
"epoch": 9.162210338680927,
"grad_norm": 0.6518117189407349,
"learning_rate": 1.0659337790405704e-06,
"loss": 0.1727,
"num_input_tokens_seen": 3187720,
"step": 5140
},
{
"epoch": 9.171122994652407,
"grad_norm": 0.6068860292434692,
"learning_rate": 1.0435805646951958e-06,
"loss": 0.1512,
"num_input_tokens_seen": 3190536,
"step": 5145
},
{
"epoch": 9.180035650623886,
"grad_norm": 0.42867806553840637,
"learning_rate": 1.0214592235578274e-06,
"loss": 0.162,
"num_input_tokens_seen": 3193608,
"step": 5150
},
{
"epoch": 9.188948306595366,
"grad_norm": 0.49051374197006226,
"learning_rate": 9.995699697400247e-07,
"loss": 0.181,
"num_input_tokens_seen": 3196936,
"step": 5155
},
{
"epoch": 9.197860962566844,
"grad_norm": 0.5725313425064087,
"learning_rate": 9.77913015106982e-07,
"loss": 0.1708,
"num_input_tokens_seen": 3200040,
"step": 5160
},
{
"epoch": 9.206773618538325,
"grad_norm": 0.9723972082138062,
"learning_rate": 9.564885692754793e-07,
"loss": 0.1814,
"num_input_tokens_seen": 3203240,
"step": 5165
},
{
"epoch": 9.215686274509803,
"grad_norm": 0.506613552570343,
"learning_rate": 9.352968396118628e-07,
"loss": 0.1726,
"num_input_tokens_seen": 3206376,
"step": 5170
},
{
"epoch": 9.224598930481283,
"grad_norm": 0.6921798586845398,
"learning_rate": 9.143380312300137e-07,
"loss": 0.1543,
"num_input_tokens_seen": 3209480,
"step": 5175
},
{
"epoch": 9.233511586452764,
"grad_norm": 0.5370962023735046,
"learning_rate": 8.936123469893892e-07,
"loss": 0.2448,
"num_input_tokens_seen": 3213448,
"step": 5180
},
{
"epoch": 9.242424242424242,
"grad_norm": 0.6006255745887756,
"learning_rate": 8.731199874930374e-07,
"loss": 0.1604,
"num_input_tokens_seen": 3216776,
"step": 5185
},
{
"epoch": 9.251336898395722,
"grad_norm": 0.5161803960800171,
"learning_rate": 8.528611510856766e-07,
"loss": 0.1543,
"num_input_tokens_seen": 3219752,
"step": 5190
},
{
"epoch": 9.260249554367201,
"grad_norm": 0.5216704607009888,
"learning_rate": 8.328360338517583e-07,
"loss": 0.1659,
"num_input_tokens_seen": 3223048,
"step": 5195
},
{
"epoch": 9.269162210338681,
"grad_norm": 0.43477028608322144,
"learning_rate": 8.130448296135768e-07,
"loss": 0.1847,
"num_input_tokens_seen": 3226984,
"step": 5200
},
{
"epoch": 9.27807486631016,
"grad_norm": 0.5066149234771729,
"learning_rate": 7.934877299293875e-07,
"loss": 0.1806,
"num_input_tokens_seen": 3230088,
"step": 5205
},
{
"epoch": 9.28698752228164,
"grad_norm": 0.9408987760543823,
"learning_rate": 7.741649240915666e-07,
"loss": 0.1692,
"num_input_tokens_seen": 3232840,
"step": 5210
},
{
"epoch": 9.29590017825312,
"grad_norm": 0.41510528326034546,
"learning_rate": 7.550765991247654e-07,
"loss": 0.144,
"num_input_tokens_seen": 3235944,
"step": 5215
},
{
"epoch": 9.304812834224599,
"grad_norm": 0.5157932043075562,
"learning_rate": 7.362229397840981e-07,
"loss": 0.1744,
"num_input_tokens_seen": 3238728,
"step": 5220
},
{
"epoch": 9.313725490196079,
"grad_norm": 0.44517961144447327,
"learning_rate": 7.17604128553373e-07,
"loss": 0.1478,
"num_input_tokens_seen": 3241256,
"step": 5225
},
{
"epoch": 9.322638146167558,
"grad_norm": 0.6294628977775574,
"learning_rate": 6.992203456432977e-07,
"loss": 0.1887,
"num_input_tokens_seen": 3244680,
"step": 5230
},
{
"epoch": 9.331550802139038,
"grad_norm": 0.3271355628967285,
"learning_rate": 6.810717689897633e-07,
"loss": 0.1474,
"num_input_tokens_seen": 3247560,
"step": 5235
},
{
"epoch": 9.340463458110516,
"grad_norm": 0.5900879502296448,
"learning_rate": 6.631585742521068e-07,
"loss": 0.1654,
"num_input_tokens_seen": 3251176,
"step": 5240
},
{
"epoch": 9.349376114081997,
"grad_norm": 1.2029948234558105,
"learning_rate": 6.454809348114044e-07,
"loss": 0.1985,
"num_input_tokens_seen": 3254152,
"step": 5245
},
{
"epoch": 9.358288770053475,
"grad_norm": 0.7293168902397156,
"learning_rate": 6.280390217688114e-07,
"loss": 0.1636,
"num_input_tokens_seen": 3256744,
"step": 5250
},
{
"epoch": 9.367201426024955,
"grad_norm": 0.28766605257987976,
"learning_rate": 6.108330039438892e-07,
"loss": 0.1729,
"num_input_tokens_seen": 3259400,
"step": 5255
},
{
"epoch": 9.376114081996436,
"grad_norm": 0.7399141788482666,
"learning_rate": 5.938630478729917e-07,
"loss": 0.1547,
"num_input_tokens_seen": 3262728,
"step": 5260
},
{
"epoch": 9.385026737967914,
"grad_norm": 0.45791682600975037,
"learning_rate": 5.771293178076286e-07,
"loss": 0.1693,
"num_input_tokens_seen": 3266376,
"step": 5265
},
{
"epoch": 9.393939393939394,
"grad_norm": 0.6668148636817932,
"learning_rate": 5.606319757128914e-07,
"loss": 0.169,
"num_input_tokens_seen": 3268808,
"step": 5270
},
{
"epoch": 9.402852049910873,
"grad_norm": 0.580091655254364,
"learning_rate": 5.443711812658792e-07,
"loss": 0.174,
"num_input_tokens_seen": 3272008,
"step": 5275
},
{
"epoch": 9.411764705882353,
"grad_norm": 0.47462576627731323,
"learning_rate": 5.283470918541616e-07,
"loss": 0.1395,
"num_input_tokens_seen": 3274920,
"step": 5280
},
{
"epoch": 9.420677361853832,
"grad_norm": 0.4406573474407196,
"learning_rate": 5.125598625742523e-07,
"loss": 0.1781,
"num_input_tokens_seen": 3278376,
"step": 5285
},
{
"epoch": 9.429590017825312,
"grad_norm": 0.4939647614955902,
"learning_rate": 4.970096462300927e-07,
"loss": 0.1745,
"num_input_tokens_seen": 3281704,
"step": 5290
},
{
"epoch": 9.43850267379679,
"grad_norm": 0.3747076988220215,
"learning_rate": 4.816965933315987e-07,
"loss": 0.1692,
"num_input_tokens_seen": 3285256,
"step": 5295
},
{
"epoch": 9.44741532976827,
"grad_norm": 0.5448613166809082,
"learning_rate": 4.6662085209318305e-07,
"loss": 0.1651,
"num_input_tokens_seen": 3288616,
"step": 5300
},
{
"epoch": 9.456327985739751,
"grad_norm": 0.5583840608596802,
"learning_rate": 4.517825684323324e-07,
"loss": 0.1549,
"num_input_tokens_seen": 3291752,
"step": 5305
},
{
"epoch": 9.46524064171123,
"grad_norm": 0.4584488272666931,
"learning_rate": 4.3718188596819086e-07,
"loss": 0.1519,
"num_input_tokens_seen": 3294344,
"step": 5310
},
{
"epoch": 9.47415329768271,
"grad_norm": 0.6175810694694519,
"learning_rate": 4.228189460201676e-07,
"loss": 0.1706,
"num_input_tokens_seen": 3297512,
"step": 5315
},
{
"epoch": 9.483065953654188,
"grad_norm": 0.5118115544319153,
"learning_rate": 4.086938876065732e-07,
"loss": 0.1538,
"num_input_tokens_seen": 3300296,
"step": 5320
},
{
"epoch": 9.491978609625669,
"grad_norm": 0.5376412868499756,
"learning_rate": 3.948068474432715e-07,
"loss": 0.274,
"num_input_tokens_seen": 3304360,
"step": 5325
},
{
"epoch": 9.500891265597147,
"grad_norm": 0.5221200585365295,
"learning_rate": 3.8115795994236313e-07,
"loss": 0.1658,
"num_input_tokens_seen": 3307304,
"step": 5330
},
{
"epoch": 9.509803921568627,
"grad_norm": 0.4227612316608429,
"learning_rate": 3.6774735721087085e-07,
"loss": 0.1618,
"num_input_tokens_seen": 3310536,
"step": 5335
},
{
"epoch": 9.516934046345812,
"eval_loss": 0.183439701795578,
"eval_runtime": 4.2535,
"eval_samples_per_second": 58.539,
"eval_steps_per_second": 14.811,
"num_input_tokens_seen": 3312648,
"step": 5339
},
{
"epoch": 9.518716577540108,
"grad_norm": 0.601445734500885,
"learning_rate": 3.5457516904947587e-07,
"loss": 0.1771,
"num_input_tokens_seen": 3313672,
"step": 5340
},
{
"epoch": 9.527629233511586,
"grad_norm": 0.5191211700439453,
"learning_rate": 3.416415229512443e-07,
"loss": 0.1688,
"num_input_tokens_seen": 3317224,
"step": 5345
},
{
"epoch": 9.536541889483066,
"grad_norm": 0.6869432330131531,
"learning_rate": 3.2894654410041417e-07,
"loss": 0.1661,
"num_input_tokens_seen": 3319848,
"step": 5350
},
{
"epoch": 9.545454545454545,
"grad_norm": 0.905884325504303,
"learning_rate": 3.1649035537117123e-07,
"loss": 0.1521,
"num_input_tokens_seen": 3322664,
"step": 5355
},
{
"epoch": 9.554367201426025,
"grad_norm": 0.5753766894340515,
"learning_rate": 3.042730773264557e-07,
"loss": 0.1512,
"num_input_tokens_seen": 3325928,
"step": 5360
},
{
"epoch": 9.563279857397504,
"grad_norm": 0.5148957967758179,
"learning_rate": 2.9229482821680197e-07,
"loss": 0.1496,
"num_input_tokens_seen": 3328680,
"step": 5365
},
{
"epoch": 9.572192513368984,
"grad_norm": 0.47426876425743103,
"learning_rate": 2.8055572397919784e-07,
"loss": 0.152,
"num_input_tokens_seen": 3331976,
"step": 5370
},
{
"epoch": 9.581105169340464,
"grad_norm": 0.5953306555747986,
"learning_rate": 2.690558782359576e-07,
"loss": 0.1609,
"num_input_tokens_seen": 3334888,
"step": 5375
},
{
"epoch": 9.590017825311943,
"grad_norm": 0.49842748045921326,
"learning_rate": 2.5779540229361745e-07,
"loss": 0.1822,
"num_input_tokens_seen": 3337960,
"step": 5380
},
{
"epoch": 9.598930481283423,
"grad_norm": 0.6325761079788208,
"learning_rate": 2.467744051418641e-07,
"loss": 0.155,
"num_input_tokens_seen": 3340936,
"step": 5385
},
{
"epoch": 9.607843137254902,
"grad_norm": 0.8439469933509827,
"learning_rate": 2.3599299345248292e-07,
"loss": 0.1561,
"num_input_tokens_seen": 3343784,
"step": 5390
},
{
"epoch": 9.616755793226382,
"grad_norm": 0.7139554619789124,
"learning_rate": 2.2545127157831413e-07,
"loss": 0.1669,
"num_input_tokens_seen": 3347016,
"step": 5395
},
{
"epoch": 9.62566844919786,
"grad_norm": 0.3963601291179657,
"learning_rate": 2.1514934155226208e-07,
"loss": 0.1412,
"num_input_tokens_seen": 3349800,
"step": 5400
},
{
"epoch": 9.63458110516934,
"grad_norm": 0.5459052324295044,
"learning_rate": 2.0508730308627933e-07,
"loss": 0.1527,
"num_input_tokens_seen": 3353640,
"step": 5405
},
{
"epoch": 9.643493761140821,
"grad_norm": 0.7221339344978333,
"learning_rate": 1.9526525357043136e-07,
"loss": 0.1708,
"num_input_tokens_seen": 3356904,
"step": 5410
},
{
"epoch": 9.6524064171123,
"grad_norm": 0.39834100008010864,
"learning_rate": 1.8568328807193337e-07,
"loss": 0.1623,
"num_input_tokens_seen": 3360232,
"step": 5415
},
{
"epoch": 9.66131907308378,
"grad_norm": 0.3296028673648834,
"learning_rate": 1.7634149933423993e-07,
"loss": 0.1723,
"num_input_tokens_seen": 3362824,
"step": 5420
},
{
"epoch": 9.670231729055258,
"grad_norm": 0.6187313199043274,
"learning_rate": 1.6723997777614574e-07,
"loss": 0.2013,
"num_input_tokens_seen": 3366152,
"step": 5425
},
{
"epoch": 9.679144385026738,
"grad_norm": 0.4088561236858368,
"learning_rate": 1.5837881149090294e-07,
"loss": 0.1668,
"num_input_tokens_seen": 3369192,
"step": 5430
},
{
"epoch": 9.688057040998217,
"grad_norm": 0.6721343994140625,
"learning_rate": 1.497580862453829e-07,
"loss": 0.1767,
"num_input_tokens_seen": 3372776,
"step": 5435
},
{
"epoch": 9.696969696969697,
"grad_norm": 0.6333170533180237,
"learning_rate": 1.4137788547923246e-07,
"loss": 0.1829,
"num_input_tokens_seen": 3376232,
"step": 5440
},
{
"epoch": 9.705882352941176,
"grad_norm": 0.6064999103546143,
"learning_rate": 1.3323829030407465e-07,
"loss": 0.1916,
"num_input_tokens_seen": 3379912,
"step": 5445
},
{
"epoch": 9.714795008912656,
"grad_norm": 0.5454294085502625,
"learning_rate": 1.2533937950272023e-07,
"loss": 0.1639,
"num_input_tokens_seen": 3382824,
"step": 5450
},
{
"epoch": 9.723707664884136,
"grad_norm": 0.4902726411819458,
"learning_rate": 1.176812295283991e-07,
"loss": 0.1577,
"num_input_tokens_seen": 3385640,
"step": 5455
},
{
"epoch": 9.732620320855615,
"grad_norm": 0.4689973294734955,
"learning_rate": 1.1026391450404128e-07,
"loss": 0.1652,
"num_input_tokens_seen": 3389672,
"step": 5460
},
{
"epoch": 9.741532976827095,
"grad_norm": 0.6127117276191711,
"learning_rate": 1.0308750622153307e-07,
"loss": 0.1815,
"num_input_tokens_seen": 3393096,
"step": 5465
},
{
"epoch": 9.750445632798574,
"grad_norm": 0.40860888361930847,
"learning_rate": 9.615207414103434e-08,
"loss": 0.149,
"num_input_tokens_seen": 3396136,
"step": 5470
},
{
"epoch": 9.759358288770054,
"grad_norm": 0.5143342018127441,
"learning_rate": 8.945768539031785e-08,
"loss": 0.1785,
"num_input_tokens_seen": 3399304,
"step": 5475
},
{
"epoch": 9.768270944741532,
"grad_norm": 0.599516749382019,
"learning_rate": 8.30044047640921e-08,
"loss": 0.1617,
"num_input_tokens_seen": 3402216,
"step": 5480
},
{
"epoch": 9.777183600713013,
"grad_norm": 0.37185174226760864,
"learning_rate": 7.679229472340176e-08,
"loss": 0.1554,
"num_input_tokens_seen": 3405096,
"step": 5485
},
{
"epoch": 9.786096256684491,
"grad_norm": 0.4413319528102875,
"learning_rate": 7.082141539500597e-08,
"loss": 0.1639,
"num_input_tokens_seen": 3407912,
"step": 5490
},
{
"epoch": 9.795008912655971,
"grad_norm": 0.7090705633163452,
"learning_rate": 6.509182457080376e-08,
"loss": 0.1679,
"num_input_tokens_seen": 3410856,
"step": 5495
},
{
"epoch": 9.803921568627452,
"grad_norm": 0.5437349677085876,
"learning_rate": 5.9603577707267875e-08,
"loss": 0.1559,
"num_input_tokens_seen": 3413928,
"step": 5500
},
{
"epoch": 9.81283422459893,
"grad_norm": 0.5729760527610779,
"learning_rate": 5.435672792491742e-08,
"loss": 0.1623,
"num_input_tokens_seen": 3417416,
"step": 5505
},
{
"epoch": 9.82174688057041,
"grad_norm": 0.38444051146507263,
"learning_rate": 4.935132600780157e-08,
"loss": 0.1769,
"num_input_tokens_seen": 3420136,
"step": 5510
},
{
"epoch": 9.830659536541889,
"grad_norm": 0.4345572292804718,
"learning_rate": 4.4587420402997235e-08,
"loss": 0.1537,
"num_input_tokens_seen": 3423272,
"step": 5515
},
{
"epoch": 9.83957219251337,
"grad_norm": 0.44134852290153503,
"learning_rate": 4.006505722015386e-08,
"loss": 0.1499,
"num_input_tokens_seen": 3426472,
"step": 5520
},
{
"epoch": 9.848484848484848,
"grad_norm": 0.6951932907104492,
"learning_rate": 3.578428023103819e-08,
"loss": 0.1725,
"num_input_tokens_seen": 3429992,
"step": 5525
},
{
"epoch": 9.857397504456328,
"grad_norm": 0.47553181648254395,
"learning_rate": 3.1745130869123566e-08,
"loss": 0.1554,
"num_input_tokens_seen": 3432456,
"step": 5530
},
{
"epoch": 9.866310160427808,
"grad_norm": 0.5962952375411987,
"learning_rate": 2.794764822916518e-08,
"loss": 0.1618,
"num_input_tokens_seen": 3434888,
"step": 5535
},
{
"epoch": 9.875222816399287,
"grad_norm": 0.4873346984386444,
"learning_rate": 2.4391869066844874e-08,
"loss": 0.1773,
"num_input_tokens_seen": 3437832,
"step": 5540
},
{
"epoch": 9.884135472370767,
"grad_norm": 0.65750652551651,
"learning_rate": 2.1077827798404726e-08,
"loss": 0.1697,
"num_input_tokens_seen": 3440872,
"step": 5545
},
{
"epoch": 9.893048128342246,
"grad_norm": 0.4054161012172699,
"learning_rate": 1.8005556500313993e-08,
"loss": 0.1495,
"num_input_tokens_seen": 3443784,
"step": 5550
},
{
"epoch": 9.901960784313726,
"grad_norm": 0.605219841003418,
"learning_rate": 1.51750849089638e-08,
"loss": 0.1643,
"num_input_tokens_seen": 3447592,
"step": 5555
},
{
"epoch": 9.910873440285204,
"grad_norm": 0.3572712540626526,
"learning_rate": 1.2586440420372936e-08,
"loss": 0.1714,
"num_input_tokens_seen": 3451048,
"step": 5560
},
{
"epoch": 9.919786096256685,
"grad_norm": 0.5080024600028992,
"learning_rate": 1.023964808992417e-08,
"loss": 0.1497,
"num_input_tokens_seen": 3453928,
"step": 5565
},
{
"epoch": 9.928698752228165,
"grad_norm": 0.5494665503501892,
"learning_rate": 8.134730632125554e-09,
"loss": 0.1739,
"num_input_tokens_seen": 3456968,
"step": 5570
},
{
"epoch": 9.937611408199643,
"grad_norm": 0.5445519089698792,
"learning_rate": 6.271708420385603e-09,
"loss": 0.1683,
"num_input_tokens_seen": 3460616,
"step": 5575
},
{
"epoch": 9.946524064171124,
"grad_norm": 0.4502975046634674,
"learning_rate": 4.650599486827334e-09,
"loss": 0.1625,
"num_input_tokens_seen": 3463592,
"step": 5580
},
{
"epoch": 9.955436720142602,
"grad_norm": 0.713843047618866,
"learning_rate": 3.2714195220912013e-09,
"loss": 0.1604,
"num_input_tokens_seen": 3466888,
"step": 5585
},
{
"epoch": 9.964349376114082,
"grad_norm": 0.457069456577301,
"learning_rate": 2.134181875204644e-09,
"loss": 0.1602,
"num_input_tokens_seen": 3470408,
"step": 5590
},
{
"epoch": 9.973262032085561,
"grad_norm": 0.5743651390075684,
"learning_rate": 1.2388975534460834e-09,
"loss": 0.1584,
"num_input_tokens_seen": 3473608,
"step": 5595
},
{
"epoch": 9.982174688057041,
"grad_norm": 0.41813942790031433,
"learning_rate": 5.855752222366783e-10,
"loss": 0.163,
"num_input_tokens_seen": 3476616,
"step": 5600
},
{
"epoch": 9.99108734402852,
"grad_norm": 0.4171542227268219,
"learning_rate": 1.7422120505705686e-10,
"loss": 0.1549,
"num_input_tokens_seen": 3479624,
"step": 5605
},
{
"epoch": 10.0,
"grad_norm": 1.7215794324874878,
"learning_rate": 4.839483383478616e-12,
"loss": 0.1694,
"num_input_tokens_seen": 3481336,
"step": 5610
},
{
"epoch": 10.0,
"num_input_tokens_seen": 3481336,
"step": 5610,
"total_flos": 1.5676298662753075e+17,
"train_loss": 0.9318533902924754,
"train_runtime": 970.4341,
"train_samples_per_second": 23.093,
"train_steps_per_second": 5.781
}
],
"logging_steps": 5,
"max_steps": 5610,
"num_input_tokens_seen": 3481336,
"num_train_epochs": 10,
"save_steps": 281,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5676298662753075e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}