train_openbookqa_1754652174 / trainer_state.json
rbelanec's picture
End of training
0a47cbd verified
{
"best_global_step": 10602,
"best_metric": 0.6984838843345642,
"best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_openbookqa_1754652174/checkpoint-10602",
"epoch": 10.0,
"eval_steps": 558,
"global_step": 11160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004480286738351254,
"grad_norm": 2.500624895095825,
"learning_rate": 1.7921146953405018e-07,
"loss": 11.4359,
"num_input_tokens_seen": 1792,
"step": 5
},
{
"epoch": 0.008960573476702509,
"grad_norm": 2.8934195041656494,
"learning_rate": 4.032258064516129e-07,
"loss": 11.1971,
"num_input_tokens_seen": 3776,
"step": 10
},
{
"epoch": 0.013440860215053764,
"grad_norm": 3.4254393577575684,
"learning_rate": 6.272401433691756e-07,
"loss": 10.9754,
"num_input_tokens_seen": 5632,
"step": 15
},
{
"epoch": 0.017921146953405017,
"grad_norm": 2.503443479537964,
"learning_rate": 8.512544802867385e-07,
"loss": 11.1833,
"num_input_tokens_seen": 7392,
"step": 20
},
{
"epoch": 0.022401433691756272,
"grad_norm": 2.3445799350738525,
"learning_rate": 1.0752688172043011e-06,
"loss": 11.1593,
"num_input_tokens_seen": 9312,
"step": 25
},
{
"epoch": 0.026881720430107527,
"grad_norm": 2.792506694793701,
"learning_rate": 1.2992831541218638e-06,
"loss": 11.0546,
"num_input_tokens_seen": 11104,
"step": 30
},
{
"epoch": 0.03136200716845878,
"grad_norm": 2.593705892562866,
"learning_rate": 1.5232974910394266e-06,
"loss": 11.0171,
"num_input_tokens_seen": 12928,
"step": 35
},
{
"epoch": 0.035842293906810034,
"grad_norm": 2.4072728157043457,
"learning_rate": 1.7473118279569893e-06,
"loss": 11.0708,
"num_input_tokens_seen": 14816,
"step": 40
},
{
"epoch": 0.04032258064516129,
"grad_norm": 2.383638858795166,
"learning_rate": 1.971326164874552e-06,
"loss": 11.3111,
"num_input_tokens_seen": 16832,
"step": 45
},
{
"epoch": 0.044802867383512544,
"grad_norm": 2.162566661834717,
"learning_rate": 2.1953405017921145e-06,
"loss": 11.2697,
"num_input_tokens_seen": 18560,
"step": 50
},
{
"epoch": 0.0492831541218638,
"grad_norm": 2.1702215671539307,
"learning_rate": 2.4193548387096776e-06,
"loss": 10.932,
"num_input_tokens_seen": 20416,
"step": 55
},
{
"epoch": 0.053763440860215055,
"grad_norm": 2.853740930557251,
"learning_rate": 2.6433691756272402e-06,
"loss": 10.8429,
"num_input_tokens_seen": 22144,
"step": 60
},
{
"epoch": 0.05824372759856631,
"grad_norm": 2.471186876296997,
"learning_rate": 2.867383512544803e-06,
"loss": 11.1565,
"num_input_tokens_seen": 24064,
"step": 65
},
{
"epoch": 0.06272401433691756,
"grad_norm": 2.382014274597168,
"learning_rate": 3.091397849462366e-06,
"loss": 11.0918,
"num_input_tokens_seen": 25920,
"step": 70
},
{
"epoch": 0.06720430107526881,
"grad_norm": 2.6635334491729736,
"learning_rate": 3.3154121863799286e-06,
"loss": 10.9299,
"num_input_tokens_seen": 27744,
"step": 75
},
{
"epoch": 0.07168458781362007,
"grad_norm": 2.3445963859558105,
"learning_rate": 3.5394265232974912e-06,
"loss": 11.2613,
"num_input_tokens_seen": 29664,
"step": 80
},
{
"epoch": 0.07616487455197132,
"grad_norm": 2.5724005699157715,
"learning_rate": 3.763440860215054e-06,
"loss": 10.9325,
"num_input_tokens_seen": 31552,
"step": 85
},
{
"epoch": 0.08064516129032258,
"grad_norm": 2.4035773277282715,
"learning_rate": 3.987455197132617e-06,
"loss": 10.9375,
"num_input_tokens_seen": 33536,
"step": 90
},
{
"epoch": 0.08512544802867383,
"grad_norm": 2.6278600692749023,
"learning_rate": 4.21146953405018e-06,
"loss": 11.0113,
"num_input_tokens_seen": 35360,
"step": 95
},
{
"epoch": 0.08960573476702509,
"grad_norm": 2.3494575023651123,
"learning_rate": 4.435483870967742e-06,
"loss": 11.3767,
"num_input_tokens_seen": 37216,
"step": 100
},
{
"epoch": 0.09408602150537634,
"grad_norm": 2.230680227279663,
"learning_rate": 4.659498207885305e-06,
"loss": 10.8387,
"num_input_tokens_seen": 39104,
"step": 105
},
{
"epoch": 0.0985663082437276,
"grad_norm": 2.627718448638916,
"learning_rate": 4.883512544802868e-06,
"loss": 10.819,
"num_input_tokens_seen": 40960,
"step": 110
},
{
"epoch": 0.10304659498207885,
"grad_norm": 2.3713722229003906,
"learning_rate": 5.1075268817204305e-06,
"loss": 10.9145,
"num_input_tokens_seen": 42880,
"step": 115
},
{
"epoch": 0.10752688172043011,
"grad_norm": 2.1317410469055176,
"learning_rate": 5.331541218637993e-06,
"loss": 10.9142,
"num_input_tokens_seen": 44768,
"step": 120
},
{
"epoch": 0.11200716845878136,
"grad_norm": 3.0103211402893066,
"learning_rate": 5.555555555555556e-06,
"loss": 10.8537,
"num_input_tokens_seen": 46528,
"step": 125
},
{
"epoch": 0.11648745519713262,
"grad_norm": 2.6022181510925293,
"learning_rate": 5.779569892473118e-06,
"loss": 10.5195,
"num_input_tokens_seen": 48544,
"step": 130
},
{
"epoch": 0.12096774193548387,
"grad_norm": 2.920191764831543,
"learning_rate": 6.003584229390681e-06,
"loss": 10.644,
"num_input_tokens_seen": 50432,
"step": 135
},
{
"epoch": 0.12544802867383512,
"grad_norm": 2.679093837738037,
"learning_rate": 6.227598566308244e-06,
"loss": 10.5572,
"num_input_tokens_seen": 52416,
"step": 140
},
{
"epoch": 0.12992831541218638,
"grad_norm": 2.975313901901245,
"learning_rate": 6.451612903225806e-06,
"loss": 10.418,
"num_input_tokens_seen": 54304,
"step": 145
},
{
"epoch": 0.13440860215053763,
"grad_norm": 2.0430831909179688,
"learning_rate": 6.67562724014337e-06,
"loss": 10.6238,
"num_input_tokens_seen": 56256,
"step": 150
},
{
"epoch": 0.1388888888888889,
"grad_norm": 2.5662593841552734,
"learning_rate": 6.8996415770609325e-06,
"loss": 10.529,
"num_input_tokens_seen": 58144,
"step": 155
},
{
"epoch": 0.14336917562724014,
"grad_norm": 2.524044990539551,
"learning_rate": 7.1236559139784956e-06,
"loss": 10.4028,
"num_input_tokens_seen": 60032,
"step": 160
},
{
"epoch": 0.1478494623655914,
"grad_norm": 2.0038323402404785,
"learning_rate": 7.347670250896058e-06,
"loss": 10.5609,
"num_input_tokens_seen": 61824,
"step": 165
},
{
"epoch": 0.15232974910394265,
"grad_norm": 2.9879722595214844,
"learning_rate": 7.571684587813621e-06,
"loss": 10.072,
"num_input_tokens_seen": 63712,
"step": 170
},
{
"epoch": 0.15681003584229392,
"grad_norm": 2.5288474559783936,
"learning_rate": 7.795698924731183e-06,
"loss": 10.3687,
"num_input_tokens_seen": 65600,
"step": 175
},
{
"epoch": 0.16129032258064516,
"grad_norm": 2.761035919189453,
"learning_rate": 8.019713261648744e-06,
"loss": 10.2835,
"num_input_tokens_seen": 67392,
"step": 180
},
{
"epoch": 0.16577060931899643,
"grad_norm": 2.7723684310913086,
"learning_rate": 8.24372759856631e-06,
"loss": 10.0884,
"num_input_tokens_seen": 69120,
"step": 185
},
{
"epoch": 0.17025089605734767,
"grad_norm": 2.9377973079681396,
"learning_rate": 8.46774193548387e-06,
"loss": 10.1739,
"num_input_tokens_seen": 71008,
"step": 190
},
{
"epoch": 0.17473118279569894,
"grad_norm": 2.2513413429260254,
"learning_rate": 8.691756272401434e-06,
"loss": 9.8862,
"num_input_tokens_seen": 72896,
"step": 195
},
{
"epoch": 0.17921146953405018,
"grad_norm": 2.589574098587036,
"learning_rate": 8.915770609318997e-06,
"loss": 10.0585,
"num_input_tokens_seen": 74880,
"step": 200
},
{
"epoch": 0.18369175627240145,
"grad_norm": 2.361954689025879,
"learning_rate": 9.13978494623656e-06,
"loss": 10.0316,
"num_input_tokens_seen": 76768,
"step": 205
},
{
"epoch": 0.1881720430107527,
"grad_norm": 1.8949062824249268,
"learning_rate": 9.363799283154121e-06,
"loss": 10.4095,
"num_input_tokens_seen": 78656,
"step": 210
},
{
"epoch": 0.19265232974910393,
"grad_norm": 2.3154962062835693,
"learning_rate": 9.587813620071686e-06,
"loss": 9.7952,
"num_input_tokens_seen": 80512,
"step": 215
},
{
"epoch": 0.1971326164874552,
"grad_norm": 2.7080109119415283,
"learning_rate": 9.811827956989247e-06,
"loss": 9.9051,
"num_input_tokens_seen": 82240,
"step": 220
},
{
"epoch": 0.20161290322580644,
"grad_norm": 1.9762256145477295,
"learning_rate": 1.003584229390681e-05,
"loss": 9.4651,
"num_input_tokens_seen": 84288,
"step": 225
},
{
"epoch": 0.2060931899641577,
"grad_norm": 2.146616220474243,
"learning_rate": 1.0259856630824373e-05,
"loss": 9.4182,
"num_input_tokens_seen": 86240,
"step": 230
},
{
"epoch": 0.21057347670250895,
"grad_norm": 2.200469970703125,
"learning_rate": 1.0483870967741936e-05,
"loss": 9.2961,
"num_input_tokens_seen": 87968,
"step": 235
},
{
"epoch": 0.21505376344086022,
"grad_norm": 2.0625269412994385,
"learning_rate": 1.0707885304659498e-05,
"loss": 9.3518,
"num_input_tokens_seen": 89920,
"step": 240
},
{
"epoch": 0.21953405017921146,
"grad_norm": 2.391050100326538,
"learning_rate": 1.0931899641577063e-05,
"loss": 9.1173,
"num_input_tokens_seen": 91776,
"step": 245
},
{
"epoch": 0.22401433691756273,
"grad_norm": 2.3285434246063232,
"learning_rate": 1.1155913978494624e-05,
"loss": 9.3096,
"num_input_tokens_seen": 93728,
"step": 250
},
{
"epoch": 0.22849462365591397,
"grad_norm": 2.0512301921844482,
"learning_rate": 1.1379928315412187e-05,
"loss": 9.4857,
"num_input_tokens_seen": 95552,
"step": 255
},
{
"epoch": 0.23297491039426524,
"grad_norm": 2.1123080253601074,
"learning_rate": 1.160394265232975e-05,
"loss": 9.0314,
"num_input_tokens_seen": 97440,
"step": 260
},
{
"epoch": 0.23745519713261648,
"grad_norm": 2.1909310817718506,
"learning_rate": 1.1827956989247313e-05,
"loss": 8.9132,
"num_input_tokens_seen": 99296,
"step": 265
},
{
"epoch": 0.24193548387096775,
"grad_norm": 1.8368756771087646,
"learning_rate": 1.2051971326164874e-05,
"loss": 9.3423,
"num_input_tokens_seen": 101184,
"step": 270
},
{
"epoch": 0.246415770609319,
"grad_norm": 2.124940872192383,
"learning_rate": 1.227598566308244e-05,
"loss": 9.1916,
"num_input_tokens_seen": 103136,
"step": 275
},
{
"epoch": 0.25089605734767023,
"grad_norm": 1.936184287071228,
"learning_rate": 1.25e-05,
"loss": 9.3744,
"num_input_tokens_seen": 105024,
"step": 280
},
{
"epoch": 0.2553763440860215,
"grad_norm": 2.1163675785064697,
"learning_rate": 1.2724014336917564e-05,
"loss": 9.2282,
"num_input_tokens_seen": 106880,
"step": 285
},
{
"epoch": 0.25985663082437277,
"grad_norm": 2.478421449661255,
"learning_rate": 1.2948028673835125e-05,
"loss": 8.8832,
"num_input_tokens_seen": 108640,
"step": 290
},
{
"epoch": 0.26433691756272404,
"grad_norm": 2.1263668537139893,
"learning_rate": 1.3172043010752688e-05,
"loss": 8.4789,
"num_input_tokens_seen": 110528,
"step": 295
},
{
"epoch": 0.26881720430107525,
"grad_norm": 1.8503817319869995,
"learning_rate": 1.3396057347670251e-05,
"loss": 8.5889,
"num_input_tokens_seen": 112480,
"step": 300
},
{
"epoch": 0.2732974910394265,
"grad_norm": 2.2405242919921875,
"learning_rate": 1.3620071684587816e-05,
"loss": 8.7637,
"num_input_tokens_seen": 114368,
"step": 305
},
{
"epoch": 0.2777777777777778,
"grad_norm": 1.857710838317871,
"learning_rate": 1.3844086021505376e-05,
"loss": 8.3565,
"num_input_tokens_seen": 116352,
"step": 310
},
{
"epoch": 0.28225806451612906,
"grad_norm": 2.657458782196045,
"learning_rate": 1.4068100358422939e-05,
"loss": 8.6305,
"num_input_tokens_seen": 118272,
"step": 315
},
{
"epoch": 0.2867383512544803,
"grad_norm": 2.1825602054595947,
"learning_rate": 1.4292114695340503e-05,
"loss": 8.4286,
"num_input_tokens_seen": 120192,
"step": 320
},
{
"epoch": 0.29121863799283154,
"grad_norm": 1.961053729057312,
"learning_rate": 1.4516129032258066e-05,
"loss": 8.1782,
"num_input_tokens_seen": 121952,
"step": 325
},
{
"epoch": 0.2956989247311828,
"grad_norm": 1.9111275672912598,
"learning_rate": 1.4740143369175626e-05,
"loss": 7.8948,
"num_input_tokens_seen": 123712,
"step": 330
},
{
"epoch": 0.300179211469534,
"grad_norm": 1.8738315105438232,
"learning_rate": 1.4964157706093191e-05,
"loss": 8.2841,
"num_input_tokens_seen": 125568,
"step": 335
},
{
"epoch": 0.3046594982078853,
"grad_norm": 2.4100537300109863,
"learning_rate": 1.5188172043010754e-05,
"loss": 8.3544,
"num_input_tokens_seen": 127584,
"step": 340
},
{
"epoch": 0.30913978494623656,
"grad_norm": 2.0588905811309814,
"learning_rate": 1.5412186379928317e-05,
"loss": 7.7921,
"num_input_tokens_seen": 129440,
"step": 345
},
{
"epoch": 0.31362007168458783,
"grad_norm": 2.321556568145752,
"learning_rate": 1.563620071684588e-05,
"loss": 8.171,
"num_input_tokens_seen": 131360,
"step": 350
},
{
"epoch": 0.31810035842293904,
"grad_norm": 2.302530527114868,
"learning_rate": 1.586021505376344e-05,
"loss": 7.9828,
"num_input_tokens_seen": 133312,
"step": 355
},
{
"epoch": 0.3225806451612903,
"grad_norm": 1.879820704460144,
"learning_rate": 1.6084229390681005e-05,
"loss": 8.0835,
"num_input_tokens_seen": 135200,
"step": 360
},
{
"epoch": 0.3270609318996416,
"grad_norm": 1.871687650680542,
"learning_rate": 1.630824372759857e-05,
"loss": 7.8038,
"num_input_tokens_seen": 136896,
"step": 365
},
{
"epoch": 0.33154121863799285,
"grad_norm": 1.9226675033569336,
"learning_rate": 1.653225806451613e-05,
"loss": 8.063,
"num_input_tokens_seen": 139008,
"step": 370
},
{
"epoch": 0.33602150537634407,
"grad_norm": 2.210846185684204,
"learning_rate": 1.6756272401433692e-05,
"loss": 8.1303,
"num_input_tokens_seen": 140832,
"step": 375
},
{
"epoch": 0.34050179211469533,
"grad_norm": 2.274174213409424,
"learning_rate": 1.6980286738351257e-05,
"loss": 7.7176,
"num_input_tokens_seen": 142688,
"step": 380
},
{
"epoch": 0.3449820788530466,
"grad_norm": 2.0752205848693848,
"learning_rate": 1.7204301075268818e-05,
"loss": 7.9585,
"num_input_tokens_seen": 144640,
"step": 385
},
{
"epoch": 0.34946236559139787,
"grad_norm": 1.955730676651001,
"learning_rate": 1.742831541218638e-05,
"loss": 8.0203,
"num_input_tokens_seen": 146496,
"step": 390
},
{
"epoch": 0.3539426523297491,
"grad_norm": 2.04298734664917,
"learning_rate": 1.7652329749103944e-05,
"loss": 7.8494,
"num_input_tokens_seen": 148288,
"step": 395
},
{
"epoch": 0.35842293906810035,
"grad_norm": 2.653252124786377,
"learning_rate": 1.7876344086021506e-05,
"loss": 7.5965,
"num_input_tokens_seen": 150208,
"step": 400
},
{
"epoch": 0.3629032258064516,
"grad_norm": 2.1470870971679688,
"learning_rate": 1.8100358422939067e-05,
"loss": 7.1931,
"num_input_tokens_seen": 152128,
"step": 405
},
{
"epoch": 0.3673835125448029,
"grad_norm": 2.616868019104004,
"learning_rate": 1.8324372759856632e-05,
"loss": 7.3095,
"num_input_tokens_seen": 153728,
"step": 410
},
{
"epoch": 0.3718637992831541,
"grad_norm": 1.9402117729187012,
"learning_rate": 1.8548387096774193e-05,
"loss": 7.0567,
"num_input_tokens_seen": 155648,
"step": 415
},
{
"epoch": 0.3763440860215054,
"grad_norm": 2.4077136516571045,
"learning_rate": 1.8772401433691758e-05,
"loss": 7.0942,
"num_input_tokens_seen": 157568,
"step": 420
},
{
"epoch": 0.38082437275985664,
"grad_norm": 2.162140369415283,
"learning_rate": 1.899641577060932e-05,
"loss": 7.0183,
"num_input_tokens_seen": 159392,
"step": 425
},
{
"epoch": 0.38530465949820786,
"grad_norm": 2.530998706817627,
"learning_rate": 1.922043010752688e-05,
"loss": 6.6967,
"num_input_tokens_seen": 161312,
"step": 430
},
{
"epoch": 0.3897849462365591,
"grad_norm": 2.8186707496643066,
"learning_rate": 1.9444444444444445e-05,
"loss": 6.5046,
"num_input_tokens_seen": 163168,
"step": 435
},
{
"epoch": 0.3942652329749104,
"grad_norm": 2.482224702835083,
"learning_rate": 1.966845878136201e-05,
"loss": 6.8594,
"num_input_tokens_seen": 165120,
"step": 440
},
{
"epoch": 0.39874551971326166,
"grad_norm": 2.4772849082946777,
"learning_rate": 1.989247311827957e-05,
"loss": 6.8452,
"num_input_tokens_seen": 166976,
"step": 445
},
{
"epoch": 0.4032258064516129,
"grad_norm": 2.0559003353118896,
"learning_rate": 2.0116487455197133e-05,
"loss": 6.7853,
"num_input_tokens_seen": 168768,
"step": 450
},
{
"epoch": 0.40770609318996415,
"grad_norm": 1.855859637260437,
"learning_rate": 2.0340501792114698e-05,
"loss": 6.8535,
"num_input_tokens_seen": 170592,
"step": 455
},
{
"epoch": 0.4121863799283154,
"grad_norm": 2.203948974609375,
"learning_rate": 2.056451612903226e-05,
"loss": 6.5722,
"num_input_tokens_seen": 172736,
"step": 460
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.9957222938537598,
"learning_rate": 2.078853046594982e-05,
"loss": 6.6833,
"num_input_tokens_seen": 174592,
"step": 465
},
{
"epoch": 0.4211469534050179,
"grad_norm": 1.826316475868225,
"learning_rate": 2.1012544802867385e-05,
"loss": 6.4222,
"num_input_tokens_seen": 176704,
"step": 470
},
{
"epoch": 0.42562724014336917,
"grad_norm": 1.9822869300842285,
"learning_rate": 2.1236559139784946e-05,
"loss": 6.1397,
"num_input_tokens_seen": 178816,
"step": 475
},
{
"epoch": 0.43010752688172044,
"grad_norm": 2.5274133682250977,
"learning_rate": 2.146057347670251e-05,
"loss": 6.353,
"num_input_tokens_seen": 180608,
"step": 480
},
{
"epoch": 0.4345878136200717,
"grad_norm": 2.535459518432617,
"learning_rate": 2.1684587813620073e-05,
"loss": 5.812,
"num_input_tokens_seen": 182368,
"step": 485
},
{
"epoch": 0.4390681003584229,
"grad_norm": 2.3390567302703857,
"learning_rate": 2.1908602150537634e-05,
"loss": 5.6701,
"num_input_tokens_seen": 184128,
"step": 490
},
{
"epoch": 0.4435483870967742,
"grad_norm": 1.7963190078735352,
"learning_rate": 2.21326164874552e-05,
"loss": 6.1912,
"num_input_tokens_seen": 186144,
"step": 495
},
{
"epoch": 0.44802867383512546,
"grad_norm": 2.1545753479003906,
"learning_rate": 2.235663082437276e-05,
"loss": 5.945,
"num_input_tokens_seen": 188160,
"step": 500
},
{
"epoch": 0.4525089605734767,
"grad_norm": 2.177809953689575,
"learning_rate": 2.258064516129032e-05,
"loss": 5.495,
"num_input_tokens_seen": 190016,
"step": 505
},
{
"epoch": 0.45698924731182794,
"grad_norm": 1.9128177165985107,
"learning_rate": 2.2804659498207886e-05,
"loss": 5.3772,
"num_input_tokens_seen": 191840,
"step": 510
},
{
"epoch": 0.4614695340501792,
"grad_norm": 1.9529114961624146,
"learning_rate": 2.302867383512545e-05,
"loss": 5.6659,
"num_input_tokens_seen": 193920,
"step": 515
},
{
"epoch": 0.4659498207885305,
"grad_norm": 2.681643486022949,
"learning_rate": 2.325268817204301e-05,
"loss": 5.3074,
"num_input_tokens_seen": 195744,
"step": 520
},
{
"epoch": 0.47043010752688175,
"grad_norm": 2.481017827987671,
"learning_rate": 2.3476702508960574e-05,
"loss": 4.8561,
"num_input_tokens_seen": 197632,
"step": 525
},
{
"epoch": 0.47491039426523296,
"grad_norm": 2.2446951866149902,
"learning_rate": 2.370071684587814e-05,
"loss": 5.0107,
"num_input_tokens_seen": 199424,
"step": 530
},
{
"epoch": 0.47939068100358423,
"grad_norm": 1.7352396249771118,
"learning_rate": 2.39247311827957e-05,
"loss": 4.8499,
"num_input_tokens_seen": 201280,
"step": 535
},
{
"epoch": 0.4838709677419355,
"grad_norm": 2.33585262298584,
"learning_rate": 2.414874551971326e-05,
"loss": 4.6192,
"num_input_tokens_seen": 203072,
"step": 540
},
{
"epoch": 0.4883512544802867,
"grad_norm": 2.1425092220306396,
"learning_rate": 2.4372759856630826e-05,
"loss": 5.0357,
"num_input_tokens_seen": 204992,
"step": 545
},
{
"epoch": 0.492831541218638,
"grad_norm": 2.2969155311584473,
"learning_rate": 2.4596774193548387e-05,
"loss": 4.7181,
"num_input_tokens_seen": 206912,
"step": 550
},
{
"epoch": 0.49731182795698925,
"grad_norm": 2.9185574054718018,
"learning_rate": 2.4820788530465952e-05,
"loss": 4.7201,
"num_input_tokens_seen": 208864,
"step": 555
},
{
"epoch": 0.5,
"eval_loss": 4.355785369873047,
"eval_runtime": 5.6351,
"eval_samples_per_second": 88.019,
"eval_steps_per_second": 22.005,
"num_input_tokens_seen": 210048,
"step": 558
},
{
"epoch": 0.5017921146953405,
"grad_norm": 2.5545475482940674,
"learning_rate": 2.5044802867383517e-05,
"loss": 4.6788,
"num_input_tokens_seen": 210816,
"step": 560
},
{
"epoch": 0.5062724014336918,
"grad_norm": 2.3069698810577393,
"learning_rate": 2.5268817204301075e-05,
"loss": 4.1564,
"num_input_tokens_seen": 212640,
"step": 565
},
{
"epoch": 0.510752688172043,
"grad_norm": 2.7041890621185303,
"learning_rate": 2.5492831541218636e-05,
"loss": 4.1372,
"num_input_tokens_seen": 214560,
"step": 570
},
{
"epoch": 0.5152329749103942,
"grad_norm": 2.012913227081299,
"learning_rate": 2.5716845878136204e-05,
"loss": 3.9504,
"num_input_tokens_seen": 216384,
"step": 575
},
{
"epoch": 0.5197132616487455,
"grad_norm": 2.3750064373016357,
"learning_rate": 2.5940860215053762e-05,
"loss": 3.7693,
"num_input_tokens_seen": 218304,
"step": 580
},
{
"epoch": 0.5241935483870968,
"grad_norm": 2.0457749366760254,
"learning_rate": 2.616487455197133e-05,
"loss": 3.8367,
"num_input_tokens_seen": 220128,
"step": 585
},
{
"epoch": 0.5286738351254481,
"grad_norm": 2.2593724727630615,
"learning_rate": 2.6388888888888892e-05,
"loss": 3.689,
"num_input_tokens_seen": 222016,
"step": 590
},
{
"epoch": 0.5331541218637993,
"grad_norm": 2.111712694168091,
"learning_rate": 2.661290322580645e-05,
"loss": 3.5935,
"num_input_tokens_seen": 224000,
"step": 595
},
{
"epoch": 0.5376344086021505,
"grad_norm": 2.8802244663238525,
"learning_rate": 2.6836917562724018e-05,
"loss": 3.5805,
"num_input_tokens_seen": 225920,
"step": 600
},
{
"epoch": 0.5421146953405018,
"grad_norm": 1.9827439785003662,
"learning_rate": 2.706093189964158e-05,
"loss": 3.2292,
"num_input_tokens_seen": 227712,
"step": 605
},
{
"epoch": 0.546594982078853,
"grad_norm": 2.3735995292663574,
"learning_rate": 2.728494623655914e-05,
"loss": 3.8282,
"num_input_tokens_seen": 229824,
"step": 610
},
{
"epoch": 0.5510752688172043,
"grad_norm": 2.136101007461548,
"learning_rate": 2.7508960573476705e-05,
"loss": 3.5909,
"num_input_tokens_seen": 231840,
"step": 615
},
{
"epoch": 0.5555555555555556,
"grad_norm": 2.7474825382232666,
"learning_rate": 2.7732974910394267e-05,
"loss": 3.5369,
"num_input_tokens_seen": 233856,
"step": 620
},
{
"epoch": 0.5600358422939068,
"grad_norm": 1.956363320350647,
"learning_rate": 2.7956989247311828e-05,
"loss": 2.6224,
"num_input_tokens_seen": 235680,
"step": 625
},
{
"epoch": 0.5645161290322581,
"grad_norm": 2.3791565895080566,
"learning_rate": 2.8181003584229393e-05,
"loss": 2.5847,
"num_input_tokens_seen": 237536,
"step": 630
},
{
"epoch": 0.5689964157706093,
"grad_norm": 2.644176483154297,
"learning_rate": 2.8405017921146954e-05,
"loss": 2.8541,
"num_input_tokens_seen": 239488,
"step": 635
},
{
"epoch": 0.5734767025089605,
"grad_norm": 2.3562235832214355,
"learning_rate": 2.862903225806452e-05,
"loss": 2.5187,
"num_input_tokens_seen": 241376,
"step": 640
},
{
"epoch": 0.5779569892473119,
"grad_norm": 2.059034824371338,
"learning_rate": 2.885304659498208e-05,
"loss": 2.3649,
"num_input_tokens_seen": 243200,
"step": 645
},
{
"epoch": 0.5824372759856631,
"grad_norm": 2.070953130722046,
"learning_rate": 2.9077060931899642e-05,
"loss": 2.2836,
"num_input_tokens_seen": 245056,
"step": 650
},
{
"epoch": 0.5869175627240143,
"grad_norm": 1.588262915611267,
"learning_rate": 2.9301075268817207e-05,
"loss": 2.0521,
"num_input_tokens_seen": 246880,
"step": 655
},
{
"epoch": 0.5913978494623656,
"grad_norm": 2.269887924194336,
"learning_rate": 2.9525089605734768e-05,
"loss": 2.0766,
"num_input_tokens_seen": 248832,
"step": 660
},
{
"epoch": 0.5958781362007168,
"grad_norm": 2.170715570449829,
"learning_rate": 2.974910394265233e-05,
"loss": 2.0889,
"num_input_tokens_seen": 250880,
"step": 665
},
{
"epoch": 0.600358422939068,
"grad_norm": 2.2083332538604736,
"learning_rate": 2.9973118279569894e-05,
"loss": 2.249,
"num_input_tokens_seen": 252864,
"step": 670
},
{
"epoch": 0.6048387096774194,
"grad_norm": 2.376215934753418,
"learning_rate": 3.0197132616487455e-05,
"loss": 1.6939,
"num_input_tokens_seen": 254656,
"step": 675
},
{
"epoch": 0.6093189964157706,
"grad_norm": 1.581056833267212,
"learning_rate": 3.0421146953405024e-05,
"loss": 1.9665,
"num_input_tokens_seen": 256640,
"step": 680
},
{
"epoch": 0.6137992831541219,
"grad_norm": 2.688706398010254,
"learning_rate": 3.0645161290322585e-05,
"loss": 1.8917,
"num_input_tokens_seen": 258560,
"step": 685
},
{
"epoch": 0.6182795698924731,
"grad_norm": 2.051927328109741,
"learning_rate": 3.0869175627240146e-05,
"loss": 1.9391,
"num_input_tokens_seen": 260544,
"step": 690
},
{
"epoch": 0.6227598566308243,
"grad_norm": 2.0645041465759277,
"learning_rate": 3.109318996415771e-05,
"loss": 1.6377,
"num_input_tokens_seen": 262368,
"step": 695
},
{
"epoch": 0.6272401433691757,
"grad_norm": 1.9354743957519531,
"learning_rate": 3.131720430107527e-05,
"loss": 1.6164,
"num_input_tokens_seen": 264288,
"step": 700
},
{
"epoch": 0.6317204301075269,
"grad_norm": 2.0850887298583984,
"learning_rate": 3.154121863799283e-05,
"loss": 1.3333,
"num_input_tokens_seen": 266112,
"step": 705
},
{
"epoch": 0.6362007168458781,
"grad_norm": 2.084470748901367,
"learning_rate": 3.17652329749104e-05,
"loss": 1.6501,
"num_input_tokens_seen": 268064,
"step": 710
},
{
"epoch": 0.6406810035842294,
"grad_norm": 2.0416452884674072,
"learning_rate": 3.198924731182796e-05,
"loss": 1.1101,
"num_input_tokens_seen": 269792,
"step": 715
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.9667887687683105,
"learning_rate": 3.221326164874552e-05,
"loss": 1.2862,
"num_input_tokens_seen": 271616,
"step": 720
},
{
"epoch": 0.649641577060932,
"grad_norm": 1.1966661214828491,
"learning_rate": 3.243727598566308e-05,
"loss": 1.4731,
"num_input_tokens_seen": 273568,
"step": 725
},
{
"epoch": 0.6541218637992832,
"grad_norm": 2.009204387664795,
"learning_rate": 3.2661290322580644e-05,
"loss": 1.3512,
"num_input_tokens_seen": 275552,
"step": 730
},
{
"epoch": 0.6586021505376344,
"grad_norm": 1.6872437000274658,
"learning_rate": 3.288530465949821e-05,
"loss": 1.2472,
"num_input_tokens_seen": 277472,
"step": 735
},
{
"epoch": 0.6630824372759857,
"grad_norm": 1.8811956644058228,
"learning_rate": 3.3109318996415774e-05,
"loss": 1.1825,
"num_input_tokens_seen": 279360,
"step": 740
},
{
"epoch": 0.6675627240143369,
"grad_norm": 1.574930191040039,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.2683,
"num_input_tokens_seen": 281440,
"step": 745
},
{
"epoch": 0.6720430107526881,
"grad_norm": 1.8590502738952637,
"learning_rate": 3.3557347670250896e-05,
"loss": 1.0297,
"num_input_tokens_seen": 283328,
"step": 750
},
{
"epoch": 0.6765232974910395,
"grad_norm": 2.326279878616333,
"learning_rate": 3.378136200716846e-05,
"loss": 1.0714,
"num_input_tokens_seen": 285184,
"step": 755
},
{
"epoch": 0.6810035842293907,
"grad_norm": 1.9387420415878296,
"learning_rate": 3.400537634408602e-05,
"loss": 0.9402,
"num_input_tokens_seen": 286944,
"step": 760
},
{
"epoch": 0.6854838709677419,
"grad_norm": 1.8371498584747314,
"learning_rate": 3.422939068100359e-05,
"loss": 1.3783,
"num_input_tokens_seen": 289120,
"step": 765
},
{
"epoch": 0.6899641577060932,
"grad_norm": 1.6254054307937622,
"learning_rate": 3.445340501792115e-05,
"loss": 1.1224,
"num_input_tokens_seen": 291008,
"step": 770
},
{
"epoch": 0.6944444444444444,
"grad_norm": 1.7399088144302368,
"learning_rate": 3.467741935483872e-05,
"loss": 1.228,
"num_input_tokens_seen": 292960,
"step": 775
},
{
"epoch": 0.6989247311827957,
"grad_norm": 1.1927127838134766,
"learning_rate": 3.490143369175627e-05,
"loss": 0.9204,
"num_input_tokens_seen": 294752,
"step": 780
},
{
"epoch": 0.703405017921147,
"grad_norm": 2.3804333209991455,
"learning_rate": 3.512544802867383e-05,
"loss": 1.131,
"num_input_tokens_seen": 296672,
"step": 785
},
{
"epoch": 0.7078853046594982,
"grad_norm": 1.4596134424209595,
"learning_rate": 3.53494623655914e-05,
"loss": 0.9233,
"num_input_tokens_seen": 298528,
"step": 790
},
{
"epoch": 0.7123655913978495,
"grad_norm": 2.27669095993042,
"learning_rate": 3.557347670250896e-05,
"loss": 1.386,
"num_input_tokens_seen": 300672,
"step": 795
},
{
"epoch": 0.7168458781362007,
"grad_norm": 2.4323675632476807,
"learning_rate": 3.5797491039426524e-05,
"loss": 0.9721,
"num_input_tokens_seen": 302528,
"step": 800
},
{
"epoch": 0.7213261648745519,
"grad_norm": 1.8626689910888672,
"learning_rate": 3.602150537634409e-05,
"loss": 1.1334,
"num_input_tokens_seen": 304512,
"step": 805
},
{
"epoch": 0.7258064516129032,
"grad_norm": 1.959444522857666,
"learning_rate": 3.624551971326165e-05,
"loss": 0.9517,
"num_input_tokens_seen": 306240,
"step": 810
},
{
"epoch": 0.7302867383512545,
"grad_norm": 1.3139394521713257,
"learning_rate": 3.6469534050179214e-05,
"loss": 0.9271,
"num_input_tokens_seen": 308160,
"step": 815
},
{
"epoch": 0.7347670250896058,
"grad_norm": 1.1890360116958618,
"learning_rate": 3.6693548387096776e-05,
"loss": 0.8931,
"num_input_tokens_seen": 309952,
"step": 820
},
{
"epoch": 0.739247311827957,
"grad_norm": 1.6261545419692993,
"learning_rate": 3.691756272401434e-05,
"loss": 0.8954,
"num_input_tokens_seen": 311936,
"step": 825
},
{
"epoch": 0.7437275985663082,
"grad_norm": 2.207946538925171,
"learning_rate": 3.7141577060931905e-05,
"loss": 1.0085,
"num_input_tokens_seen": 313760,
"step": 830
},
{
"epoch": 0.7482078853046595,
"grad_norm": 1.2214912176132202,
"learning_rate": 3.736559139784947e-05,
"loss": 0.8793,
"num_input_tokens_seen": 315456,
"step": 835
},
{
"epoch": 0.7526881720430108,
"grad_norm": 1.8236546516418457,
"learning_rate": 3.758960573476703e-05,
"loss": 0.8436,
"num_input_tokens_seen": 317312,
"step": 840
},
{
"epoch": 0.757168458781362,
"grad_norm": 1.6997966766357422,
"learning_rate": 3.781362007168459e-05,
"loss": 0.9173,
"num_input_tokens_seen": 319264,
"step": 845
},
{
"epoch": 0.7616487455197133,
"grad_norm": 1.9939361810684204,
"learning_rate": 3.803763440860215e-05,
"loss": 1.0545,
"num_input_tokens_seen": 321248,
"step": 850
},
{
"epoch": 0.7661290322580645,
"grad_norm": 1.4386049509048462,
"learning_rate": 3.826164874551971e-05,
"loss": 0.8228,
"num_input_tokens_seen": 323072,
"step": 855
},
{
"epoch": 0.7706093189964157,
"grad_norm": 1.3483433723449707,
"learning_rate": 3.848566308243728e-05,
"loss": 0.784,
"num_input_tokens_seen": 324896,
"step": 860
},
{
"epoch": 0.775089605734767,
"grad_norm": 1.7732473611831665,
"learning_rate": 3.870967741935484e-05,
"loss": 0.8639,
"num_input_tokens_seen": 326592,
"step": 865
},
{
"epoch": 0.7795698924731183,
"grad_norm": 2.9457969665527344,
"learning_rate": 3.89336917562724e-05,
"loss": 1.0969,
"num_input_tokens_seen": 328480,
"step": 870
},
{
"epoch": 0.7840501792114696,
"grad_norm": 1.3309738636016846,
"learning_rate": 3.9157706093189964e-05,
"loss": 0.814,
"num_input_tokens_seen": 330240,
"step": 875
},
{
"epoch": 0.7885304659498208,
"grad_norm": 1.2904449701309204,
"learning_rate": 3.9381720430107526e-05,
"loss": 0.8384,
"num_input_tokens_seen": 331936,
"step": 880
},
{
"epoch": 0.793010752688172,
"grad_norm": 1.127160906791687,
"learning_rate": 3.9605734767025094e-05,
"loss": 0.8072,
"num_input_tokens_seen": 333664,
"step": 885
},
{
"epoch": 0.7974910394265233,
"grad_norm": 1.7538950443267822,
"learning_rate": 3.9829749103942655e-05,
"loss": 0.7792,
"num_input_tokens_seen": 335488,
"step": 890
},
{
"epoch": 0.8019713261648745,
"grad_norm": 1.3736745119094849,
"learning_rate": 4.005376344086022e-05,
"loss": 0.8026,
"num_input_tokens_seen": 337280,
"step": 895
},
{
"epoch": 0.8064516129032258,
"grad_norm": 1.918084740638733,
"learning_rate": 4.027777777777778e-05,
"loss": 1.027,
"num_input_tokens_seen": 339360,
"step": 900
},
{
"epoch": 0.8109318996415771,
"grad_norm": 1.6451882123947144,
"learning_rate": 4.050179211469534e-05,
"loss": 0.7405,
"num_input_tokens_seen": 341184,
"step": 905
},
{
"epoch": 0.8154121863799283,
"grad_norm": 2.081998586654663,
"learning_rate": 4.072580645161291e-05,
"loss": 0.8656,
"num_input_tokens_seen": 343072,
"step": 910
},
{
"epoch": 0.8198924731182796,
"grad_norm": 3.3316433429718018,
"learning_rate": 4.094982078853047e-05,
"loss": 0.9319,
"num_input_tokens_seen": 344928,
"step": 915
},
{
"epoch": 0.8243727598566308,
"grad_norm": 1.1804288625717163,
"learning_rate": 4.117383512544803e-05,
"loss": 0.7355,
"num_input_tokens_seen": 346752,
"step": 920
},
{
"epoch": 0.828853046594982,
"grad_norm": 1.9487136602401733,
"learning_rate": 4.13978494623656e-05,
"loss": 0.8087,
"num_input_tokens_seen": 348608,
"step": 925
},
{
"epoch": 0.8333333333333334,
"grad_norm": 2.2469987869262695,
"learning_rate": 4.162186379928315e-05,
"loss": 1.0164,
"num_input_tokens_seen": 350496,
"step": 930
},
{
"epoch": 0.8378136200716846,
"grad_norm": 1.9513884782791138,
"learning_rate": 4.1845878136200714e-05,
"loss": 0.8033,
"num_input_tokens_seen": 352416,
"step": 935
},
{
"epoch": 0.8422939068100358,
"grad_norm": 2.2020530700683594,
"learning_rate": 4.206989247311828e-05,
"loss": 0.7324,
"num_input_tokens_seen": 354272,
"step": 940
},
{
"epoch": 0.8467741935483871,
"grad_norm": 1.5562139749526978,
"learning_rate": 4.2293906810035844e-05,
"loss": 0.8158,
"num_input_tokens_seen": 356064,
"step": 945
},
{
"epoch": 0.8512544802867383,
"grad_norm": 2.2040066719055176,
"learning_rate": 4.2517921146953405e-05,
"loss": 0.8879,
"num_input_tokens_seen": 358016,
"step": 950
},
{
"epoch": 0.8557347670250897,
"grad_norm": 1.9474464654922485,
"learning_rate": 4.2741935483870973e-05,
"loss": 0.8055,
"num_input_tokens_seen": 359808,
"step": 955
},
{
"epoch": 0.8602150537634409,
"grad_norm": 1.0472666025161743,
"learning_rate": 4.296594982078853e-05,
"loss": 0.7877,
"num_input_tokens_seen": 361664,
"step": 960
},
{
"epoch": 0.8646953405017921,
"grad_norm": 1.2779587507247925,
"learning_rate": 4.3189964157706096e-05,
"loss": 0.738,
"num_input_tokens_seen": 363488,
"step": 965
},
{
"epoch": 0.8691756272401434,
"grad_norm": 1.3081976175308228,
"learning_rate": 4.341397849462366e-05,
"loss": 0.9189,
"num_input_tokens_seen": 365632,
"step": 970
},
{
"epoch": 0.8736559139784946,
"grad_norm": 1.8249237537384033,
"learning_rate": 4.363799283154122e-05,
"loss": 0.8646,
"num_input_tokens_seen": 367616,
"step": 975
},
{
"epoch": 0.8781362007168458,
"grad_norm": 1.5187082290649414,
"learning_rate": 4.386200716845879e-05,
"loss": 0.7628,
"num_input_tokens_seen": 369408,
"step": 980
},
{
"epoch": 0.8826164874551972,
"grad_norm": 2.554736614227295,
"learning_rate": 4.408602150537635e-05,
"loss": 0.8075,
"num_input_tokens_seen": 371232,
"step": 985
},
{
"epoch": 0.8870967741935484,
"grad_norm": 1.244150161743164,
"learning_rate": 4.431003584229391e-05,
"loss": 0.8056,
"num_input_tokens_seen": 373088,
"step": 990
},
{
"epoch": 0.8915770609318996,
"grad_norm": 1.334649920463562,
"learning_rate": 4.453405017921147e-05,
"loss": 0.7364,
"num_input_tokens_seen": 374944,
"step": 995
},
{
"epoch": 0.8960573476702509,
"grad_norm": 1.1086010932922363,
"learning_rate": 4.475806451612903e-05,
"loss": 0.7582,
"num_input_tokens_seen": 376800,
"step": 1000
},
{
"epoch": 0.9005376344086021,
"grad_norm": 1.9798866510391235,
"learning_rate": 4.49820788530466e-05,
"loss": 0.8295,
"num_input_tokens_seen": 378720,
"step": 1005
},
{
"epoch": 0.9050179211469535,
"grad_norm": 1.3580180406570435,
"learning_rate": 4.520609318996416e-05,
"loss": 0.9695,
"num_input_tokens_seen": 380768,
"step": 1010
},
{
"epoch": 0.9094982078853047,
"grad_norm": 1.9761812686920166,
"learning_rate": 4.543010752688172e-05,
"loss": 0.8012,
"num_input_tokens_seen": 382752,
"step": 1015
},
{
"epoch": 0.9139784946236559,
"grad_norm": 1.539128065109253,
"learning_rate": 4.5654121863799285e-05,
"loss": 0.7739,
"num_input_tokens_seen": 384576,
"step": 1020
},
{
"epoch": 0.9184587813620072,
"grad_norm": 1.047807216644287,
"learning_rate": 4.5878136200716846e-05,
"loss": 0.7571,
"num_input_tokens_seen": 386368,
"step": 1025
},
{
"epoch": 0.9229390681003584,
"grad_norm": 0.7635663151741028,
"learning_rate": 4.610215053763441e-05,
"loss": 0.725,
"num_input_tokens_seen": 388192,
"step": 1030
},
{
"epoch": 0.9274193548387096,
"grad_norm": 0.9058603644371033,
"learning_rate": 4.6326164874551976e-05,
"loss": 0.6824,
"num_input_tokens_seen": 390016,
"step": 1035
},
{
"epoch": 0.931899641577061,
"grad_norm": 2.0875561237335205,
"learning_rate": 4.655017921146954e-05,
"loss": 0.7599,
"num_input_tokens_seen": 391904,
"step": 1040
},
{
"epoch": 0.9363799283154122,
"grad_norm": 1.1811761856079102,
"learning_rate": 4.67741935483871e-05,
"loss": 0.7755,
"num_input_tokens_seen": 393888,
"step": 1045
},
{
"epoch": 0.9408602150537635,
"grad_norm": 1.8433384895324707,
"learning_rate": 4.699820788530466e-05,
"loss": 1.0802,
"num_input_tokens_seen": 396000,
"step": 1050
},
{
"epoch": 0.9453405017921147,
"grad_norm": 1.3989745378494263,
"learning_rate": 4.722222222222222e-05,
"loss": 0.7628,
"num_input_tokens_seen": 397824,
"step": 1055
},
{
"epoch": 0.9498207885304659,
"grad_norm": 1.1494001150131226,
"learning_rate": 4.744623655913979e-05,
"loss": 0.7441,
"num_input_tokens_seen": 399648,
"step": 1060
},
{
"epoch": 0.9543010752688172,
"grad_norm": 1.8642150163650513,
"learning_rate": 4.767025089605735e-05,
"loss": 0.9036,
"num_input_tokens_seen": 401728,
"step": 1065
},
{
"epoch": 0.9587813620071685,
"grad_norm": 0.9621382355690002,
"learning_rate": 4.789426523297491e-05,
"loss": 0.7951,
"num_input_tokens_seen": 403616,
"step": 1070
},
{
"epoch": 0.9632616487455197,
"grad_norm": 1.2531604766845703,
"learning_rate": 4.811827956989248e-05,
"loss": 0.8256,
"num_input_tokens_seen": 405504,
"step": 1075
},
{
"epoch": 0.967741935483871,
"grad_norm": 1.0363707542419434,
"learning_rate": 4.8342293906810035e-05,
"loss": 0.7455,
"num_input_tokens_seen": 407392,
"step": 1080
},
{
"epoch": 0.9722222222222222,
"grad_norm": 1.4194499254226685,
"learning_rate": 4.8566308243727596e-05,
"loss": 0.7447,
"num_input_tokens_seen": 409216,
"step": 1085
},
{
"epoch": 0.9767025089605734,
"grad_norm": 1.6883561611175537,
"learning_rate": 4.8790322580645164e-05,
"loss": 0.8617,
"num_input_tokens_seen": 411104,
"step": 1090
},
{
"epoch": 0.9811827956989247,
"grad_norm": 1.1037418842315674,
"learning_rate": 4.9014336917562726e-05,
"loss": 0.697,
"num_input_tokens_seen": 412992,
"step": 1095
},
{
"epoch": 0.985663082437276,
"grad_norm": 1.2818868160247803,
"learning_rate": 4.9238351254480294e-05,
"loss": 0.7121,
"num_input_tokens_seen": 414656,
"step": 1100
},
{
"epoch": 0.9901433691756273,
"grad_norm": 1.3587709665298462,
"learning_rate": 4.9462365591397855e-05,
"loss": 0.8608,
"num_input_tokens_seen": 416736,
"step": 1105
},
{
"epoch": 0.9946236559139785,
"grad_norm": 1.685178518295288,
"learning_rate": 4.968637992831541e-05,
"loss": 0.7884,
"num_input_tokens_seen": 418496,
"step": 1110
},
{
"epoch": 0.9991039426523297,
"grad_norm": 1.2909343242645264,
"learning_rate": 4.991039426523298e-05,
"loss": 0.8136,
"num_input_tokens_seen": 420448,
"step": 1115
},
{
"epoch": 1.0,
"eval_loss": 0.8080337047576904,
"eval_runtime": 5.6144,
"eval_samples_per_second": 88.345,
"eval_steps_per_second": 22.086,
"num_input_tokens_seen": 420520,
"step": 1116
},
{
"epoch": 1.003584229390681,
"grad_norm": 1.6225780248641968,
"learning_rate": 4.9999988993763824e-05,
"loss": 0.819,
"num_input_tokens_seen": 422088,
"step": 1120
},
{
"epoch": 1.0080645161290323,
"grad_norm": 0.8704792857170105,
"learning_rate": 4.9999921733466727e-05,
"loss": 0.814,
"num_input_tokens_seen": 423912,
"step": 1125
},
{
"epoch": 1.0125448028673836,
"grad_norm": 1.1995881795883179,
"learning_rate": 4.9999793327612486e-05,
"loss": 0.7385,
"num_input_tokens_seen": 425768,
"step": 1130
},
{
"epoch": 1.0170250896057347,
"grad_norm": 1.436139464378357,
"learning_rate": 4.999960377651517e-05,
"loss": 0.7454,
"num_input_tokens_seen": 427528,
"step": 1135
},
{
"epoch": 1.021505376344086,
"grad_norm": 0.8228470087051392,
"learning_rate": 4.9999353080638376e-05,
"loss": 0.8053,
"num_input_tokens_seen": 429416,
"step": 1140
},
{
"epoch": 1.0259856630824373,
"grad_norm": 1.5793920755386353,
"learning_rate": 4.9999041240595276e-05,
"loss": 0.8063,
"num_input_tokens_seen": 431080,
"step": 1145
},
{
"epoch": 1.0304659498207884,
"grad_norm": 2.1269445419311523,
"learning_rate": 4.9998668257148576e-05,
"loss": 0.8313,
"num_input_tokens_seen": 432936,
"step": 1150
},
{
"epoch": 1.0349462365591398,
"grad_norm": 1.4320948123931885,
"learning_rate": 4.999823413121053e-05,
"loss": 0.8094,
"num_input_tokens_seen": 434984,
"step": 1155
},
{
"epoch": 1.039426523297491,
"grad_norm": 1.0124826431274414,
"learning_rate": 4.999773886384293e-05,
"loss": 0.6991,
"num_input_tokens_seen": 436744,
"step": 1160
},
{
"epoch": 1.0439068100358422,
"grad_norm": 1.4147486686706543,
"learning_rate": 4.9997182456257116e-05,
"loss": 0.7146,
"num_input_tokens_seen": 438568,
"step": 1165
},
{
"epoch": 1.0483870967741935,
"grad_norm": 1.1802654266357422,
"learning_rate": 4.999656490981397e-05,
"loss": 0.7963,
"num_input_tokens_seen": 440424,
"step": 1170
},
{
"epoch": 1.0528673835125448,
"grad_norm": 1.2537444829940796,
"learning_rate": 4.9995886226023913e-05,
"loss": 0.829,
"num_input_tokens_seen": 442504,
"step": 1175
},
{
"epoch": 1.0573476702508962,
"grad_norm": 1.1889104843139648,
"learning_rate": 4.999514640654688e-05,
"loss": 0.7126,
"num_input_tokens_seen": 444328,
"step": 1180
},
{
"epoch": 1.0618279569892473,
"grad_norm": 0.9470105767250061,
"learning_rate": 4.999434545319234e-05,
"loss": 0.8046,
"num_input_tokens_seen": 446312,
"step": 1185
},
{
"epoch": 1.0663082437275986,
"grad_norm": 1.1713849306106567,
"learning_rate": 4.999348336791929e-05,
"loss": 0.741,
"num_input_tokens_seen": 448232,
"step": 1190
},
{
"epoch": 1.07078853046595,
"grad_norm": 1.3088457584381104,
"learning_rate": 4.9992560152836264e-05,
"loss": 0.7323,
"num_input_tokens_seen": 450056,
"step": 1195
},
{
"epoch": 1.075268817204301,
"grad_norm": 1.7432241439819336,
"learning_rate": 4.999157581020126e-05,
"loss": 0.7173,
"num_input_tokens_seen": 451976,
"step": 1200
},
{
"epoch": 1.0797491039426523,
"grad_norm": 1.3616633415222168,
"learning_rate": 4.9990530342421835e-05,
"loss": 0.7661,
"num_input_tokens_seen": 454056,
"step": 1205
},
{
"epoch": 1.0842293906810037,
"grad_norm": 1.893376111984253,
"learning_rate": 4.998942375205502e-05,
"loss": 0.8518,
"num_input_tokens_seen": 455880,
"step": 1210
},
{
"epoch": 1.0887096774193548,
"grad_norm": 1.1770555973052979,
"learning_rate": 4.9988256041807334e-05,
"loss": 0.8027,
"num_input_tokens_seen": 457736,
"step": 1215
},
{
"epoch": 1.093189964157706,
"grad_norm": 0.9845167398452759,
"learning_rate": 4.998702721453481e-05,
"loss": 0.8092,
"num_input_tokens_seen": 459752,
"step": 1220
},
{
"epoch": 1.0976702508960574,
"grad_norm": 1.02119779586792,
"learning_rate": 4.998573727324295e-05,
"loss": 0.7272,
"num_input_tokens_seen": 461608,
"step": 1225
},
{
"epoch": 1.1021505376344085,
"grad_norm": 1.4152262210845947,
"learning_rate": 4.998438622108673e-05,
"loss": 0.7721,
"num_input_tokens_seen": 463464,
"step": 1230
},
{
"epoch": 1.1066308243727598,
"grad_norm": 1.7630904912948608,
"learning_rate": 4.9982974061370594e-05,
"loss": 0.8142,
"num_input_tokens_seen": 465416,
"step": 1235
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.0361242294311523,
"learning_rate": 4.9981500797548445e-05,
"loss": 0.75,
"num_input_tokens_seen": 467208,
"step": 1240
},
{
"epoch": 1.1155913978494623,
"grad_norm": 1.8716683387756348,
"learning_rate": 4.9979966433223627e-05,
"loss": 0.6942,
"num_input_tokens_seen": 469096,
"step": 1245
},
{
"epoch": 1.1200716845878136,
"grad_norm": 1.1575100421905518,
"learning_rate": 4.997837097214895e-05,
"loss": 0.7676,
"num_input_tokens_seen": 470984,
"step": 1250
},
{
"epoch": 1.124551971326165,
"grad_norm": 0.7588862180709839,
"learning_rate": 4.997671441822662e-05,
"loss": 0.7607,
"num_input_tokens_seen": 472904,
"step": 1255
},
{
"epoch": 1.129032258064516,
"grad_norm": 1.8226057291030884,
"learning_rate": 4.997499677550831e-05,
"loss": 0.7432,
"num_input_tokens_seen": 474792,
"step": 1260
},
{
"epoch": 1.1335125448028673,
"grad_norm": 1.1694135665893555,
"learning_rate": 4.997321804819506e-05,
"loss": 0.7216,
"num_input_tokens_seen": 476680,
"step": 1265
},
{
"epoch": 1.1379928315412187,
"grad_norm": 1.0002014636993408,
"learning_rate": 4.9971378240637345e-05,
"loss": 0.7637,
"num_input_tokens_seen": 478344,
"step": 1270
},
{
"epoch": 1.14247311827957,
"grad_norm": 1.5931822061538696,
"learning_rate": 4.9969477357335025e-05,
"loss": 0.8194,
"num_input_tokens_seen": 480328,
"step": 1275
},
{
"epoch": 1.146953405017921,
"grad_norm": 1.2760032415390015,
"learning_rate": 4.9967515402937334e-05,
"loss": 0.7717,
"num_input_tokens_seen": 482344,
"step": 1280
},
{
"epoch": 1.1514336917562724,
"grad_norm": 1.3313841819763184,
"learning_rate": 4.996549238224288e-05,
"loss": 0.7957,
"num_input_tokens_seen": 484296,
"step": 1285
},
{
"epoch": 1.1559139784946237,
"grad_norm": 1.0116024017333984,
"learning_rate": 4.996340830019962e-05,
"loss": 0.7225,
"num_input_tokens_seen": 486216,
"step": 1290
},
{
"epoch": 1.1603942652329748,
"grad_norm": 0.9009803533554077,
"learning_rate": 4.996126316190488e-05,
"loss": 0.7333,
"num_input_tokens_seen": 488040,
"step": 1295
},
{
"epoch": 1.1648745519713262,
"grad_norm": 1.6891072988510132,
"learning_rate": 4.995905697260528e-05,
"loss": 0.7796,
"num_input_tokens_seen": 489832,
"step": 1300
},
{
"epoch": 1.1693548387096775,
"grad_norm": 0.8419767022132874,
"learning_rate": 4.995678973769681e-05,
"loss": 0.7108,
"num_input_tokens_seen": 491688,
"step": 1305
},
{
"epoch": 1.1738351254480286,
"grad_norm": 1.250991940498352,
"learning_rate": 4.995446146272472e-05,
"loss": 0.7791,
"num_input_tokens_seen": 493736,
"step": 1310
},
{
"epoch": 1.17831541218638,
"grad_norm": 1.2484474182128906,
"learning_rate": 4.9952072153383575e-05,
"loss": 0.79,
"num_input_tokens_seen": 495656,
"step": 1315
},
{
"epoch": 1.1827956989247312,
"grad_norm": 0.9272834062576294,
"learning_rate": 4.994962181551725e-05,
"loss": 0.7649,
"num_input_tokens_seen": 497640,
"step": 1320
},
{
"epoch": 1.1872759856630823,
"grad_norm": 0.682406485080719,
"learning_rate": 4.994711045511881e-05,
"loss": 0.7139,
"num_input_tokens_seen": 499464,
"step": 1325
},
{
"epoch": 1.1917562724014337,
"grad_norm": 1.1172208786010742,
"learning_rate": 4.9944538078330646e-05,
"loss": 0.6943,
"num_input_tokens_seen": 501352,
"step": 1330
},
{
"epoch": 1.196236559139785,
"grad_norm": 1.0329699516296387,
"learning_rate": 4.994190469144434e-05,
"loss": 0.651,
"num_input_tokens_seen": 503240,
"step": 1335
},
{
"epoch": 1.2007168458781363,
"grad_norm": 1.6401331424713135,
"learning_rate": 4.993921030090072e-05,
"loss": 0.9447,
"num_input_tokens_seen": 505320,
"step": 1340
},
{
"epoch": 1.2051971326164874,
"grad_norm": 1.254267930984497,
"learning_rate": 4.99364549132898e-05,
"loss": 0.754,
"num_input_tokens_seen": 507336,
"step": 1345
},
{
"epoch": 1.2096774193548387,
"grad_norm": 1.2467966079711914,
"learning_rate": 4.993363853535079e-05,
"loss": 0.7379,
"num_input_tokens_seen": 509128,
"step": 1350
},
{
"epoch": 1.2141577060931898,
"grad_norm": 1.1022220849990845,
"learning_rate": 4.9930761173972076e-05,
"loss": 0.7379,
"num_input_tokens_seen": 510952,
"step": 1355
},
{
"epoch": 1.2186379928315412,
"grad_norm": 1.1572595834732056,
"learning_rate": 4.992782283619118e-05,
"loss": 0.7332,
"num_input_tokens_seen": 512808,
"step": 1360
},
{
"epoch": 1.2231182795698925,
"grad_norm": 0.8642338514328003,
"learning_rate": 4.99248235291948e-05,
"loss": 0.6583,
"num_input_tokens_seen": 514504,
"step": 1365
},
{
"epoch": 1.2275985663082438,
"grad_norm": 0.9974188208580017,
"learning_rate": 4.992176326031872e-05,
"loss": 0.8578,
"num_input_tokens_seen": 516360,
"step": 1370
},
{
"epoch": 1.232078853046595,
"grad_norm": 1.2274900674819946,
"learning_rate": 4.991864203704783e-05,
"loss": 0.762,
"num_input_tokens_seen": 518152,
"step": 1375
},
{
"epoch": 1.2365591397849462,
"grad_norm": 1.3409820795059204,
"learning_rate": 4.991545986701611e-05,
"loss": 0.7545,
"num_input_tokens_seen": 520072,
"step": 1380
},
{
"epoch": 1.2410394265232976,
"grad_norm": 1.1630853414535522,
"learning_rate": 4.991221675800662e-05,
"loss": 0.7108,
"num_input_tokens_seen": 521928,
"step": 1385
},
{
"epoch": 1.2455197132616487,
"grad_norm": 1.7423793077468872,
"learning_rate": 4.990891271795145e-05,
"loss": 0.8476,
"num_input_tokens_seen": 523880,
"step": 1390
},
{
"epoch": 1.25,
"grad_norm": 0.8607897758483887,
"learning_rate": 4.99055477549317e-05,
"loss": 0.6993,
"num_input_tokens_seen": 525832,
"step": 1395
},
{
"epoch": 1.2544802867383513,
"grad_norm": 0.9243642687797546,
"learning_rate": 4.990212187717753e-05,
"loss": 0.575,
"num_input_tokens_seen": 527560,
"step": 1400
},
{
"epoch": 1.2589605734767024,
"grad_norm": 0.6754842400550842,
"learning_rate": 4.9898635093068036e-05,
"loss": 0.7522,
"num_input_tokens_seen": 529480,
"step": 1405
},
{
"epoch": 1.2634408602150538,
"grad_norm": 1.0726746320724487,
"learning_rate": 4.98950874111313e-05,
"loss": 0.9473,
"num_input_tokens_seen": 531592,
"step": 1410
},
{
"epoch": 1.267921146953405,
"grad_norm": 0.930479109287262,
"learning_rate": 4.989147884004435e-05,
"loss": 0.7041,
"num_input_tokens_seen": 533480,
"step": 1415
},
{
"epoch": 1.2724014336917562,
"grad_norm": 0.8492618203163147,
"learning_rate": 4.988780938863314e-05,
"loss": 0.6708,
"num_input_tokens_seen": 535464,
"step": 1420
},
{
"epoch": 1.2768817204301075,
"grad_norm": 1.962171196937561,
"learning_rate": 4.9884079065872514e-05,
"loss": 0.7281,
"num_input_tokens_seen": 537512,
"step": 1425
},
{
"epoch": 1.2813620071684588,
"grad_norm": 1.0054094791412354,
"learning_rate": 4.988028788088622e-05,
"loss": 0.8596,
"num_input_tokens_seen": 539560,
"step": 1430
},
{
"epoch": 1.2858422939068102,
"grad_norm": 1.520097017288208,
"learning_rate": 4.9876435842946845e-05,
"loss": 0.7854,
"num_input_tokens_seen": 541448,
"step": 1435
},
{
"epoch": 1.2903225806451613,
"grad_norm": 1.189052939414978,
"learning_rate": 4.987252296147582e-05,
"loss": 0.6744,
"num_input_tokens_seen": 543336,
"step": 1440
},
{
"epoch": 1.2948028673835126,
"grad_norm": 1.1547536849975586,
"learning_rate": 4.986854924604339e-05,
"loss": 0.7349,
"num_input_tokens_seen": 545320,
"step": 1445
},
{
"epoch": 1.2992831541218637,
"grad_norm": 1.1133335828781128,
"learning_rate": 4.986451470636858e-05,
"loss": 0.7514,
"num_input_tokens_seen": 547240,
"step": 1450
},
{
"epoch": 1.303763440860215,
"grad_norm": 1.0307190418243408,
"learning_rate": 4.98604193523192e-05,
"loss": 0.7785,
"num_input_tokens_seen": 549000,
"step": 1455
},
{
"epoch": 1.3082437275985663,
"grad_norm": 1.0905498266220093,
"learning_rate": 4.985626319391178e-05,
"loss": 0.7204,
"num_input_tokens_seen": 550920,
"step": 1460
},
{
"epoch": 1.3127240143369177,
"grad_norm": 0.8858598470687866,
"learning_rate": 4.985204624131157e-05,
"loss": 0.7684,
"num_input_tokens_seen": 552744,
"step": 1465
},
{
"epoch": 1.3172043010752688,
"grad_norm": 1.4811769723892212,
"learning_rate": 4.984776850483254e-05,
"loss": 0.7303,
"num_input_tokens_seen": 554632,
"step": 1470
},
{
"epoch": 1.32168458781362,
"grad_norm": 1.123785376548767,
"learning_rate": 4.9843429994937284e-05,
"loss": 0.6639,
"num_input_tokens_seen": 556392,
"step": 1475
},
{
"epoch": 1.3261648745519714,
"grad_norm": 1.258150339126587,
"learning_rate": 4.983903072223708e-05,
"loss": 0.7914,
"num_input_tokens_seen": 558248,
"step": 1480
},
{
"epoch": 1.3306451612903225,
"grad_norm": 0.9963940978050232,
"learning_rate": 4.983457069749178e-05,
"loss": 0.7242,
"num_input_tokens_seen": 560200,
"step": 1485
},
{
"epoch": 1.3351254480286738,
"grad_norm": 1.2597713470458984,
"learning_rate": 4.983004993160986e-05,
"loss": 0.7417,
"num_input_tokens_seen": 562024,
"step": 1490
},
{
"epoch": 1.3396057347670252,
"grad_norm": 0.9808022975921631,
"learning_rate": 4.982546843564834e-05,
"loss": 0.7227,
"num_input_tokens_seen": 563848,
"step": 1495
},
{
"epoch": 1.3440860215053765,
"grad_norm": 1.9162483215332031,
"learning_rate": 4.982082622081279e-05,
"loss": 0.7643,
"num_input_tokens_seen": 565832,
"step": 1500
},
{
"epoch": 1.3485663082437276,
"grad_norm": 1.5381019115447998,
"learning_rate": 4.981612329845726e-05,
"loss": 0.7185,
"num_input_tokens_seen": 567688,
"step": 1505
},
{
"epoch": 1.353046594982079,
"grad_norm": 0.8574863076210022,
"learning_rate": 4.98113596800843e-05,
"loss": 0.6938,
"num_input_tokens_seen": 569544,
"step": 1510
},
{
"epoch": 1.35752688172043,
"grad_norm": 1.4968950748443604,
"learning_rate": 4.980653537734493e-05,
"loss": 0.7329,
"num_input_tokens_seen": 571432,
"step": 1515
},
{
"epoch": 1.3620071684587813,
"grad_norm": 1.1606332063674927,
"learning_rate": 4.9801650402038555e-05,
"loss": 0.7442,
"num_input_tokens_seen": 573256,
"step": 1520
},
{
"epoch": 1.3664874551971327,
"grad_norm": 1.2675437927246094,
"learning_rate": 4.979670476611301e-05,
"loss": 0.763,
"num_input_tokens_seen": 575208,
"step": 1525
},
{
"epoch": 1.370967741935484,
"grad_norm": 1.4575673341751099,
"learning_rate": 4.979169848166446e-05,
"loss": 0.8159,
"num_input_tokens_seen": 576936,
"step": 1530
},
{
"epoch": 1.375448028673835,
"grad_norm": 0.9934259653091431,
"learning_rate": 4.978663156093744e-05,
"loss": 0.6303,
"num_input_tokens_seen": 579016,
"step": 1535
},
{
"epoch": 1.3799283154121864,
"grad_norm": 0.9751392006874084,
"learning_rate": 4.978150401632477e-05,
"loss": 0.7635,
"num_input_tokens_seen": 580872,
"step": 1540
},
{
"epoch": 1.3844086021505375,
"grad_norm": 1.1100707054138184,
"learning_rate": 4.9776315860367564e-05,
"loss": 0.7501,
"num_input_tokens_seen": 582696,
"step": 1545
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.8194773197174072,
"learning_rate": 4.9771067105755145e-05,
"loss": 0.7135,
"num_input_tokens_seen": 584456,
"step": 1550
},
{
"epoch": 1.3933691756272402,
"grad_norm": 0.9508175253868103,
"learning_rate": 4.976575776532509e-05,
"loss": 0.7308,
"num_input_tokens_seen": 586408,
"step": 1555
},
{
"epoch": 1.3978494623655915,
"grad_norm": 1.265271782875061,
"learning_rate": 4.976038785206315e-05,
"loss": 0.7841,
"num_input_tokens_seen": 588296,
"step": 1560
},
{
"epoch": 1.4023297491039426,
"grad_norm": 1.7464754581451416,
"learning_rate": 4.9754957379103205e-05,
"loss": 0.7915,
"num_input_tokens_seen": 590280,
"step": 1565
},
{
"epoch": 1.406810035842294,
"grad_norm": 1.1853801012039185,
"learning_rate": 4.974946635972728e-05,
"loss": 0.7529,
"num_input_tokens_seen": 592104,
"step": 1570
},
{
"epoch": 1.4112903225806452,
"grad_norm": 0.9984325766563416,
"learning_rate": 4.974391480736546e-05,
"loss": 0.7774,
"num_input_tokens_seen": 593960,
"step": 1575
},
{
"epoch": 1.4157706093189963,
"grad_norm": 0.6899164915084839,
"learning_rate": 4.973830273559591e-05,
"loss": 0.7263,
"num_input_tokens_seen": 595720,
"step": 1580
},
{
"epoch": 1.4202508960573477,
"grad_norm": 1.1132155656814575,
"learning_rate": 4.97326301581448e-05,
"loss": 0.7447,
"num_input_tokens_seen": 597704,
"step": 1585
},
{
"epoch": 1.424731182795699,
"grad_norm": 1.076192855834961,
"learning_rate": 4.9726897088886294e-05,
"loss": 0.7111,
"num_input_tokens_seen": 599560,
"step": 1590
},
{
"epoch": 1.4292114695340503,
"grad_norm": 1.03714120388031,
"learning_rate": 4.972110354184249e-05,
"loss": 0.7496,
"num_input_tokens_seen": 601384,
"step": 1595
},
{
"epoch": 1.4336917562724014,
"grad_norm": 1.5453832149505615,
"learning_rate": 4.971524953118344e-05,
"loss": 0.7348,
"num_input_tokens_seen": 603176,
"step": 1600
},
{
"epoch": 1.4381720430107527,
"grad_norm": 0.8860936164855957,
"learning_rate": 4.9709335071227046e-05,
"loss": 0.7677,
"num_input_tokens_seen": 605064,
"step": 1605
},
{
"epoch": 1.4426523297491038,
"grad_norm": 1.1750874519348145,
"learning_rate": 4.970336017643907e-05,
"loss": 0.7877,
"num_input_tokens_seen": 606920,
"step": 1610
},
{
"epoch": 1.4471326164874552,
"grad_norm": 0.8579301834106445,
"learning_rate": 4.969732486143309e-05,
"loss": 0.7482,
"num_input_tokens_seen": 608712,
"step": 1615
},
{
"epoch": 1.4516129032258065,
"grad_norm": 1.032989501953125,
"learning_rate": 4.969122914097046e-05,
"loss": 0.7113,
"num_input_tokens_seen": 610600,
"step": 1620
},
{
"epoch": 1.4560931899641578,
"grad_norm": 0.986748218536377,
"learning_rate": 4.968507302996029e-05,
"loss": 0.7629,
"num_input_tokens_seen": 612488,
"step": 1625
},
{
"epoch": 1.460573476702509,
"grad_norm": 0.9742453098297119,
"learning_rate": 4.967885654345936e-05,
"loss": 0.7882,
"num_input_tokens_seen": 614344,
"step": 1630
},
{
"epoch": 1.4650537634408602,
"grad_norm": 1.1552369594573975,
"learning_rate": 4.9672579696672136e-05,
"loss": 0.7541,
"num_input_tokens_seen": 616168,
"step": 1635
},
{
"epoch": 1.4695340501792113,
"grad_norm": 0.7690085172653198,
"learning_rate": 4.966624250495075e-05,
"loss": 0.6982,
"num_input_tokens_seen": 618024,
"step": 1640
},
{
"epoch": 1.4740143369175627,
"grad_norm": 1.2485721111297607,
"learning_rate": 4.9659844983794855e-05,
"loss": 0.7561,
"num_input_tokens_seen": 619848,
"step": 1645
},
{
"epoch": 1.478494623655914,
"grad_norm": 0.8757264614105225,
"learning_rate": 4.965338714885173e-05,
"loss": 0.6676,
"num_input_tokens_seen": 621576,
"step": 1650
},
{
"epoch": 1.4829749103942653,
"grad_norm": 1.2202672958374023,
"learning_rate": 4.964686901591612e-05,
"loss": 0.7544,
"num_input_tokens_seen": 623592,
"step": 1655
},
{
"epoch": 1.4874551971326164,
"grad_norm": 1.16517174243927,
"learning_rate": 4.964029060093029e-05,
"loss": 0.6739,
"num_input_tokens_seen": 625384,
"step": 1660
},
{
"epoch": 1.4919354838709677,
"grad_norm": 0.7608532309532166,
"learning_rate": 4.96336519199839e-05,
"loss": 0.7081,
"num_input_tokens_seen": 627400,
"step": 1665
},
{
"epoch": 1.496415770609319,
"grad_norm": 0.8713858723640442,
"learning_rate": 4.9626952989314065e-05,
"loss": 0.7929,
"num_input_tokens_seen": 629192,
"step": 1670
},
{
"epoch": 1.5,
"eval_loss": 0.7311503291130066,
"eval_runtime": 5.6351,
"eval_samples_per_second": 88.019,
"eval_steps_per_second": 22.005,
"num_input_tokens_seen": 630888,
"step": 1674
},
{
"epoch": 1.5008960573476702,
"grad_norm": 1.2036266326904297,
"learning_rate": 4.962019382530521e-05,
"loss": 0.7689,
"num_input_tokens_seen": 631336,
"step": 1675
},
{
"epoch": 1.5053763440860215,
"grad_norm": 1.4354534149169922,
"learning_rate": 4.9613374444489095e-05,
"loss": 0.7565,
"num_input_tokens_seen": 633160,
"step": 1680
},
{
"epoch": 1.5098566308243728,
"grad_norm": 0.8775255680084229,
"learning_rate": 4.960649486354478e-05,
"loss": 0.815,
"num_input_tokens_seen": 635176,
"step": 1685
},
{
"epoch": 1.5143369175627241,
"grad_norm": 0.6111800670623779,
"learning_rate": 4.959955509929854e-05,
"loss": 0.6656,
"num_input_tokens_seen": 636936,
"step": 1690
},
{
"epoch": 1.5188172043010753,
"grad_norm": 1.351608157157898,
"learning_rate": 4.9592555168723875e-05,
"loss": 0.8207,
"num_input_tokens_seen": 638984,
"step": 1695
},
{
"epoch": 1.5232974910394266,
"grad_norm": 1.0694748163223267,
"learning_rate": 4.95854950889414e-05,
"loss": 0.7403,
"num_input_tokens_seen": 640904,
"step": 1700
},
{
"epoch": 1.5277777777777777,
"grad_norm": 0.8151997923851013,
"learning_rate": 4.957837487721889e-05,
"loss": 0.7493,
"num_input_tokens_seen": 642792,
"step": 1705
},
{
"epoch": 1.532258064516129,
"grad_norm": 0.8805826306343079,
"learning_rate": 4.957119455097117e-05,
"loss": 0.6942,
"num_input_tokens_seen": 644680,
"step": 1710
},
{
"epoch": 1.5367383512544803,
"grad_norm": 0.7584099769592285,
"learning_rate": 4.956395412776008e-05,
"loss": 0.7322,
"num_input_tokens_seen": 646504,
"step": 1715
},
{
"epoch": 1.5412186379928317,
"grad_norm": 0.8927046060562134,
"learning_rate": 4.955665362529448e-05,
"loss": 0.7146,
"num_input_tokens_seen": 648520,
"step": 1720
},
{
"epoch": 1.5456989247311828,
"grad_norm": 1.0460692644119263,
"learning_rate": 4.954929306143016e-05,
"loss": 0.7403,
"num_input_tokens_seen": 650312,
"step": 1725
},
{
"epoch": 1.550179211469534,
"grad_norm": 0.9621413350105286,
"learning_rate": 4.9541872454169794e-05,
"loss": 0.685,
"num_input_tokens_seen": 652200,
"step": 1730
},
{
"epoch": 1.5546594982078852,
"grad_norm": 1.074214220046997,
"learning_rate": 4.953439182166293e-05,
"loss": 0.806,
"num_input_tokens_seen": 654280,
"step": 1735
},
{
"epoch": 1.5591397849462365,
"grad_norm": 0.9384589195251465,
"learning_rate": 4.952685118220593e-05,
"loss": 0.7478,
"num_input_tokens_seen": 656168,
"step": 1740
},
{
"epoch": 1.5636200716845878,
"grad_norm": 0.9732498526573181,
"learning_rate": 4.951925055424191e-05,
"loss": 0.7457,
"num_input_tokens_seen": 657992,
"step": 1745
},
{
"epoch": 1.5681003584229392,
"grad_norm": 0.9830466508865356,
"learning_rate": 4.951158995636071e-05,
"loss": 0.7573,
"num_input_tokens_seen": 659720,
"step": 1750
},
{
"epoch": 1.5725806451612905,
"grad_norm": 0.8285174369812012,
"learning_rate": 4.9503869407298856e-05,
"loss": 0.7618,
"num_input_tokens_seen": 661768,
"step": 1755
},
{
"epoch": 1.5770609318996416,
"grad_norm": 0.9129826426506042,
"learning_rate": 4.94960889259395e-05,
"loss": 0.7081,
"num_input_tokens_seen": 663592,
"step": 1760
},
{
"epoch": 1.5815412186379927,
"grad_norm": 0.8629391193389893,
"learning_rate": 4.948824853131236e-05,
"loss": 0.7015,
"num_input_tokens_seen": 665384,
"step": 1765
},
{
"epoch": 1.586021505376344,
"grad_norm": 0.7808408737182617,
"learning_rate": 4.948034824259373e-05,
"loss": 0.6774,
"num_input_tokens_seen": 667400,
"step": 1770
},
{
"epoch": 1.5905017921146953,
"grad_norm": 1.3148083686828613,
"learning_rate": 4.947238807910637e-05,
"loss": 0.7496,
"num_input_tokens_seen": 669192,
"step": 1775
},
{
"epoch": 1.5949820788530467,
"grad_norm": 0.9059514403343201,
"learning_rate": 4.9464368060319465e-05,
"loss": 0.7509,
"num_input_tokens_seen": 671112,
"step": 1780
},
{
"epoch": 1.599462365591398,
"grad_norm": 0.9348905682563782,
"learning_rate": 4.9456288205848634e-05,
"loss": 0.714,
"num_input_tokens_seen": 672968,
"step": 1785
},
{
"epoch": 1.603942652329749,
"grad_norm": 0.8638232350349426,
"learning_rate": 4.944814853545583e-05,
"loss": 0.6887,
"num_input_tokens_seen": 674792,
"step": 1790
},
{
"epoch": 1.6084229390681004,
"grad_norm": 1.297763466835022,
"learning_rate": 4.9439949069049294e-05,
"loss": 0.7983,
"num_input_tokens_seen": 676776,
"step": 1795
},
{
"epoch": 1.6129032258064515,
"grad_norm": 1.1472688913345337,
"learning_rate": 4.943168982668352e-05,
"loss": 0.7423,
"num_input_tokens_seen": 678760,
"step": 1800
},
{
"epoch": 1.6173835125448028,
"grad_norm": 1.002719759941101,
"learning_rate": 4.9423370828559236e-05,
"loss": 0.6999,
"num_input_tokens_seen": 680584,
"step": 1805
},
{
"epoch": 1.6218637992831542,
"grad_norm": 0.9048038125038147,
"learning_rate": 4.941499209502327e-05,
"loss": 0.6823,
"num_input_tokens_seen": 682504,
"step": 1810
},
{
"epoch": 1.6263440860215055,
"grad_norm": 0.7667252421379089,
"learning_rate": 4.9406553646568594e-05,
"loss": 0.6943,
"num_input_tokens_seen": 684424,
"step": 1815
},
{
"epoch": 1.6308243727598566,
"grad_norm": 0.767292320728302,
"learning_rate": 4.939805550383421e-05,
"loss": 0.722,
"num_input_tokens_seen": 686248,
"step": 1820
},
{
"epoch": 1.635304659498208,
"grad_norm": 0.9602553844451904,
"learning_rate": 4.9389497687605135e-05,
"loss": 0.7781,
"num_input_tokens_seen": 688104,
"step": 1825
},
{
"epoch": 1.639784946236559,
"grad_norm": 0.936279296875,
"learning_rate": 4.938088021881233e-05,
"loss": 0.7545,
"num_input_tokens_seen": 689896,
"step": 1830
},
{
"epoch": 1.6442652329749103,
"grad_norm": 1.1077156066894531,
"learning_rate": 4.9372203118532655e-05,
"loss": 0.6995,
"num_input_tokens_seen": 691912,
"step": 1835
},
{
"epoch": 1.6487455197132617,
"grad_norm": 0.8677238821983337,
"learning_rate": 4.936346640798883e-05,
"loss": 0.7406,
"num_input_tokens_seen": 693640,
"step": 1840
},
{
"epoch": 1.653225806451613,
"grad_norm": 0.5871409773826599,
"learning_rate": 4.935467010854936e-05,
"loss": 0.6855,
"num_input_tokens_seen": 695528,
"step": 1845
},
{
"epoch": 1.6577060931899643,
"grad_norm": 0.7833765745162964,
"learning_rate": 4.9345814241728495e-05,
"loss": 0.6938,
"num_input_tokens_seen": 697256,
"step": 1850
},
{
"epoch": 1.6621863799283154,
"grad_norm": 0.8606950044631958,
"learning_rate": 4.933689882918618e-05,
"loss": 0.6943,
"num_input_tokens_seen": 699112,
"step": 1855
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.0058404207229614,
"learning_rate": 4.9327923892728e-05,
"loss": 0.7113,
"num_input_tokens_seen": 700904,
"step": 1860
},
{
"epoch": 1.6711469534050178,
"grad_norm": 0.9566648602485657,
"learning_rate": 4.9318889454305115e-05,
"loss": 0.6834,
"num_input_tokens_seen": 702632,
"step": 1865
},
{
"epoch": 1.6756272401433692,
"grad_norm": 0.9307464957237244,
"learning_rate": 4.930979553601423e-05,
"loss": 0.7581,
"num_input_tokens_seen": 704680,
"step": 1870
},
{
"epoch": 1.6801075268817205,
"grad_norm": 1.2653905153274536,
"learning_rate": 4.930064216009754e-05,
"loss": 0.7228,
"num_input_tokens_seen": 706792,
"step": 1875
},
{
"epoch": 1.6845878136200718,
"grad_norm": 0.9056828618049622,
"learning_rate": 4.929142934894262e-05,
"loss": 0.7146,
"num_input_tokens_seen": 708552,
"step": 1880
},
{
"epoch": 1.689068100358423,
"grad_norm": 0.7426489591598511,
"learning_rate": 4.928215712508245e-05,
"loss": 0.7212,
"num_input_tokens_seen": 710568,
"step": 1885
},
{
"epoch": 1.6935483870967742,
"grad_norm": 0.914968729019165,
"learning_rate": 4.9272825511195316e-05,
"loss": 0.664,
"num_input_tokens_seen": 712488,
"step": 1890
},
{
"epoch": 1.6980286738351253,
"grad_norm": 1.0682841539382935,
"learning_rate": 4.9263434530104755e-05,
"loss": 0.7293,
"num_input_tokens_seen": 714440,
"step": 1895
},
{
"epoch": 1.7025089605734767,
"grad_norm": 0.7572513222694397,
"learning_rate": 4.92539842047795e-05,
"loss": 0.713,
"num_input_tokens_seen": 716328,
"step": 1900
},
{
"epoch": 1.706989247311828,
"grad_norm": 1.7348966598510742,
"learning_rate": 4.924447455833346e-05,
"loss": 0.8395,
"num_input_tokens_seen": 718216,
"step": 1905
},
{
"epoch": 1.7114695340501793,
"grad_norm": 0.8259839415550232,
"learning_rate": 4.9234905614025594e-05,
"loss": 0.7374,
"num_input_tokens_seen": 719976,
"step": 1910
},
{
"epoch": 1.7159498207885304,
"grad_norm": 0.9698778986930847,
"learning_rate": 4.922527739525993e-05,
"loss": 0.6721,
"num_input_tokens_seen": 721928,
"step": 1915
},
{
"epoch": 1.7204301075268817,
"grad_norm": 0.6747263669967651,
"learning_rate": 4.9215589925585434e-05,
"loss": 0.7525,
"num_input_tokens_seen": 723752,
"step": 1920
},
{
"epoch": 1.7249103942652328,
"grad_norm": 0.4461309313774109,
"learning_rate": 4.9205843228696036e-05,
"loss": 0.7114,
"num_input_tokens_seen": 725480,
"step": 1925
},
{
"epoch": 1.7293906810035842,
"grad_norm": 0.6168416142463684,
"learning_rate": 4.9196037328430475e-05,
"loss": 0.6724,
"num_input_tokens_seen": 727400,
"step": 1930
},
{
"epoch": 1.7338709677419355,
"grad_norm": 0.8838000297546387,
"learning_rate": 4.918617224877232e-05,
"loss": 0.6928,
"num_input_tokens_seen": 729576,
"step": 1935
},
{
"epoch": 1.7383512544802868,
"grad_norm": 0.8864124417304993,
"learning_rate": 4.917624801384988e-05,
"loss": 0.7149,
"num_input_tokens_seen": 731528,
"step": 1940
},
{
"epoch": 1.7428315412186381,
"grad_norm": 0.8120636343955994,
"learning_rate": 4.916626464793616e-05,
"loss": 0.7634,
"num_input_tokens_seen": 733448,
"step": 1945
},
{
"epoch": 1.7473118279569892,
"grad_norm": 0.9845364689826965,
"learning_rate": 4.915622217544875e-05,
"loss": 0.7203,
"num_input_tokens_seen": 735304,
"step": 1950
},
{
"epoch": 1.7517921146953404,
"grad_norm": 0.8506798148155212,
"learning_rate": 4.9146120620949854e-05,
"loss": 0.7283,
"num_input_tokens_seen": 737032,
"step": 1955
},
{
"epoch": 1.7562724014336917,
"grad_norm": 0.7311829328536987,
"learning_rate": 4.9135960009146135e-05,
"loss": 0.7221,
"num_input_tokens_seen": 738856,
"step": 1960
},
{
"epoch": 1.760752688172043,
"grad_norm": 0.9475505948066711,
"learning_rate": 4.912574036488874e-05,
"loss": 0.7073,
"num_input_tokens_seen": 740712,
"step": 1965
},
{
"epoch": 1.7652329749103943,
"grad_norm": 1.0571333169937134,
"learning_rate": 4.9115461713173174e-05,
"loss": 0.7287,
"num_input_tokens_seen": 742568,
"step": 1970
},
{
"epoch": 1.7697132616487457,
"grad_norm": 1.2273980379104614,
"learning_rate": 4.910512407913926e-05,
"loss": 0.7846,
"num_input_tokens_seen": 744584,
"step": 1975
},
{
"epoch": 1.7741935483870968,
"grad_norm": 0.6895895600318909,
"learning_rate": 4.9094727488071114e-05,
"loss": 0.7451,
"num_input_tokens_seen": 746376,
"step": 1980
},
{
"epoch": 1.778673835125448,
"grad_norm": 0.7869203686714172,
"learning_rate": 4.9084271965397014e-05,
"loss": 0.8673,
"num_input_tokens_seen": 748488,
"step": 1985
},
{
"epoch": 1.7831541218637992,
"grad_norm": 1.0943074226379395,
"learning_rate": 4.907375753668939e-05,
"loss": 0.7509,
"num_input_tokens_seen": 750376,
"step": 1990
},
{
"epoch": 1.7876344086021505,
"grad_norm": 0.6461062431335449,
"learning_rate": 4.906318422766476e-05,
"loss": 0.6848,
"num_input_tokens_seen": 752104,
"step": 1995
},
{
"epoch": 1.7921146953405018,
"grad_norm": 0.6915934085845947,
"learning_rate": 4.9052552064183624e-05,
"loss": 0.7169,
"num_input_tokens_seen": 753960,
"step": 2000
},
{
"epoch": 1.7965949820788532,
"grad_norm": 0.842902660369873,
"learning_rate": 4.904186107225046e-05,
"loss": 0.7407,
"num_input_tokens_seen": 755944,
"step": 2005
},
{
"epoch": 1.8010752688172043,
"grad_norm": 1.1747803688049316,
"learning_rate": 4.903111127801361e-05,
"loss": 0.703,
"num_input_tokens_seen": 757832,
"step": 2010
},
{
"epoch": 1.8055555555555556,
"grad_norm": 0.8623597621917725,
"learning_rate": 4.902030270776524e-05,
"loss": 0.7459,
"num_input_tokens_seen": 759816,
"step": 2015
},
{
"epoch": 1.8100358422939067,
"grad_norm": 0.697680652141571,
"learning_rate": 4.9009435387941274e-05,
"loss": 0.696,
"num_input_tokens_seen": 761640,
"step": 2020
},
{
"epoch": 1.814516129032258,
"grad_norm": 0.7466960549354553,
"learning_rate": 4.899850934512134e-05,
"loss": 0.756,
"num_input_tokens_seen": 763400,
"step": 2025
},
{
"epoch": 1.8189964157706093,
"grad_norm": 0.8614512085914612,
"learning_rate": 4.898752460602866e-05,
"loss": 0.7388,
"num_input_tokens_seen": 765288,
"step": 2030
},
{
"epoch": 1.8234767025089607,
"grad_norm": 0.7374971508979797,
"learning_rate": 4.897648119753006e-05,
"loss": 0.7338,
"num_input_tokens_seen": 767144,
"step": 2035
},
{
"epoch": 1.827956989247312,
"grad_norm": 1.1002193689346313,
"learning_rate": 4.8965379146635816e-05,
"loss": 0.7455,
"num_input_tokens_seen": 769064,
"step": 2040
},
{
"epoch": 1.832437275985663,
"grad_norm": 0.6678388118743896,
"learning_rate": 4.895421848049968e-05,
"loss": 0.7326,
"num_input_tokens_seen": 770856,
"step": 2045
},
{
"epoch": 1.8369175627240142,
"grad_norm": 0.7533695697784424,
"learning_rate": 4.894299922641873e-05,
"loss": 0.6876,
"num_input_tokens_seen": 772776,
"step": 2050
},
{
"epoch": 1.8413978494623655,
"grad_norm": 0.649446964263916,
"learning_rate": 4.893172141183335e-05,
"loss": 0.7192,
"num_input_tokens_seen": 774728,
"step": 2055
},
{
"epoch": 1.8458781362007168,
"grad_norm": 1.1061700582504272,
"learning_rate": 4.892038506432717e-05,
"loss": 0.6923,
"num_input_tokens_seen": 776712,
"step": 2060
},
{
"epoch": 1.8503584229390682,
"grad_norm": 0.44784870743751526,
"learning_rate": 4.890899021162696e-05,
"loss": 0.7245,
"num_input_tokens_seen": 778472,
"step": 2065
},
{
"epoch": 1.8548387096774195,
"grad_norm": 0.8359610438346863,
"learning_rate": 4.8897536881602594e-05,
"loss": 0.6979,
"num_input_tokens_seen": 780360,
"step": 2070
},
{
"epoch": 1.8593189964157706,
"grad_norm": 0.5439364910125732,
"learning_rate": 4.888602510226697e-05,
"loss": 0.7232,
"num_input_tokens_seen": 782280,
"step": 2075
},
{
"epoch": 1.863799283154122,
"grad_norm": 0.8178055882453918,
"learning_rate": 4.8874454901775936e-05,
"loss": 0.7403,
"num_input_tokens_seen": 784136,
"step": 2080
},
{
"epoch": 1.868279569892473,
"grad_norm": 0.6543580889701843,
"learning_rate": 4.8862826308428244e-05,
"loss": 0.6871,
"num_input_tokens_seen": 785960,
"step": 2085
},
{
"epoch": 1.8727598566308243,
"grad_norm": 0.7824105024337769,
"learning_rate": 4.885113935066545e-05,
"loss": 0.6954,
"num_input_tokens_seen": 787848,
"step": 2090
},
{
"epoch": 1.8772401433691757,
"grad_norm": 1.0091453790664673,
"learning_rate": 4.883939405707186e-05,
"loss": 0.68,
"num_input_tokens_seen": 789800,
"step": 2095
},
{
"epoch": 1.881720430107527,
"grad_norm": 0.7868355512619019,
"learning_rate": 4.882759045637449e-05,
"loss": 0.71,
"num_input_tokens_seen": 791592,
"step": 2100
},
{
"epoch": 1.886200716845878,
"grad_norm": 1.3903027772903442,
"learning_rate": 4.88157285774429e-05,
"loss": 0.7023,
"num_input_tokens_seen": 793544,
"step": 2105
},
{
"epoch": 1.8906810035842294,
"grad_norm": 0.9901719689369202,
"learning_rate": 4.8803808449289264e-05,
"loss": 0.7471,
"num_input_tokens_seen": 795368,
"step": 2110
},
{
"epoch": 1.8951612903225805,
"grad_norm": 0.870309054851532,
"learning_rate": 4.879183010106817e-05,
"loss": 0.7189,
"num_input_tokens_seen": 797128,
"step": 2115
},
{
"epoch": 1.8996415770609318,
"grad_norm": 0.7967744469642639,
"learning_rate": 4.877979356207663e-05,
"loss": 0.7321,
"num_input_tokens_seen": 798888,
"step": 2120
},
{
"epoch": 1.9041218637992832,
"grad_norm": 1.005323886871338,
"learning_rate": 4.876769886175396e-05,
"loss": 0.6991,
"num_input_tokens_seen": 800616,
"step": 2125
},
{
"epoch": 1.9086021505376345,
"grad_norm": 0.9401888251304626,
"learning_rate": 4.8755546029681746e-05,
"loss": 0.7486,
"num_input_tokens_seen": 802472,
"step": 2130
},
{
"epoch": 1.9130824372759858,
"grad_norm": 0.8822266459465027,
"learning_rate": 4.874333509558375e-05,
"loss": 0.6868,
"num_input_tokens_seen": 804328,
"step": 2135
},
{
"epoch": 1.917562724014337,
"grad_norm": 1.493293285369873,
"learning_rate": 4.873106608932585e-05,
"loss": 0.6651,
"num_input_tokens_seen": 806152,
"step": 2140
},
{
"epoch": 1.922043010752688,
"grad_norm": 1.1211999654769897,
"learning_rate": 4.871873904091593e-05,
"loss": 0.7402,
"num_input_tokens_seen": 808168,
"step": 2145
},
{
"epoch": 1.9265232974910393,
"grad_norm": 0.7824253439903259,
"learning_rate": 4.870635398050387e-05,
"loss": 0.7018,
"num_input_tokens_seen": 810056,
"step": 2150
},
{
"epoch": 1.9310035842293907,
"grad_norm": 0.8798695802688599,
"learning_rate": 4.8693910938381404e-05,
"loss": 0.6747,
"num_input_tokens_seen": 812008,
"step": 2155
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.33410248160362244,
"learning_rate": 4.868140994498211e-05,
"loss": 0.7464,
"num_input_tokens_seen": 813736,
"step": 2160
},
{
"epoch": 1.9399641577060933,
"grad_norm": 0.9553130865097046,
"learning_rate": 4.86688510308813e-05,
"loss": 0.7589,
"num_input_tokens_seen": 815752,
"step": 2165
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.8246096968650818,
"learning_rate": 4.865623422679593e-05,
"loss": 0.7059,
"num_input_tokens_seen": 817544,
"step": 2170
},
{
"epoch": 1.9489247311827957,
"grad_norm": 0.8371939063072205,
"learning_rate": 4.864355956358454e-05,
"loss": 0.6801,
"num_input_tokens_seen": 819432,
"step": 2175
},
{
"epoch": 1.9534050179211468,
"grad_norm": 1.350062608718872,
"learning_rate": 4.8630827072247244e-05,
"loss": 0.7161,
"num_input_tokens_seen": 821224,
"step": 2180
},
{
"epoch": 1.9578853046594982,
"grad_norm": 0.7525457143783569,
"learning_rate": 4.8618036783925516e-05,
"loss": 0.7676,
"num_input_tokens_seen": 822984,
"step": 2185
},
{
"epoch": 1.9623655913978495,
"grad_norm": 1.0451446771621704,
"learning_rate": 4.860518872990223e-05,
"loss": 0.751,
"num_input_tokens_seen": 824968,
"step": 2190
},
{
"epoch": 1.9668458781362008,
"grad_norm": 0.6209150552749634,
"learning_rate": 4.859228294160155e-05,
"loss": 0.7335,
"num_input_tokens_seen": 826984,
"step": 2195
},
{
"epoch": 1.971326164874552,
"grad_norm": 0.9934207201004028,
"learning_rate": 4.857931945058884e-05,
"loss": 0.7283,
"num_input_tokens_seen": 828872,
"step": 2200
},
{
"epoch": 1.9758064516129032,
"grad_norm": 0.7372912168502808,
"learning_rate": 4.856629828857059e-05,
"loss": 0.7357,
"num_input_tokens_seen": 830760,
"step": 2205
},
{
"epoch": 1.9802867383512543,
"grad_norm": 0.6201682090759277,
"learning_rate": 4.855321948739435e-05,
"loss": 0.6704,
"num_input_tokens_seen": 832712,
"step": 2210
},
{
"epoch": 1.9847670250896057,
"grad_norm": 0.7720775604248047,
"learning_rate": 4.8540083079048645e-05,
"loss": 0.7008,
"num_input_tokens_seen": 834696,
"step": 2215
},
{
"epoch": 1.989247311827957,
"grad_norm": 1.0468122959136963,
"learning_rate": 4.85268890956629e-05,
"loss": 0.7516,
"num_input_tokens_seen": 836648,
"step": 2220
},
{
"epoch": 1.9937275985663083,
"grad_norm": 1.0690462589263916,
"learning_rate": 4.851363756950733e-05,
"loss": 0.7224,
"num_input_tokens_seen": 838760,
"step": 2225
},
{
"epoch": 1.9982078853046596,
"grad_norm": 0.727118968963623,
"learning_rate": 4.8500328532992945e-05,
"loss": 0.7296,
"num_input_tokens_seen": 840584,
"step": 2230
},
{
"epoch": 2.0,
"eval_loss": 0.7162764072418213,
"eval_runtime": 5.6096,
"eval_samples_per_second": 88.419,
"eval_steps_per_second": 22.105,
"num_input_tokens_seen": 841024,
"step": 2232
},
{
"epoch": 2.002688172043011,
"grad_norm": 1.104372262954712,
"learning_rate": 4.848696201867138e-05,
"loss": 0.7025,
"num_input_tokens_seen": 842272,
"step": 2235
},
{
"epoch": 2.007168458781362,
"grad_norm": 0.6588714122772217,
"learning_rate": 4.847353805923484e-05,
"loss": 0.7179,
"num_input_tokens_seen": 844000,
"step": 2240
},
{
"epoch": 2.011648745519713,
"grad_norm": 0.5846701860427856,
"learning_rate": 4.846005668751605e-05,
"loss": 0.7021,
"num_input_tokens_seen": 845760,
"step": 2245
},
{
"epoch": 2.0161290322580645,
"grad_norm": 0.8954629302024841,
"learning_rate": 4.844651793648817e-05,
"loss": 0.7324,
"num_input_tokens_seen": 847776,
"step": 2250
},
{
"epoch": 2.020609318996416,
"grad_norm": 1.2610913515090942,
"learning_rate": 4.843292183926466e-05,
"loss": 0.7149,
"num_input_tokens_seen": 849728,
"step": 2255
},
{
"epoch": 2.025089605734767,
"grad_norm": 0.9264258742332458,
"learning_rate": 4.841926842909928e-05,
"loss": 0.7172,
"num_input_tokens_seen": 851584,
"step": 2260
},
{
"epoch": 2.0295698924731185,
"grad_norm": 0.6855342388153076,
"learning_rate": 4.840555773938594e-05,
"loss": 0.7209,
"num_input_tokens_seen": 853408,
"step": 2265
},
{
"epoch": 2.0340501792114694,
"grad_norm": 0.8659264445304871,
"learning_rate": 4.839178980365866e-05,
"loss": 0.6889,
"num_input_tokens_seen": 855360,
"step": 2270
},
{
"epoch": 2.0385304659498207,
"grad_norm": 1.0739582777023315,
"learning_rate": 4.8377964655591465e-05,
"loss": 0.7711,
"num_input_tokens_seen": 857504,
"step": 2275
},
{
"epoch": 2.043010752688172,
"grad_norm": 1.0123777389526367,
"learning_rate": 4.8364082328998314e-05,
"loss": 0.7176,
"num_input_tokens_seen": 859360,
"step": 2280
},
{
"epoch": 2.0474910394265233,
"grad_norm": 0.9168586134910583,
"learning_rate": 4.835014285783303e-05,
"loss": 0.7695,
"num_input_tokens_seen": 861312,
"step": 2285
},
{
"epoch": 2.0519713261648747,
"grad_norm": 0.8716160655021667,
"learning_rate": 4.833614627618918e-05,
"loss": 0.7196,
"num_input_tokens_seen": 863168,
"step": 2290
},
{
"epoch": 2.056451612903226,
"grad_norm": 0.6699190139770508,
"learning_rate": 4.832209261830002e-05,
"loss": 0.7157,
"num_input_tokens_seen": 865184,
"step": 2295
},
{
"epoch": 2.060931899641577,
"grad_norm": 0.715002715587616,
"learning_rate": 4.8307981918538405e-05,
"loss": 0.6378,
"num_input_tokens_seen": 867168,
"step": 2300
},
{
"epoch": 2.065412186379928,
"grad_norm": 0.9280583262443542,
"learning_rate": 4.829381421141671e-05,
"loss": 0.6811,
"num_input_tokens_seen": 869056,
"step": 2305
},
{
"epoch": 2.0698924731182795,
"grad_norm": 0.8694478273391724,
"learning_rate": 4.827958953158675e-05,
"loss": 0.734,
"num_input_tokens_seen": 870816,
"step": 2310
},
{
"epoch": 2.074372759856631,
"grad_norm": 0.6347289681434631,
"learning_rate": 4.8265307913839655e-05,
"loss": 0.716,
"num_input_tokens_seen": 872736,
"step": 2315
},
{
"epoch": 2.078853046594982,
"grad_norm": 0.4250599443912506,
"learning_rate": 4.825096939310584e-05,
"loss": 0.7348,
"num_input_tokens_seen": 874656,
"step": 2320
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.9700059294700623,
"learning_rate": 4.823657400445489e-05,
"loss": 0.8272,
"num_input_tokens_seen": 876576,
"step": 2325
},
{
"epoch": 2.0878136200716844,
"grad_norm": 0.7163282036781311,
"learning_rate": 4.822212178309548e-05,
"loss": 0.6818,
"num_input_tokens_seen": 878528,
"step": 2330
},
{
"epoch": 2.0922939068100357,
"grad_norm": 1.27232825756073,
"learning_rate": 4.820761276437527e-05,
"loss": 0.7249,
"num_input_tokens_seen": 880288,
"step": 2335
},
{
"epoch": 2.096774193548387,
"grad_norm": 0.8018732666969299,
"learning_rate": 4.819304698378089e-05,
"loss": 0.7493,
"num_input_tokens_seen": 882304,
"step": 2340
},
{
"epoch": 2.1012544802867383,
"grad_norm": 0.8478394150733948,
"learning_rate": 4.817842447693771e-05,
"loss": 0.6688,
"num_input_tokens_seen": 884256,
"step": 2345
},
{
"epoch": 2.1057347670250897,
"grad_norm": 1.056063175201416,
"learning_rate": 4.816374527960994e-05,
"loss": 0.6616,
"num_input_tokens_seen": 886240,
"step": 2350
},
{
"epoch": 2.110215053763441,
"grad_norm": 0.9384861588478088,
"learning_rate": 4.8149009427700377e-05,
"loss": 0.6951,
"num_input_tokens_seen": 888064,
"step": 2355
},
{
"epoch": 2.1146953405017923,
"grad_norm": 0.5180307626724243,
"learning_rate": 4.813421695725041e-05,
"loss": 0.7293,
"num_input_tokens_seen": 889728,
"step": 2360
},
{
"epoch": 2.119175627240143,
"grad_norm": 0.7397670149803162,
"learning_rate": 4.81193679044399e-05,
"loss": 0.7085,
"num_input_tokens_seen": 891520,
"step": 2365
},
{
"epoch": 2.1236559139784945,
"grad_norm": 0.6285805702209473,
"learning_rate": 4.810446230558714e-05,
"loss": 0.7045,
"num_input_tokens_seen": 893344,
"step": 2370
},
{
"epoch": 2.128136200716846,
"grad_norm": 0.7491189241409302,
"learning_rate": 4.8089500197148654e-05,
"loss": 0.683,
"num_input_tokens_seen": 895328,
"step": 2375
},
{
"epoch": 2.132616487455197,
"grad_norm": 0.5812940001487732,
"learning_rate": 4.807448161571922e-05,
"loss": 0.729,
"num_input_tokens_seen": 897248,
"step": 2380
},
{
"epoch": 2.1370967741935485,
"grad_norm": 0.6962029337882996,
"learning_rate": 4.805940659803174e-05,
"loss": 0.7535,
"num_input_tokens_seen": 899200,
"step": 2385
},
{
"epoch": 2.1415770609319,
"grad_norm": 0.5583885312080383,
"learning_rate": 4.804427518095715e-05,
"loss": 0.6988,
"num_input_tokens_seen": 901120,
"step": 2390
},
{
"epoch": 2.1460573476702507,
"grad_norm": 0.740242600440979,
"learning_rate": 4.802908740150431e-05,
"loss": 0.681,
"num_input_tokens_seen": 903040,
"step": 2395
},
{
"epoch": 2.150537634408602,
"grad_norm": 0.7208986878395081,
"learning_rate": 4.801384329681996e-05,
"loss": 0.712,
"num_input_tokens_seen": 904960,
"step": 2400
},
{
"epoch": 2.1550179211469533,
"grad_norm": 0.8246489763259888,
"learning_rate": 4.799854290418858e-05,
"loss": 0.7185,
"num_input_tokens_seen": 906816,
"step": 2405
},
{
"epoch": 2.1594982078853047,
"grad_norm": 0.5951177477836609,
"learning_rate": 4.798318626103233e-05,
"loss": 0.6935,
"num_input_tokens_seen": 908544,
"step": 2410
},
{
"epoch": 2.163978494623656,
"grad_norm": 1.0811342000961304,
"learning_rate": 4.7967773404910946e-05,
"loss": 0.7317,
"num_input_tokens_seen": 910336,
"step": 2415
},
{
"epoch": 2.1684587813620073,
"grad_norm": 0.9689534902572632,
"learning_rate": 4.7952304373521644e-05,
"loss": 0.7447,
"num_input_tokens_seen": 912288,
"step": 2420
},
{
"epoch": 2.1729390681003586,
"grad_norm": 0.4089421331882477,
"learning_rate": 4.793677920469906e-05,
"loss": 0.6866,
"num_input_tokens_seen": 914144,
"step": 2425
},
{
"epoch": 2.1774193548387095,
"grad_norm": 0.618166983127594,
"learning_rate": 4.7921197936415106e-05,
"loss": 0.7001,
"num_input_tokens_seen": 915904,
"step": 2430
},
{
"epoch": 2.181899641577061,
"grad_norm": 0.492492139339447,
"learning_rate": 4.7905560606778924e-05,
"loss": 0.7071,
"num_input_tokens_seen": 917824,
"step": 2435
},
{
"epoch": 2.186379928315412,
"grad_norm": 0.6676193475723267,
"learning_rate": 4.7889867254036755e-05,
"loss": 0.6964,
"num_input_tokens_seen": 919744,
"step": 2440
},
{
"epoch": 2.1908602150537635,
"grad_norm": 0.7295692563056946,
"learning_rate": 4.787411791657188e-05,
"loss": 0.7325,
"num_input_tokens_seen": 921728,
"step": 2445
},
{
"epoch": 2.195340501792115,
"grad_norm": 0.6862680315971375,
"learning_rate": 4.785831263290449e-05,
"loss": 0.7055,
"num_input_tokens_seen": 923648,
"step": 2450
},
{
"epoch": 2.199820788530466,
"grad_norm": 0.7587245106697083,
"learning_rate": 4.784245144169162e-05,
"loss": 0.673,
"num_input_tokens_seen": 925536,
"step": 2455
},
{
"epoch": 2.204301075268817,
"grad_norm": 0.6514589786529541,
"learning_rate": 4.782653438172705e-05,
"loss": 0.7201,
"num_input_tokens_seen": 927392,
"step": 2460
},
{
"epoch": 2.2087813620071683,
"grad_norm": 0.9270156025886536,
"learning_rate": 4.781056149194121e-05,
"loss": 0.6743,
"num_input_tokens_seen": 929536,
"step": 2465
},
{
"epoch": 2.2132616487455197,
"grad_norm": 0.795536458492279,
"learning_rate": 4.779453281140107e-05,
"loss": 0.719,
"num_input_tokens_seen": 931520,
"step": 2470
},
{
"epoch": 2.217741935483871,
"grad_norm": 0.9114360809326172,
"learning_rate": 4.777844837931005e-05,
"loss": 0.7193,
"num_input_tokens_seen": 933504,
"step": 2475
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.509145200252533,
"learning_rate": 4.776230823500793e-05,
"loss": 0.662,
"num_input_tokens_seen": 935360,
"step": 2480
},
{
"epoch": 2.2267025089605736,
"grad_norm": 1.2724803686141968,
"learning_rate": 4.7746112417970766e-05,
"loss": 0.6978,
"num_input_tokens_seen": 937440,
"step": 2485
},
{
"epoch": 2.2311827956989245,
"grad_norm": 1.0436537265777588,
"learning_rate": 4.772986096781078e-05,
"loss": 0.7792,
"num_input_tokens_seen": 939424,
"step": 2490
},
{
"epoch": 2.235663082437276,
"grad_norm": 0.8516407608985901,
"learning_rate": 4.771355392427624e-05,
"loss": 0.7294,
"num_input_tokens_seen": 941312,
"step": 2495
},
{
"epoch": 2.240143369175627,
"grad_norm": 0.859889805316925,
"learning_rate": 4.769719132725141e-05,
"loss": 0.7053,
"num_input_tokens_seen": 943264,
"step": 2500
},
{
"epoch": 2.2446236559139785,
"grad_norm": 0.8790197968482971,
"learning_rate": 4.768077321675643e-05,
"loss": 0.6957,
"num_input_tokens_seen": 945248,
"step": 2505
},
{
"epoch": 2.24910394265233,
"grad_norm": 0.7509116530418396,
"learning_rate": 4.766429963294719e-05,
"loss": 0.7072,
"num_input_tokens_seen": 947168,
"step": 2510
},
{
"epoch": 2.253584229390681,
"grad_norm": 0.7757908701896667,
"learning_rate": 4.7647770616115265e-05,
"loss": 0.7193,
"num_input_tokens_seen": 948960,
"step": 2515
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.6172046065330505,
"learning_rate": 4.763118620668785e-05,
"loss": 0.7676,
"num_input_tokens_seen": 950688,
"step": 2520
},
{
"epoch": 2.2625448028673834,
"grad_norm": 0.7593435049057007,
"learning_rate": 4.761454644522757e-05,
"loss": 0.6887,
"num_input_tokens_seen": 952608,
"step": 2525
},
{
"epoch": 2.2670250896057347,
"grad_norm": 1.0903900861740112,
"learning_rate": 4.759785137243245e-05,
"loss": 0.7673,
"num_input_tokens_seen": 954368,
"step": 2530
},
{
"epoch": 2.271505376344086,
"grad_norm": 0.9267560243606567,
"learning_rate": 4.758110102913581e-05,
"loss": 0.7019,
"num_input_tokens_seen": 956160,
"step": 2535
},
{
"epoch": 2.2759856630824373,
"grad_norm": 1.1232478618621826,
"learning_rate": 4.7564295456306136e-05,
"loss": 0.7269,
"num_input_tokens_seen": 957984,
"step": 2540
},
{
"epoch": 2.2804659498207887,
"grad_norm": 1.0440434217453003,
"learning_rate": 4.7547434695047e-05,
"loss": 0.7089,
"num_input_tokens_seen": 959872,
"step": 2545
},
{
"epoch": 2.28494623655914,
"grad_norm": 0.6118893027305603,
"learning_rate": 4.7530518786596954e-05,
"loss": 0.696,
"num_input_tokens_seen": 961664,
"step": 2550
},
{
"epoch": 2.289426523297491,
"grad_norm": 0.8722130656242371,
"learning_rate": 4.7513547772329446e-05,
"loss": 0.727,
"num_input_tokens_seen": 963712,
"step": 2555
},
{
"epoch": 2.293906810035842,
"grad_norm": 0.5364380478858948,
"learning_rate": 4.749652169375268e-05,
"loss": 0.7026,
"num_input_tokens_seen": 965696,
"step": 2560
},
{
"epoch": 2.2983870967741935,
"grad_norm": 0.7415496706962585,
"learning_rate": 4.747944059250955e-05,
"loss": 0.6811,
"num_input_tokens_seen": 967488,
"step": 2565
},
{
"epoch": 2.302867383512545,
"grad_norm": 0.612433910369873,
"learning_rate": 4.746230451037752e-05,
"loss": 0.6918,
"num_input_tokens_seen": 969344,
"step": 2570
},
{
"epoch": 2.307347670250896,
"grad_norm": 0.6766025424003601,
"learning_rate": 4.7445113489268544e-05,
"loss": 0.6934,
"num_input_tokens_seen": 971168,
"step": 2575
},
{
"epoch": 2.3118279569892475,
"grad_norm": 1.016257882118225,
"learning_rate": 4.7427867571228926e-05,
"loss": 0.7136,
"num_input_tokens_seen": 973088,
"step": 2580
},
{
"epoch": 2.3163082437275984,
"grad_norm": 0.9015939235687256,
"learning_rate": 4.741056679843926e-05,
"loss": 0.6865,
"num_input_tokens_seen": 975008,
"step": 2585
},
{
"epoch": 2.3207885304659497,
"grad_norm": 0.5278044939041138,
"learning_rate": 4.739321121321428e-05,
"loss": 0.6557,
"num_input_tokens_seen": 976864,
"step": 2590
},
{
"epoch": 2.325268817204301,
"grad_norm": 1.0323004722595215,
"learning_rate": 4.737580085800282e-05,
"loss": 0.7846,
"num_input_tokens_seen": 978752,
"step": 2595
},
{
"epoch": 2.3297491039426523,
"grad_norm": 0.5666264295578003,
"learning_rate": 4.735833577538762e-05,
"loss": 0.7187,
"num_input_tokens_seen": 980576,
"step": 2600
},
{
"epoch": 2.3342293906810037,
"grad_norm": 0.5789869427680969,
"learning_rate": 4.734081600808531e-05,
"loss": 0.7199,
"num_input_tokens_seen": 982336,
"step": 2605
},
{
"epoch": 2.338709677419355,
"grad_norm": 0.9192356467247009,
"learning_rate": 4.732324159894627e-05,
"loss": 0.7349,
"num_input_tokens_seen": 984064,
"step": 2610
},
{
"epoch": 2.3431899641577063,
"grad_norm": 0.880268394947052,
"learning_rate": 4.730561259095451e-05,
"loss": 0.7277,
"num_input_tokens_seen": 985888,
"step": 2615
},
{
"epoch": 2.347670250896057,
"grad_norm": 0.5334110260009766,
"learning_rate": 4.728792902722759e-05,
"loss": 0.6812,
"num_input_tokens_seen": 987712,
"step": 2620
},
{
"epoch": 2.3521505376344085,
"grad_norm": 0.7519697546958923,
"learning_rate": 4.7270190951016493e-05,
"loss": 0.6801,
"num_input_tokens_seen": 989568,
"step": 2625
},
{
"epoch": 2.35663082437276,
"grad_norm": 0.8907373547554016,
"learning_rate": 4.7252398405705535e-05,
"loss": 0.6876,
"num_input_tokens_seen": 991584,
"step": 2630
},
{
"epoch": 2.361111111111111,
"grad_norm": 0.7968729138374329,
"learning_rate": 4.723455143481227e-05,
"loss": 0.7279,
"num_input_tokens_seen": 993472,
"step": 2635
},
{
"epoch": 2.3655913978494625,
"grad_norm": 0.799196183681488,
"learning_rate": 4.721665008198734e-05,
"loss": 0.691,
"num_input_tokens_seen": 995296,
"step": 2640
},
{
"epoch": 2.370071684587814,
"grad_norm": 0.653005063533783,
"learning_rate": 4.719869439101442e-05,
"loss": 0.7456,
"num_input_tokens_seen": 997120,
"step": 2645
},
{
"epoch": 2.3745519713261647,
"grad_norm": 0.7391387224197388,
"learning_rate": 4.718068440581007e-05,
"loss": 0.7312,
"num_input_tokens_seen": 999104,
"step": 2650
},
{
"epoch": 2.379032258064516,
"grad_norm": 0.9343587756156921,
"learning_rate": 4.7162620170423655e-05,
"loss": 0.7184,
"num_input_tokens_seen": 1000864,
"step": 2655
},
{
"epoch": 2.3835125448028673,
"grad_norm": 0.5467570424079895,
"learning_rate": 4.714450172903722e-05,
"loss": 0.7674,
"num_input_tokens_seen": 1002976,
"step": 2660
},
{
"epoch": 2.3879928315412187,
"grad_norm": 0.6304900050163269,
"learning_rate": 4.712632912596538e-05,
"loss": 0.7117,
"num_input_tokens_seen": 1004672,
"step": 2665
},
{
"epoch": 2.39247311827957,
"grad_norm": 0.8206930756568909,
"learning_rate": 4.710810240565526e-05,
"loss": 0.6894,
"num_input_tokens_seen": 1006560,
"step": 2670
},
{
"epoch": 2.3969534050179213,
"grad_norm": 0.7391961812973022,
"learning_rate": 4.7089821612686295e-05,
"loss": 0.7055,
"num_input_tokens_seen": 1008384,
"step": 2675
},
{
"epoch": 2.4014336917562726,
"grad_norm": 0.6682542562484741,
"learning_rate": 4.707148679177021e-05,
"loss": 0.6728,
"num_input_tokens_seen": 1010208,
"step": 2680
},
{
"epoch": 2.4059139784946235,
"grad_norm": 0.6063985824584961,
"learning_rate": 4.705309798775084e-05,
"loss": 0.7021,
"num_input_tokens_seen": 1012128,
"step": 2685
},
{
"epoch": 2.410394265232975,
"grad_norm": 0.8216093182563782,
"learning_rate": 4.703465524560409e-05,
"loss": 0.722,
"num_input_tokens_seen": 1014080,
"step": 2690
},
{
"epoch": 2.414874551971326,
"grad_norm": 0.9102132320404053,
"learning_rate": 4.7016158610437764e-05,
"loss": 0.7014,
"num_input_tokens_seen": 1015968,
"step": 2695
},
{
"epoch": 2.4193548387096775,
"grad_norm": 0.6346727609634399,
"learning_rate": 4.69976081274915e-05,
"loss": 0.6802,
"num_input_tokens_seen": 1017696,
"step": 2700
},
{
"epoch": 2.423835125448029,
"grad_norm": 0.7000402808189392,
"learning_rate": 4.6979003842136596e-05,
"loss": 0.7651,
"num_input_tokens_seen": 1019552,
"step": 2705
},
{
"epoch": 2.4283154121863797,
"grad_norm": 0.549882173538208,
"learning_rate": 4.6960345799875995e-05,
"loss": 0.8167,
"num_input_tokens_seen": 1021344,
"step": 2710
},
{
"epoch": 2.432795698924731,
"grad_norm": 0.6052292585372925,
"learning_rate": 4.694163404634408e-05,
"loss": 0.6887,
"num_input_tokens_seen": 1023136,
"step": 2715
},
{
"epoch": 2.4372759856630823,
"grad_norm": 0.6004948616027832,
"learning_rate": 4.692286862730663e-05,
"loss": 0.6855,
"num_input_tokens_seen": 1024960,
"step": 2720
},
{
"epoch": 2.4417562724014337,
"grad_norm": 0.6895211338996887,
"learning_rate": 4.690404958866066e-05,
"loss": 0.7451,
"num_input_tokens_seen": 1026720,
"step": 2725
},
{
"epoch": 2.446236559139785,
"grad_norm": 0.6713605523109436,
"learning_rate": 4.6885176976434344e-05,
"loss": 0.7132,
"num_input_tokens_seen": 1028544,
"step": 2730
},
{
"epoch": 2.4507168458781363,
"grad_norm": 0.7282506823539734,
"learning_rate": 4.6866250836786876e-05,
"loss": 0.689,
"num_input_tokens_seen": 1030368,
"step": 2735
},
{
"epoch": 2.4551971326164876,
"grad_norm": 0.8884916305541992,
"learning_rate": 4.684727121600838e-05,
"loss": 0.6809,
"num_input_tokens_seen": 1032224,
"step": 2740
},
{
"epoch": 2.4596774193548385,
"grad_norm": 0.7573866844177246,
"learning_rate": 4.6828238160519775e-05,
"loss": 0.6593,
"num_input_tokens_seen": 1034112,
"step": 2745
},
{
"epoch": 2.46415770609319,
"grad_norm": 0.7106537222862244,
"learning_rate": 4.680915171687269e-05,
"loss": 0.6905,
"num_input_tokens_seen": 1036000,
"step": 2750
},
{
"epoch": 2.468637992831541,
"grad_norm": 0.5046974420547485,
"learning_rate": 4.6790011931749314e-05,
"loss": 0.6927,
"num_input_tokens_seen": 1037888,
"step": 2755
},
{
"epoch": 2.4731182795698925,
"grad_norm": 0.8967843055725098,
"learning_rate": 4.6770818851962305e-05,
"loss": 0.7133,
"num_input_tokens_seen": 1039776,
"step": 2760
},
{
"epoch": 2.477598566308244,
"grad_norm": 0.784045398235321,
"learning_rate": 4.675157252445467e-05,
"loss": 0.6573,
"num_input_tokens_seen": 1041600,
"step": 2765
},
{
"epoch": 2.482078853046595,
"grad_norm": 0.8381885290145874,
"learning_rate": 4.673227299629966e-05,
"loss": 0.7263,
"num_input_tokens_seen": 1043456,
"step": 2770
},
{
"epoch": 2.486559139784946,
"grad_norm": 0.8245952725410461,
"learning_rate": 4.6712920314700624e-05,
"loss": 0.6746,
"num_input_tokens_seen": 1045248,
"step": 2775
},
{
"epoch": 2.4910394265232974,
"grad_norm": 0.6422320008277893,
"learning_rate": 4.6693514526990955e-05,
"loss": 0.7433,
"num_input_tokens_seen": 1047168,
"step": 2780
},
{
"epoch": 2.4955197132616487,
"grad_norm": 0.6076728701591492,
"learning_rate": 4.6674055680633885e-05,
"loss": 0.6742,
"num_input_tokens_seen": 1049056,
"step": 2785
},
{
"epoch": 2.5,
"grad_norm": 0.5336353182792664,
"learning_rate": 4.665454382322246e-05,
"loss": 0.8036,
"num_input_tokens_seen": 1051168,
"step": 2790
},
{
"epoch": 2.5,
"eval_loss": 0.724970281124115,
"eval_runtime": 5.6399,
"eval_samples_per_second": 87.945,
"eval_steps_per_second": 21.986,
"num_input_tokens_seen": 1051168,
"step": 2790
},
{
"epoch": 2.5044802867383513,
"grad_norm": 1.0776666402816772,
"learning_rate": 4.663497900247936e-05,
"loss": 0.7213,
"num_input_tokens_seen": 1053120,
"step": 2795
},
{
"epoch": 2.5089605734767026,
"grad_norm": 0.7473271489143372,
"learning_rate": 4.6615361266256805e-05,
"loss": 0.7244,
"num_input_tokens_seen": 1055008,
"step": 2800
},
{
"epoch": 2.513440860215054,
"grad_norm": 0.6820984482765198,
"learning_rate": 4.6595690662536436e-05,
"loss": 0.6848,
"num_input_tokens_seen": 1056832,
"step": 2805
},
{
"epoch": 2.517921146953405,
"grad_norm": 0.7136480808258057,
"learning_rate": 4.657596723942923e-05,
"loss": 0.7369,
"num_input_tokens_seen": 1058656,
"step": 2810
},
{
"epoch": 2.522401433691756,
"grad_norm": 0.6265502572059631,
"learning_rate": 4.65561910451753e-05,
"loss": 0.7132,
"num_input_tokens_seen": 1060416,
"step": 2815
},
{
"epoch": 2.5268817204301075,
"grad_norm": 0.6327486634254456,
"learning_rate": 4.653636212814386e-05,
"loss": 0.7276,
"num_input_tokens_seen": 1062176,
"step": 2820
},
{
"epoch": 2.531362007168459,
"grad_norm": 0.6677072644233704,
"learning_rate": 4.651648053683308e-05,
"loss": 0.75,
"num_input_tokens_seen": 1064032,
"step": 2825
},
{
"epoch": 2.53584229390681,
"grad_norm": 0.6885439157485962,
"learning_rate": 4.649654631986994e-05,
"loss": 0.6952,
"num_input_tokens_seen": 1065920,
"step": 2830
},
{
"epoch": 2.540322580645161,
"grad_norm": 0.4123775362968445,
"learning_rate": 4.6476559526010146e-05,
"loss": 0.6645,
"num_input_tokens_seen": 1067840,
"step": 2835
},
{
"epoch": 2.5448028673835124,
"grad_norm": 0.5762872099876404,
"learning_rate": 4.6456520204137996e-05,
"loss": 0.7147,
"num_input_tokens_seen": 1069824,
"step": 2840
},
{
"epoch": 2.5492831541218637,
"grad_norm": 0.6282514333724976,
"learning_rate": 4.643642840326627e-05,
"loss": 0.7152,
"num_input_tokens_seen": 1071744,
"step": 2845
},
{
"epoch": 2.553763440860215,
"grad_norm": 0.6361386775970459,
"learning_rate": 4.64162841725361e-05,
"loss": 0.7435,
"num_input_tokens_seen": 1073536,
"step": 2850
},
{
"epoch": 2.5582437275985663,
"grad_norm": 0.7141626477241516,
"learning_rate": 4.639608756121684e-05,
"loss": 0.6694,
"num_input_tokens_seen": 1075424,
"step": 2855
},
{
"epoch": 2.5627240143369177,
"grad_norm": 0.8432585597038269,
"learning_rate": 4.637583861870596e-05,
"loss": 0.6899,
"num_input_tokens_seen": 1077472,
"step": 2860
},
{
"epoch": 2.567204301075269,
"grad_norm": 0.6747820377349854,
"learning_rate": 4.635553739452895e-05,
"loss": 0.694,
"num_input_tokens_seen": 1079296,
"step": 2865
},
{
"epoch": 2.5716845878136203,
"grad_norm": 0.9020037651062012,
"learning_rate": 4.6335183938339125e-05,
"loss": 0.6956,
"num_input_tokens_seen": 1081152,
"step": 2870
},
{
"epoch": 2.576164874551971,
"grad_norm": 0.7527491450309753,
"learning_rate": 4.631477829991761e-05,
"loss": 0.7206,
"num_input_tokens_seen": 1083168,
"step": 2875
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.6851952075958252,
"learning_rate": 4.629432052917309e-05,
"loss": 0.7044,
"num_input_tokens_seen": 1084992,
"step": 2880
},
{
"epoch": 2.585125448028674,
"grad_norm": 0.6493790745735168,
"learning_rate": 4.627381067614182e-05,
"loss": 0.7422,
"num_input_tokens_seen": 1086784,
"step": 2885
},
{
"epoch": 2.589605734767025,
"grad_norm": 0.5780056715011597,
"learning_rate": 4.625324879098741e-05,
"loss": 0.6984,
"num_input_tokens_seen": 1088608,
"step": 2890
},
{
"epoch": 2.5940860215053765,
"grad_norm": 0.7665016651153564,
"learning_rate": 4.6232634924000725e-05,
"loss": 0.7409,
"num_input_tokens_seen": 1090592,
"step": 2895
},
{
"epoch": 2.5985663082437274,
"grad_norm": 0.7732678651809692,
"learning_rate": 4.621196912559978e-05,
"loss": 0.6971,
"num_input_tokens_seen": 1092448,
"step": 2900
},
{
"epoch": 2.6030465949820787,
"grad_norm": 0.5451002717018127,
"learning_rate": 4.619125144632961e-05,
"loss": 0.7078,
"num_input_tokens_seen": 1094368,
"step": 2905
},
{
"epoch": 2.60752688172043,
"grad_norm": 0.7645034193992615,
"learning_rate": 4.617048193686213e-05,
"loss": 0.7346,
"num_input_tokens_seen": 1096288,
"step": 2910
},
{
"epoch": 2.6120071684587813,
"grad_norm": 0.7834355235099792,
"learning_rate": 4.614966064799603e-05,
"loss": 0.6801,
"num_input_tokens_seen": 1098240,
"step": 2915
},
{
"epoch": 2.6164874551971327,
"grad_norm": 0.6444031000137329,
"learning_rate": 4.612878763065664e-05,
"loss": 0.746,
"num_input_tokens_seen": 1100096,
"step": 2920
},
{
"epoch": 2.620967741935484,
"grad_norm": 0.6403073668479919,
"learning_rate": 4.610786293589581e-05,
"loss": 0.7428,
"num_input_tokens_seen": 1101984,
"step": 2925
},
{
"epoch": 2.6254480286738353,
"grad_norm": 0.7044591307640076,
"learning_rate": 4.608688661489179e-05,
"loss": 0.6958,
"num_input_tokens_seen": 1104000,
"step": 2930
},
{
"epoch": 2.6299283154121866,
"grad_norm": 0.7843032479286194,
"learning_rate": 4.60658587189491e-05,
"loss": 0.7658,
"num_input_tokens_seen": 1106080,
"step": 2935
},
{
"epoch": 2.6344086021505375,
"grad_norm": 0.4566028416156769,
"learning_rate": 4.604477929949837e-05,
"loss": 0.691,
"num_input_tokens_seen": 1108096,
"step": 2940
},
{
"epoch": 2.638888888888889,
"grad_norm": 0.7375583052635193,
"learning_rate": 4.60236484080963e-05,
"loss": 0.7131,
"num_input_tokens_seen": 1109952,
"step": 2945
},
{
"epoch": 2.64336917562724,
"grad_norm": 0.7825741767883301,
"learning_rate": 4.600246609642546e-05,
"loss": 0.6957,
"num_input_tokens_seen": 1111840,
"step": 2950
},
{
"epoch": 2.6478494623655915,
"grad_norm": 0.8638558983802795,
"learning_rate": 4.598123241629416e-05,
"loss": 0.7269,
"num_input_tokens_seen": 1113600,
"step": 2955
},
{
"epoch": 2.652329749103943,
"grad_norm": 0.5645026564598083,
"learning_rate": 4.5959947419636394e-05,
"loss": 0.6928,
"num_input_tokens_seen": 1115424,
"step": 2960
},
{
"epoch": 2.6568100358422937,
"grad_norm": 0.791883111000061,
"learning_rate": 4.593861115851163e-05,
"loss": 0.6988,
"num_input_tokens_seen": 1117376,
"step": 2965
},
{
"epoch": 2.661290322580645,
"grad_norm": 0.5525398850440979,
"learning_rate": 4.5917223685104735e-05,
"loss": 0.6782,
"num_input_tokens_seen": 1119232,
"step": 2970
},
{
"epoch": 2.6657706093189963,
"grad_norm": 0.6524225473403931,
"learning_rate": 4.5895785051725836e-05,
"loss": 0.6991,
"num_input_tokens_seen": 1121184,
"step": 2975
},
{
"epoch": 2.6702508960573477,
"grad_norm": 0.723503589630127,
"learning_rate": 4.587429531081019e-05,
"loss": 0.6962,
"num_input_tokens_seen": 1123424,
"step": 2980
},
{
"epoch": 2.674731182795699,
"grad_norm": 0.5620132088661194,
"learning_rate": 4.5852754514918034e-05,
"loss": 0.737,
"num_input_tokens_seen": 1125152,
"step": 2985
},
{
"epoch": 2.6792114695340503,
"grad_norm": 0.5039889216423035,
"learning_rate": 4.58311627167345e-05,
"loss": 0.6667,
"num_input_tokens_seen": 1127136,
"step": 2990
},
{
"epoch": 2.6836917562724016,
"grad_norm": 0.963306188583374,
"learning_rate": 4.580951996906946e-05,
"loss": 0.7496,
"num_input_tokens_seen": 1128992,
"step": 2995
},
{
"epoch": 2.688172043010753,
"grad_norm": 0.9899599552154541,
"learning_rate": 4.578782632485738e-05,
"loss": 0.7067,
"num_input_tokens_seen": 1130976,
"step": 3000
},
{
"epoch": 2.692652329749104,
"grad_norm": 0.47623762488365173,
"learning_rate": 4.576608183715724e-05,
"loss": 0.6723,
"num_input_tokens_seen": 1132832,
"step": 3005
},
{
"epoch": 2.697132616487455,
"grad_norm": 0.6253779530525208,
"learning_rate": 4.574428655915235e-05,
"loss": 0.7055,
"num_input_tokens_seen": 1134720,
"step": 3010
},
{
"epoch": 2.7016129032258065,
"grad_norm": 0.9512951970100403,
"learning_rate": 4.572244054415026e-05,
"loss": 0.7123,
"num_input_tokens_seen": 1136576,
"step": 3015
},
{
"epoch": 2.706093189964158,
"grad_norm": 0.702843427658081,
"learning_rate": 4.570054384558259e-05,
"loss": 0.6806,
"num_input_tokens_seen": 1138560,
"step": 3020
},
{
"epoch": 2.7105734767025087,
"grad_norm": 0.6525705456733704,
"learning_rate": 4.5678596517004966e-05,
"loss": 0.6627,
"num_input_tokens_seen": 1140480,
"step": 3025
},
{
"epoch": 2.71505376344086,
"grad_norm": 0.4482388496398926,
"learning_rate": 4.56565986120968e-05,
"loss": 0.7051,
"num_input_tokens_seen": 1142432,
"step": 3030
},
{
"epoch": 2.7195340501792113,
"grad_norm": 0.8427887558937073,
"learning_rate": 4.563455018466125e-05,
"loss": 0.7249,
"num_input_tokens_seen": 1144256,
"step": 3035
},
{
"epoch": 2.7240143369175627,
"grad_norm": 0.4986298084259033,
"learning_rate": 4.5612451288624996e-05,
"loss": 0.6847,
"num_input_tokens_seen": 1146240,
"step": 3040
},
{
"epoch": 2.728494623655914,
"grad_norm": 0.8530718684196472,
"learning_rate": 4.559030197803819e-05,
"loss": 0.6835,
"num_input_tokens_seen": 1148192,
"step": 3045
},
{
"epoch": 2.7329749103942653,
"grad_norm": 0.47099852561950684,
"learning_rate": 4.5568102307074286e-05,
"loss": 0.7198,
"num_input_tokens_seen": 1149984,
"step": 3050
},
{
"epoch": 2.7374551971326166,
"grad_norm": 0.5832663774490356,
"learning_rate": 4.554585233002989e-05,
"loss": 0.6821,
"num_input_tokens_seen": 1151872,
"step": 3055
},
{
"epoch": 2.741935483870968,
"grad_norm": 0.6409208178520203,
"learning_rate": 4.552355210132467e-05,
"loss": 0.7222,
"num_input_tokens_seen": 1153696,
"step": 3060
},
{
"epoch": 2.746415770609319,
"grad_norm": 0.6105261445045471,
"learning_rate": 4.550120167550119e-05,
"loss": 0.7136,
"num_input_tokens_seen": 1155584,
"step": 3065
},
{
"epoch": 2.75089605734767,
"grad_norm": 0.7974156141281128,
"learning_rate": 4.54788011072248e-05,
"loss": 0.6708,
"num_input_tokens_seen": 1157376,
"step": 3070
},
{
"epoch": 2.7553763440860215,
"grad_norm": 0.8578731417655945,
"learning_rate": 4.545635045128347e-05,
"loss": 0.713,
"num_input_tokens_seen": 1159104,
"step": 3075
},
{
"epoch": 2.759856630824373,
"grad_norm": 0.9953461289405823,
"learning_rate": 4.5433849762587685e-05,
"loss": 0.7076,
"num_input_tokens_seen": 1161024,
"step": 3080
},
{
"epoch": 2.764336917562724,
"grad_norm": 0.7679957747459412,
"learning_rate": 4.541129909617031e-05,
"loss": 0.7383,
"num_input_tokens_seen": 1162848,
"step": 3085
},
{
"epoch": 2.768817204301075,
"grad_norm": 0.6889281272888184,
"learning_rate": 4.5388698507186445e-05,
"loss": 0.6939,
"num_input_tokens_seen": 1164608,
"step": 3090
},
{
"epoch": 2.7732974910394264,
"grad_norm": 0.894231915473938,
"learning_rate": 4.536604805091327e-05,
"loss": 0.7089,
"num_input_tokens_seen": 1166368,
"step": 3095
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.6850932240486145,
"learning_rate": 4.534334778274997e-05,
"loss": 0.6971,
"num_input_tokens_seen": 1168064,
"step": 3100
},
{
"epoch": 2.782258064516129,
"grad_norm": 0.7918106913566589,
"learning_rate": 4.532059775821752e-05,
"loss": 0.6858,
"num_input_tokens_seen": 1170016,
"step": 3105
},
{
"epoch": 2.7867383512544803,
"grad_norm": 0.6311814785003662,
"learning_rate": 4.529779803295863e-05,
"loss": 0.7439,
"num_input_tokens_seen": 1171712,
"step": 3110
},
{
"epoch": 2.7912186379928317,
"grad_norm": 0.8450077772140503,
"learning_rate": 4.527494866273753e-05,
"loss": 0.6845,
"num_input_tokens_seen": 1173536,
"step": 3115
},
{
"epoch": 2.795698924731183,
"grad_norm": 0.6848275065422058,
"learning_rate": 4.525204970343991e-05,
"loss": 0.7427,
"num_input_tokens_seen": 1175456,
"step": 3120
},
{
"epoch": 2.8001792114695343,
"grad_norm": 0.6643990874290466,
"learning_rate": 4.5229101211072736e-05,
"loss": 0.7146,
"num_input_tokens_seen": 1177536,
"step": 3125
},
{
"epoch": 2.804659498207885,
"grad_norm": 1.0588222742080688,
"learning_rate": 4.52061032417641e-05,
"loss": 0.7177,
"num_input_tokens_seen": 1179328,
"step": 3130
},
{
"epoch": 2.8091397849462365,
"grad_norm": 0.5510256886482239,
"learning_rate": 4.518305585176313e-05,
"loss": 0.6954,
"num_input_tokens_seen": 1181152,
"step": 3135
},
{
"epoch": 2.813620071684588,
"grad_norm": 0.8167834877967834,
"learning_rate": 4.5159959097439833e-05,
"loss": 0.7174,
"num_input_tokens_seen": 1183104,
"step": 3140
},
{
"epoch": 2.818100358422939,
"grad_norm": 0.6767428517341614,
"learning_rate": 4.513681303528493e-05,
"loss": 0.6817,
"num_input_tokens_seen": 1184960,
"step": 3145
},
{
"epoch": 2.8225806451612905,
"grad_norm": 0.5507485866546631,
"learning_rate": 4.511361772190975e-05,
"loss": 0.7098,
"num_input_tokens_seen": 1186784,
"step": 3150
},
{
"epoch": 2.8270609318996414,
"grad_norm": 0.5543942451477051,
"learning_rate": 4.50903732140461e-05,
"loss": 0.7044,
"num_input_tokens_seen": 1188960,
"step": 3155
},
{
"epoch": 2.8315412186379927,
"grad_norm": 0.5798549056053162,
"learning_rate": 4.506707956854608e-05,
"loss": 0.6974,
"num_input_tokens_seen": 1190784,
"step": 3160
},
{
"epoch": 2.836021505376344,
"grad_norm": 0.9505072236061096,
"learning_rate": 4.5043736842382e-05,
"loss": 0.6779,
"num_input_tokens_seen": 1192896,
"step": 3165
},
{
"epoch": 2.8405017921146953,
"grad_norm": 0.7640702128410339,
"learning_rate": 4.5020345092646176e-05,
"loss": 0.7244,
"num_input_tokens_seen": 1195008,
"step": 3170
},
{
"epoch": 2.8449820788530467,
"grad_norm": 0.6997788548469543,
"learning_rate": 4.4996904376550876e-05,
"loss": 0.7405,
"num_input_tokens_seen": 1196800,
"step": 3175
},
{
"epoch": 2.849462365591398,
"grad_norm": 0.5832765698432922,
"learning_rate": 4.497341475142808e-05,
"loss": 0.7037,
"num_input_tokens_seen": 1198688,
"step": 3180
},
{
"epoch": 2.8539426523297493,
"grad_norm": 0.6882241368293762,
"learning_rate": 4.494987627472943e-05,
"loss": 0.6921,
"num_input_tokens_seen": 1200704,
"step": 3185
},
{
"epoch": 2.8584229390681006,
"grad_norm": 0.7053371667861938,
"learning_rate": 4.492628900402604e-05,
"loss": 0.726,
"num_input_tokens_seen": 1202560,
"step": 3190
},
{
"epoch": 2.8629032258064515,
"grad_norm": 0.6048314571380615,
"learning_rate": 4.4902652997008365e-05,
"loss": 0.6922,
"num_input_tokens_seen": 1204448,
"step": 3195
},
{
"epoch": 2.867383512544803,
"grad_norm": 0.7072091698646545,
"learning_rate": 4.487896831148605e-05,
"loss": 0.7175,
"num_input_tokens_seen": 1206400,
"step": 3200
},
{
"epoch": 2.871863799283154,
"grad_norm": 0.7911497950553894,
"learning_rate": 4.48552350053878e-05,
"loss": 0.7205,
"num_input_tokens_seen": 1208160,
"step": 3205
},
{
"epoch": 2.8763440860215055,
"grad_norm": 0.3948129713535309,
"learning_rate": 4.483145313676127e-05,
"loss": 0.7346,
"num_input_tokens_seen": 1209920,
"step": 3210
},
{
"epoch": 2.8808243727598564,
"grad_norm": 0.6734813451766968,
"learning_rate": 4.480762276377284e-05,
"loss": 0.7002,
"num_input_tokens_seen": 1211872,
"step": 3215
},
{
"epoch": 2.8853046594982077,
"grad_norm": 0.48924365639686584,
"learning_rate": 4.4783743944707576e-05,
"loss": 0.7089,
"num_input_tokens_seen": 1213856,
"step": 3220
},
{
"epoch": 2.889784946236559,
"grad_norm": 0.7001926898956299,
"learning_rate": 4.475981673796899e-05,
"loss": 0.7029,
"num_input_tokens_seen": 1215680,
"step": 3225
},
{
"epoch": 2.8942652329749103,
"grad_norm": 1.0040634870529175,
"learning_rate": 4.473584120207896e-05,
"loss": 0.6916,
"num_input_tokens_seen": 1217600,
"step": 3230
},
{
"epoch": 2.8987455197132617,
"grad_norm": 0.7664744257926941,
"learning_rate": 4.471181739567758e-05,
"loss": 0.7127,
"num_input_tokens_seen": 1219488,
"step": 3235
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.5811721682548523,
"learning_rate": 4.468774537752299e-05,
"loss": 0.6816,
"num_input_tokens_seen": 1221216,
"step": 3240
},
{
"epoch": 2.9077060931899643,
"grad_norm": 0.5584621429443359,
"learning_rate": 4.466362520649125e-05,
"loss": 0.7139,
"num_input_tokens_seen": 1222944,
"step": 3245
},
{
"epoch": 2.9121863799283156,
"grad_norm": 0.7363925576210022,
"learning_rate": 4.463945694157621e-05,
"loss": 0.7161,
"num_input_tokens_seen": 1224832,
"step": 3250
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.6875536441802979,
"learning_rate": 4.461524064188931e-05,
"loss": 0.7205,
"num_input_tokens_seen": 1226560,
"step": 3255
},
{
"epoch": 2.921146953405018,
"grad_norm": 0.6339446902275085,
"learning_rate": 4.459097636665953e-05,
"loss": 0.7823,
"num_input_tokens_seen": 1228480,
"step": 3260
},
{
"epoch": 2.925627240143369,
"grad_norm": 0.8035193085670471,
"learning_rate": 4.456666417523314e-05,
"loss": 0.7181,
"num_input_tokens_seen": 1230208,
"step": 3265
},
{
"epoch": 2.9301075268817205,
"grad_norm": 0.6746560335159302,
"learning_rate": 4.4542304127073644e-05,
"loss": 0.7073,
"num_input_tokens_seen": 1232160,
"step": 3270
},
{
"epoch": 2.934587813620072,
"grad_norm": 0.603693425655365,
"learning_rate": 4.451789628176155e-05,
"loss": 0.7055,
"num_input_tokens_seen": 1234112,
"step": 3275
},
{
"epoch": 2.9390681003584227,
"grad_norm": 0.6682401299476624,
"learning_rate": 4.449344069899433e-05,
"loss": 0.7182,
"num_input_tokens_seen": 1236064,
"step": 3280
},
{
"epoch": 2.943548387096774,
"grad_norm": 0.5182085633277893,
"learning_rate": 4.446893743858615e-05,
"loss": 0.7015,
"num_input_tokens_seen": 1237856,
"step": 3285
},
{
"epoch": 2.9480286738351253,
"grad_norm": 0.5174866914749146,
"learning_rate": 4.4444386560467836e-05,
"loss": 0.6907,
"num_input_tokens_seen": 1239968,
"step": 3290
},
{
"epoch": 2.9525089605734767,
"grad_norm": 0.6247062087059021,
"learning_rate": 4.441978812468666e-05,
"loss": 0.7432,
"num_input_tokens_seen": 1241760,
"step": 3295
},
{
"epoch": 2.956989247311828,
"grad_norm": 0.7549983859062195,
"learning_rate": 4.439514219140621e-05,
"loss": 0.7235,
"num_input_tokens_seen": 1243840,
"step": 3300
},
{
"epoch": 2.9614695340501793,
"grad_norm": 0.7973654270172119,
"learning_rate": 4.4370448820906246e-05,
"loss": 0.7258,
"num_input_tokens_seen": 1245664,
"step": 3305
},
{
"epoch": 2.9659498207885306,
"grad_norm": 0.4042164385318756,
"learning_rate": 4.434570807358255e-05,
"loss": 0.6954,
"num_input_tokens_seen": 1247488,
"step": 3310
},
{
"epoch": 2.970430107526882,
"grad_norm": 0.8879252672195435,
"learning_rate": 4.4320920009946795e-05,
"loss": 0.7188,
"num_input_tokens_seen": 1249280,
"step": 3315
},
{
"epoch": 2.974910394265233,
"grad_norm": 0.6141677498817444,
"learning_rate": 4.4296084690626356e-05,
"loss": 0.6683,
"num_input_tokens_seen": 1251136,
"step": 3320
},
{
"epoch": 2.979390681003584,
"grad_norm": 0.5475640892982483,
"learning_rate": 4.427120217636421e-05,
"loss": 0.6596,
"num_input_tokens_seen": 1253024,
"step": 3325
},
{
"epoch": 2.9838709677419355,
"grad_norm": 0.6818305850028992,
"learning_rate": 4.424627252801874e-05,
"loss": 0.6488,
"num_input_tokens_seen": 1254848,
"step": 3330
},
{
"epoch": 2.988351254480287,
"grad_norm": 0.688703715801239,
"learning_rate": 4.422129580656365e-05,
"loss": 0.6964,
"num_input_tokens_seen": 1256704,
"step": 3335
},
{
"epoch": 2.992831541218638,
"grad_norm": 0.6664419770240784,
"learning_rate": 4.419627207308773e-05,
"loss": 0.6693,
"num_input_tokens_seen": 1258624,
"step": 3340
},
{
"epoch": 2.997311827956989,
"grad_norm": 0.6489902138710022,
"learning_rate": 4.4171201388794795e-05,
"loss": 0.7289,
"num_input_tokens_seen": 1260480,
"step": 3345
},
{
"epoch": 3.0,
"eval_loss": 0.7142015099525452,
"eval_runtime": 5.6222,
"eval_samples_per_second": 88.222,
"eval_steps_per_second": 22.056,
"num_input_tokens_seen": 1261304,
"step": 3348
},
{
"epoch": 3.0017921146953404,
"grad_norm": 0.6640289425849915,
"learning_rate": 4.414608381500347e-05,
"loss": 0.6641,
"num_input_tokens_seen": 1262008,
"step": 3350
},
{
"epoch": 3.0062724014336917,
"grad_norm": 0.6687781810760498,
"learning_rate": 4.4120919413147054e-05,
"loss": 0.7156,
"num_input_tokens_seen": 1263800,
"step": 3355
},
{
"epoch": 3.010752688172043,
"grad_norm": 0.6788121461868286,
"learning_rate": 4.409570824477341e-05,
"loss": 0.7282,
"num_input_tokens_seen": 1265592,
"step": 3360
},
{
"epoch": 3.0152329749103943,
"grad_norm": 0.6610774993896484,
"learning_rate": 4.407045037154478e-05,
"loss": 0.6751,
"num_input_tokens_seen": 1267512,
"step": 3365
},
{
"epoch": 3.0197132616487457,
"grad_norm": 0.6137348413467407,
"learning_rate": 4.40451458552376e-05,
"loss": 0.7057,
"num_input_tokens_seen": 1269400,
"step": 3370
},
{
"epoch": 3.024193548387097,
"grad_norm": 0.6485475897789001,
"learning_rate": 4.4019794757742426e-05,
"loss": 0.6814,
"num_input_tokens_seen": 1271192,
"step": 3375
},
{
"epoch": 3.028673835125448,
"grad_norm": 1.0241650342941284,
"learning_rate": 4.3994397141063734e-05,
"loss": 0.7126,
"num_input_tokens_seen": 1273080,
"step": 3380
},
{
"epoch": 3.033154121863799,
"grad_norm": 0.6707364320755005,
"learning_rate": 4.3968953067319777e-05,
"loss": 0.6491,
"num_input_tokens_seen": 1275000,
"step": 3385
},
{
"epoch": 3.0376344086021505,
"grad_norm": 0.7968290448188782,
"learning_rate": 4.394346259874242e-05,
"loss": 0.7002,
"num_input_tokens_seen": 1276856,
"step": 3390
},
{
"epoch": 3.042114695340502,
"grad_norm": 0.5572385191917419,
"learning_rate": 4.3917925797677025e-05,
"loss": 0.6943,
"num_input_tokens_seen": 1278648,
"step": 3395
},
{
"epoch": 3.046594982078853,
"grad_norm": 0.6464095711708069,
"learning_rate": 4.389234272658227e-05,
"loss": 0.6567,
"num_input_tokens_seen": 1280504,
"step": 3400
},
{
"epoch": 3.0510752688172045,
"grad_norm": 0.698445200920105,
"learning_rate": 4.386671344802998e-05,
"loss": 0.729,
"num_input_tokens_seen": 1282488,
"step": 3405
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.7717545628547668,
"learning_rate": 4.384103802470502e-05,
"loss": 0.7305,
"num_input_tokens_seen": 1284312,
"step": 3410
},
{
"epoch": 3.0600358422939067,
"grad_norm": 0.8240836262702942,
"learning_rate": 4.381531651940511e-05,
"loss": 0.6899,
"num_input_tokens_seen": 1286200,
"step": 3415
},
{
"epoch": 3.064516129032258,
"grad_norm": 0.6516638398170471,
"learning_rate": 4.378954899504068e-05,
"loss": 0.6686,
"num_input_tokens_seen": 1288088,
"step": 3420
},
{
"epoch": 3.0689964157706093,
"grad_norm": 0.5957766771316528,
"learning_rate": 4.3763735514634706e-05,
"loss": 0.7081,
"num_input_tokens_seen": 1290232,
"step": 3425
},
{
"epoch": 3.0734767025089607,
"grad_norm": 0.5282344222068787,
"learning_rate": 4.3737876141322576e-05,
"loss": 0.7534,
"num_input_tokens_seen": 1292184,
"step": 3430
},
{
"epoch": 3.077956989247312,
"grad_norm": 0.6102969646453857,
"learning_rate": 4.371197093835192e-05,
"loss": 0.6726,
"num_input_tokens_seen": 1294168,
"step": 3435
},
{
"epoch": 3.0824372759856633,
"grad_norm": 0.8353556394577026,
"learning_rate": 4.368601996908246e-05,
"loss": 0.6673,
"num_input_tokens_seen": 1296088,
"step": 3440
},
{
"epoch": 3.086917562724014,
"grad_norm": 0.5756303071975708,
"learning_rate": 4.366002329698585e-05,
"loss": 0.6629,
"num_input_tokens_seen": 1297816,
"step": 3445
},
{
"epoch": 3.0913978494623655,
"grad_norm": 0.8167845606803894,
"learning_rate": 4.3633980985645526e-05,
"loss": 0.7104,
"num_input_tokens_seen": 1299704,
"step": 3450
},
{
"epoch": 3.095878136200717,
"grad_norm": 0.9127864837646484,
"learning_rate": 4.360789309875656e-05,
"loss": 0.7214,
"num_input_tokens_seen": 1301656,
"step": 3455
},
{
"epoch": 3.100358422939068,
"grad_norm": 0.5587136149406433,
"learning_rate": 4.358175970012549e-05,
"loss": 0.6937,
"num_input_tokens_seen": 1303608,
"step": 3460
},
{
"epoch": 3.1048387096774195,
"grad_norm": 0.5719638466835022,
"learning_rate": 4.3555580853670154e-05,
"loss": 0.6916,
"num_input_tokens_seen": 1305432,
"step": 3465
},
{
"epoch": 3.109318996415771,
"grad_norm": 0.4955412447452545,
"learning_rate": 4.352935662341956e-05,
"loss": 0.7134,
"num_input_tokens_seen": 1307288,
"step": 3470
},
{
"epoch": 3.1137992831541217,
"grad_norm": 0.837181806564331,
"learning_rate": 4.350308707351372e-05,
"loss": 0.6982,
"num_input_tokens_seen": 1309272,
"step": 3475
},
{
"epoch": 3.118279569892473,
"grad_norm": 0.4352196753025055,
"learning_rate": 4.347677226820349e-05,
"loss": 0.6825,
"num_input_tokens_seen": 1311128,
"step": 3480
},
{
"epoch": 3.1227598566308243,
"grad_norm": 0.4888545870780945,
"learning_rate": 4.3450412271850406e-05,
"loss": 0.7204,
"num_input_tokens_seen": 1312856,
"step": 3485
},
{
"epoch": 3.1272401433691757,
"grad_norm": 0.6269586682319641,
"learning_rate": 4.342400714892653e-05,
"loss": 0.6848,
"num_input_tokens_seen": 1315000,
"step": 3490
},
{
"epoch": 3.131720430107527,
"grad_norm": 0.6699550747871399,
"learning_rate": 4.339755696401431e-05,
"loss": 0.6998,
"num_input_tokens_seen": 1316792,
"step": 3495
},
{
"epoch": 3.1362007168458783,
"grad_norm": 0.5314154028892517,
"learning_rate": 4.337106178180639e-05,
"loss": 0.6819,
"num_input_tokens_seen": 1318616,
"step": 3500
},
{
"epoch": 3.140681003584229,
"grad_norm": 0.7012357711791992,
"learning_rate": 4.3344521667105486e-05,
"loss": 0.7421,
"num_input_tokens_seen": 1320504,
"step": 3505
},
{
"epoch": 3.1451612903225805,
"grad_norm": 0.6760140657424927,
"learning_rate": 4.331793668482421e-05,
"loss": 0.6886,
"num_input_tokens_seen": 1322488,
"step": 3510
},
{
"epoch": 3.149641577060932,
"grad_norm": 0.5820931196212769,
"learning_rate": 4.329130689998491e-05,
"loss": 0.7029,
"num_input_tokens_seen": 1324440,
"step": 3515
},
{
"epoch": 3.154121863799283,
"grad_norm": 0.5365394353866577,
"learning_rate": 4.3264632377719496e-05,
"loss": 0.737,
"num_input_tokens_seen": 1326488,
"step": 3520
},
{
"epoch": 3.1586021505376345,
"grad_norm": 0.5414961576461792,
"learning_rate": 4.323791318326932e-05,
"loss": 0.6979,
"num_input_tokens_seen": 1328536,
"step": 3525
},
{
"epoch": 3.163082437275986,
"grad_norm": 0.5694983005523682,
"learning_rate": 4.3211149381984996e-05,
"loss": 0.6969,
"num_input_tokens_seen": 1330328,
"step": 3530
},
{
"epoch": 3.1675627240143367,
"grad_norm": 0.5108627080917358,
"learning_rate": 4.318434103932622e-05,
"loss": 0.7004,
"num_input_tokens_seen": 1332280,
"step": 3535
},
{
"epoch": 3.172043010752688,
"grad_norm": 0.4905160665512085,
"learning_rate": 4.315748822086164e-05,
"loss": 0.6608,
"num_input_tokens_seen": 1334360,
"step": 3540
},
{
"epoch": 3.1765232974910393,
"grad_norm": 0.7809849977493286,
"learning_rate": 4.3130590992268695e-05,
"loss": 0.7581,
"num_input_tokens_seen": 1336472,
"step": 3545
},
{
"epoch": 3.1810035842293907,
"grad_norm": 0.48875778913497925,
"learning_rate": 4.3103649419333424e-05,
"loss": 0.6954,
"num_input_tokens_seen": 1338296,
"step": 3550
},
{
"epoch": 3.185483870967742,
"grad_norm": 0.6410282254219055,
"learning_rate": 4.307666356795033e-05,
"loss": 0.7318,
"num_input_tokens_seen": 1340216,
"step": 3555
},
{
"epoch": 3.1899641577060933,
"grad_norm": 0.9984332323074341,
"learning_rate": 4.3049633504122215e-05,
"loss": 0.6837,
"num_input_tokens_seen": 1341912,
"step": 3560
},
{
"epoch": 3.1944444444444446,
"grad_norm": 0.5605589747428894,
"learning_rate": 4.302255929396003e-05,
"loss": 0.714,
"num_input_tokens_seen": 1343672,
"step": 3565
},
{
"epoch": 3.1989247311827955,
"grad_norm": 0.7751408815383911,
"learning_rate": 4.299544100368268e-05,
"loss": 0.6955,
"num_input_tokens_seen": 1345528,
"step": 3570
},
{
"epoch": 3.203405017921147,
"grad_norm": 0.663055956363678,
"learning_rate": 4.2968278699616885e-05,
"loss": 0.6538,
"num_input_tokens_seen": 1347640,
"step": 3575
},
{
"epoch": 3.207885304659498,
"grad_norm": 0.41530367732048035,
"learning_rate": 4.294107244819704e-05,
"loss": 0.6929,
"num_input_tokens_seen": 1349432,
"step": 3580
},
{
"epoch": 3.2123655913978495,
"grad_norm": 0.6339878439903259,
"learning_rate": 4.291382231596499e-05,
"loss": 0.7599,
"num_input_tokens_seen": 1351160,
"step": 3585
},
{
"epoch": 3.216845878136201,
"grad_norm": 0.8388494849205017,
"learning_rate": 4.2886528369569935e-05,
"loss": 0.6925,
"num_input_tokens_seen": 1353016,
"step": 3590
},
{
"epoch": 3.221326164874552,
"grad_norm": 0.5189142227172852,
"learning_rate": 4.285919067576822e-05,
"loss": 0.7088,
"num_input_tokens_seen": 1354904,
"step": 3595
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.6881794929504395,
"learning_rate": 4.283180930142322e-05,
"loss": 0.6506,
"num_input_tokens_seen": 1356792,
"step": 3600
},
{
"epoch": 3.2302867383512543,
"grad_norm": 0.550651490688324,
"learning_rate": 4.280438431350508e-05,
"loss": 0.7122,
"num_input_tokens_seen": 1358680,
"step": 3605
},
{
"epoch": 3.2347670250896057,
"grad_norm": 0.5976690053939819,
"learning_rate": 4.2776915779090674e-05,
"loss": 0.6498,
"num_input_tokens_seen": 1360536,
"step": 3610
},
{
"epoch": 3.239247311827957,
"grad_norm": 0.8103978633880615,
"learning_rate": 4.274940376536338e-05,
"loss": 0.716,
"num_input_tokens_seen": 1362424,
"step": 3615
},
{
"epoch": 3.2437275985663083,
"grad_norm": 0.35558953881263733,
"learning_rate": 4.272184833961289e-05,
"loss": 0.6674,
"num_input_tokens_seen": 1364408,
"step": 3620
},
{
"epoch": 3.2482078853046596,
"grad_norm": 0.592622697353363,
"learning_rate": 4.269424956923509e-05,
"loss": 0.7252,
"num_input_tokens_seen": 1366200,
"step": 3625
},
{
"epoch": 3.252688172043011,
"grad_norm": 0.6038530468940735,
"learning_rate": 4.2666607521731883e-05,
"loss": 0.6721,
"num_input_tokens_seen": 1368024,
"step": 3630
},
{
"epoch": 3.257168458781362,
"grad_norm": 0.5127895474433899,
"learning_rate": 4.2638922264711026e-05,
"loss": 0.6845,
"num_input_tokens_seen": 1369784,
"step": 3635
},
{
"epoch": 3.261648745519713,
"grad_norm": 0.6586855053901672,
"learning_rate": 4.2611193865885926e-05,
"loss": 0.7232,
"num_input_tokens_seen": 1371608,
"step": 3640
},
{
"epoch": 3.2661290322580645,
"grad_norm": 0.671576738357544,
"learning_rate": 4.258342239307554e-05,
"loss": 0.7184,
"num_input_tokens_seen": 1373400,
"step": 3645
},
{
"epoch": 3.270609318996416,
"grad_norm": 0.5418992042541504,
"learning_rate": 4.255560791420417e-05,
"loss": 0.6709,
"num_input_tokens_seen": 1375256,
"step": 3650
},
{
"epoch": 3.275089605734767,
"grad_norm": 0.5688634514808655,
"learning_rate": 4.2527750497301323e-05,
"loss": 0.6669,
"num_input_tokens_seen": 1377336,
"step": 3655
},
{
"epoch": 3.279569892473118,
"grad_norm": 0.6521787047386169,
"learning_rate": 4.249985021050147e-05,
"loss": 0.7181,
"num_input_tokens_seen": 1379064,
"step": 3660
},
{
"epoch": 3.2840501792114694,
"grad_norm": 0.6241593360900879,
"learning_rate": 4.247190712204398e-05,
"loss": 0.6542,
"num_input_tokens_seen": 1380920,
"step": 3665
},
{
"epoch": 3.2885304659498207,
"grad_norm": 0.5417748093605042,
"learning_rate": 4.2443921300272895e-05,
"loss": 0.7291,
"num_input_tokens_seen": 1382872,
"step": 3670
},
{
"epoch": 3.293010752688172,
"grad_norm": 0.6573375463485718,
"learning_rate": 4.241589281363678e-05,
"loss": 0.7441,
"num_input_tokens_seen": 1384888,
"step": 3675
},
{
"epoch": 3.2974910394265233,
"grad_norm": 0.8385305404663086,
"learning_rate": 4.2387821730688545e-05,
"loss": 0.6885,
"num_input_tokens_seen": 1386776,
"step": 3680
},
{
"epoch": 3.3019713261648747,
"grad_norm": 0.8915187120437622,
"learning_rate": 4.2359708120085286e-05,
"loss": 0.7465,
"num_input_tokens_seen": 1388600,
"step": 3685
},
{
"epoch": 3.306451612903226,
"grad_norm": 0.48122549057006836,
"learning_rate": 4.233155205058811e-05,
"loss": 0.6854,
"num_input_tokens_seen": 1390488,
"step": 3690
},
{
"epoch": 3.3109318996415773,
"grad_norm": 0.6241974830627441,
"learning_rate": 4.230335359106198e-05,
"loss": 0.712,
"num_input_tokens_seen": 1392344,
"step": 3695
},
{
"epoch": 3.315412186379928,
"grad_norm": 0.8159958124160767,
"learning_rate": 4.227511281047552e-05,
"loss": 0.7353,
"num_input_tokens_seen": 1394296,
"step": 3700
},
{
"epoch": 3.3198924731182795,
"grad_norm": 0.8499752283096313,
"learning_rate": 4.22468297779009e-05,
"loss": 0.7385,
"num_input_tokens_seen": 1396216,
"step": 3705
},
{
"epoch": 3.324372759856631,
"grad_norm": 0.5092227458953857,
"learning_rate": 4.2218504562513584e-05,
"loss": 0.6918,
"num_input_tokens_seen": 1398136,
"step": 3710
},
{
"epoch": 3.328853046594982,
"grad_norm": 0.7707375884056091,
"learning_rate": 4.219013723359224e-05,
"loss": 0.7062,
"num_input_tokens_seen": 1400088,
"step": 3715
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.5665374398231506,
"learning_rate": 4.216172786051854e-05,
"loss": 0.7163,
"num_input_tokens_seen": 1402040,
"step": 3720
},
{
"epoch": 3.3378136200716844,
"grad_norm": 0.5855923295021057,
"learning_rate": 4.213327651277697e-05,
"loss": 0.6933,
"num_input_tokens_seen": 1403960,
"step": 3725
},
{
"epoch": 3.3422939068100357,
"grad_norm": 0.5904120206832886,
"learning_rate": 4.2104783259954687e-05,
"loss": 0.6721,
"num_input_tokens_seen": 1405848,
"step": 3730
},
{
"epoch": 3.346774193548387,
"grad_norm": 0.5156244039535522,
"learning_rate": 4.207624817174135e-05,
"loss": 0.6761,
"num_input_tokens_seen": 1407736,
"step": 3735
},
{
"epoch": 3.3512544802867383,
"grad_norm": 0.5167236328125,
"learning_rate": 4.204767131792892e-05,
"loss": 0.7573,
"num_input_tokens_seen": 1409624,
"step": 3740
},
{
"epoch": 3.3557347670250897,
"grad_norm": 0.46987155079841614,
"learning_rate": 4.201905276841153e-05,
"loss": 0.7128,
"num_input_tokens_seen": 1411480,
"step": 3745
},
{
"epoch": 3.360215053763441,
"grad_norm": 0.5643429160118103,
"learning_rate": 4.199039259318529e-05,
"loss": 0.7207,
"num_input_tokens_seen": 1413400,
"step": 3750
},
{
"epoch": 3.3646953405017923,
"grad_norm": 0.7398551106452942,
"learning_rate": 4.196169086234811e-05,
"loss": 0.7291,
"num_input_tokens_seen": 1415224,
"step": 3755
},
{
"epoch": 3.369175627240143,
"grad_norm": 0.6347612142562866,
"learning_rate": 4.193294764609954e-05,
"loss": 0.7287,
"num_input_tokens_seen": 1416952,
"step": 3760
},
{
"epoch": 3.3736559139784945,
"grad_norm": 0.434487909078598,
"learning_rate": 4.190416301474059e-05,
"loss": 0.7096,
"num_input_tokens_seen": 1418840,
"step": 3765
},
{
"epoch": 3.378136200716846,
"grad_norm": 0.6980918645858765,
"learning_rate": 4.18753370386736e-05,
"loss": 0.6996,
"num_input_tokens_seen": 1420536,
"step": 3770
},
{
"epoch": 3.382616487455197,
"grad_norm": 0.6351816058158875,
"learning_rate": 4.184646978840198e-05,
"loss": 0.7126,
"num_input_tokens_seen": 1422456,
"step": 3775
},
{
"epoch": 3.3870967741935485,
"grad_norm": 0.5904970169067383,
"learning_rate": 4.181756133453013e-05,
"loss": 0.7314,
"num_input_tokens_seen": 1424312,
"step": 3780
},
{
"epoch": 3.3915770609319,
"grad_norm": 0.663182258605957,
"learning_rate": 4.17886117477632e-05,
"loss": 0.6684,
"num_input_tokens_seen": 1426360,
"step": 3785
},
{
"epoch": 3.3960573476702507,
"grad_norm": 0.7417337894439697,
"learning_rate": 4.175962109890696e-05,
"loss": 0.685,
"num_input_tokens_seen": 1428152,
"step": 3790
},
{
"epoch": 3.400537634408602,
"grad_norm": 0.6670061945915222,
"learning_rate": 4.173058945886762e-05,
"loss": 0.6656,
"num_input_tokens_seen": 1430360,
"step": 3795
},
{
"epoch": 3.4050179211469533,
"grad_norm": 0.41760730743408203,
"learning_rate": 4.1701516898651614e-05,
"loss": 0.7181,
"num_input_tokens_seen": 1432184,
"step": 3800
},
{
"epoch": 3.4094982078853047,
"grad_norm": 0.6389894485473633,
"learning_rate": 4.1672403489365505e-05,
"loss": 0.7196,
"num_input_tokens_seen": 1434008,
"step": 3805
},
{
"epoch": 3.413978494623656,
"grad_norm": 0.622952401638031,
"learning_rate": 4.164324930221571e-05,
"loss": 0.7128,
"num_input_tokens_seen": 1435928,
"step": 3810
},
{
"epoch": 3.4184587813620073,
"grad_norm": 0.8242074251174927,
"learning_rate": 4.161405440850844e-05,
"loss": 0.7395,
"num_input_tokens_seen": 1437784,
"step": 3815
},
{
"epoch": 3.4229390681003586,
"grad_norm": 0.541373610496521,
"learning_rate": 4.1584818879649426e-05,
"loss": 0.6827,
"num_input_tokens_seen": 1439640,
"step": 3820
},
{
"epoch": 3.4274193548387095,
"grad_norm": 0.5152111053466797,
"learning_rate": 4.1555542787143795e-05,
"loss": 0.7267,
"num_input_tokens_seen": 1441496,
"step": 3825
},
{
"epoch": 3.431899641577061,
"grad_norm": 0.6609454154968262,
"learning_rate": 4.1526226202595915e-05,
"loss": 0.7206,
"num_input_tokens_seen": 1443512,
"step": 3830
},
{
"epoch": 3.436379928315412,
"grad_norm": 0.6851304769515991,
"learning_rate": 4.1496869197709146e-05,
"loss": 0.708,
"num_input_tokens_seen": 1445432,
"step": 3835
},
{
"epoch": 3.4408602150537635,
"grad_norm": 0.6026902794837952,
"learning_rate": 4.1467471844285724e-05,
"loss": 0.7032,
"num_input_tokens_seen": 1447384,
"step": 3840
},
{
"epoch": 3.445340501792115,
"grad_norm": 0.6418375968933105,
"learning_rate": 4.14380342142266e-05,
"loss": 0.6866,
"num_input_tokens_seen": 1449208,
"step": 3845
},
{
"epoch": 3.449820788530466,
"grad_norm": 0.5678339600563049,
"learning_rate": 4.1408556379531186e-05,
"loss": 0.7173,
"num_input_tokens_seen": 1451064,
"step": 3850
},
{
"epoch": 3.454301075268817,
"grad_norm": 0.5131914019584656,
"learning_rate": 4.137903841229727e-05,
"loss": 0.7137,
"num_input_tokens_seen": 1452856,
"step": 3855
},
{
"epoch": 3.4587813620071683,
"grad_norm": 0.39637306332588196,
"learning_rate": 4.1349480384720765e-05,
"loss": 0.6941,
"num_input_tokens_seen": 1454712,
"step": 3860
},
{
"epoch": 3.4632616487455197,
"grad_norm": 0.8247675895690918,
"learning_rate": 4.13198823690956e-05,
"loss": 0.6861,
"num_input_tokens_seen": 1456600,
"step": 3865
},
{
"epoch": 3.467741935483871,
"grad_norm": 0.4988982379436493,
"learning_rate": 4.1290244437813475e-05,
"loss": 0.7023,
"num_input_tokens_seen": 1458552,
"step": 3870
},
{
"epoch": 3.4722222222222223,
"grad_norm": 0.7563645839691162,
"learning_rate": 4.126056666336373e-05,
"loss": 0.7156,
"num_input_tokens_seen": 1460408,
"step": 3875
},
{
"epoch": 3.4767025089605736,
"grad_norm": 0.5761010646820068,
"learning_rate": 4.123084911833315e-05,
"loss": 0.7287,
"num_input_tokens_seen": 1462392,
"step": 3880
},
{
"epoch": 3.481182795698925,
"grad_norm": 0.6941600441932678,
"learning_rate": 4.120109187540581e-05,
"loss": 0.7154,
"num_input_tokens_seen": 1464184,
"step": 3885
},
{
"epoch": 3.485663082437276,
"grad_norm": 0.6207517981529236,
"learning_rate": 4.117129500736286e-05,
"loss": 0.711,
"num_input_tokens_seen": 1466040,
"step": 3890
},
{
"epoch": 3.490143369175627,
"grad_norm": 0.4753372073173523,
"learning_rate": 4.114145858708236e-05,
"loss": 0.7232,
"num_input_tokens_seen": 1467960,
"step": 3895
},
{
"epoch": 3.4946236559139785,
"grad_norm": 0.6294930577278137,
"learning_rate": 4.111158268753914e-05,
"loss": 0.6971,
"num_input_tokens_seen": 1469944,
"step": 3900
},
{
"epoch": 3.49910394265233,
"grad_norm": 0.8148292303085327,
"learning_rate": 4.108166738180455e-05,
"loss": 0.698,
"num_input_tokens_seen": 1471736,
"step": 3905
},
{
"epoch": 3.5,
"eval_loss": 0.7066304683685303,
"eval_runtime": 5.6339,
"eval_samples_per_second": 88.038,
"eval_steps_per_second": 22.009,
"num_input_tokens_seen": 1472152,
"step": 3906
},
{
"epoch": 3.503584229390681,
"grad_norm": 0.5032709836959839,
"learning_rate": 4.105171274304637e-05,
"loss": 0.7026,
"num_input_tokens_seen": 1473624,
"step": 3910
},
{
"epoch": 3.508064516129032,
"grad_norm": 0.5328845381736755,
"learning_rate": 4.102171884452852e-05,
"loss": 0.6933,
"num_input_tokens_seen": 1475480,
"step": 3915
},
{
"epoch": 3.5125448028673834,
"grad_norm": 0.664138674736023,
"learning_rate": 4.099168575961099e-05,
"loss": 0.7192,
"num_input_tokens_seen": 1477336,
"step": 3920
},
{
"epoch": 3.5170250896057347,
"grad_norm": 0.3400190472602844,
"learning_rate": 4.096161356174959e-05,
"loss": 0.6938,
"num_input_tokens_seen": 1479256,
"step": 3925
},
{
"epoch": 3.521505376344086,
"grad_norm": 0.73305344581604,
"learning_rate": 4.093150232449581e-05,
"loss": 0.6948,
"num_input_tokens_seen": 1481080,
"step": 3930
},
{
"epoch": 3.5259856630824373,
"grad_norm": 0.5535285472869873,
"learning_rate": 4.0901352121496613e-05,
"loss": 0.6842,
"num_input_tokens_seen": 1483128,
"step": 3935
},
{
"epoch": 3.5304659498207887,
"grad_norm": 0.507737398147583,
"learning_rate": 4.087116302649428e-05,
"loss": 0.6932,
"num_input_tokens_seen": 1484984,
"step": 3940
},
{
"epoch": 3.53494623655914,
"grad_norm": 0.4982096254825592,
"learning_rate": 4.0840935113326184e-05,
"loss": 0.6843,
"num_input_tokens_seen": 1486744,
"step": 3945
},
{
"epoch": 3.5394265232974913,
"grad_norm": 0.5703076124191284,
"learning_rate": 4.081066845592467e-05,
"loss": 0.7112,
"num_input_tokens_seen": 1488632,
"step": 3950
},
{
"epoch": 3.543906810035842,
"grad_norm": 0.6137137413024902,
"learning_rate": 4.0780363128316844e-05,
"loss": 0.7193,
"num_input_tokens_seen": 1490584,
"step": 3955
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.5267996191978455,
"learning_rate": 4.0750019204624356e-05,
"loss": 0.6742,
"num_input_tokens_seen": 1492472,
"step": 3960
},
{
"epoch": 3.552867383512545,
"grad_norm": 0.7842523455619812,
"learning_rate": 4.071963675906331e-05,
"loss": 0.7559,
"num_input_tokens_seen": 1494488,
"step": 3965
},
{
"epoch": 3.557347670250896,
"grad_norm": 0.6850671768188477,
"learning_rate": 4.0689215865944e-05,
"loss": 0.6919,
"num_input_tokens_seen": 1496504,
"step": 3970
},
{
"epoch": 3.561827956989247,
"grad_norm": 0.990515947341919,
"learning_rate": 4.0658756599670735e-05,
"loss": 0.7051,
"num_input_tokens_seen": 1498392,
"step": 3975
},
{
"epoch": 3.5663082437275984,
"grad_norm": 0.663224458694458,
"learning_rate": 4.062825903474172e-05,
"loss": 0.6949,
"num_input_tokens_seen": 1500376,
"step": 3980
},
{
"epoch": 3.5707885304659497,
"grad_norm": 0.8304319381713867,
"learning_rate": 4.059772324574881e-05,
"loss": 0.7059,
"num_input_tokens_seen": 1502200,
"step": 3985
},
{
"epoch": 3.575268817204301,
"grad_norm": 0.6099317669868469,
"learning_rate": 4.056714930737735e-05,
"loss": 0.7065,
"num_input_tokens_seen": 1503928,
"step": 3990
},
{
"epoch": 3.5797491039426523,
"grad_norm": 0.5851802229881287,
"learning_rate": 4.053653729440599e-05,
"loss": 0.7104,
"num_input_tokens_seen": 1505816,
"step": 3995
},
{
"epoch": 3.5842293906810037,
"grad_norm": 0.6147633790969849,
"learning_rate": 4.05058872817065e-05,
"loss": 0.6802,
"num_input_tokens_seen": 1507608,
"step": 4000
},
{
"epoch": 3.588709677419355,
"grad_norm": 0.6019237637519836,
"learning_rate": 4.047519934424362e-05,
"loss": 0.6765,
"num_input_tokens_seen": 1509560,
"step": 4005
},
{
"epoch": 3.5931899641577063,
"grad_norm": 0.6119495034217834,
"learning_rate": 4.044447355707483e-05,
"loss": 0.7309,
"num_input_tokens_seen": 1511672,
"step": 4010
},
{
"epoch": 3.597670250896057,
"grad_norm": 0.875652015209198,
"learning_rate": 4.0413709995350145e-05,
"loss": 0.7415,
"num_input_tokens_seen": 1513560,
"step": 4015
},
{
"epoch": 3.6021505376344085,
"grad_norm": 1.135251522064209,
"learning_rate": 4.038290873431203e-05,
"loss": 0.7534,
"num_input_tokens_seen": 1515544,
"step": 4020
},
{
"epoch": 3.60663082437276,
"grad_norm": 0.5745335221290588,
"learning_rate": 4.035206984929513e-05,
"loss": 0.741,
"num_input_tokens_seen": 1517496,
"step": 4025
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.878966212272644,
"learning_rate": 4.032119341572612e-05,
"loss": 0.704,
"num_input_tokens_seen": 1519448,
"step": 4030
},
{
"epoch": 3.6155913978494625,
"grad_norm": 0.5013412833213806,
"learning_rate": 4.0290279509123483e-05,
"loss": 0.7011,
"num_input_tokens_seen": 1521272,
"step": 4035
},
{
"epoch": 3.6200716845878134,
"grad_norm": 0.5498907566070557,
"learning_rate": 4.02593282050974e-05,
"loss": 0.6896,
"num_input_tokens_seen": 1523128,
"step": 4040
},
{
"epoch": 3.6245519713261647,
"grad_norm": 0.5444872379302979,
"learning_rate": 4.022833957934949e-05,
"loss": 0.7108,
"num_input_tokens_seen": 1525144,
"step": 4045
},
{
"epoch": 3.629032258064516,
"grad_norm": 0.5091925859451294,
"learning_rate": 4.019731370767267e-05,
"loss": 0.6913,
"num_input_tokens_seen": 1526968,
"step": 4050
},
{
"epoch": 3.6335125448028673,
"grad_norm": 0.3739386200904846,
"learning_rate": 4.016625066595092e-05,
"loss": 0.688,
"num_input_tokens_seen": 1528760,
"step": 4055
},
{
"epoch": 3.6379928315412187,
"grad_norm": 0.6722102761268616,
"learning_rate": 4.013515053015918e-05,
"loss": 0.69,
"num_input_tokens_seen": 1530584,
"step": 4060
},
{
"epoch": 3.64247311827957,
"grad_norm": 0.7019551992416382,
"learning_rate": 4.010401337636309e-05,
"loss": 0.7203,
"num_input_tokens_seen": 1532312,
"step": 4065
},
{
"epoch": 3.6469534050179213,
"grad_norm": 0.539631724357605,
"learning_rate": 4.007283928071882e-05,
"loss": 0.6904,
"num_input_tokens_seen": 1534008,
"step": 4070
},
{
"epoch": 3.6514336917562726,
"grad_norm": 0.7080748081207275,
"learning_rate": 4.0041628319472926e-05,
"loss": 0.6979,
"num_input_tokens_seen": 1535896,
"step": 4075
},
{
"epoch": 3.6559139784946235,
"grad_norm": 0.5636100172996521,
"learning_rate": 4.001038056896211e-05,
"loss": 0.7193,
"num_input_tokens_seen": 1537752,
"step": 4080
},
{
"epoch": 3.660394265232975,
"grad_norm": 0.5630587935447693,
"learning_rate": 3.9979096105613035e-05,
"loss": 0.6734,
"num_input_tokens_seen": 1539640,
"step": 4085
},
{
"epoch": 3.664874551971326,
"grad_norm": 0.7142034769058228,
"learning_rate": 3.99477750059422e-05,
"loss": 0.6964,
"num_input_tokens_seen": 1541528,
"step": 4090
},
{
"epoch": 3.6693548387096775,
"grad_norm": 0.5816643238067627,
"learning_rate": 3.991641734655568e-05,
"loss": 0.6701,
"num_input_tokens_seen": 1543448,
"step": 4095
},
{
"epoch": 3.673835125448029,
"grad_norm": 0.7215139269828796,
"learning_rate": 3.988502320414897e-05,
"loss": 0.7094,
"num_input_tokens_seen": 1545240,
"step": 4100
},
{
"epoch": 3.6783154121863797,
"grad_norm": 0.6312930583953857,
"learning_rate": 3.985359265550682e-05,
"loss": 0.7388,
"num_input_tokens_seen": 1547096,
"step": 4105
},
{
"epoch": 3.682795698924731,
"grad_norm": 0.49201858043670654,
"learning_rate": 3.9822125777502995e-05,
"loss": 0.7318,
"num_input_tokens_seen": 1549016,
"step": 4110
},
{
"epoch": 3.6872759856630823,
"grad_norm": 0.6607922315597534,
"learning_rate": 3.979062264710012e-05,
"loss": 0.708,
"num_input_tokens_seen": 1550968,
"step": 4115
},
{
"epoch": 3.6917562724014337,
"grad_norm": 0.6152588129043579,
"learning_rate": 3.975908334134952e-05,
"loss": 0.7058,
"num_input_tokens_seen": 1552760,
"step": 4120
},
{
"epoch": 3.696236559139785,
"grad_norm": 0.5260980129241943,
"learning_rate": 3.9727507937390954e-05,
"loss": 0.7266,
"num_input_tokens_seen": 1554680,
"step": 4125
},
{
"epoch": 3.7007168458781363,
"grad_norm": 0.5318123698234558,
"learning_rate": 3.969589651245249e-05,
"loss": 0.715,
"num_input_tokens_seen": 1556536,
"step": 4130
},
{
"epoch": 3.7051971326164876,
"grad_norm": 0.6568738222122192,
"learning_rate": 3.9664249143850304e-05,
"loss": 0.6898,
"num_input_tokens_seen": 1558424,
"step": 4135
},
{
"epoch": 3.709677419354839,
"grad_norm": 0.9819836020469666,
"learning_rate": 3.9632565908988476e-05,
"loss": 0.7165,
"num_input_tokens_seen": 1560344,
"step": 4140
},
{
"epoch": 3.71415770609319,
"grad_norm": 0.6370415687561035,
"learning_rate": 3.960084688535881e-05,
"loss": 0.6916,
"num_input_tokens_seen": 1562264,
"step": 4145
},
{
"epoch": 3.718637992831541,
"grad_norm": 0.8133038878440857,
"learning_rate": 3.956909215054066e-05,
"loss": 0.7061,
"num_input_tokens_seen": 1564120,
"step": 4150
},
{
"epoch": 3.7231182795698925,
"grad_norm": 0.46259617805480957,
"learning_rate": 3.953730178220067e-05,
"loss": 0.6822,
"num_input_tokens_seen": 1566072,
"step": 4155
},
{
"epoch": 3.727598566308244,
"grad_norm": 0.6947198510169983,
"learning_rate": 3.9505475858092705e-05,
"loss": 0.7145,
"num_input_tokens_seen": 1567992,
"step": 4160
},
{
"epoch": 3.732078853046595,
"grad_norm": 0.7821716070175171,
"learning_rate": 3.947361445605755e-05,
"loss": 0.6979,
"num_input_tokens_seen": 1569816,
"step": 4165
},
{
"epoch": 3.736559139784946,
"grad_norm": 0.6598497629165649,
"learning_rate": 3.944171765402279e-05,
"loss": 0.6687,
"num_input_tokens_seen": 1571672,
"step": 4170
},
{
"epoch": 3.7410394265232974,
"grad_norm": 0.6169228553771973,
"learning_rate": 3.9409785530002565e-05,
"loss": 0.7181,
"num_input_tokens_seen": 1573496,
"step": 4175
},
{
"epoch": 3.7455197132616487,
"grad_norm": 0.41680485010147095,
"learning_rate": 3.937781816209742e-05,
"loss": 0.7108,
"num_input_tokens_seen": 1575416,
"step": 4180
},
{
"epoch": 3.75,
"grad_norm": 0.5349394679069519,
"learning_rate": 3.934581562849411e-05,
"loss": 0.6989,
"num_input_tokens_seen": 1577272,
"step": 4185
},
{
"epoch": 3.7544802867383513,
"grad_norm": 0.49851375818252563,
"learning_rate": 3.931377800746538e-05,
"loss": 0.7065,
"num_input_tokens_seen": 1579064,
"step": 4190
},
{
"epoch": 3.7589605734767026,
"grad_norm": 0.7621869444847107,
"learning_rate": 3.928170537736981e-05,
"loss": 0.7245,
"num_input_tokens_seen": 1580984,
"step": 4195
},
{
"epoch": 3.763440860215054,
"grad_norm": 0.5499773025512695,
"learning_rate": 3.924959781665159e-05,
"loss": 0.7089,
"num_input_tokens_seen": 1583096,
"step": 4200
},
{
"epoch": 3.767921146953405,
"grad_norm": 0.7189744710922241,
"learning_rate": 3.921745540384038e-05,
"loss": 0.6925,
"num_input_tokens_seen": 1584824,
"step": 4205
},
{
"epoch": 3.772401433691756,
"grad_norm": 0.5173475742340088,
"learning_rate": 3.918527821755101e-05,
"loss": 0.679,
"num_input_tokens_seen": 1586872,
"step": 4210
},
{
"epoch": 3.7768817204301075,
"grad_norm": 0.620393693447113,
"learning_rate": 3.915306633648345e-05,
"loss": 0.7302,
"num_input_tokens_seen": 1588696,
"step": 4215
},
{
"epoch": 3.781362007168459,
"grad_norm": 0.6762012243270874,
"learning_rate": 3.9120819839422456e-05,
"loss": 0.7044,
"num_input_tokens_seen": 1590712,
"step": 4220
},
{
"epoch": 3.78584229390681,
"grad_norm": 0.6276195049285889,
"learning_rate": 3.908853880523748e-05,
"loss": 0.6695,
"num_input_tokens_seen": 1592472,
"step": 4225
},
{
"epoch": 3.790322580645161,
"grad_norm": 0.6157204508781433,
"learning_rate": 3.905622331288246e-05,
"loss": 0.7026,
"num_input_tokens_seen": 1594168,
"step": 4230
},
{
"epoch": 3.7948028673835124,
"grad_norm": 0.5470080375671387,
"learning_rate": 3.9023873441395574e-05,
"loss": 0.7001,
"num_input_tokens_seen": 1595992,
"step": 4235
},
{
"epoch": 3.7992831541218637,
"grad_norm": 0.5555064082145691,
"learning_rate": 3.899148926989912e-05,
"loss": 0.7048,
"num_input_tokens_seen": 1597720,
"step": 4240
},
{
"epoch": 3.803763440860215,
"grad_norm": 0.780737578868866,
"learning_rate": 3.895907087759926e-05,
"loss": 0.6902,
"num_input_tokens_seen": 1599736,
"step": 4245
},
{
"epoch": 3.8082437275985663,
"grad_norm": 0.507416844367981,
"learning_rate": 3.8926618343785876e-05,
"loss": 0.6769,
"num_input_tokens_seen": 1601496,
"step": 4250
},
{
"epoch": 3.8127240143369177,
"grad_norm": 0.5080125331878662,
"learning_rate": 3.8894131747832354e-05,
"loss": 0.7197,
"num_input_tokens_seen": 1603352,
"step": 4255
},
{
"epoch": 3.817204301075269,
"grad_norm": 0.6002953052520752,
"learning_rate": 3.886161116919537e-05,
"loss": 0.677,
"num_input_tokens_seen": 1605208,
"step": 4260
},
{
"epoch": 3.8216845878136203,
"grad_norm": 0.514525294303894,
"learning_rate": 3.8829056687414735e-05,
"loss": 0.6942,
"num_input_tokens_seen": 1607128,
"step": 4265
},
{
"epoch": 3.826164874551971,
"grad_norm": 0.7737773060798645,
"learning_rate": 3.8796468382113184e-05,
"loss": 0.7222,
"num_input_tokens_seen": 1609176,
"step": 4270
},
{
"epoch": 3.8306451612903225,
"grad_norm": 0.6488903760910034,
"learning_rate": 3.876384633299616e-05,
"loss": 0.6904,
"num_input_tokens_seen": 1611096,
"step": 4275
},
{
"epoch": 3.835125448028674,
"grad_norm": 0.6453109383583069,
"learning_rate": 3.873119061985164e-05,
"loss": 0.6992,
"num_input_tokens_seen": 1613048,
"step": 4280
},
{
"epoch": 3.839605734767025,
"grad_norm": 0.7293371558189392,
"learning_rate": 3.869850132254996e-05,
"loss": 0.7087,
"num_input_tokens_seen": 1615128,
"step": 4285
},
{
"epoch": 3.8440860215053765,
"grad_norm": 0.7813587188720703,
"learning_rate": 3.866577852104358e-05,
"loss": 0.6819,
"num_input_tokens_seen": 1616952,
"step": 4290
},
{
"epoch": 3.8485663082437274,
"grad_norm": 0.6214764714241028,
"learning_rate": 3.86330222953669e-05,
"loss": 0.666,
"num_input_tokens_seen": 1618840,
"step": 4295
},
{
"epoch": 3.8530465949820787,
"grad_norm": 0.41636016964912415,
"learning_rate": 3.860023272563609e-05,
"loss": 0.7116,
"num_input_tokens_seen": 1620760,
"step": 4300
},
{
"epoch": 3.85752688172043,
"grad_norm": 0.6706348657608032,
"learning_rate": 3.856740989204884e-05,
"loss": 0.721,
"num_input_tokens_seen": 1622648,
"step": 4305
},
{
"epoch": 3.8620071684587813,
"grad_norm": 0.5346550941467285,
"learning_rate": 3.8534553874884244e-05,
"loss": 0.6626,
"num_input_tokens_seen": 1624632,
"step": 4310
},
{
"epoch": 3.8664874551971327,
"grad_norm": 0.46775901317596436,
"learning_rate": 3.850166475450252e-05,
"loss": 0.6639,
"num_input_tokens_seen": 1626520,
"step": 4315
},
{
"epoch": 3.870967741935484,
"grad_norm": 0.49902352690696716,
"learning_rate": 3.846874261134485e-05,
"loss": 0.7102,
"num_input_tokens_seen": 1628536,
"step": 4320
},
{
"epoch": 3.8754480286738353,
"grad_norm": 0.7217360138893127,
"learning_rate": 3.843578752593323e-05,
"loss": 0.7223,
"num_input_tokens_seen": 1630488,
"step": 4325
},
{
"epoch": 3.8799283154121866,
"grad_norm": 0.6418315172195435,
"learning_rate": 3.840279957887017e-05,
"loss": 0.7037,
"num_input_tokens_seen": 1632344,
"step": 4330
},
{
"epoch": 3.8844086021505375,
"grad_norm": 0.7038565278053284,
"learning_rate": 3.836977885083858e-05,
"loss": 0.7187,
"num_input_tokens_seen": 1634296,
"step": 4335
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.4795782268047333,
"learning_rate": 3.833672542260156e-05,
"loss": 0.6854,
"num_input_tokens_seen": 1636312,
"step": 4340
},
{
"epoch": 3.89336917562724,
"grad_norm": 0.5679958462715149,
"learning_rate": 3.830363937500216e-05,
"loss": 0.7072,
"num_input_tokens_seen": 1638296,
"step": 4345
},
{
"epoch": 3.8978494623655915,
"grad_norm": 0.5424354076385498,
"learning_rate": 3.827052078896323e-05,
"loss": 0.6753,
"num_input_tokens_seen": 1640248,
"step": 4350
},
{
"epoch": 3.902329749103943,
"grad_norm": 0.7104454636573792,
"learning_rate": 3.8237369745487205e-05,
"loss": 0.7172,
"num_input_tokens_seen": 1642040,
"step": 4355
},
{
"epoch": 3.9068100358422937,
"grad_norm": 0.5328916311264038,
"learning_rate": 3.820418632565589e-05,
"loss": 0.6794,
"num_input_tokens_seen": 1643736,
"step": 4360
},
{
"epoch": 3.911290322580645,
"grad_norm": 0.6578375697135925,
"learning_rate": 3.817097061063028e-05,
"loss": 0.662,
"num_input_tokens_seen": 1645784,
"step": 4365
},
{
"epoch": 3.9157706093189963,
"grad_norm": 0.7673133015632629,
"learning_rate": 3.81377226816504e-05,
"loss": 0.7265,
"num_input_tokens_seen": 1647480,
"step": 4370
},
{
"epoch": 3.9202508960573477,
"grad_norm": 0.5228795409202576,
"learning_rate": 3.8104442620035e-05,
"loss": 0.7221,
"num_input_tokens_seen": 1649336,
"step": 4375
},
{
"epoch": 3.924731182795699,
"grad_norm": 0.7645145654678345,
"learning_rate": 3.8071130507181466e-05,
"loss": 0.7152,
"num_input_tokens_seen": 1651192,
"step": 4380
},
{
"epoch": 3.9292114695340503,
"grad_norm": 0.4934726357460022,
"learning_rate": 3.803778642456553e-05,
"loss": 0.6892,
"num_input_tokens_seen": 1653080,
"step": 4385
},
{
"epoch": 3.9336917562724016,
"grad_norm": 0.3990285396575928,
"learning_rate": 3.800441045374119e-05,
"loss": 0.7042,
"num_input_tokens_seen": 1655000,
"step": 4390
},
{
"epoch": 3.938172043010753,
"grad_norm": 0.563335657119751,
"learning_rate": 3.797100267634038e-05,
"loss": 0.6995,
"num_input_tokens_seen": 1656824,
"step": 4395
},
{
"epoch": 3.942652329749104,
"grad_norm": 0.6097021698951721,
"learning_rate": 3.7937563174072826e-05,
"loss": 0.673,
"num_input_tokens_seen": 1658712,
"step": 4400
},
{
"epoch": 3.947132616487455,
"grad_norm": 0.4125150740146637,
"learning_rate": 3.790409202872588e-05,
"loss": 0.6597,
"num_input_tokens_seen": 1660568,
"step": 4405
},
{
"epoch": 3.9516129032258065,
"grad_norm": 0.558273434638977,
"learning_rate": 3.787058932216427e-05,
"loss": 0.6884,
"num_input_tokens_seen": 1662392,
"step": 4410
},
{
"epoch": 3.956093189964158,
"grad_norm": 0.6978607773780823,
"learning_rate": 3.783705513632992e-05,
"loss": 0.7311,
"num_input_tokens_seen": 1664088,
"step": 4415
},
{
"epoch": 3.9605734767025087,
"grad_norm": 0.600098729133606,
"learning_rate": 3.780348955324173e-05,
"loss": 0.6842,
"num_input_tokens_seen": 1665912,
"step": 4420
},
{
"epoch": 3.96505376344086,
"grad_norm": 0.7134581804275513,
"learning_rate": 3.7769892654995444e-05,
"loss": 0.7465,
"num_input_tokens_seen": 1667832,
"step": 4425
},
{
"epoch": 3.9695340501792113,
"grad_norm": 0.5726223587989807,
"learning_rate": 3.773626452376332e-05,
"loss": 0.7285,
"num_input_tokens_seen": 1669816,
"step": 4430
},
{
"epoch": 3.9740143369175627,
"grad_norm": 0.6757382750511169,
"learning_rate": 3.7702605241794073e-05,
"loss": 0.7042,
"num_input_tokens_seen": 1671608,
"step": 4435
},
{
"epoch": 3.978494623655914,
"grad_norm": 0.6381019949913025,
"learning_rate": 3.7668914891412574e-05,
"loss": 0.6666,
"num_input_tokens_seen": 1673400,
"step": 4440
},
{
"epoch": 3.9829749103942653,
"grad_norm": 0.553852915763855,
"learning_rate": 3.7635193555019697e-05,
"loss": 0.7053,
"num_input_tokens_seen": 1675192,
"step": 4445
},
{
"epoch": 3.9874551971326166,
"grad_norm": 0.6171552538871765,
"learning_rate": 3.760144131509209e-05,
"loss": 0.7065,
"num_input_tokens_seen": 1677048,
"step": 4450
},
{
"epoch": 3.991935483870968,
"grad_norm": 0.6333886981010437,
"learning_rate": 3.756765825418199e-05,
"loss": 0.6822,
"num_input_tokens_seen": 1679128,
"step": 4455
},
{
"epoch": 3.996415770609319,
"grad_norm": 0.5539998412132263,
"learning_rate": 3.7533844454917025e-05,
"loss": 0.7177,
"num_input_tokens_seen": 1680856,
"step": 4460
},
{
"epoch": 4.0,
"eval_loss": 0.704715371131897,
"eval_runtime": 5.621,
"eval_samples_per_second": 88.241,
"eval_steps_per_second": 22.06,
"num_input_tokens_seen": 1682016,
"step": 4464
},
{
"epoch": 4.000896057347671,
"grad_norm": 0.3961438834667206,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.7105,
"num_input_tokens_seen": 1682336,
"step": 4465
},
{
"epoch": 4.005376344086022,
"grad_norm": 0.6253134608268738,
"learning_rate": 3.746612497220869e-05,
"loss": 0.724,
"num_input_tokens_seen": 1684096,
"step": 4470
},
{
"epoch": 4.009856630824372,
"grad_norm": 0.6570338010787964,
"learning_rate": 3.743221945439566e-05,
"loss": 0.6865,
"num_input_tokens_seen": 1686112,
"step": 4475
},
{
"epoch": 4.014336917562724,
"grad_norm": 0.6149647235870361,
"learning_rate": 3.739828352948803e-05,
"loss": 0.7226,
"num_input_tokens_seen": 1687872,
"step": 4480
},
{
"epoch": 4.018817204301075,
"grad_norm": 0.5701264142990112,
"learning_rate": 3.736431728048731e-05,
"loss": 0.7126,
"num_input_tokens_seen": 1689760,
"step": 4485
},
{
"epoch": 4.023297491039426,
"grad_norm": 0.7899668216705322,
"learning_rate": 3.733032079046916e-05,
"loss": 0.739,
"num_input_tokens_seen": 1691584,
"step": 4490
},
{
"epoch": 4.027777777777778,
"grad_norm": 0.3633069097995758,
"learning_rate": 3.7296294142583225e-05,
"loss": 0.6703,
"num_input_tokens_seen": 1693376,
"step": 4495
},
{
"epoch": 4.032258064516129,
"grad_norm": 0.43072450160980225,
"learning_rate": 3.726223742005289e-05,
"loss": 0.7192,
"num_input_tokens_seen": 1695232,
"step": 4500
},
{
"epoch": 4.03673835125448,
"grad_norm": 0.3624493479728699,
"learning_rate": 3.7228150706175116e-05,
"loss": 0.6964,
"num_input_tokens_seen": 1697088,
"step": 4505
},
{
"epoch": 4.041218637992832,
"grad_norm": 0.46872445940971375,
"learning_rate": 3.7194034084320195e-05,
"loss": 0.6599,
"num_input_tokens_seen": 1699008,
"step": 4510
},
{
"epoch": 4.045698924731183,
"grad_norm": 0.6037322878837585,
"learning_rate": 3.715988763793158e-05,
"loss": 0.7385,
"num_input_tokens_seen": 1701216,
"step": 4515
},
{
"epoch": 4.050179211469534,
"grad_norm": 0.46617811918258667,
"learning_rate": 3.7125711450525704e-05,
"loss": 0.6784,
"num_input_tokens_seen": 1703040,
"step": 4520
},
{
"epoch": 4.054659498207886,
"grad_norm": 0.460159033536911,
"learning_rate": 3.7091505605691674e-05,
"loss": 0.6943,
"num_input_tokens_seen": 1704800,
"step": 4525
},
{
"epoch": 4.059139784946237,
"grad_norm": 0.553875207901001,
"learning_rate": 3.705727018709118e-05,
"loss": 0.7053,
"num_input_tokens_seen": 1706592,
"step": 4530
},
{
"epoch": 4.063620071684587,
"grad_norm": 0.7636135816574097,
"learning_rate": 3.702300527845825e-05,
"loss": 0.7173,
"num_input_tokens_seen": 1708544,
"step": 4535
},
{
"epoch": 4.068100358422939,
"grad_norm": 0.7820340394973755,
"learning_rate": 3.6988710963598993e-05,
"loss": 0.8042,
"num_input_tokens_seen": 1710720,
"step": 4540
},
{
"epoch": 4.07258064516129,
"grad_norm": 0.4480503499507904,
"learning_rate": 3.695438732639149e-05,
"loss": 0.7068,
"num_input_tokens_seen": 1712480,
"step": 4545
},
{
"epoch": 4.077060931899641,
"grad_norm": 0.7662056088447571,
"learning_rate": 3.6920034450785526e-05,
"loss": 0.7124,
"num_input_tokens_seen": 1714368,
"step": 4550
},
{
"epoch": 4.081541218637993,
"grad_norm": 0.7628973722457886,
"learning_rate": 3.688565242080238e-05,
"loss": 0.6843,
"num_input_tokens_seen": 1716256,
"step": 4555
},
{
"epoch": 4.086021505376344,
"grad_norm": 0.5160735845565796,
"learning_rate": 3.6851241320534665e-05,
"loss": 0.6879,
"num_input_tokens_seen": 1718208,
"step": 4560
},
{
"epoch": 4.090501792114695,
"grad_norm": 0.4380353093147278,
"learning_rate": 3.681680123414606e-05,
"loss": 0.7175,
"num_input_tokens_seen": 1719936,
"step": 4565
},
{
"epoch": 4.094982078853047,
"grad_norm": 0.5784623622894287,
"learning_rate": 3.678233224587118e-05,
"loss": 0.6923,
"num_input_tokens_seen": 1721856,
"step": 4570
},
{
"epoch": 4.099462365591398,
"grad_norm": 0.5189318656921387,
"learning_rate": 3.6747834440015294e-05,
"loss": 0.7124,
"num_input_tokens_seen": 1723808,
"step": 4575
},
{
"epoch": 4.103942652329749,
"grad_norm": 0.6294048428535461,
"learning_rate": 3.671330790095417e-05,
"loss": 0.6556,
"num_input_tokens_seen": 1725696,
"step": 4580
},
{
"epoch": 4.108422939068101,
"grad_norm": 0.5305403470993042,
"learning_rate": 3.667875271313386e-05,
"loss": 0.719,
"num_input_tokens_seen": 1727584,
"step": 4585
},
{
"epoch": 4.112903225806452,
"grad_norm": 0.7114974856376648,
"learning_rate": 3.664416896107047e-05,
"loss": 0.7206,
"num_input_tokens_seen": 1729568,
"step": 4590
},
{
"epoch": 4.117383512544803,
"grad_norm": 0.4398999512195587,
"learning_rate": 3.660955672934998e-05,
"loss": 0.6858,
"num_input_tokens_seen": 1731328,
"step": 4595
},
{
"epoch": 4.121863799283154,
"grad_norm": 0.550876259803772,
"learning_rate": 3.657491610262802e-05,
"loss": 0.6637,
"num_input_tokens_seen": 1733344,
"step": 4600
},
{
"epoch": 4.126344086021505,
"grad_norm": 0.7345083951950073,
"learning_rate": 3.654024716562968e-05,
"loss": 0.6746,
"num_input_tokens_seen": 1735232,
"step": 4605
},
{
"epoch": 4.130824372759856,
"grad_norm": 0.5149017572402954,
"learning_rate": 3.650555000314927e-05,
"loss": 0.6852,
"num_input_tokens_seen": 1737248,
"step": 4610
},
{
"epoch": 4.135304659498208,
"grad_norm": 0.7252482175827026,
"learning_rate": 3.6470824700050155e-05,
"loss": 0.7039,
"num_input_tokens_seen": 1739264,
"step": 4615
},
{
"epoch": 4.139784946236559,
"grad_norm": 0.5596062541007996,
"learning_rate": 3.643607134126452e-05,
"loss": 0.7211,
"num_input_tokens_seen": 1741184,
"step": 4620
},
{
"epoch": 4.14426523297491,
"grad_norm": 0.5212526321411133,
"learning_rate": 3.6401290011793185e-05,
"loss": 0.6673,
"num_input_tokens_seen": 1742976,
"step": 4625
},
{
"epoch": 4.148745519713262,
"grad_norm": 0.5234253406524658,
"learning_rate": 3.636648079670534e-05,
"loss": 0.6706,
"num_input_tokens_seen": 1744832,
"step": 4630
},
{
"epoch": 4.153225806451613,
"grad_norm": 0.6067260503768921,
"learning_rate": 3.6331643781138426e-05,
"loss": 0.7152,
"num_input_tokens_seen": 1746592,
"step": 4635
},
{
"epoch": 4.157706093189964,
"grad_norm": 0.8035642504692078,
"learning_rate": 3.629677905029785e-05,
"loss": 0.6924,
"num_input_tokens_seen": 1748384,
"step": 4640
},
{
"epoch": 4.162186379928316,
"grad_norm": 0.5164393782615662,
"learning_rate": 3.626188668945683e-05,
"loss": 0.7261,
"num_input_tokens_seen": 1750272,
"step": 4645
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.3963463306427002,
"learning_rate": 3.622696678395613e-05,
"loss": 0.6577,
"num_input_tokens_seen": 1752128,
"step": 4650
},
{
"epoch": 4.171146953405018,
"grad_norm": 0.7539615631103516,
"learning_rate": 3.619201941920389e-05,
"loss": 0.6858,
"num_input_tokens_seen": 1754112,
"step": 4655
},
{
"epoch": 4.175627240143369,
"grad_norm": 0.8456557393074036,
"learning_rate": 3.615704468067545e-05,
"loss": 0.7315,
"num_input_tokens_seen": 1755936,
"step": 4660
},
{
"epoch": 4.18010752688172,
"grad_norm": 0.5629037618637085,
"learning_rate": 3.612204265391306e-05,
"loss": 0.71,
"num_input_tokens_seen": 1757792,
"step": 4665
},
{
"epoch": 4.184587813620071,
"grad_norm": 0.3057938516139984,
"learning_rate": 3.608701342452573e-05,
"loss": 0.6738,
"num_input_tokens_seen": 1759680,
"step": 4670
},
{
"epoch": 4.189068100358423,
"grad_norm": 0.7656651735305786,
"learning_rate": 3.605195707818898e-05,
"loss": 0.7177,
"num_input_tokens_seen": 1761568,
"step": 4675
},
{
"epoch": 4.193548387096774,
"grad_norm": 0.837243914604187,
"learning_rate": 3.6016873700644685e-05,
"loss": 0.7361,
"num_input_tokens_seen": 1763488,
"step": 4680
},
{
"epoch": 4.198028673835125,
"grad_norm": 0.49771350622177124,
"learning_rate": 3.598176337770082e-05,
"loss": 0.7004,
"num_input_tokens_seen": 1765440,
"step": 4685
},
{
"epoch": 4.202508960573477,
"grad_norm": 0.6064881086349487,
"learning_rate": 3.594662619523127e-05,
"loss": 0.6879,
"num_input_tokens_seen": 1767296,
"step": 4690
},
{
"epoch": 4.206989247311828,
"grad_norm": 0.45831379294395447,
"learning_rate": 3.5911462239175595e-05,
"loss": 0.6758,
"num_input_tokens_seen": 1769248,
"step": 4695
},
{
"epoch": 4.211469534050179,
"grad_norm": 0.3446964919567108,
"learning_rate": 3.587627159553886e-05,
"loss": 0.6909,
"num_input_tokens_seen": 1770976,
"step": 4700
},
{
"epoch": 4.215949820788531,
"grad_norm": 0.4302808940410614,
"learning_rate": 3.5841054350391386e-05,
"loss": 0.6985,
"num_input_tokens_seen": 1772960,
"step": 4705
},
{
"epoch": 4.220430107526882,
"grad_norm": 0.737852156162262,
"learning_rate": 3.580581058986858e-05,
"loss": 0.7044,
"num_input_tokens_seen": 1774752,
"step": 4710
},
{
"epoch": 4.224910394265233,
"grad_norm": 0.5260282754898071,
"learning_rate": 3.5770540400170675e-05,
"loss": 0.6993,
"num_input_tokens_seen": 1776800,
"step": 4715
},
{
"epoch": 4.229390681003585,
"grad_norm": 0.5473984479904175,
"learning_rate": 3.573524386756256e-05,
"loss": 0.7054,
"num_input_tokens_seen": 1778752,
"step": 4720
},
{
"epoch": 4.233870967741935,
"grad_norm": 0.45589348673820496,
"learning_rate": 3.569992107837356e-05,
"loss": 0.6958,
"num_input_tokens_seen": 1780736,
"step": 4725
},
{
"epoch": 4.238351254480286,
"grad_norm": 0.5350434184074402,
"learning_rate": 3.56645721189972e-05,
"loss": 0.6803,
"num_input_tokens_seen": 1782688,
"step": 4730
},
{
"epoch": 4.242831541218638,
"grad_norm": 0.761256992816925,
"learning_rate": 3.562919707589102e-05,
"loss": 0.7072,
"num_input_tokens_seen": 1784416,
"step": 4735
},
{
"epoch": 4.247311827956989,
"grad_norm": 0.6957365870475769,
"learning_rate": 3.5593796035576373e-05,
"loss": 0.668,
"num_input_tokens_seen": 1786432,
"step": 4740
},
{
"epoch": 4.25179211469534,
"grad_norm": 0.5861120223999023,
"learning_rate": 3.555836908463817e-05,
"loss": 0.6732,
"num_input_tokens_seen": 1788352,
"step": 4745
},
{
"epoch": 4.256272401433692,
"grad_norm": 0.5068464279174805,
"learning_rate": 3.552291630972472e-05,
"loss": 0.6916,
"num_input_tokens_seen": 1790208,
"step": 4750
},
{
"epoch": 4.260752688172043,
"grad_norm": 0.38968732953071594,
"learning_rate": 3.5487437797547476e-05,
"loss": 0.7119,
"num_input_tokens_seen": 1792032,
"step": 4755
},
{
"epoch": 4.265232974910394,
"grad_norm": 0.5201514363288879,
"learning_rate": 3.545193363488085e-05,
"loss": 0.7201,
"num_input_tokens_seen": 1793792,
"step": 4760
},
{
"epoch": 4.269713261648746,
"grad_norm": 0.6711928248405457,
"learning_rate": 3.5416403908561966e-05,
"loss": 0.7008,
"num_input_tokens_seen": 1795712,
"step": 4765
},
{
"epoch": 4.274193548387097,
"grad_norm": 0.5274402499198914,
"learning_rate": 3.538084870549052e-05,
"loss": 0.7237,
"num_input_tokens_seen": 1797536,
"step": 4770
},
{
"epoch": 4.278673835125448,
"grad_norm": 0.5370923280715942,
"learning_rate": 3.534526811262848e-05,
"loss": 0.7051,
"num_input_tokens_seen": 1799392,
"step": 4775
},
{
"epoch": 4.2831541218638,
"grad_norm": 0.5397687554359436,
"learning_rate": 3.530966221699992e-05,
"loss": 0.7353,
"num_input_tokens_seen": 1801184,
"step": 4780
},
{
"epoch": 4.287634408602151,
"grad_norm": 0.5232270956039429,
"learning_rate": 3.5274031105690826e-05,
"loss": 0.7115,
"num_input_tokens_seen": 1803104,
"step": 4785
},
{
"epoch": 4.292114695340501,
"grad_norm": 0.6073275208473206,
"learning_rate": 3.523837486584881e-05,
"loss": 0.6764,
"num_input_tokens_seen": 1805024,
"step": 4790
},
{
"epoch": 4.296594982078853,
"grad_norm": 0.5557631254196167,
"learning_rate": 3.5202693584682986e-05,
"loss": 0.6845,
"num_input_tokens_seen": 1806848,
"step": 4795
},
{
"epoch": 4.301075268817204,
"grad_norm": 0.45816871523857117,
"learning_rate": 3.51669873494637e-05,
"loss": 0.7111,
"num_input_tokens_seen": 1808832,
"step": 4800
},
{
"epoch": 4.305555555555555,
"grad_norm": 0.3697778880596161,
"learning_rate": 3.513125624752232e-05,
"loss": 0.7015,
"num_input_tokens_seen": 1810656,
"step": 4805
},
{
"epoch": 4.310035842293907,
"grad_norm": 0.5849930644035339,
"learning_rate": 3.509550036625106e-05,
"loss": 0.7011,
"num_input_tokens_seen": 1812512,
"step": 4810
},
{
"epoch": 4.314516129032258,
"grad_norm": 0.597547709941864,
"learning_rate": 3.5059719793102716e-05,
"loss": 0.7366,
"num_input_tokens_seen": 1814336,
"step": 4815
},
{
"epoch": 4.318996415770609,
"grad_norm": 0.6377572417259216,
"learning_rate": 3.502391461559049e-05,
"loss": 0.7249,
"num_input_tokens_seen": 1816288,
"step": 4820
},
{
"epoch": 4.323476702508961,
"grad_norm": 0.43655264377593994,
"learning_rate": 3.498808492128776e-05,
"loss": 0.7186,
"num_input_tokens_seen": 1818144,
"step": 4825
},
{
"epoch": 4.327956989247312,
"grad_norm": 0.5869845747947693,
"learning_rate": 3.495223079782785e-05,
"loss": 0.6415,
"num_input_tokens_seen": 1820224,
"step": 4830
},
{
"epoch": 4.332437275985663,
"grad_norm": 0.3732021152973175,
"learning_rate": 3.491635233290387e-05,
"loss": 0.6636,
"num_input_tokens_seen": 1822048,
"step": 4835
},
{
"epoch": 4.336917562724015,
"grad_norm": 0.5236073136329651,
"learning_rate": 3.488044961426843e-05,
"loss": 0.6605,
"num_input_tokens_seen": 1824000,
"step": 4840
},
{
"epoch": 4.341397849462366,
"grad_norm": 0.4687046408653259,
"learning_rate": 3.484452272973347e-05,
"loss": 0.6923,
"num_input_tokens_seen": 1825856,
"step": 4845
},
{
"epoch": 4.345878136200717,
"grad_norm": 0.4935377240180969,
"learning_rate": 3.480857176717005e-05,
"loss": 0.7204,
"num_input_tokens_seen": 1827776,
"step": 4850
},
{
"epoch": 4.350358422939068,
"grad_norm": 0.5924071073532104,
"learning_rate": 3.4772596814508104e-05,
"loss": 0.7215,
"num_input_tokens_seen": 1829600,
"step": 4855
},
{
"epoch": 4.354838709677419,
"grad_norm": 0.42385607957839966,
"learning_rate": 3.473659795973626e-05,
"loss": 0.6406,
"num_input_tokens_seen": 1831360,
"step": 4860
},
{
"epoch": 4.35931899641577,
"grad_norm": 0.41054651141166687,
"learning_rate": 3.470057529090159e-05,
"loss": 0.7,
"num_input_tokens_seen": 1833152,
"step": 4865
},
{
"epoch": 4.363799283154122,
"grad_norm": 0.7326763868331909,
"learning_rate": 3.46645288961094e-05,
"loss": 0.7314,
"num_input_tokens_seen": 1834976,
"step": 4870
},
{
"epoch": 4.368279569892473,
"grad_norm": 0.32546576857566833,
"learning_rate": 3.462845886352306e-05,
"loss": 0.6558,
"num_input_tokens_seen": 1837024,
"step": 4875
},
{
"epoch": 4.372759856630824,
"grad_norm": 0.563502848148346,
"learning_rate": 3.4592365281363734e-05,
"loss": 0.7179,
"num_input_tokens_seen": 1838848,
"step": 4880
},
{
"epoch": 4.377240143369176,
"grad_norm": 0.4499485492706299,
"learning_rate": 3.455624823791018e-05,
"loss": 0.685,
"num_input_tokens_seen": 1840640,
"step": 4885
},
{
"epoch": 4.381720430107527,
"grad_norm": 0.6014490723609924,
"learning_rate": 3.4520107821498544e-05,
"loss": 0.7341,
"num_input_tokens_seen": 1842752,
"step": 4890
},
{
"epoch": 4.386200716845878,
"grad_norm": 0.4780343174934387,
"learning_rate": 3.448394412052215e-05,
"loss": 0.7098,
"num_input_tokens_seen": 1844480,
"step": 4895
},
{
"epoch": 4.39068100358423,
"grad_norm": 0.39559486508369446,
"learning_rate": 3.444775722343124e-05,
"loss": 0.6782,
"num_input_tokens_seen": 1846240,
"step": 4900
},
{
"epoch": 4.395161290322581,
"grad_norm": 0.6034467220306396,
"learning_rate": 3.441154721873284e-05,
"loss": 0.7097,
"num_input_tokens_seen": 1848064,
"step": 4905
},
{
"epoch": 4.399641577060932,
"grad_norm": 0.5184746980667114,
"learning_rate": 3.437531419499043e-05,
"loss": 0.6952,
"num_input_tokens_seen": 1850048,
"step": 4910
},
{
"epoch": 4.404121863799283,
"grad_norm": 0.6733912229537964,
"learning_rate": 3.4339058240823843e-05,
"loss": 0.6948,
"num_input_tokens_seen": 1852032,
"step": 4915
},
{
"epoch": 4.408602150537634,
"grad_norm": 0.6464542746543884,
"learning_rate": 3.430277944490898e-05,
"loss": 0.7098,
"num_input_tokens_seen": 1853888,
"step": 4920
},
{
"epoch": 4.413082437275985,
"grad_norm": 0.39893773198127747,
"learning_rate": 3.42664778959776e-05,
"loss": 0.695,
"num_input_tokens_seen": 1855776,
"step": 4925
},
{
"epoch": 4.417562724014337,
"grad_norm": 0.6849373579025269,
"learning_rate": 3.423015368281711e-05,
"loss": 0.7076,
"num_input_tokens_seen": 1857600,
"step": 4930
},
{
"epoch": 4.422043010752688,
"grad_norm": 0.5464024543762207,
"learning_rate": 3.419380689427038e-05,
"loss": 0.7138,
"num_input_tokens_seen": 1859520,
"step": 4935
},
{
"epoch": 4.426523297491039,
"grad_norm": 0.532284677028656,
"learning_rate": 3.415743761923546e-05,
"loss": 0.6927,
"num_input_tokens_seen": 1861440,
"step": 4940
},
{
"epoch": 4.431003584229391,
"grad_norm": 0.45980408787727356,
"learning_rate": 3.412104594666541e-05,
"loss": 0.7577,
"num_input_tokens_seen": 1863200,
"step": 4945
},
{
"epoch": 4.435483870967742,
"grad_norm": 0.506994903087616,
"learning_rate": 3.408463196556807e-05,
"loss": 0.6934,
"num_input_tokens_seen": 1865120,
"step": 4950
},
{
"epoch": 4.439964157706093,
"grad_norm": 0.5589765310287476,
"learning_rate": 3.404819576500586e-05,
"loss": 0.7261,
"num_input_tokens_seen": 1867136,
"step": 4955
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.448844313621521,
"learning_rate": 3.401173743409552e-05,
"loss": 0.6661,
"num_input_tokens_seen": 1868864,
"step": 4960
},
{
"epoch": 4.448924731182796,
"grad_norm": 0.3504087030887604,
"learning_rate": 3.397525706200793e-05,
"loss": 0.7071,
"num_input_tokens_seen": 1870720,
"step": 4965
},
{
"epoch": 4.453405017921147,
"grad_norm": 0.44309166073799133,
"learning_rate": 3.393875473796787e-05,
"loss": 0.6904,
"num_input_tokens_seen": 1872640,
"step": 4970
},
{
"epoch": 4.457885304659499,
"grad_norm": 0.6156589984893799,
"learning_rate": 3.390223055125383e-05,
"loss": 0.7403,
"num_input_tokens_seen": 1874528,
"step": 4975
},
{
"epoch": 4.462365591397849,
"grad_norm": 0.5609185695648193,
"learning_rate": 3.3865684591197745e-05,
"loss": 0.7005,
"num_input_tokens_seen": 1876416,
"step": 4980
},
{
"epoch": 4.4668458781362,
"grad_norm": 0.5331929922103882,
"learning_rate": 3.3829116947184823e-05,
"loss": 0.6866,
"num_input_tokens_seen": 1878176,
"step": 4985
},
{
"epoch": 4.471326164874552,
"grad_norm": 0.49103447794914246,
"learning_rate": 3.379252770865331e-05,
"loss": 0.7062,
"num_input_tokens_seen": 1879968,
"step": 4990
},
{
"epoch": 4.475806451612903,
"grad_norm": 0.4910992681980133,
"learning_rate": 3.375591696509425e-05,
"loss": 0.7213,
"num_input_tokens_seen": 1881760,
"step": 4995
},
{
"epoch": 4.480286738351254,
"grad_norm": 0.8638359904289246,
"learning_rate": 3.371928480605131e-05,
"loss": 0.7325,
"num_input_tokens_seen": 1883648,
"step": 5000
},
{
"epoch": 4.484767025089606,
"grad_norm": 0.590151846408844,
"learning_rate": 3.3682631321120504e-05,
"loss": 0.669,
"num_input_tokens_seen": 1885696,
"step": 5005
},
{
"epoch": 4.489247311827957,
"grad_norm": 0.5797485709190369,
"learning_rate": 3.3645956599950044e-05,
"loss": 0.6884,
"num_input_tokens_seen": 1887488,
"step": 5010
},
{
"epoch": 4.493727598566308,
"grad_norm": 0.5620754361152649,
"learning_rate": 3.360926073224004e-05,
"loss": 0.6982,
"num_input_tokens_seen": 1889472,
"step": 5015
},
{
"epoch": 4.49820788530466,
"grad_norm": 0.5586421489715576,
"learning_rate": 3.3572543807742364e-05,
"loss": 0.6778,
"num_input_tokens_seen": 1891360,
"step": 5020
},
{
"epoch": 4.5,
"eval_loss": 0.701896607875824,
"eval_runtime": 5.6352,
"eval_samples_per_second": 88.018,
"eval_steps_per_second": 22.005,
"num_input_tokens_seen": 1892160,
"step": 5022
},
{
"epoch": 4.502688172043011,
"grad_norm": 0.6140494346618652,
"learning_rate": 3.3535805916260346e-05,
"loss": 0.7061,
"num_input_tokens_seen": 1893312,
"step": 5025
},
{
"epoch": 4.507168458781362,
"grad_norm": 0.5332615375518799,
"learning_rate": 3.3499047147648645e-05,
"loss": 0.6872,
"num_input_tokens_seen": 1895072,
"step": 5030
},
{
"epoch": 4.511648745519714,
"grad_norm": 0.43873006105422974,
"learning_rate": 3.346226759181294e-05,
"loss": 0.699,
"num_input_tokens_seen": 1896928,
"step": 5035
},
{
"epoch": 4.516129032258064,
"grad_norm": 0.5747973918914795,
"learning_rate": 3.342546733870977e-05,
"loss": 0.6896,
"num_input_tokens_seen": 1898816,
"step": 5040
},
{
"epoch": 4.520609318996415,
"grad_norm": 0.5600061416625977,
"learning_rate": 3.338864647834631e-05,
"loss": 0.6712,
"num_input_tokens_seen": 1900672,
"step": 5045
},
{
"epoch": 4.525089605734767,
"grad_norm": 0.4608553946018219,
"learning_rate": 3.335180510078012e-05,
"loss": 0.6479,
"num_input_tokens_seen": 1902528,
"step": 5050
},
{
"epoch": 4.529569892473118,
"grad_norm": 0.5437533855438232,
"learning_rate": 3.331494329611894e-05,
"loss": 0.7082,
"num_input_tokens_seen": 1904672,
"step": 5055
},
{
"epoch": 4.534050179211469,
"grad_norm": 0.8298529386520386,
"learning_rate": 3.327806115452046e-05,
"loss": 0.7172,
"num_input_tokens_seen": 1906720,
"step": 5060
},
{
"epoch": 4.538530465949821,
"grad_norm": 0.6532770991325378,
"learning_rate": 3.324115876619215e-05,
"loss": 0.7106,
"num_input_tokens_seen": 1908544,
"step": 5065
},
{
"epoch": 4.543010752688172,
"grad_norm": 0.6913199424743652,
"learning_rate": 3.3204236221390975e-05,
"loss": 0.7495,
"num_input_tokens_seen": 1910496,
"step": 5070
},
{
"epoch": 4.547491039426523,
"grad_norm": 0.5206092000007629,
"learning_rate": 3.316729361042319e-05,
"loss": 0.679,
"num_input_tokens_seen": 1912288,
"step": 5075
},
{
"epoch": 4.551971326164875,
"grad_norm": 0.46319350600242615,
"learning_rate": 3.3130331023644134e-05,
"loss": 0.7173,
"num_input_tokens_seen": 1914208,
"step": 5080
},
{
"epoch": 4.556451612903226,
"grad_norm": 0.7394029498100281,
"learning_rate": 3.309334855145803e-05,
"loss": 0.6766,
"num_input_tokens_seen": 1915968,
"step": 5085
},
{
"epoch": 4.560931899641577,
"grad_norm": 0.7033692002296448,
"learning_rate": 3.30563462843177e-05,
"loss": 0.7051,
"num_input_tokens_seen": 1917824,
"step": 5090
},
{
"epoch": 4.565412186379929,
"grad_norm": 0.7795708179473877,
"learning_rate": 3.301932431272439e-05,
"loss": 0.6573,
"num_input_tokens_seen": 1919808,
"step": 5095
},
{
"epoch": 4.56989247311828,
"grad_norm": 0.45599478483200073,
"learning_rate": 3.2982282727227565e-05,
"loss": 0.7036,
"num_input_tokens_seen": 1921728,
"step": 5100
},
{
"epoch": 4.574372759856631,
"grad_norm": 0.8516075611114502,
"learning_rate": 3.294522161842463e-05,
"loss": 0.7271,
"num_input_tokens_seen": 1923584,
"step": 5105
},
{
"epoch": 4.578853046594982,
"grad_norm": 0.6737523078918457,
"learning_rate": 3.2908141076960766e-05,
"loss": 0.727,
"num_input_tokens_seen": 1925504,
"step": 5110
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.5484982132911682,
"learning_rate": 3.287104119352867e-05,
"loss": 0.6958,
"num_input_tokens_seen": 1927360,
"step": 5115
},
{
"epoch": 4.587813620071684,
"grad_norm": 0.6374887824058533,
"learning_rate": 3.283392205886833e-05,
"loss": 0.7413,
"num_input_tokens_seen": 1929376,
"step": 5120
},
{
"epoch": 4.592293906810036,
"grad_norm": 0.6388539671897888,
"learning_rate": 3.279678376376686e-05,
"loss": 0.6703,
"num_input_tokens_seen": 1931232,
"step": 5125
},
{
"epoch": 4.596774193548387,
"grad_norm": 0.5705585479736328,
"learning_rate": 3.2759626399058196e-05,
"loss": 0.6992,
"num_input_tokens_seen": 1933056,
"step": 5130
},
{
"epoch": 4.601254480286738,
"grad_norm": 0.45625385642051697,
"learning_rate": 3.2722450055622946e-05,
"loss": 0.6908,
"num_input_tokens_seen": 1935040,
"step": 5135
},
{
"epoch": 4.60573476702509,
"grad_norm": 0.4561392068862915,
"learning_rate": 3.268525482438813e-05,
"loss": 0.6669,
"num_input_tokens_seen": 1937056,
"step": 5140
},
{
"epoch": 4.610215053763441,
"grad_norm": 0.5542040467262268,
"learning_rate": 3.264804079632693e-05,
"loss": 0.6967,
"num_input_tokens_seen": 1938944,
"step": 5145
},
{
"epoch": 4.614695340501792,
"grad_norm": 0.4966046214103699,
"learning_rate": 3.2610808062458554e-05,
"loss": 0.7079,
"num_input_tokens_seen": 1940768,
"step": 5150
},
{
"epoch": 4.619175627240144,
"grad_norm": 0.5149814486503601,
"learning_rate": 3.257355671384794e-05,
"loss": 0.6761,
"num_input_tokens_seen": 1942560,
"step": 5155
},
{
"epoch": 4.623655913978495,
"grad_norm": 0.44616055488586426,
"learning_rate": 3.253628684160554e-05,
"loss": 0.7097,
"num_input_tokens_seen": 1944480,
"step": 5160
},
{
"epoch": 4.628136200716845,
"grad_norm": 0.7818517088890076,
"learning_rate": 3.2498998536887114e-05,
"loss": 0.7135,
"num_input_tokens_seen": 1946336,
"step": 5165
},
{
"epoch": 4.632616487455197,
"grad_norm": 0.46605023741722107,
"learning_rate": 3.246169189089354e-05,
"loss": 0.6895,
"num_input_tokens_seen": 1948064,
"step": 5170
},
{
"epoch": 4.637096774193548,
"grad_norm": 0.6193602681159973,
"learning_rate": 3.2424366994870515e-05,
"loss": 0.6853,
"num_input_tokens_seen": 1949952,
"step": 5175
},
{
"epoch": 4.641577060931899,
"grad_norm": 0.5214295387268066,
"learning_rate": 3.238702394010839e-05,
"loss": 0.6746,
"num_input_tokens_seen": 1951680,
"step": 5180
},
{
"epoch": 4.646057347670251,
"grad_norm": 0.4216688573360443,
"learning_rate": 3.234966281794193e-05,
"loss": 0.6882,
"num_input_tokens_seen": 1953472,
"step": 5185
},
{
"epoch": 4.650537634408602,
"grad_norm": 0.46460291743278503,
"learning_rate": 3.231228371975007e-05,
"loss": 0.7436,
"num_input_tokens_seen": 1955328,
"step": 5190
},
{
"epoch": 4.655017921146953,
"grad_norm": 0.6976000666618347,
"learning_rate": 3.2274886736955744e-05,
"loss": 0.6798,
"num_input_tokens_seen": 1957184,
"step": 5195
},
{
"epoch": 4.659498207885305,
"grad_norm": 0.5898621678352356,
"learning_rate": 3.223747196102561e-05,
"loss": 0.7066,
"num_input_tokens_seen": 1959040,
"step": 5200
},
{
"epoch": 4.663978494623656,
"grad_norm": 0.5544499754905701,
"learning_rate": 3.220003948346984e-05,
"loss": 0.6431,
"num_input_tokens_seen": 1961088,
"step": 5205
},
{
"epoch": 4.668458781362007,
"grad_norm": 0.8915784955024719,
"learning_rate": 3.216258939584192e-05,
"loss": 0.698,
"num_input_tokens_seen": 1962752,
"step": 5210
},
{
"epoch": 4.672939068100359,
"grad_norm": 0.7086608409881592,
"learning_rate": 3.2125121789738384e-05,
"loss": 0.7153,
"num_input_tokens_seen": 1964704,
"step": 5215
},
{
"epoch": 4.67741935483871,
"grad_norm": 0.4128365218639374,
"learning_rate": 3.2087636756798635e-05,
"loss": 0.7051,
"num_input_tokens_seen": 1966688,
"step": 5220
},
{
"epoch": 4.681899641577061,
"grad_norm": 0.577278196811676,
"learning_rate": 3.205013438870468e-05,
"loss": 0.7005,
"num_input_tokens_seen": 1968480,
"step": 5225
},
{
"epoch": 4.686379928315413,
"grad_norm": 0.3509978652000427,
"learning_rate": 3.201261477718093e-05,
"loss": 0.7304,
"num_input_tokens_seen": 1970304,
"step": 5230
},
{
"epoch": 4.690860215053764,
"grad_norm": 0.6244447827339172,
"learning_rate": 3.197507801399399e-05,
"loss": 0.709,
"num_input_tokens_seen": 1972224,
"step": 5235
},
{
"epoch": 4.695340501792114,
"grad_norm": 0.6282808780670166,
"learning_rate": 3.193752419095239e-05,
"loss": 0.7164,
"num_input_tokens_seen": 1974016,
"step": 5240
},
{
"epoch": 4.699820788530466,
"grad_norm": 0.6692128777503967,
"learning_rate": 3.18999533999064e-05,
"loss": 0.6799,
"num_input_tokens_seen": 1975840,
"step": 5245
},
{
"epoch": 4.704301075268817,
"grad_norm": 0.7175477147102356,
"learning_rate": 3.186236573274779e-05,
"loss": 0.6818,
"num_input_tokens_seen": 1977728,
"step": 5250
},
{
"epoch": 4.708781362007168,
"grad_norm": 0.5985310673713684,
"learning_rate": 3.1824761281409574e-05,
"loss": 0.6939,
"num_input_tokens_seen": 1979776,
"step": 5255
},
{
"epoch": 4.71326164874552,
"grad_norm": 0.5040144920349121,
"learning_rate": 3.178714013786587e-05,
"loss": 0.6917,
"num_input_tokens_seen": 1981728,
"step": 5260
},
{
"epoch": 4.717741935483871,
"grad_norm": 0.5648061037063599,
"learning_rate": 3.174950239413161e-05,
"loss": 0.7029,
"num_input_tokens_seen": 1983776,
"step": 5265
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.5307470560073853,
"learning_rate": 3.171184814226228e-05,
"loss": 0.7299,
"num_input_tokens_seen": 1985632,
"step": 5270
},
{
"epoch": 4.726702508960574,
"grad_norm": 0.5408966541290283,
"learning_rate": 3.167417747435379e-05,
"loss": 0.7017,
"num_input_tokens_seen": 1987456,
"step": 5275
},
{
"epoch": 4.731182795698925,
"grad_norm": 0.603818416595459,
"learning_rate": 3.16364904825422e-05,
"loss": 0.7246,
"num_input_tokens_seen": 1989344,
"step": 5280
},
{
"epoch": 4.735663082437276,
"grad_norm": 0.8000281453132629,
"learning_rate": 3.1598787259003476e-05,
"loss": 0.6887,
"num_input_tokens_seen": 1991232,
"step": 5285
},
{
"epoch": 4.740143369175628,
"grad_norm": 0.74366295337677,
"learning_rate": 3.1561067895953276e-05,
"loss": 0.6743,
"num_input_tokens_seen": 1993216,
"step": 5290
},
{
"epoch": 4.744623655913978,
"grad_norm": 0.6478007435798645,
"learning_rate": 3.152333248564677e-05,
"loss": 0.692,
"num_input_tokens_seen": 1995040,
"step": 5295
},
{
"epoch": 4.749103942652329,
"grad_norm": 0.6803041696548462,
"learning_rate": 3.148558112037835e-05,
"loss": 0.6901,
"num_input_tokens_seen": 1996928,
"step": 5300
},
{
"epoch": 4.753584229390681,
"grad_norm": 0.4506027102470398,
"learning_rate": 3.1447813892481425e-05,
"loss": 0.7007,
"num_input_tokens_seen": 1998976,
"step": 5305
},
{
"epoch": 4.758064516129032,
"grad_norm": 0.4170258343219757,
"learning_rate": 3.141003089432822e-05,
"loss": 0.6954,
"num_input_tokens_seen": 2000864,
"step": 5310
},
{
"epoch": 4.762544802867383,
"grad_norm": 0.5882114171981812,
"learning_rate": 3.137223221832951e-05,
"loss": 0.6908,
"num_input_tokens_seen": 2002688,
"step": 5315
},
{
"epoch": 4.767025089605735,
"grad_norm": 0.4571791887283325,
"learning_rate": 3.133441795693445e-05,
"loss": 0.7135,
"num_input_tokens_seen": 2004864,
"step": 5320
},
{
"epoch": 4.771505376344086,
"grad_norm": 0.6060591340065002,
"learning_rate": 3.129658820263028e-05,
"loss": 0.7044,
"num_input_tokens_seen": 2006880,
"step": 5325
},
{
"epoch": 4.775985663082437,
"grad_norm": 0.5432692170143127,
"learning_rate": 3.125874304794214e-05,
"loss": 0.6767,
"num_input_tokens_seen": 2008704,
"step": 5330
},
{
"epoch": 4.780465949820789,
"grad_norm": 0.7855601906776428,
"learning_rate": 3.122088258543287e-05,
"loss": 0.7241,
"num_input_tokens_seen": 2010592,
"step": 5335
},
{
"epoch": 4.78494623655914,
"grad_norm": 0.5748709440231323,
"learning_rate": 3.1183006907702684e-05,
"loss": 0.7007,
"num_input_tokens_seen": 2012448,
"step": 5340
},
{
"epoch": 4.789426523297491,
"grad_norm": 0.6286282539367676,
"learning_rate": 3.114511610738907e-05,
"loss": 0.708,
"num_input_tokens_seen": 2014208,
"step": 5345
},
{
"epoch": 4.793906810035843,
"grad_norm": 0.5121644735336304,
"learning_rate": 3.110721027716649e-05,
"loss": 0.7078,
"num_input_tokens_seen": 2016032,
"step": 5350
},
{
"epoch": 4.798387096774194,
"grad_norm": 0.5875132083892822,
"learning_rate": 3.106928950974614e-05,
"loss": 0.6871,
"num_input_tokens_seen": 2017920,
"step": 5355
},
{
"epoch": 4.802867383512545,
"grad_norm": 0.5885509252548218,
"learning_rate": 3.103135389787578e-05,
"loss": 0.7209,
"num_input_tokens_seen": 2019936,
"step": 5360
},
{
"epoch": 4.807347670250896,
"grad_norm": 0.6623149514198303,
"learning_rate": 3.099340353433946e-05,
"loss": 0.7188,
"num_input_tokens_seen": 2021824,
"step": 5365
},
{
"epoch": 4.811827956989247,
"grad_norm": 0.5710822939872742,
"learning_rate": 3.095543851195732e-05,
"loss": 0.6984,
"num_input_tokens_seen": 2023904,
"step": 5370
},
{
"epoch": 4.816308243727598,
"grad_norm": 0.42584097385406494,
"learning_rate": 3.091745892358535e-05,
"loss": 0.7011,
"num_input_tokens_seen": 2025728,
"step": 5375
},
{
"epoch": 4.82078853046595,
"grad_norm": 0.5550757050514221,
"learning_rate": 3.087946486211515e-05,
"loss": 0.705,
"num_input_tokens_seen": 2027520,
"step": 5380
},
{
"epoch": 4.825268817204301,
"grad_norm": 0.5160370469093323,
"learning_rate": 3.084145642047374e-05,
"loss": 0.6821,
"num_input_tokens_seen": 2029568,
"step": 5385
},
{
"epoch": 4.829749103942652,
"grad_norm": 0.6525334119796753,
"learning_rate": 3.080343369162332e-05,
"loss": 0.6815,
"num_input_tokens_seen": 2031552,
"step": 5390
},
{
"epoch": 4.834229390681004,
"grad_norm": 0.4453662633895874,
"learning_rate": 3.076539676856101e-05,
"loss": 0.6928,
"num_input_tokens_seen": 2033472,
"step": 5395
},
{
"epoch": 4.838709677419355,
"grad_norm": 0.4875478744506836,
"learning_rate": 3.0727345744318645e-05,
"loss": 0.678,
"num_input_tokens_seen": 2035424,
"step": 5400
},
{
"epoch": 4.843189964157706,
"grad_norm": 0.409298837184906,
"learning_rate": 3.068928071196256e-05,
"loss": 0.7295,
"num_input_tokens_seen": 2037248,
"step": 5405
},
{
"epoch": 4.847670250896058,
"grad_norm": 0.458920955657959,
"learning_rate": 3.065120176459338e-05,
"loss": 0.6985,
"num_input_tokens_seen": 2039040,
"step": 5410
},
{
"epoch": 4.852150537634409,
"grad_norm": 0.4822891354560852,
"learning_rate": 3.0613108995345694e-05,
"loss": 0.6806,
"num_input_tokens_seen": 2041152,
"step": 5415
},
{
"epoch": 4.856630824372759,
"grad_norm": 0.7133973836898804,
"learning_rate": 3.057500249738796e-05,
"loss": 0.7237,
"num_input_tokens_seen": 2043072,
"step": 5420
},
{
"epoch": 4.861111111111111,
"grad_norm": 0.5981613993644714,
"learning_rate": 3.053688236392219e-05,
"loss": 0.6778,
"num_input_tokens_seen": 2045088,
"step": 5425
},
{
"epoch": 4.865591397849462,
"grad_norm": 0.6646333932876587,
"learning_rate": 3.0498748688183744e-05,
"loss": 0.7015,
"num_input_tokens_seen": 2046912,
"step": 5430
},
{
"epoch": 4.870071684587813,
"grad_norm": 0.5133581161499023,
"learning_rate": 3.046060156344111e-05,
"loss": 0.6655,
"num_input_tokens_seen": 2048768,
"step": 5435
},
{
"epoch": 4.874551971326165,
"grad_norm": 0.555539608001709,
"learning_rate": 3.0422441082995667e-05,
"loss": 0.6783,
"num_input_tokens_seen": 2050624,
"step": 5440
},
{
"epoch": 4.879032258064516,
"grad_norm": 0.5734941363334656,
"learning_rate": 3.0384267340181462e-05,
"loss": 0.6662,
"num_input_tokens_seen": 2052608,
"step": 5445
},
{
"epoch": 4.883512544802867,
"grad_norm": 0.3394169509410858,
"learning_rate": 3.0346080428364974e-05,
"loss": 0.6761,
"num_input_tokens_seen": 2054368,
"step": 5450
},
{
"epoch": 4.887992831541219,
"grad_norm": 0.5142562389373779,
"learning_rate": 3.0307880440944902e-05,
"loss": 0.7261,
"num_input_tokens_seen": 2056448,
"step": 5455
},
{
"epoch": 4.89247311827957,
"grad_norm": 0.5017949938774109,
"learning_rate": 3.026966747135192e-05,
"loss": 0.6861,
"num_input_tokens_seen": 2058368,
"step": 5460
},
{
"epoch": 4.896953405017921,
"grad_norm": 0.5257229804992676,
"learning_rate": 3.023144161304844e-05,
"loss": 0.67,
"num_input_tokens_seen": 2060256,
"step": 5465
},
{
"epoch": 4.901433691756273,
"grad_norm": 0.4861137866973877,
"learning_rate": 3.0193202959528426e-05,
"loss": 0.6471,
"num_input_tokens_seen": 2062240,
"step": 5470
},
{
"epoch": 4.905913978494624,
"grad_norm": 0.5103967785835266,
"learning_rate": 3.0154951604317118e-05,
"loss": 0.7269,
"num_input_tokens_seen": 2063968,
"step": 5475
},
{
"epoch": 4.910394265232975,
"grad_norm": 0.634955883026123,
"learning_rate": 3.0116687640970814e-05,
"loss": 0.7047,
"num_input_tokens_seen": 2065920,
"step": 5480
},
{
"epoch": 4.914874551971327,
"grad_norm": 0.7920023202896118,
"learning_rate": 3.0078411163076682e-05,
"loss": 0.7021,
"num_input_tokens_seen": 2067808,
"step": 5485
},
{
"epoch": 4.919354838709677,
"grad_norm": 0.49357402324676514,
"learning_rate": 3.0040122264252457e-05,
"loss": 0.636,
"num_input_tokens_seen": 2069888,
"step": 5490
},
{
"epoch": 4.923835125448028,
"grad_norm": 0.581206738948822,
"learning_rate": 3.0001821038146287e-05,
"loss": 0.7512,
"num_input_tokens_seen": 2071712,
"step": 5495
},
{
"epoch": 4.92831541218638,
"grad_norm": 0.5302388668060303,
"learning_rate": 2.9963507578436456e-05,
"loss": 0.7503,
"num_input_tokens_seen": 2073536,
"step": 5500
},
{
"epoch": 4.932795698924731,
"grad_norm": 0.655440628528595,
"learning_rate": 2.9925181978831163e-05,
"loss": 0.6823,
"num_input_tokens_seen": 2075392,
"step": 5505
},
{
"epoch": 4.937275985663082,
"grad_norm": 0.4810947775840759,
"learning_rate": 2.9886844333068314e-05,
"loss": 0.6949,
"num_input_tokens_seen": 2077280,
"step": 5510
},
{
"epoch": 4.941756272401434,
"grad_norm": 0.4489869773387909,
"learning_rate": 2.9848494734915276e-05,
"loss": 0.737,
"num_input_tokens_seen": 2079360,
"step": 5515
},
{
"epoch": 4.946236559139785,
"grad_norm": 0.6105610728263855,
"learning_rate": 2.9810133278168643e-05,
"loss": 0.7341,
"num_input_tokens_seen": 2081216,
"step": 5520
},
{
"epoch": 4.950716845878136,
"grad_norm": 0.4917219281196594,
"learning_rate": 2.9771760056654e-05,
"loss": 0.7096,
"num_input_tokens_seen": 2082944,
"step": 5525
},
{
"epoch": 4.955197132616488,
"grad_norm": 0.5548819899559021,
"learning_rate": 2.973337516422574e-05,
"loss": 0.7047,
"num_input_tokens_seen": 2084768,
"step": 5530
},
{
"epoch": 4.959677419354839,
"grad_norm": 0.5685299634933472,
"learning_rate": 2.9694978694766767e-05,
"loss": 0.6785,
"num_input_tokens_seen": 2086752,
"step": 5535
},
{
"epoch": 4.96415770609319,
"grad_norm": 0.36095836758613586,
"learning_rate": 2.9656570742188332e-05,
"loss": 0.7015,
"num_input_tokens_seen": 2088448,
"step": 5540
},
{
"epoch": 4.968637992831541,
"grad_norm": 0.5040276646614075,
"learning_rate": 2.961815140042974e-05,
"loss": 0.6876,
"num_input_tokens_seen": 2090432,
"step": 5545
},
{
"epoch": 4.973118279569892,
"grad_norm": 0.4736102521419525,
"learning_rate": 2.957972076345817e-05,
"loss": 0.6886,
"num_input_tokens_seen": 2092384,
"step": 5550
},
{
"epoch": 4.977598566308243,
"grad_norm": 0.47025778889656067,
"learning_rate": 2.9541278925268428e-05,
"loss": 0.6863,
"num_input_tokens_seen": 2094080,
"step": 5555
},
{
"epoch": 4.982078853046595,
"grad_norm": 0.43467360734939575,
"learning_rate": 2.950282597988272e-05,
"loss": 0.6994,
"num_input_tokens_seen": 2095776,
"step": 5560
},
{
"epoch": 4.986559139784946,
"grad_norm": 0.8185675740242004,
"learning_rate": 2.9464362021350395e-05,
"loss": 0.6944,
"num_input_tokens_seen": 2097664,
"step": 5565
},
{
"epoch": 4.991039426523297,
"grad_norm": 0.5016605854034424,
"learning_rate": 2.9425887143747773e-05,
"loss": 0.6658,
"num_input_tokens_seen": 2099456,
"step": 5570
},
{
"epoch": 4.995519713261649,
"grad_norm": 0.610438346862793,
"learning_rate": 2.938740144117784e-05,
"loss": 0.6839,
"num_input_tokens_seen": 2101312,
"step": 5575
},
{
"epoch": 5.0,
"grad_norm": 1.209800362586975,
"learning_rate": 2.93489050077701e-05,
"loss": 0.7394,
"num_input_tokens_seen": 2102920,
"step": 5580
},
{
"epoch": 5.0,
"eval_loss": 0.7092333436012268,
"eval_runtime": 5.6121,
"eval_samples_per_second": 88.38,
"eval_steps_per_second": 22.095,
"num_input_tokens_seen": 2102920,
"step": 5580
},
{
"epoch": 5.004480286738351,
"grad_norm": 0.7478780746459961,
"learning_rate": 2.9310397937680277e-05,
"loss": 0.6765,
"num_input_tokens_seen": 2104808,
"step": 5585
},
{
"epoch": 5.008960573476703,
"grad_norm": 0.39114946126937866,
"learning_rate": 2.9271880325090105e-05,
"loss": 0.68,
"num_input_tokens_seen": 2106568,
"step": 5590
},
{
"epoch": 5.013440860215054,
"grad_norm": 0.949701189994812,
"learning_rate": 2.9233352264207133e-05,
"loss": 0.7081,
"num_input_tokens_seen": 2108456,
"step": 5595
},
{
"epoch": 5.017921146953405,
"grad_norm": 0.525955319404602,
"learning_rate": 2.919481384926443e-05,
"loss": 0.6584,
"num_input_tokens_seen": 2110184,
"step": 5600
},
{
"epoch": 5.022401433691757,
"grad_norm": 0.5059929490089417,
"learning_rate": 2.9156265174520414e-05,
"loss": 0.7112,
"num_input_tokens_seen": 2112104,
"step": 5605
},
{
"epoch": 5.026881720430108,
"grad_norm": 0.5731722712516785,
"learning_rate": 2.911770633425858e-05,
"loss": 0.7132,
"num_input_tokens_seen": 2114056,
"step": 5610
},
{
"epoch": 5.031362007168458,
"grad_norm": 0.6896179914474487,
"learning_rate": 2.90791374227873e-05,
"loss": 0.7445,
"num_input_tokens_seen": 2115880,
"step": 5615
},
{
"epoch": 5.03584229390681,
"grad_norm": 0.6563051342964172,
"learning_rate": 2.9040558534439564e-05,
"loss": 0.6798,
"num_input_tokens_seen": 2117640,
"step": 5620
},
{
"epoch": 5.040322580645161,
"grad_norm": 0.4548991322517395,
"learning_rate": 2.9001969763572802e-05,
"loss": 0.6843,
"num_input_tokens_seen": 2119496,
"step": 5625
},
{
"epoch": 5.044802867383512,
"grad_norm": 0.5727083683013916,
"learning_rate": 2.8963371204568542e-05,
"loss": 0.6879,
"num_input_tokens_seen": 2121384,
"step": 5630
},
{
"epoch": 5.049283154121864,
"grad_norm": 0.6287766098976135,
"learning_rate": 2.892476295183232e-05,
"loss": 0.6761,
"num_input_tokens_seen": 2123336,
"step": 5635
},
{
"epoch": 5.053763440860215,
"grad_norm": 0.452092707157135,
"learning_rate": 2.888614509979336e-05,
"loss": 0.6833,
"num_input_tokens_seen": 2125064,
"step": 5640
},
{
"epoch": 5.058243727598566,
"grad_norm": 0.5629188418388367,
"learning_rate": 2.8847517742904352e-05,
"loss": 0.6902,
"num_input_tokens_seen": 2126920,
"step": 5645
},
{
"epoch": 5.062724014336918,
"grad_norm": 0.5243597626686096,
"learning_rate": 2.880888097564124e-05,
"loss": 0.7022,
"num_input_tokens_seen": 2128744,
"step": 5650
},
{
"epoch": 5.067204301075269,
"grad_norm": 0.39402908086776733,
"learning_rate": 2.877023489250299e-05,
"loss": 0.6651,
"num_input_tokens_seen": 2130664,
"step": 5655
},
{
"epoch": 5.07168458781362,
"grad_norm": 0.4814399778842926,
"learning_rate": 2.8731579588011343e-05,
"loss": 0.6876,
"num_input_tokens_seen": 2132520,
"step": 5660
},
{
"epoch": 5.076164874551972,
"grad_norm": 0.469891756772995,
"learning_rate": 2.8692915156710615e-05,
"loss": 0.7327,
"num_input_tokens_seen": 2134536,
"step": 5665
},
{
"epoch": 5.080645161290323,
"grad_norm": 0.49232617020606995,
"learning_rate": 2.8654241693167423e-05,
"loss": 0.6932,
"num_input_tokens_seen": 2136616,
"step": 5670
},
{
"epoch": 5.085125448028673,
"grad_norm": 0.7345511317253113,
"learning_rate": 2.8615559291970474e-05,
"loss": 0.6729,
"num_input_tokens_seen": 2138408,
"step": 5675
},
{
"epoch": 5.089605734767025,
"grad_norm": 0.7703377604484558,
"learning_rate": 2.8576868047730354e-05,
"loss": 0.7267,
"num_input_tokens_seen": 2140264,
"step": 5680
},
{
"epoch": 5.094086021505376,
"grad_norm": 0.5797245502471924,
"learning_rate": 2.8538168055079262e-05,
"loss": 0.6878,
"num_input_tokens_seen": 2142152,
"step": 5685
},
{
"epoch": 5.098566308243727,
"grad_norm": 0.8190505504608154,
"learning_rate": 2.8499459408670796e-05,
"loss": 0.6967,
"num_input_tokens_seen": 2144040,
"step": 5690
},
{
"epoch": 5.103046594982079,
"grad_norm": 0.5040394067764282,
"learning_rate": 2.846074220317973e-05,
"loss": 0.6745,
"num_input_tokens_seen": 2145896,
"step": 5695
},
{
"epoch": 5.10752688172043,
"grad_norm": 0.5397759675979614,
"learning_rate": 2.8422016533301753e-05,
"loss": 0.7206,
"num_input_tokens_seen": 2147720,
"step": 5700
},
{
"epoch": 5.112007168458781,
"grad_norm": 0.6920524835586548,
"learning_rate": 2.8383282493753283e-05,
"loss": 0.6896,
"num_input_tokens_seen": 2149704,
"step": 5705
},
{
"epoch": 5.116487455197133,
"grad_norm": 0.4486968219280243,
"learning_rate": 2.8344540179271178e-05,
"loss": 0.7165,
"num_input_tokens_seen": 2151592,
"step": 5710
},
{
"epoch": 5.120967741935484,
"grad_norm": 0.563451886177063,
"learning_rate": 2.830578968461256e-05,
"loss": 0.6802,
"num_input_tokens_seen": 2153320,
"step": 5715
},
{
"epoch": 5.125448028673835,
"grad_norm": 0.4947597086429596,
"learning_rate": 2.8267031104554552e-05,
"loss": 0.6844,
"num_input_tokens_seen": 2155144,
"step": 5720
},
{
"epoch": 5.129928315412187,
"grad_norm": 0.4879480302333832,
"learning_rate": 2.822826453389404e-05,
"loss": 0.6772,
"num_input_tokens_seen": 2156904,
"step": 5725
},
{
"epoch": 5.134408602150538,
"grad_norm": 0.675524115562439,
"learning_rate": 2.8189490067447473e-05,
"loss": 0.7385,
"num_input_tokens_seen": 2158792,
"step": 5730
},
{
"epoch": 5.138888888888889,
"grad_norm": 0.5710222125053406,
"learning_rate": 2.815070780005059e-05,
"loss": 0.6677,
"num_input_tokens_seen": 2160776,
"step": 5735
},
{
"epoch": 5.14336917562724,
"grad_norm": 0.3597255349159241,
"learning_rate": 2.811191782655823e-05,
"loss": 0.6658,
"num_input_tokens_seen": 2162568,
"step": 5740
},
{
"epoch": 5.147849462365591,
"grad_norm": 0.5168389678001404,
"learning_rate": 2.8073120241844077e-05,
"loss": 0.6692,
"num_input_tokens_seen": 2164488,
"step": 5745
},
{
"epoch": 5.152329749103942,
"grad_norm": 0.41304054856300354,
"learning_rate": 2.8034315140800414e-05,
"loss": 0.6983,
"num_input_tokens_seen": 2166184,
"step": 5750
},
{
"epoch": 5.156810035842294,
"grad_norm": 0.5995345711708069,
"learning_rate": 2.7995502618337933e-05,
"loss": 0.7489,
"num_input_tokens_seen": 2168040,
"step": 5755
},
{
"epoch": 5.161290322580645,
"grad_norm": 0.5592546463012695,
"learning_rate": 2.795668276938545e-05,
"loss": 0.6616,
"num_input_tokens_seen": 2170024,
"step": 5760
},
{
"epoch": 5.165770609318996,
"grad_norm": 0.37307098507881165,
"learning_rate": 2.7917855688889717e-05,
"loss": 0.7253,
"num_input_tokens_seen": 2171848,
"step": 5765
},
{
"epoch": 5.170250896057348,
"grad_norm": 0.4320157468318939,
"learning_rate": 2.787902147181517e-05,
"loss": 0.6866,
"num_input_tokens_seen": 2173608,
"step": 5770
},
{
"epoch": 5.174731182795699,
"grad_norm": 0.5110576152801514,
"learning_rate": 2.7840180213143712e-05,
"loss": 0.6937,
"num_input_tokens_seen": 2175336,
"step": 5775
},
{
"epoch": 5.17921146953405,
"grad_norm": 0.6609287261962891,
"learning_rate": 2.7801332007874437e-05,
"loss": 0.7321,
"num_input_tokens_seen": 2177192,
"step": 5780
},
{
"epoch": 5.183691756272402,
"grad_norm": 0.6040504574775696,
"learning_rate": 2.776247695102345e-05,
"loss": 0.691,
"num_input_tokens_seen": 2178952,
"step": 5785
},
{
"epoch": 5.188172043010753,
"grad_norm": 0.5164482593536377,
"learning_rate": 2.7723615137623637e-05,
"loss": 0.7194,
"num_input_tokens_seen": 2180968,
"step": 5790
},
{
"epoch": 5.192652329749104,
"grad_norm": 0.43215903639793396,
"learning_rate": 2.7684746662724363e-05,
"loss": 0.6952,
"num_input_tokens_seen": 2182792,
"step": 5795
},
{
"epoch": 5.197132616487456,
"grad_norm": 0.44016534090042114,
"learning_rate": 2.7645871621391305e-05,
"loss": 0.7246,
"num_input_tokens_seen": 2184648,
"step": 5800
},
{
"epoch": 5.201612903225806,
"grad_norm": 0.4214681386947632,
"learning_rate": 2.760699010870622e-05,
"loss": 0.6804,
"num_input_tokens_seen": 2186440,
"step": 5805
},
{
"epoch": 5.206093189964157,
"grad_norm": 0.48975706100463867,
"learning_rate": 2.7568102219766666e-05,
"loss": 0.7153,
"num_input_tokens_seen": 2188424,
"step": 5810
},
{
"epoch": 5.210573476702509,
"grad_norm": 0.6235063076019287,
"learning_rate": 2.7529208049685807e-05,
"loss": 0.7322,
"num_input_tokens_seen": 2190152,
"step": 5815
},
{
"epoch": 5.21505376344086,
"grad_norm": 0.4480155110359192,
"learning_rate": 2.7490307693592172e-05,
"loss": 0.7,
"num_input_tokens_seen": 2192072,
"step": 5820
},
{
"epoch": 5.219534050179211,
"grad_norm": 0.5600855946540833,
"learning_rate": 2.7451401246629403e-05,
"loss": 0.6852,
"num_input_tokens_seen": 2194056,
"step": 5825
},
{
"epoch": 5.224014336917563,
"grad_norm": 0.7299264073371887,
"learning_rate": 2.741248880395607e-05,
"loss": 0.6946,
"num_input_tokens_seen": 2195816,
"step": 5830
},
{
"epoch": 5.228494623655914,
"grad_norm": 0.46794813871383667,
"learning_rate": 2.7373570460745384e-05,
"loss": 0.6861,
"num_input_tokens_seen": 2197736,
"step": 5835
},
{
"epoch": 5.232974910394265,
"grad_norm": 0.5217588543891907,
"learning_rate": 2.7334646312184997e-05,
"loss": 0.6669,
"num_input_tokens_seen": 2199688,
"step": 5840
},
{
"epoch": 5.237455197132617,
"grad_norm": 0.44602036476135254,
"learning_rate": 2.7295716453476755e-05,
"loss": 0.6665,
"num_input_tokens_seen": 2201576,
"step": 5845
},
{
"epoch": 5.241935483870968,
"grad_norm": 0.5522176027297974,
"learning_rate": 2.7256780979836466e-05,
"loss": 0.6441,
"num_input_tokens_seen": 2203624,
"step": 5850
},
{
"epoch": 5.246415770609319,
"grad_norm": 0.43028005957603455,
"learning_rate": 2.721783998649369e-05,
"loss": 0.6738,
"num_input_tokens_seen": 2205448,
"step": 5855
},
{
"epoch": 5.250896057347671,
"grad_norm": 0.6127921938896179,
"learning_rate": 2.717889356869146e-05,
"loss": 0.7429,
"num_input_tokens_seen": 2207272,
"step": 5860
},
{
"epoch": 5.255376344086022,
"grad_norm": 0.5151240229606628,
"learning_rate": 2.71399418216861e-05,
"loss": 0.6823,
"num_input_tokens_seen": 2209160,
"step": 5865
},
{
"epoch": 5.259856630824372,
"grad_norm": 0.8400991559028625,
"learning_rate": 2.7100984840746956e-05,
"loss": 0.7006,
"num_input_tokens_seen": 2211080,
"step": 5870
},
{
"epoch": 5.264336917562724,
"grad_norm": 0.7540143728256226,
"learning_rate": 2.7062022721156177e-05,
"loss": 0.7426,
"num_input_tokens_seen": 2213032,
"step": 5875
},
{
"epoch": 5.268817204301075,
"grad_norm": 0.6295384764671326,
"learning_rate": 2.7023055558208487e-05,
"loss": 0.7187,
"num_input_tokens_seen": 2214824,
"step": 5880
},
{
"epoch": 5.273297491039426,
"grad_norm": 0.467625230550766,
"learning_rate": 2.6984083447210945e-05,
"loss": 0.6995,
"num_input_tokens_seen": 2216648,
"step": 5885
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.5293396711349487,
"learning_rate": 2.6945106483482686e-05,
"loss": 0.6924,
"num_input_tokens_seen": 2218440,
"step": 5890
},
{
"epoch": 5.282258064516129,
"grad_norm": 0.6371461749076843,
"learning_rate": 2.690612476235475e-05,
"loss": 0.7196,
"num_input_tokens_seen": 2220424,
"step": 5895
},
{
"epoch": 5.28673835125448,
"grad_norm": 0.6132698059082031,
"learning_rate": 2.6867138379169802e-05,
"loss": 0.6934,
"num_input_tokens_seen": 2222152,
"step": 5900
},
{
"epoch": 5.291218637992832,
"grad_norm": 0.5086933374404907,
"learning_rate": 2.6828147429281902e-05,
"loss": 0.7014,
"num_input_tokens_seen": 2223976,
"step": 5905
},
{
"epoch": 5.295698924731183,
"grad_norm": 0.47041961550712585,
"learning_rate": 2.6789152008056272e-05,
"loss": 0.6988,
"num_input_tokens_seen": 2225960,
"step": 5910
},
{
"epoch": 5.300179211469534,
"grad_norm": 0.7675802707672119,
"learning_rate": 2.6750152210869095e-05,
"loss": 0.6973,
"num_input_tokens_seen": 2227912,
"step": 5915
},
{
"epoch": 5.304659498207886,
"grad_norm": 0.5190770030021667,
"learning_rate": 2.6711148133107233e-05,
"loss": 0.6921,
"num_input_tokens_seen": 2229736,
"step": 5920
},
{
"epoch": 5.309139784946237,
"grad_norm": 0.509989321231842,
"learning_rate": 2.6672139870168034e-05,
"loss": 0.6864,
"num_input_tokens_seen": 2231720,
"step": 5925
},
{
"epoch": 5.313620071684587,
"grad_norm": 0.5584254264831543,
"learning_rate": 2.6633127517459066e-05,
"loss": 0.6944,
"num_input_tokens_seen": 2233544,
"step": 5930
},
{
"epoch": 5.318100358422939,
"grad_norm": 0.4709128439426422,
"learning_rate": 2.6594111170397916e-05,
"loss": 0.6945,
"num_input_tokens_seen": 2235336,
"step": 5935
},
{
"epoch": 5.32258064516129,
"grad_norm": 0.498522013425827,
"learning_rate": 2.655509092441194e-05,
"loss": 0.6868,
"num_input_tokens_seen": 2237128,
"step": 5940
},
{
"epoch": 5.327060931899641,
"grad_norm": 0.7889726758003235,
"learning_rate": 2.6516066874938023e-05,
"loss": 0.7143,
"num_input_tokens_seen": 2239016,
"step": 5945
},
{
"epoch": 5.331541218637993,
"grad_norm": 0.5677915811538696,
"learning_rate": 2.6477039117422335e-05,
"loss": 0.6815,
"num_input_tokens_seen": 2240968,
"step": 5950
},
{
"epoch": 5.336021505376344,
"grad_norm": 0.6692419052124023,
"learning_rate": 2.6438007747320153e-05,
"loss": 0.6658,
"num_input_tokens_seen": 2242728,
"step": 5955
},
{
"epoch": 5.340501792114695,
"grad_norm": 0.5950575470924377,
"learning_rate": 2.639897286009556e-05,
"loss": 0.7028,
"num_input_tokens_seen": 2244584,
"step": 5960
},
{
"epoch": 5.344982078853047,
"grad_norm": 0.4070301949977875,
"learning_rate": 2.6359934551221267e-05,
"loss": 0.6951,
"num_input_tokens_seen": 2246408,
"step": 5965
},
{
"epoch": 5.349462365591398,
"grad_norm": 0.6000044345855713,
"learning_rate": 2.6320892916178326e-05,
"loss": 0.7226,
"num_input_tokens_seen": 2248456,
"step": 5970
},
{
"epoch": 5.353942652329749,
"grad_norm": 0.5580965280532837,
"learning_rate": 2.628184805045593e-05,
"loss": 0.7243,
"num_input_tokens_seen": 2250216,
"step": 5975
},
{
"epoch": 5.358422939068101,
"grad_norm": 0.5775971412658691,
"learning_rate": 2.6242800049551192e-05,
"loss": 0.6869,
"num_input_tokens_seen": 2252040,
"step": 5980
},
{
"epoch": 5.362903225806452,
"grad_norm": 0.7574295401573181,
"learning_rate": 2.620374900896889e-05,
"loss": 0.6809,
"num_input_tokens_seen": 2253992,
"step": 5985
},
{
"epoch": 5.367383512544803,
"grad_norm": 0.4660971164703369,
"learning_rate": 2.6164695024221215e-05,
"loss": 0.7012,
"num_input_tokens_seen": 2255816,
"step": 5990
},
{
"epoch": 5.371863799283154,
"grad_norm": 0.8073466420173645,
"learning_rate": 2.612563819082757e-05,
"loss": 0.6988,
"num_input_tokens_seen": 2257672,
"step": 5995
},
{
"epoch": 5.376344086021505,
"grad_norm": 0.7728441953659058,
"learning_rate": 2.6086578604314337e-05,
"loss": 0.6909,
"num_input_tokens_seen": 2259688,
"step": 6000
},
{
"epoch": 5.380824372759856,
"grad_norm": 0.665420651435852,
"learning_rate": 2.6047516360214623e-05,
"loss": 0.6906,
"num_input_tokens_seen": 2261512,
"step": 6005
},
{
"epoch": 5.385304659498208,
"grad_norm": 0.772237241268158,
"learning_rate": 2.6008451554068025e-05,
"loss": 0.6879,
"num_input_tokens_seen": 2263240,
"step": 6010
},
{
"epoch": 5.389784946236559,
"grad_norm": 0.6501445770263672,
"learning_rate": 2.5969384281420424e-05,
"loss": 0.6998,
"num_input_tokens_seen": 2265000,
"step": 6015
},
{
"epoch": 5.39426523297491,
"grad_norm": 0.49419528245925903,
"learning_rate": 2.593031463782371e-05,
"loss": 0.6816,
"num_input_tokens_seen": 2266792,
"step": 6020
},
{
"epoch": 5.398745519713262,
"grad_norm": 0.46163687109947205,
"learning_rate": 2.5891242718835614e-05,
"loss": 0.7022,
"num_input_tokens_seen": 2268648,
"step": 6025
},
{
"epoch": 5.403225806451613,
"grad_norm": 0.4298229515552521,
"learning_rate": 2.5852168620019385e-05,
"loss": 0.6706,
"num_input_tokens_seen": 2270472,
"step": 6030
},
{
"epoch": 5.407706093189964,
"grad_norm": 0.615990936756134,
"learning_rate": 2.5813092436943626e-05,
"loss": 0.7241,
"num_input_tokens_seen": 2272296,
"step": 6035
},
{
"epoch": 5.412186379928316,
"grad_norm": 0.5446776151657104,
"learning_rate": 2.577401426518204e-05,
"loss": 0.7074,
"num_input_tokens_seen": 2274248,
"step": 6040
},
{
"epoch": 5.416666666666667,
"grad_norm": 0.48909687995910645,
"learning_rate": 2.573493420031318e-05,
"loss": 0.6816,
"num_input_tokens_seen": 2276168,
"step": 6045
},
{
"epoch": 5.421146953405018,
"grad_norm": 0.45095184445381165,
"learning_rate": 2.569585233792027e-05,
"loss": 0.6593,
"num_input_tokens_seen": 2278056,
"step": 6050
},
{
"epoch": 5.425627240143369,
"grad_norm": 0.6048058867454529,
"learning_rate": 2.5656768773590854e-05,
"loss": 0.6883,
"num_input_tokens_seen": 2279944,
"step": 6055
},
{
"epoch": 5.43010752688172,
"grad_norm": 0.44659972190856934,
"learning_rate": 2.5617683602916714e-05,
"loss": 0.721,
"num_input_tokens_seen": 2281896,
"step": 6060
},
{
"epoch": 5.434587813620071,
"grad_norm": 0.3663732409477234,
"learning_rate": 2.5578596921493525e-05,
"loss": 0.7137,
"num_input_tokens_seen": 2283592,
"step": 6065
},
{
"epoch": 5.439068100358423,
"grad_norm": 0.6166502237319946,
"learning_rate": 2.553950882492066e-05,
"loss": 0.6721,
"num_input_tokens_seen": 2285640,
"step": 6070
},
{
"epoch": 5.443548387096774,
"grad_norm": 0.4628104865550995,
"learning_rate": 2.5500419408800953e-05,
"loss": 0.7151,
"num_input_tokens_seen": 2287464,
"step": 6075
},
{
"epoch": 5.448028673835125,
"grad_norm": 0.5418219566345215,
"learning_rate": 2.546132876874048e-05,
"loss": 0.6941,
"num_input_tokens_seen": 2289352,
"step": 6080
},
{
"epoch": 5.452508960573477,
"grad_norm": 0.5038431882858276,
"learning_rate": 2.5422237000348276e-05,
"loss": 0.7297,
"num_input_tokens_seen": 2291240,
"step": 6085
},
{
"epoch": 5.456989247311828,
"grad_norm": 0.7030521631240845,
"learning_rate": 2.5383144199236188e-05,
"loss": 0.6837,
"num_input_tokens_seen": 2293352,
"step": 6090
},
{
"epoch": 5.461469534050179,
"grad_norm": 0.6526070237159729,
"learning_rate": 2.5344050461018542e-05,
"loss": 0.6786,
"num_input_tokens_seen": 2295464,
"step": 6095
},
{
"epoch": 5.465949820788531,
"grad_norm": 0.5026730298995972,
"learning_rate": 2.530495588131197e-05,
"loss": 0.6907,
"num_input_tokens_seen": 2297160,
"step": 6100
},
{
"epoch": 5.470430107526882,
"grad_norm": 0.3277010917663574,
"learning_rate": 2.526586055573518e-05,
"loss": 0.6977,
"num_input_tokens_seen": 2299048,
"step": 6105
},
{
"epoch": 5.474910394265233,
"grad_norm": 0.6846584677696228,
"learning_rate": 2.5226764579908678e-05,
"loss": 0.6823,
"num_input_tokens_seen": 2300904,
"step": 6110
},
{
"epoch": 5.479390681003585,
"grad_norm": 0.4550936818122864,
"learning_rate": 2.5187668049454583e-05,
"loss": 0.6808,
"num_input_tokens_seen": 2302824,
"step": 6115
},
{
"epoch": 5.483870967741936,
"grad_norm": 0.600966215133667,
"learning_rate": 2.5148571059996346e-05,
"loss": 0.7128,
"num_input_tokens_seen": 2304648,
"step": 6120
},
{
"epoch": 5.488351254480286,
"grad_norm": 0.61279296875,
"learning_rate": 2.5109473707158565e-05,
"loss": 0.7259,
"num_input_tokens_seen": 2306760,
"step": 6125
},
{
"epoch": 5.492831541218638,
"grad_norm": 0.7445093989372253,
"learning_rate": 2.5070376086566704e-05,
"loss": 0.6921,
"num_input_tokens_seen": 2308648,
"step": 6130
},
{
"epoch": 5.497311827956989,
"grad_norm": 0.6505946516990662,
"learning_rate": 2.5031278293846922e-05,
"loss": 0.6996,
"num_input_tokens_seen": 2310728,
"step": 6135
},
{
"epoch": 5.5,
"eval_loss": 0.7000005841255188,
"eval_runtime": 5.6446,
"eval_samples_per_second": 87.871,
"eval_steps_per_second": 21.968,
"num_input_tokens_seen": 2311976,
"step": 6138
},
{
"epoch": 5.50179211469534,
"grad_norm": 0.5703749656677246,
"learning_rate": 2.4992180424625737e-05,
"loss": 0.6789,
"num_input_tokens_seen": 2312904,
"step": 6140
},
{
"epoch": 5.506272401433692,
"grad_norm": 0.3148249387741089,
"learning_rate": 2.4953082574529906e-05,
"loss": 0.7067,
"num_input_tokens_seen": 2314856,
"step": 6145
},
{
"epoch": 5.510752688172043,
"grad_norm": 0.41538357734680176,
"learning_rate": 2.491398483918612e-05,
"loss": 0.6485,
"num_input_tokens_seen": 2316808,
"step": 6150
},
{
"epoch": 5.515232974910394,
"grad_norm": 0.6810615658760071,
"learning_rate": 2.48748873142208e-05,
"loss": 0.6799,
"num_input_tokens_seen": 2318664,
"step": 6155
},
{
"epoch": 5.519713261648746,
"grad_norm": 0.5225232243537903,
"learning_rate": 2.4835790095259825e-05,
"loss": 0.6843,
"num_input_tokens_seen": 2320552,
"step": 6160
},
{
"epoch": 5.524193548387097,
"grad_norm": 0.7076472043991089,
"learning_rate": 2.479669327792835e-05,
"loss": 0.6802,
"num_input_tokens_seen": 2322632,
"step": 6165
},
{
"epoch": 5.528673835125448,
"grad_norm": 0.5768188834190369,
"learning_rate": 2.475759695785054e-05,
"loss": 0.6942,
"num_input_tokens_seen": 2324360,
"step": 6170
},
{
"epoch": 5.5331541218638,
"grad_norm": 0.623727023601532,
"learning_rate": 2.4718501230649355e-05,
"loss": 0.6683,
"num_input_tokens_seen": 2326184,
"step": 6175
},
{
"epoch": 5.53763440860215,
"grad_norm": 0.4721769392490387,
"learning_rate": 2.4679406191946285e-05,
"loss": 0.7199,
"num_input_tokens_seen": 2328072,
"step": 6180
},
{
"epoch": 5.542114695340501,
"grad_norm": 0.47976237535476685,
"learning_rate": 2.464031193736116e-05,
"loss": 0.6867,
"num_input_tokens_seen": 2329960,
"step": 6185
},
{
"epoch": 5.546594982078853,
"grad_norm": 0.5275446772575378,
"learning_rate": 2.4601218562511856e-05,
"loss": 0.7152,
"num_input_tokens_seen": 2331816,
"step": 6190
},
{
"epoch": 5.551075268817204,
"grad_norm": 0.5743386745452881,
"learning_rate": 2.4562126163014134e-05,
"loss": 0.7023,
"num_input_tokens_seen": 2333800,
"step": 6195
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.3725605607032776,
"learning_rate": 2.452303483448136e-05,
"loss": 0.6987,
"num_input_tokens_seen": 2335624,
"step": 6200
},
{
"epoch": 5.560035842293907,
"grad_norm": 0.7053723931312561,
"learning_rate": 2.4483944672524263e-05,
"loss": 0.671,
"num_input_tokens_seen": 2337544,
"step": 6205
},
{
"epoch": 5.564516129032258,
"grad_norm": 0.3978196382522583,
"learning_rate": 2.444485577275075e-05,
"loss": 0.7045,
"num_input_tokens_seen": 2339400,
"step": 6210
},
{
"epoch": 5.568996415770609,
"grad_norm": 0.46860271692276,
"learning_rate": 2.44057682307656e-05,
"loss": 0.6753,
"num_input_tokens_seen": 2341128,
"step": 6215
},
{
"epoch": 5.573476702508961,
"grad_norm": 0.6284614205360413,
"learning_rate": 2.436668214217031e-05,
"loss": 0.6944,
"num_input_tokens_seen": 2342920,
"step": 6220
},
{
"epoch": 5.577956989247312,
"grad_norm": 0.541385293006897,
"learning_rate": 2.4327597602562792e-05,
"loss": 0.6794,
"num_input_tokens_seen": 2344968,
"step": 6225
},
{
"epoch": 5.582437275985663,
"grad_norm": 0.41632750630378723,
"learning_rate": 2.428851470753719e-05,
"loss": 0.6832,
"num_input_tokens_seen": 2346824,
"step": 6230
},
{
"epoch": 5.586917562724015,
"grad_norm": 0.36444324254989624,
"learning_rate": 2.4249433552683627e-05,
"loss": 0.6629,
"num_input_tokens_seen": 2348712,
"step": 6235
},
{
"epoch": 5.591397849462366,
"grad_norm": 0.4200541377067566,
"learning_rate": 2.4210354233587955e-05,
"loss": 0.7115,
"num_input_tokens_seen": 2350600,
"step": 6240
},
{
"epoch": 5.595878136200717,
"grad_norm": 0.5005070567131042,
"learning_rate": 2.417127684583154e-05,
"loss": 0.6852,
"num_input_tokens_seen": 2352584,
"step": 6245
},
{
"epoch": 5.600358422939068,
"grad_norm": 0.4710138440132141,
"learning_rate": 2.413220148499103e-05,
"loss": 0.7178,
"num_input_tokens_seen": 2354408,
"step": 6250
},
{
"epoch": 5.604838709677419,
"grad_norm": 0.6707773804664612,
"learning_rate": 2.409312824663811e-05,
"loss": 0.7461,
"num_input_tokens_seen": 2356264,
"step": 6255
},
{
"epoch": 5.60931899641577,
"grad_norm": 0.5251260995864868,
"learning_rate": 2.405405722633928e-05,
"loss": 0.6999,
"num_input_tokens_seen": 2358152,
"step": 6260
},
{
"epoch": 5.613799283154122,
"grad_norm": 0.46387919783592224,
"learning_rate": 2.4014988519655618e-05,
"loss": 0.71,
"num_input_tokens_seen": 2359912,
"step": 6265
},
{
"epoch": 5.618279569892473,
"grad_norm": 0.7532406449317932,
"learning_rate": 2.3975922222142517e-05,
"loss": 0.6983,
"num_input_tokens_seen": 2361864,
"step": 6270
},
{
"epoch": 5.622759856630824,
"grad_norm": 0.591415286064148,
"learning_rate": 2.3936858429349508e-05,
"loss": 0.6857,
"num_input_tokens_seen": 2363784,
"step": 6275
},
{
"epoch": 5.627240143369176,
"grad_norm": 0.45360586047172546,
"learning_rate": 2.389779723681999e-05,
"loss": 0.6659,
"num_input_tokens_seen": 2365608,
"step": 6280
},
{
"epoch": 5.631720430107527,
"grad_norm": 0.43823105096817017,
"learning_rate": 2.3858738740090995e-05,
"loss": 0.7003,
"num_input_tokens_seen": 2367496,
"step": 6285
},
{
"epoch": 5.636200716845878,
"grad_norm": 0.5166615843772888,
"learning_rate": 2.3819683034692953e-05,
"loss": 0.6941,
"num_input_tokens_seen": 2369416,
"step": 6290
},
{
"epoch": 5.64068100358423,
"grad_norm": 0.5909369587898254,
"learning_rate": 2.3780630216149506e-05,
"loss": 0.6664,
"num_input_tokens_seen": 2371336,
"step": 6295
},
{
"epoch": 5.645161290322581,
"grad_norm": 0.6770111322402954,
"learning_rate": 2.374158037997717e-05,
"loss": 0.702,
"num_input_tokens_seen": 2373416,
"step": 6300
},
{
"epoch": 5.649641577060932,
"grad_norm": 0.6585894227027893,
"learning_rate": 2.3702533621685228e-05,
"loss": 0.7382,
"num_input_tokens_seen": 2375304,
"step": 6305
},
{
"epoch": 5.654121863799283,
"grad_norm": 0.4873889088630676,
"learning_rate": 2.36634900367754e-05,
"loss": 0.6871,
"num_input_tokens_seen": 2377192,
"step": 6310
},
{
"epoch": 5.658602150537634,
"grad_norm": 0.7726801037788391,
"learning_rate": 2.3624449720741654e-05,
"loss": 0.7019,
"num_input_tokens_seen": 2379080,
"step": 6315
},
{
"epoch": 5.663082437275985,
"grad_norm": 0.8256939649581909,
"learning_rate": 2.3585412769069984e-05,
"loss": 0.6638,
"num_input_tokens_seen": 2381384,
"step": 6320
},
{
"epoch": 5.667562724014337,
"grad_norm": 0.4363030195236206,
"learning_rate": 2.3546379277238107e-05,
"loss": 0.6887,
"num_input_tokens_seen": 2383304,
"step": 6325
},
{
"epoch": 5.672043010752688,
"grad_norm": 0.6827439665794373,
"learning_rate": 2.3507349340715322e-05,
"loss": 0.7208,
"num_input_tokens_seen": 2385128,
"step": 6330
},
{
"epoch": 5.676523297491039,
"grad_norm": 0.43318870663642883,
"learning_rate": 2.3468323054962213e-05,
"loss": 0.6828,
"num_input_tokens_seen": 2386952,
"step": 6335
},
{
"epoch": 5.681003584229391,
"grad_norm": 0.6294808983802795,
"learning_rate": 2.3429300515430437e-05,
"loss": 0.6895,
"num_input_tokens_seen": 2388872,
"step": 6340
},
{
"epoch": 5.685483870967742,
"grad_norm": 0.43699830770492554,
"learning_rate": 2.3390281817562496e-05,
"loss": 0.6953,
"num_input_tokens_seen": 2390888,
"step": 6345
},
{
"epoch": 5.689964157706093,
"grad_norm": 0.3608168065547943,
"learning_rate": 2.335126705679149e-05,
"loss": 0.6976,
"num_input_tokens_seen": 2392712,
"step": 6350
},
{
"epoch": 5.694444444444445,
"grad_norm": 0.5301570296287537,
"learning_rate": 2.331225632854087e-05,
"loss": 0.6999,
"num_input_tokens_seen": 2394728,
"step": 6355
},
{
"epoch": 5.698924731182796,
"grad_norm": 0.5568897128105164,
"learning_rate": 2.327324972822426e-05,
"loss": 0.7094,
"num_input_tokens_seen": 2396648,
"step": 6360
},
{
"epoch": 5.703405017921147,
"grad_norm": 0.4521498680114746,
"learning_rate": 2.3234247351245177e-05,
"loss": 0.6585,
"num_input_tokens_seen": 2398632,
"step": 6365
},
{
"epoch": 5.707885304659499,
"grad_norm": 0.46761560440063477,
"learning_rate": 2.3195249292996786e-05,
"loss": 0.6876,
"num_input_tokens_seen": 2400616,
"step": 6370
},
{
"epoch": 5.71236559139785,
"grad_norm": 0.4455620348453522,
"learning_rate": 2.3156255648861723e-05,
"loss": 0.6895,
"num_input_tokens_seen": 2402472,
"step": 6375
},
{
"epoch": 5.7168458781362,
"grad_norm": 0.6404704451560974,
"learning_rate": 2.3117266514211788e-05,
"loss": 0.7024,
"num_input_tokens_seen": 2404392,
"step": 6380
},
{
"epoch": 5.721326164874552,
"grad_norm": 0.7674428224563599,
"learning_rate": 2.3078281984407787e-05,
"loss": 0.683,
"num_input_tokens_seen": 2406312,
"step": 6385
},
{
"epoch": 5.725806451612903,
"grad_norm": 0.48401227593421936,
"learning_rate": 2.3039302154799256e-05,
"loss": 0.6813,
"num_input_tokens_seen": 2408168,
"step": 6390
},
{
"epoch": 5.730286738351254,
"grad_norm": 0.4482485055923462,
"learning_rate": 2.300032712072422e-05,
"loss": 0.6582,
"num_input_tokens_seen": 2409992,
"step": 6395
},
{
"epoch": 5.734767025089606,
"grad_norm": 0.7896308898925781,
"learning_rate": 2.2961356977508984e-05,
"loss": 0.6966,
"num_input_tokens_seen": 2411944,
"step": 6400
},
{
"epoch": 5.739247311827957,
"grad_norm": 0.6187811493873596,
"learning_rate": 2.2922391820467905e-05,
"loss": 0.7247,
"num_input_tokens_seen": 2413928,
"step": 6405
},
{
"epoch": 5.743727598566308,
"grad_norm": 0.4433819055557251,
"learning_rate": 2.2883431744903115e-05,
"loss": 0.7091,
"num_input_tokens_seen": 2415848,
"step": 6410
},
{
"epoch": 5.74820788530466,
"grad_norm": 0.40984046459198,
"learning_rate": 2.284447684610434e-05,
"loss": 0.7202,
"num_input_tokens_seen": 2417704,
"step": 6415
},
{
"epoch": 5.752688172043011,
"grad_norm": 0.45450559258461,
"learning_rate": 2.2805527219348632e-05,
"loss": 0.7115,
"num_input_tokens_seen": 2419656,
"step": 6420
},
{
"epoch": 5.757168458781362,
"grad_norm": 0.8165116906166077,
"learning_rate": 2.276658295990016e-05,
"loss": 0.7439,
"num_input_tokens_seen": 2421512,
"step": 6425
},
{
"epoch": 5.761648745519714,
"grad_norm": 0.616450846195221,
"learning_rate": 2.272764416300997e-05,
"loss": 0.6797,
"num_input_tokens_seen": 2423272,
"step": 6430
},
{
"epoch": 5.766129032258064,
"grad_norm": 0.5471182465553284,
"learning_rate": 2.2688710923915718e-05,
"loss": 0.6858,
"num_input_tokens_seen": 2425288,
"step": 6435
},
{
"epoch": 5.770609318996415,
"grad_norm": 0.8255714178085327,
"learning_rate": 2.264978333784149e-05,
"loss": 0.6955,
"num_input_tokens_seen": 2427112,
"step": 6440
},
{
"epoch": 5.775089605734767,
"grad_norm": 0.5268316268920898,
"learning_rate": 2.261086149999755e-05,
"loss": 0.7015,
"num_input_tokens_seen": 2428968,
"step": 6445
},
{
"epoch": 5.779569892473118,
"grad_norm": 0.7851718664169312,
"learning_rate": 2.257194550558009e-05,
"loss": 0.7278,
"num_input_tokens_seen": 2431048,
"step": 6450
},
{
"epoch": 5.784050179211469,
"grad_norm": 0.43519654870033264,
"learning_rate": 2.253303544977101e-05,
"loss": 0.6455,
"num_input_tokens_seen": 2432904,
"step": 6455
},
{
"epoch": 5.788530465949821,
"grad_norm": 0.4320980906486511,
"learning_rate": 2.249413142773771e-05,
"loss": 0.7036,
"num_input_tokens_seen": 2434856,
"step": 6460
},
{
"epoch": 5.793010752688172,
"grad_norm": 0.49086955189704895,
"learning_rate": 2.245523353463278e-05,
"loss": 0.7149,
"num_input_tokens_seen": 2436680,
"step": 6465
},
{
"epoch": 5.797491039426523,
"grad_norm": 0.9313246011734009,
"learning_rate": 2.2416341865593875e-05,
"loss": 0.7261,
"num_input_tokens_seen": 2438344,
"step": 6470
},
{
"epoch": 5.801971326164875,
"grad_norm": 0.36511847376823425,
"learning_rate": 2.2377456515743396e-05,
"loss": 0.72,
"num_input_tokens_seen": 2440168,
"step": 6475
},
{
"epoch": 5.806451612903226,
"grad_norm": 0.3627917766571045,
"learning_rate": 2.2338577580188296e-05,
"loss": 0.6819,
"num_input_tokens_seen": 2442056,
"step": 6480
},
{
"epoch": 5.810931899641577,
"grad_norm": 0.4386759102344513,
"learning_rate": 2.2299705154019846e-05,
"loss": 0.7089,
"num_input_tokens_seen": 2443976,
"step": 6485
},
{
"epoch": 5.815412186379929,
"grad_norm": 0.5388379693031311,
"learning_rate": 2.2260839332313375e-05,
"loss": 0.7013,
"num_input_tokens_seen": 2445832,
"step": 6490
},
{
"epoch": 5.81989247311828,
"grad_norm": 0.5991097092628479,
"learning_rate": 2.222198021012809e-05,
"loss": 0.7096,
"num_input_tokens_seen": 2447720,
"step": 6495
},
{
"epoch": 5.824372759856631,
"grad_norm": 0.5087167620658875,
"learning_rate": 2.218312788250678e-05,
"loss": 0.6658,
"num_input_tokens_seen": 2449704,
"step": 6500
},
{
"epoch": 5.828853046594982,
"grad_norm": 0.5476199388504028,
"learning_rate": 2.2144282444475638e-05,
"loss": 0.6827,
"num_input_tokens_seen": 2451592,
"step": 6505
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.5977054238319397,
"learning_rate": 2.2105443991044006e-05,
"loss": 0.7112,
"num_input_tokens_seen": 2453640,
"step": 6510
},
{
"epoch": 5.837813620071684,
"grad_norm": 0.5165858864784241,
"learning_rate": 2.206661261720414e-05,
"loss": 0.6908,
"num_input_tokens_seen": 2455496,
"step": 6515
},
{
"epoch": 5.842293906810036,
"grad_norm": 0.8256833553314209,
"learning_rate": 2.2027788417930962e-05,
"loss": 0.6984,
"num_input_tokens_seen": 2457320,
"step": 6520
},
{
"epoch": 5.846774193548387,
"grad_norm": 0.7374931573867798,
"learning_rate": 2.1988971488181862e-05,
"loss": 0.7022,
"num_input_tokens_seen": 2459400,
"step": 6525
},
{
"epoch": 5.851254480286738,
"grad_norm": 0.42575767636299133,
"learning_rate": 2.1950161922896452e-05,
"loss": 0.7119,
"num_input_tokens_seen": 2461352,
"step": 6530
},
{
"epoch": 5.85573476702509,
"grad_norm": 0.8485537767410278,
"learning_rate": 2.1911359816996342e-05,
"loss": 0.6922,
"num_input_tokens_seen": 2463112,
"step": 6535
},
{
"epoch": 5.860215053763441,
"grad_norm": 0.5161400437355042,
"learning_rate": 2.1872565265384867e-05,
"loss": 0.6798,
"num_input_tokens_seen": 2465096,
"step": 6540
},
{
"epoch": 5.864695340501792,
"grad_norm": 0.6514114141464233,
"learning_rate": 2.1833778362946914e-05,
"loss": 0.7127,
"num_input_tokens_seen": 2467016,
"step": 6545
},
{
"epoch": 5.869175627240144,
"grad_norm": 0.5723159909248352,
"learning_rate": 2.179499920454864e-05,
"loss": 0.6659,
"num_input_tokens_seen": 2468680,
"step": 6550
},
{
"epoch": 5.873655913978495,
"grad_norm": 0.6084119081497192,
"learning_rate": 2.1756227885037277e-05,
"loss": 0.6798,
"num_input_tokens_seen": 2470632,
"step": 6555
},
{
"epoch": 5.878136200716845,
"grad_norm": 0.4714006781578064,
"learning_rate": 2.1717464499240882e-05,
"loss": 0.6966,
"num_input_tokens_seen": 2472616,
"step": 6560
},
{
"epoch": 5.882616487455197,
"grad_norm": 0.41111356019973755,
"learning_rate": 2.16787091419681e-05,
"loss": 0.6927,
"num_input_tokens_seen": 2474536,
"step": 6565
},
{
"epoch": 5.887096774193548,
"grad_norm": 0.7911174297332764,
"learning_rate": 2.1639961908007962e-05,
"loss": 0.6812,
"num_input_tokens_seen": 2476616,
"step": 6570
},
{
"epoch": 5.891577060931899,
"grad_norm": 0.3497937023639679,
"learning_rate": 2.160122289212958e-05,
"loss": 0.7069,
"num_input_tokens_seen": 2478504,
"step": 6575
},
{
"epoch": 5.896057347670251,
"grad_norm": 0.45918330550193787,
"learning_rate": 2.1562492189082023e-05,
"loss": 0.7093,
"num_input_tokens_seen": 2480296,
"step": 6580
},
{
"epoch": 5.900537634408602,
"grad_norm": 0.5026075839996338,
"learning_rate": 2.1523769893593997e-05,
"loss": 0.6777,
"num_input_tokens_seen": 2482312,
"step": 6585
},
{
"epoch": 5.905017921146953,
"grad_norm": 0.41753828525543213,
"learning_rate": 2.1485056100373646e-05,
"loss": 0.6774,
"num_input_tokens_seen": 2483976,
"step": 6590
},
{
"epoch": 5.909498207885305,
"grad_norm": 0.6379095911979675,
"learning_rate": 2.1446350904108346e-05,
"loss": 0.7132,
"num_input_tokens_seen": 2485704,
"step": 6595
},
{
"epoch": 5.913978494623656,
"grad_norm": 0.7349073886871338,
"learning_rate": 2.14076543994644e-05,
"loss": 0.6919,
"num_input_tokens_seen": 2487688,
"step": 6600
},
{
"epoch": 5.918458781362007,
"grad_norm": 0.7176439762115479,
"learning_rate": 2.1368966681086892e-05,
"loss": 0.7006,
"num_input_tokens_seen": 2489512,
"step": 6605
},
{
"epoch": 5.922939068100359,
"grad_norm": 0.687764048576355,
"learning_rate": 2.1330287843599393e-05,
"loss": 0.7288,
"num_input_tokens_seen": 2491464,
"step": 6610
},
{
"epoch": 5.92741935483871,
"grad_norm": 0.5664499998092651,
"learning_rate": 2.1291617981603766e-05,
"loss": 0.7268,
"num_input_tokens_seen": 2493416,
"step": 6615
},
{
"epoch": 5.931899641577061,
"grad_norm": 0.5868696570396423,
"learning_rate": 2.1252957189679927e-05,
"loss": 0.7139,
"num_input_tokens_seen": 2495272,
"step": 6620
},
{
"epoch": 5.936379928315413,
"grad_norm": 0.6797670125961304,
"learning_rate": 2.1214305562385592e-05,
"loss": 0.7032,
"num_input_tokens_seen": 2497192,
"step": 6625
},
{
"epoch": 5.940860215053764,
"grad_norm": 0.5387040376663208,
"learning_rate": 2.1175663194256056e-05,
"loss": 0.7143,
"num_input_tokens_seen": 2499080,
"step": 6630
},
{
"epoch": 5.945340501792114,
"grad_norm": 0.3677046000957489,
"learning_rate": 2.113703017980399e-05,
"loss": 0.687,
"num_input_tokens_seen": 2500872,
"step": 6635
},
{
"epoch": 5.949820788530466,
"grad_norm": 0.47968631982803345,
"learning_rate": 2.1098406613519178e-05,
"loss": 0.7149,
"num_input_tokens_seen": 2502760,
"step": 6640
},
{
"epoch": 5.954301075268817,
"grad_norm": 0.5777232050895691,
"learning_rate": 2.10597925898683e-05,
"loss": 0.6951,
"num_input_tokens_seen": 2504680,
"step": 6645
},
{
"epoch": 5.958781362007168,
"grad_norm": 0.44781193137168884,
"learning_rate": 2.102118820329469e-05,
"loss": 0.705,
"num_input_tokens_seen": 2506504,
"step": 6650
},
{
"epoch": 5.96326164874552,
"grad_norm": 0.41459017992019653,
"learning_rate": 2.09825935482181e-05,
"loss": 0.7021,
"num_input_tokens_seen": 2508328,
"step": 6655
},
{
"epoch": 5.967741935483871,
"grad_norm": 0.607573926448822,
"learning_rate": 2.09440087190345e-05,
"loss": 0.7145,
"num_input_tokens_seen": 2510280,
"step": 6660
},
{
"epoch": 5.972222222222222,
"grad_norm": 0.3400748074054718,
"learning_rate": 2.0905433810115828e-05,
"loss": 0.6999,
"num_input_tokens_seen": 2512264,
"step": 6665
},
{
"epoch": 5.976702508960574,
"grad_norm": 0.7823965549468994,
"learning_rate": 2.0866868915809733e-05,
"loss": 0.6573,
"num_input_tokens_seen": 2514216,
"step": 6670
},
{
"epoch": 5.981182795698925,
"grad_norm": 0.4284115433692932,
"learning_rate": 2.0828314130439408e-05,
"loss": 0.6815,
"num_input_tokens_seen": 2516104,
"step": 6675
},
{
"epoch": 5.985663082437276,
"grad_norm": 0.5594421625137329,
"learning_rate": 2.0789769548303303e-05,
"loss": 0.7105,
"num_input_tokens_seen": 2518120,
"step": 6680
},
{
"epoch": 5.990143369175628,
"grad_norm": 0.8436473608016968,
"learning_rate": 2.0751235263674893e-05,
"loss": 0.7038,
"num_input_tokens_seen": 2519880,
"step": 6685
},
{
"epoch": 5.994623655913978,
"grad_norm": 0.5207362771034241,
"learning_rate": 2.0712711370802495e-05,
"loss": 0.6996,
"num_input_tokens_seen": 2521800,
"step": 6690
},
{
"epoch": 5.999103942652329,
"grad_norm": 0.46800780296325684,
"learning_rate": 2.0674197963908997e-05,
"loss": 0.7012,
"num_input_tokens_seen": 2523592,
"step": 6695
},
{
"epoch": 6.0,
"eval_loss": 0.701448380947113,
"eval_runtime": 5.6297,
"eval_samples_per_second": 88.104,
"eval_steps_per_second": 22.026,
"num_input_tokens_seen": 2523672,
"step": 6696
},
{
"epoch": 6.003584229390681,
"grad_norm": 0.5440098643302917,
"learning_rate": 2.0635695137191646e-05,
"loss": 0.6747,
"num_input_tokens_seen": 2525048,
"step": 6700
},
{
"epoch": 6.008064516129032,
"grad_norm": 0.577639102935791,
"learning_rate": 2.0597202984821815e-05,
"loss": 0.6818,
"num_input_tokens_seen": 2526776,
"step": 6705
},
{
"epoch": 6.012544802867383,
"grad_norm": 0.5155590772628784,
"learning_rate": 2.0558721600944754e-05,
"loss": 0.6864,
"num_input_tokens_seen": 2528696,
"step": 6710
},
{
"epoch": 6.017025089605735,
"grad_norm": 0.5763823390007019,
"learning_rate": 2.0520251079679373e-05,
"loss": 0.7004,
"num_input_tokens_seen": 2530520,
"step": 6715
},
{
"epoch": 6.021505376344086,
"grad_norm": 0.6298096179962158,
"learning_rate": 2.048179151511804e-05,
"loss": 0.7013,
"num_input_tokens_seen": 2532344,
"step": 6720
},
{
"epoch": 6.025985663082437,
"grad_norm": 0.5589931607246399,
"learning_rate": 2.0443343001326303e-05,
"loss": 0.7173,
"num_input_tokens_seen": 2534264,
"step": 6725
},
{
"epoch": 6.030465949820789,
"grad_norm": 0.4019322097301483,
"learning_rate": 2.04049056323427e-05,
"loss": 0.6726,
"num_input_tokens_seen": 2535992,
"step": 6730
},
{
"epoch": 6.03494623655914,
"grad_norm": 0.7917940020561218,
"learning_rate": 2.0366479502178497e-05,
"loss": 0.6833,
"num_input_tokens_seen": 2537944,
"step": 6735
},
{
"epoch": 6.039426523297491,
"grad_norm": 0.5238997340202332,
"learning_rate": 2.0328064704817458e-05,
"loss": 0.6622,
"num_input_tokens_seen": 2539864,
"step": 6740
},
{
"epoch": 6.043906810035843,
"grad_norm": 0.4306890368461609,
"learning_rate": 2.028966133421565e-05,
"loss": 0.6686,
"num_input_tokens_seen": 2541784,
"step": 6745
},
{
"epoch": 6.048387096774194,
"grad_norm": 0.3549671471118927,
"learning_rate": 2.0251269484301193e-05,
"loss": 0.6855,
"num_input_tokens_seen": 2543640,
"step": 6750
},
{
"epoch": 6.052867383512544,
"grad_norm": 0.5074473023414612,
"learning_rate": 2.021288924897402e-05,
"loss": 0.6812,
"num_input_tokens_seen": 2545656,
"step": 6755
},
{
"epoch": 6.057347670250896,
"grad_norm": 0.597404420375824,
"learning_rate": 2.0174520722105673e-05,
"loss": 0.6669,
"num_input_tokens_seen": 2547448,
"step": 6760
},
{
"epoch": 6.061827956989247,
"grad_norm": 0.503865122795105,
"learning_rate": 2.0136163997539017e-05,
"loss": 0.6961,
"num_input_tokens_seen": 2549272,
"step": 6765
},
{
"epoch": 6.066308243727598,
"grad_norm": 0.44585174322128296,
"learning_rate": 2.0097819169088096e-05,
"loss": 0.6521,
"num_input_tokens_seen": 2551032,
"step": 6770
},
{
"epoch": 6.07078853046595,
"grad_norm": 0.5672586560249329,
"learning_rate": 2.0059486330537835e-05,
"loss": 0.6468,
"num_input_tokens_seen": 2552824,
"step": 6775
},
{
"epoch": 6.075268817204301,
"grad_norm": 0.575846791267395,
"learning_rate": 2.0021165575643837e-05,
"loss": 0.6824,
"num_input_tokens_seen": 2554520,
"step": 6780
},
{
"epoch": 6.079749103942652,
"grad_norm": 0.4153701364994049,
"learning_rate": 1.998285699813215e-05,
"loss": 0.7352,
"num_input_tokens_seen": 2556376,
"step": 6785
},
{
"epoch": 6.084229390681004,
"grad_norm": 0.8207941651344299,
"learning_rate": 1.9944560691699057e-05,
"loss": 0.744,
"num_input_tokens_seen": 2558200,
"step": 6790
},
{
"epoch": 6.088709677419355,
"grad_norm": 0.5624127388000488,
"learning_rate": 1.9906276750010792e-05,
"loss": 0.7281,
"num_input_tokens_seen": 2560312,
"step": 6795
},
{
"epoch": 6.093189964157706,
"grad_norm": 0.6707472801208496,
"learning_rate": 1.9868005266703364e-05,
"loss": 0.692,
"num_input_tokens_seen": 2562328,
"step": 6800
},
{
"epoch": 6.097670250896058,
"grad_norm": 0.6541929244995117,
"learning_rate": 1.982974633538232e-05,
"loss": 0.6828,
"num_input_tokens_seen": 2564248,
"step": 6805
},
{
"epoch": 6.102150537634409,
"grad_norm": 0.6280264258384705,
"learning_rate": 1.9791500049622505e-05,
"loss": 0.6971,
"num_input_tokens_seen": 2566296,
"step": 6810
},
{
"epoch": 6.10663082437276,
"grad_norm": 0.5198401212692261,
"learning_rate": 1.975326650296782e-05,
"loss": 0.6392,
"num_input_tokens_seen": 2568376,
"step": 6815
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.6566698551177979,
"learning_rate": 1.9715045788931037e-05,
"loss": 0.7099,
"num_input_tokens_seen": 2570328,
"step": 6820
},
{
"epoch": 6.115591397849462,
"grad_norm": 0.6096717715263367,
"learning_rate": 1.967683800099349e-05,
"loss": 0.6999,
"num_input_tokens_seen": 2572184,
"step": 6825
},
{
"epoch": 6.120071684587813,
"grad_norm": 0.5329359173774719,
"learning_rate": 1.9638643232604957e-05,
"loss": 0.693,
"num_input_tokens_seen": 2573944,
"step": 6830
},
{
"epoch": 6.124551971326165,
"grad_norm": 0.5989028215408325,
"learning_rate": 1.9600461577183344e-05,
"loss": 0.6944,
"num_input_tokens_seen": 2575864,
"step": 6835
},
{
"epoch": 6.129032258064516,
"grad_norm": 0.5565618872642517,
"learning_rate": 1.9562293128114473e-05,
"loss": 0.6789,
"num_input_tokens_seen": 2577656,
"step": 6840
},
{
"epoch": 6.133512544802867,
"grad_norm": 0.4710071384906769,
"learning_rate": 1.95241379787519e-05,
"loss": 0.6942,
"num_input_tokens_seen": 2579416,
"step": 6845
},
{
"epoch": 6.137992831541219,
"grad_norm": 0.3683546781539917,
"learning_rate": 1.9485996222416607e-05,
"loss": 0.6928,
"num_input_tokens_seen": 2581208,
"step": 6850
},
{
"epoch": 6.14247311827957,
"grad_norm": 0.47386008501052856,
"learning_rate": 1.944786795239686e-05,
"loss": 0.6891,
"num_input_tokens_seen": 2583096,
"step": 6855
},
{
"epoch": 6.146953405017921,
"grad_norm": 0.6793469190597534,
"learning_rate": 1.9409753261947927e-05,
"loss": 0.6871,
"num_input_tokens_seen": 2585112,
"step": 6860
},
{
"epoch": 6.151433691756273,
"grad_norm": 0.46531930565834045,
"learning_rate": 1.9371652244291842e-05,
"loss": 0.6862,
"num_input_tokens_seen": 2586968,
"step": 6865
},
{
"epoch": 6.155913978494624,
"grad_norm": 0.47625985741615295,
"learning_rate": 1.9333564992617232e-05,
"loss": 0.691,
"num_input_tokens_seen": 2588760,
"step": 6870
},
{
"epoch": 6.160394265232975,
"grad_norm": 0.7655602693557739,
"learning_rate": 1.9295491600079035e-05,
"loss": 0.7103,
"num_input_tokens_seen": 2590680,
"step": 6875
},
{
"epoch": 6.164874551971327,
"grad_norm": 0.5129245519638062,
"learning_rate": 1.925743215979829e-05,
"loss": 0.6551,
"num_input_tokens_seen": 2592824,
"step": 6880
},
{
"epoch": 6.169354838709677,
"grad_norm": 0.5057689547538757,
"learning_rate": 1.9219386764861908e-05,
"loss": 0.6699,
"num_input_tokens_seen": 2594648,
"step": 6885
},
{
"epoch": 6.173835125448028,
"grad_norm": 0.47186315059661865,
"learning_rate": 1.9181355508322462e-05,
"loss": 0.7018,
"num_input_tokens_seen": 2596536,
"step": 6890
},
{
"epoch": 6.17831541218638,
"grad_norm": 0.6571786403656006,
"learning_rate": 1.914333848319795e-05,
"loss": 0.6503,
"num_input_tokens_seen": 2598424,
"step": 6895
},
{
"epoch": 6.182795698924731,
"grad_norm": 0.34855401515960693,
"learning_rate": 1.9105335782471534e-05,
"loss": 0.6945,
"num_input_tokens_seen": 2600216,
"step": 6900
},
{
"epoch": 6.187275985663082,
"grad_norm": 0.6422795057296753,
"learning_rate": 1.9067347499091364e-05,
"loss": 0.7219,
"num_input_tokens_seen": 2601944,
"step": 6905
},
{
"epoch": 6.191756272401434,
"grad_norm": 0.696559488773346,
"learning_rate": 1.9029373725970313e-05,
"loss": 0.7042,
"num_input_tokens_seen": 2603896,
"step": 6910
},
{
"epoch": 6.196236559139785,
"grad_norm": 0.6045291423797607,
"learning_rate": 1.8991414555985783e-05,
"loss": 0.6727,
"num_input_tokens_seen": 2605880,
"step": 6915
},
{
"epoch": 6.200716845878136,
"grad_norm": 0.5362111926078796,
"learning_rate": 1.895347008197945e-05,
"loss": 0.6899,
"num_input_tokens_seen": 2607672,
"step": 6920
},
{
"epoch": 6.205197132616488,
"grad_norm": 0.5272240042686462,
"learning_rate": 1.891554039675703e-05,
"loss": 0.6914,
"num_input_tokens_seen": 2609496,
"step": 6925
},
{
"epoch": 6.209677419354839,
"grad_norm": 0.6496487855911255,
"learning_rate": 1.8877625593088104e-05,
"loss": 0.6956,
"num_input_tokens_seen": 2611320,
"step": 6930
},
{
"epoch": 6.21415770609319,
"grad_norm": 0.5585481524467468,
"learning_rate": 1.8839725763705814e-05,
"loss": 0.664,
"num_input_tokens_seen": 2613304,
"step": 6935
},
{
"epoch": 6.218637992831542,
"grad_norm": 0.5600648522377014,
"learning_rate": 1.880184100130671e-05,
"loss": 0.6873,
"num_input_tokens_seen": 2615128,
"step": 6940
},
{
"epoch": 6.223118279569892,
"grad_norm": 0.49623891711235046,
"learning_rate": 1.876397139855047e-05,
"loss": 0.6787,
"num_input_tokens_seen": 2617016,
"step": 6945
},
{
"epoch": 6.227598566308243,
"grad_norm": 0.648080587387085,
"learning_rate": 1.8726117048059704e-05,
"loss": 0.6754,
"num_input_tokens_seen": 2618840,
"step": 6950
},
{
"epoch": 6.232078853046595,
"grad_norm": 0.611798107624054,
"learning_rate": 1.8688278042419734e-05,
"loss": 0.6912,
"num_input_tokens_seen": 2620664,
"step": 6955
},
{
"epoch": 6.236559139784946,
"grad_norm": 0.633701741695404,
"learning_rate": 1.8650454474178298e-05,
"loss": 0.7054,
"num_input_tokens_seen": 2622360,
"step": 6960
},
{
"epoch": 6.241039426523297,
"grad_norm": 0.698072612285614,
"learning_rate": 1.8612646435845443e-05,
"loss": 0.6942,
"num_input_tokens_seen": 2624120,
"step": 6965
},
{
"epoch": 6.245519713261649,
"grad_norm": 0.6582928895950317,
"learning_rate": 1.857485401989318e-05,
"loss": 0.6985,
"num_input_tokens_seen": 2625976,
"step": 6970
},
{
"epoch": 6.25,
"grad_norm": 0.6447840332984924,
"learning_rate": 1.853707731875534e-05,
"loss": 0.7099,
"num_input_tokens_seen": 2627896,
"step": 6975
},
{
"epoch": 6.254480286738351,
"grad_norm": 0.4497109651565552,
"learning_rate": 1.849931642482732e-05,
"loss": 0.6981,
"num_input_tokens_seen": 2629752,
"step": 6980
},
{
"epoch": 6.258960573476703,
"grad_norm": 0.7298440337181091,
"learning_rate": 1.8461571430465834e-05,
"loss": 0.7398,
"num_input_tokens_seen": 2631608,
"step": 6985
},
{
"epoch": 6.263440860215054,
"grad_norm": 0.46043136715888977,
"learning_rate": 1.8423842427988722e-05,
"loss": 0.6922,
"num_input_tokens_seen": 2633528,
"step": 6990
},
{
"epoch": 6.267921146953405,
"grad_norm": 0.5057744979858398,
"learning_rate": 1.83861295096747e-05,
"loss": 0.7101,
"num_input_tokens_seen": 2635544,
"step": 6995
},
{
"epoch": 6.272401433691757,
"grad_norm": 0.6320658922195435,
"learning_rate": 1.8348432767763162e-05,
"loss": 0.6801,
"num_input_tokens_seen": 2637496,
"step": 7000
},
{
"epoch": 6.276881720430108,
"grad_norm": 0.34551823139190674,
"learning_rate": 1.8310752294453924e-05,
"loss": 0.6804,
"num_input_tokens_seen": 2639320,
"step": 7005
},
{
"epoch": 6.281362007168458,
"grad_norm": 0.574415922164917,
"learning_rate": 1.8273088181907034e-05,
"loss": 0.6867,
"num_input_tokens_seen": 2641176,
"step": 7010
},
{
"epoch": 6.28584229390681,
"grad_norm": 0.5269017815589905,
"learning_rate": 1.823544052224247e-05,
"loss": 0.6902,
"num_input_tokens_seen": 2642936,
"step": 7015
},
{
"epoch": 6.290322580645161,
"grad_norm": 0.4298408031463623,
"learning_rate": 1.8197809407540028e-05,
"loss": 0.6957,
"num_input_tokens_seen": 2644696,
"step": 7020
},
{
"epoch": 6.294802867383512,
"grad_norm": 0.6325519680976868,
"learning_rate": 1.816019492983902e-05,
"loss": 0.6596,
"num_input_tokens_seen": 2646520,
"step": 7025
},
{
"epoch": 6.299283154121864,
"grad_norm": 0.39390936493873596,
"learning_rate": 1.812259718113805e-05,
"loss": 0.6794,
"num_input_tokens_seen": 2648312,
"step": 7030
},
{
"epoch": 6.303763440860215,
"grad_norm": 0.557681143283844,
"learning_rate": 1.8085016253394817e-05,
"loss": 0.6849,
"num_input_tokens_seen": 2650200,
"step": 7035
},
{
"epoch": 6.308243727598566,
"grad_norm": 0.8681994676589966,
"learning_rate": 1.8047452238525896e-05,
"loss": 0.6985,
"num_input_tokens_seen": 2651992,
"step": 7040
},
{
"epoch": 6.312724014336918,
"grad_norm": 0.6332396864891052,
"learning_rate": 1.8009905228406458e-05,
"loss": 0.7095,
"num_input_tokens_seen": 2653848,
"step": 7045
},
{
"epoch": 6.317204301075269,
"grad_norm": 0.5211427807807922,
"learning_rate": 1.797237531487012e-05,
"loss": 0.6843,
"num_input_tokens_seen": 2655672,
"step": 7050
},
{
"epoch": 6.32168458781362,
"grad_norm": 0.7297235727310181,
"learning_rate": 1.7934862589708657e-05,
"loss": 0.6913,
"num_input_tokens_seen": 2657432,
"step": 7055
},
{
"epoch": 6.326164874551972,
"grad_norm": 0.46372517943382263,
"learning_rate": 1.789736714467182e-05,
"loss": 0.6807,
"num_input_tokens_seen": 2659256,
"step": 7060
},
{
"epoch": 6.330645161290323,
"grad_norm": 0.5800030827522278,
"learning_rate": 1.7859889071467102e-05,
"loss": 0.6791,
"num_input_tokens_seen": 2661144,
"step": 7065
},
{
"epoch": 6.335125448028673,
"grad_norm": 0.5289621949195862,
"learning_rate": 1.7822428461759483e-05,
"loss": 0.692,
"num_input_tokens_seen": 2662904,
"step": 7070
},
{
"epoch": 6.339605734767025,
"grad_norm": 0.6062299609184265,
"learning_rate": 1.778498540717124e-05,
"loss": 0.6937,
"num_input_tokens_seen": 2664728,
"step": 7075
},
{
"epoch": 6.344086021505376,
"grad_norm": 0.4943602383136749,
"learning_rate": 1.7747559999281723e-05,
"loss": 0.7078,
"num_input_tokens_seen": 2666616,
"step": 7080
},
{
"epoch": 6.348566308243727,
"grad_norm": 0.4620401859283447,
"learning_rate": 1.771015232962712e-05,
"loss": 0.6676,
"num_input_tokens_seen": 2668376,
"step": 7085
},
{
"epoch": 6.353046594982079,
"grad_norm": 0.7000113129615784,
"learning_rate": 1.7672762489700227e-05,
"loss": 0.6619,
"num_input_tokens_seen": 2670168,
"step": 7090
},
{
"epoch": 6.35752688172043,
"grad_norm": 0.4342782497406006,
"learning_rate": 1.7635390570950246e-05,
"loss": 0.7001,
"num_input_tokens_seen": 2672120,
"step": 7095
},
{
"epoch": 6.362007168458781,
"grad_norm": 0.7346124053001404,
"learning_rate": 1.7598036664782508e-05,
"loss": 0.6692,
"num_input_tokens_seen": 2674232,
"step": 7100
},
{
"epoch": 6.366487455197133,
"grad_norm": 0.8609234690666199,
"learning_rate": 1.7560700862558325e-05,
"loss": 0.7069,
"num_input_tokens_seen": 2676120,
"step": 7105
},
{
"epoch": 6.370967741935484,
"grad_norm": 0.580582857131958,
"learning_rate": 1.7523383255594735e-05,
"loss": 0.7049,
"num_input_tokens_seen": 2678072,
"step": 7110
},
{
"epoch": 6.375448028673835,
"grad_norm": 0.6592505574226379,
"learning_rate": 1.7486083935164244e-05,
"loss": 0.7161,
"num_input_tokens_seen": 2679960,
"step": 7115
},
{
"epoch": 6.379928315412187,
"grad_norm": 0.6857872605323792,
"learning_rate": 1.7448802992494657e-05,
"loss": 0.6875,
"num_input_tokens_seen": 2681816,
"step": 7120
},
{
"epoch": 6.384408602150538,
"grad_norm": 0.5294128060340881,
"learning_rate": 1.7411540518768805e-05,
"loss": 0.6824,
"num_input_tokens_seen": 2683768,
"step": 7125
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.6431249380111694,
"learning_rate": 1.737429660512437e-05,
"loss": 0.7197,
"num_input_tokens_seen": 2685464,
"step": 7130
},
{
"epoch": 6.393369175627241,
"grad_norm": 0.6430848240852356,
"learning_rate": 1.733707134265363e-05,
"loss": 0.6713,
"num_input_tokens_seen": 2687384,
"step": 7135
},
{
"epoch": 6.397849462365591,
"grad_norm": 0.4698632061481476,
"learning_rate": 1.7299864822403257e-05,
"loss": 0.6624,
"num_input_tokens_seen": 2689176,
"step": 7140
},
{
"epoch": 6.402329749103942,
"grad_norm": 0.7135388851165771,
"learning_rate": 1.7262677135374053e-05,
"loss": 0.7079,
"num_input_tokens_seen": 2691000,
"step": 7145
},
{
"epoch": 6.406810035842294,
"grad_norm": 0.48572251200675964,
"learning_rate": 1.72255083725208e-05,
"loss": 0.7285,
"num_input_tokens_seen": 2692824,
"step": 7150
},
{
"epoch": 6.411290322580645,
"grad_norm": 0.4359692335128784,
"learning_rate": 1.7188358624751954e-05,
"loss": 0.7156,
"num_input_tokens_seen": 2694648,
"step": 7155
},
{
"epoch": 6.415770609318996,
"grad_norm": 0.3980657160282135,
"learning_rate": 1.7151227982929477e-05,
"loss": 0.659,
"num_input_tokens_seen": 2696760,
"step": 7160
},
{
"epoch": 6.420250896057348,
"grad_norm": 0.5434011220932007,
"learning_rate": 1.711411653786861e-05,
"loss": 0.731,
"num_input_tokens_seen": 2698680,
"step": 7165
},
{
"epoch": 6.424731182795699,
"grad_norm": 0.7666317820549011,
"learning_rate": 1.7077024380337646e-05,
"loss": 0.7375,
"num_input_tokens_seen": 2700568,
"step": 7170
},
{
"epoch": 6.42921146953405,
"grad_norm": 0.7653113007545471,
"learning_rate": 1.7039951601057692e-05,
"loss": 0.6526,
"num_input_tokens_seen": 2702360,
"step": 7175
},
{
"epoch": 6.433691756272402,
"grad_norm": 0.45610731840133667,
"learning_rate": 1.7002898290702454e-05,
"loss": 0.7237,
"num_input_tokens_seen": 2704376,
"step": 7180
},
{
"epoch": 6.438172043010753,
"grad_norm": 0.46774065494537354,
"learning_rate": 1.6965864539898026e-05,
"loss": 0.6886,
"num_input_tokens_seen": 2706200,
"step": 7185
},
{
"epoch": 6.442652329749104,
"grad_norm": 0.5820562839508057,
"learning_rate": 1.6928850439222666e-05,
"loss": 0.6965,
"num_input_tokens_seen": 2708088,
"step": 7190
},
{
"epoch": 6.447132616487455,
"grad_norm": 0.9814329743385315,
"learning_rate": 1.689185607920658e-05,
"loss": 0.6905,
"num_input_tokens_seen": 2709912,
"step": 7195
},
{
"epoch": 6.451612903225806,
"grad_norm": 0.5103081464767456,
"learning_rate": 1.685488155033167e-05,
"loss": 0.6837,
"num_input_tokens_seen": 2711672,
"step": 7200
},
{
"epoch": 6.456093189964157,
"grad_norm": 0.5958290696144104,
"learning_rate": 1.681792694303136e-05,
"loss": 0.6738,
"num_input_tokens_seen": 2713528,
"step": 7205
},
{
"epoch": 6.460573476702509,
"grad_norm": 0.5504493713378906,
"learning_rate": 1.6780992347690313e-05,
"loss": 0.6801,
"num_input_tokens_seen": 2715416,
"step": 7210
},
{
"epoch": 6.46505376344086,
"grad_norm": 0.47812798619270325,
"learning_rate": 1.6744077854644282e-05,
"loss": 0.7178,
"num_input_tokens_seen": 2717464,
"step": 7215
},
{
"epoch": 6.469534050179211,
"grad_norm": 0.5012235641479492,
"learning_rate": 1.6707183554179846e-05,
"loss": 0.6902,
"num_input_tokens_seen": 2719352,
"step": 7220
},
{
"epoch": 6.474014336917563,
"grad_norm": 0.6582107543945312,
"learning_rate": 1.6670309536534172e-05,
"loss": 0.6875,
"num_input_tokens_seen": 2721368,
"step": 7225
},
{
"epoch": 6.478494623655914,
"grad_norm": 0.7172203063964844,
"learning_rate": 1.6633455891894858e-05,
"loss": 0.6896,
"num_input_tokens_seen": 2723320,
"step": 7230
},
{
"epoch": 6.482974910394265,
"grad_norm": 0.6096293330192566,
"learning_rate": 1.659662271039963e-05,
"loss": 0.7123,
"num_input_tokens_seen": 2725240,
"step": 7235
},
{
"epoch": 6.487455197132617,
"grad_norm": 0.6388385891914368,
"learning_rate": 1.65598100821362e-05,
"loss": 0.6414,
"num_input_tokens_seen": 2727288,
"step": 7240
},
{
"epoch": 6.491935483870968,
"grad_norm": 0.7723689079284668,
"learning_rate": 1.652301809714199e-05,
"loss": 0.7895,
"num_input_tokens_seen": 2729080,
"step": 7245
},
{
"epoch": 6.496415770609319,
"grad_norm": 0.5337994694709778,
"learning_rate": 1.648624684540394e-05,
"loss": 0.6863,
"num_input_tokens_seen": 2730904,
"step": 7250
},
{
"epoch": 6.5,
"eval_loss": 0.7022318840026855,
"eval_runtime": 5.6438,
"eval_samples_per_second": 87.884,
"eval_steps_per_second": 21.971,
"num_input_tokens_seen": 2732440,
"step": 7254
},
{
"epoch": 6.500896057347671,
"grad_norm": 0.5980244278907776,
"learning_rate": 1.6449496416858284e-05,
"loss": 0.6843,
"num_input_tokens_seen": 2732792,
"step": 7255
},
{
"epoch": 6.505376344086022,
"grad_norm": 0.7844721078872681,
"learning_rate": 1.6412766901390314e-05,
"loss": 0.6785,
"num_input_tokens_seen": 2734616,
"step": 7260
},
{
"epoch": 6.509856630824372,
"grad_norm": 0.6142382025718689,
"learning_rate": 1.6376058388834183e-05,
"loss": 0.7065,
"num_input_tokens_seen": 2736472,
"step": 7265
},
{
"epoch": 6.514336917562724,
"grad_norm": 0.3665355145931244,
"learning_rate": 1.633937096897266e-05,
"loss": 0.7276,
"num_input_tokens_seen": 2738360,
"step": 7270
},
{
"epoch": 6.518817204301075,
"grad_norm": 0.5914114713668823,
"learning_rate": 1.630270473153695e-05,
"loss": 0.7268,
"num_input_tokens_seen": 2740408,
"step": 7275
},
{
"epoch": 6.523297491039426,
"grad_norm": 0.5145869255065918,
"learning_rate": 1.6266059766206425e-05,
"loss": 0.6954,
"num_input_tokens_seen": 2742168,
"step": 7280
},
{
"epoch": 6.527777777777778,
"grad_norm": 0.6934862732887268,
"learning_rate": 1.6229436162608448e-05,
"loss": 0.6979,
"num_input_tokens_seen": 2743928,
"step": 7285
},
{
"epoch": 6.532258064516129,
"grad_norm": 0.6275290250778198,
"learning_rate": 1.619283401031811e-05,
"loss": 0.688,
"num_input_tokens_seen": 2745944,
"step": 7290
},
{
"epoch": 6.53673835125448,
"grad_norm": 0.49510547518730164,
"learning_rate": 1.6156253398858058e-05,
"loss": 0.6983,
"num_input_tokens_seen": 2747960,
"step": 7295
},
{
"epoch": 6.541218637992832,
"grad_norm": 0.569757878780365,
"learning_rate": 1.6119694417698246e-05,
"loss": 0.6844,
"num_input_tokens_seen": 2749848,
"step": 7300
},
{
"epoch": 6.545698924731183,
"grad_norm": 0.46662643551826477,
"learning_rate": 1.6083157156255733e-05,
"loss": 0.713,
"num_input_tokens_seen": 2751704,
"step": 7305
},
{
"epoch": 6.550179211469534,
"grad_norm": 0.46720123291015625,
"learning_rate": 1.6046641703894434e-05,
"loss": 0.7148,
"num_input_tokens_seen": 2753528,
"step": 7310
},
{
"epoch": 6.554659498207886,
"grad_norm": 0.37194913625717163,
"learning_rate": 1.6010148149924956e-05,
"loss": 0.7162,
"num_input_tokens_seen": 2755320,
"step": 7315
},
{
"epoch": 6.559139784946236,
"grad_norm": 0.4675387442111969,
"learning_rate": 1.5973676583604298e-05,
"loss": 0.683,
"num_input_tokens_seen": 2757368,
"step": 7320
},
{
"epoch": 6.563620071684587,
"grad_norm": 0.5463271141052246,
"learning_rate": 1.5937227094135733e-05,
"loss": 0.6915,
"num_input_tokens_seen": 2759224,
"step": 7325
},
{
"epoch": 6.568100358422939,
"grad_norm": 0.7810373902320862,
"learning_rate": 1.5900799770668495e-05,
"loss": 0.73,
"num_input_tokens_seen": 2761112,
"step": 7330
},
{
"epoch": 6.57258064516129,
"grad_norm": 0.5256034135818481,
"learning_rate": 1.5864394702297636e-05,
"loss": 0.6704,
"num_input_tokens_seen": 2763096,
"step": 7335
},
{
"epoch": 6.577060931899641,
"grad_norm": 0.46397244930267334,
"learning_rate": 1.5828011978063765e-05,
"loss": 0.706,
"num_input_tokens_seen": 2764888,
"step": 7340
},
{
"epoch": 6.581541218637993,
"grad_norm": 0.5735636949539185,
"learning_rate": 1.5791651686952823e-05,
"loss": 0.676,
"num_input_tokens_seen": 2766776,
"step": 7345
},
{
"epoch": 6.586021505376344,
"grad_norm": 0.6215652227401733,
"learning_rate": 1.575531391789591e-05,
"loss": 0.6988,
"num_input_tokens_seen": 2768760,
"step": 7350
},
{
"epoch": 6.590501792114695,
"grad_norm": 0.39189133048057556,
"learning_rate": 1.5718998759769025e-05,
"loss": 0.7245,
"num_input_tokens_seen": 2770584,
"step": 7355
},
{
"epoch": 6.594982078853047,
"grad_norm": 0.39220893383026123,
"learning_rate": 1.5682706301392867e-05,
"loss": 0.69,
"num_input_tokens_seen": 2772408,
"step": 7360
},
{
"epoch": 6.599462365591398,
"grad_norm": 0.5382081866264343,
"learning_rate": 1.564643663153263e-05,
"loss": 0.7117,
"num_input_tokens_seen": 2774328,
"step": 7365
},
{
"epoch": 6.603942652329749,
"grad_norm": 0.4528568685054779,
"learning_rate": 1.561018983889775e-05,
"loss": 0.6928,
"num_input_tokens_seen": 2776120,
"step": 7370
},
{
"epoch": 6.608422939068101,
"grad_norm": 0.5559325814247131,
"learning_rate": 1.557396601214171e-05,
"loss": 0.7035,
"num_input_tokens_seen": 2778040,
"step": 7375
},
{
"epoch": 6.612903225806452,
"grad_norm": 0.5265352725982666,
"learning_rate": 1.5537765239861838e-05,
"loss": 0.7336,
"num_input_tokens_seen": 2779928,
"step": 7380
},
{
"epoch": 6.617383512544803,
"grad_norm": 0.46773481369018555,
"learning_rate": 1.550158761059907e-05,
"loss": 0.7086,
"num_input_tokens_seen": 2781656,
"step": 7385
},
{
"epoch": 6.621863799283155,
"grad_norm": 0.4599006772041321,
"learning_rate": 1.5465433212837726e-05,
"loss": 0.6835,
"num_input_tokens_seen": 2783544,
"step": 7390
},
{
"epoch": 6.626344086021505,
"grad_norm": 0.5777242183685303,
"learning_rate": 1.542930213500533e-05,
"loss": 0.6867,
"num_input_tokens_seen": 2785304,
"step": 7395
},
{
"epoch": 6.630824372759856,
"grad_norm": 0.5781882405281067,
"learning_rate": 1.5393194465472337e-05,
"loss": 0.6471,
"num_input_tokens_seen": 2787256,
"step": 7400
},
{
"epoch": 6.635304659498208,
"grad_norm": 0.4682949483394623,
"learning_rate": 1.535711029255197e-05,
"loss": 0.7062,
"num_input_tokens_seen": 2789144,
"step": 7405
},
{
"epoch": 6.639784946236559,
"grad_norm": 0.3891659080982208,
"learning_rate": 1.532104970449999e-05,
"loss": 0.6508,
"num_input_tokens_seen": 2791000,
"step": 7410
},
{
"epoch": 6.64426523297491,
"grad_norm": 0.5683819055557251,
"learning_rate": 1.5285012789514446e-05,
"loss": 0.6994,
"num_input_tokens_seen": 2793016,
"step": 7415
},
{
"epoch": 6.648745519713262,
"grad_norm": 0.4903818666934967,
"learning_rate": 1.5248999635735516e-05,
"loss": 0.7113,
"num_input_tokens_seen": 2795000,
"step": 7420
},
{
"epoch": 6.653225806451613,
"grad_norm": 0.6192786693572998,
"learning_rate": 1.5213010331245259e-05,
"loss": 0.7565,
"num_input_tokens_seen": 2796984,
"step": 7425
},
{
"epoch": 6.657706093189964,
"grad_norm": 0.6645828485488892,
"learning_rate": 1.5177044964067372e-05,
"loss": 0.6786,
"num_input_tokens_seen": 2798872,
"step": 7430
},
{
"epoch": 6.662186379928316,
"grad_norm": 0.6101380586624146,
"learning_rate": 1.5141103622167041e-05,
"loss": 0.6987,
"num_input_tokens_seen": 2800888,
"step": 7435
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.6002082228660583,
"learning_rate": 1.5105186393450665e-05,
"loss": 0.7221,
"num_input_tokens_seen": 2802776,
"step": 7440
},
{
"epoch": 6.671146953405018,
"grad_norm": 0.44711893796920776,
"learning_rate": 1.5069293365765685e-05,
"loss": 0.6928,
"num_input_tokens_seen": 2804504,
"step": 7445
},
{
"epoch": 6.675627240143369,
"grad_norm": 0.4706989824771881,
"learning_rate": 1.5033424626900353e-05,
"loss": 0.7042,
"num_input_tokens_seen": 2806424,
"step": 7450
},
{
"epoch": 6.68010752688172,
"grad_norm": 0.4829060435295105,
"learning_rate": 1.4997580264583488e-05,
"loss": 0.6754,
"num_input_tokens_seen": 2808312,
"step": 7455
},
{
"epoch": 6.684587813620071,
"grad_norm": 0.40096986293792725,
"learning_rate": 1.4961760366484307e-05,
"loss": 0.6713,
"num_input_tokens_seen": 2810232,
"step": 7460
},
{
"epoch": 6.689068100358423,
"grad_norm": 0.6513391137123108,
"learning_rate": 1.492596502021219e-05,
"loss": 0.6462,
"num_input_tokens_seen": 2812248,
"step": 7465
},
{
"epoch": 6.693548387096774,
"grad_norm": 0.5434605479240417,
"learning_rate": 1.4890194313316478e-05,
"loss": 0.6619,
"num_input_tokens_seen": 2814168,
"step": 7470
},
{
"epoch": 6.698028673835125,
"grad_norm": 0.5546020865440369,
"learning_rate": 1.4854448333286222e-05,
"loss": 0.6814,
"num_input_tokens_seen": 2816088,
"step": 7475
},
{
"epoch": 6.702508960573477,
"grad_norm": 0.5495643615722656,
"learning_rate": 1.4818727167550025e-05,
"loss": 0.7079,
"num_input_tokens_seen": 2817944,
"step": 7480
},
{
"epoch": 6.706989247311828,
"grad_norm": 0.5469850897789001,
"learning_rate": 1.478303090347577e-05,
"loss": 0.6782,
"num_input_tokens_seen": 2819928,
"step": 7485
},
{
"epoch": 6.711469534050179,
"grad_norm": 0.41618314385414124,
"learning_rate": 1.474735962837045e-05,
"loss": 0.7023,
"num_input_tokens_seen": 2821880,
"step": 7490
},
{
"epoch": 6.715949820788531,
"grad_norm": 0.4919305145740509,
"learning_rate": 1.4711713429479945e-05,
"loss": 0.6803,
"num_input_tokens_seen": 2823800,
"step": 7495
},
{
"epoch": 6.720430107526882,
"grad_norm": 0.7194111347198486,
"learning_rate": 1.4676092393988791e-05,
"loss": 0.674,
"num_input_tokens_seen": 2825656,
"step": 7500
},
{
"epoch": 6.724910394265233,
"grad_norm": 0.4722954332828522,
"learning_rate": 1.4640496609019993e-05,
"loss": 0.6954,
"num_input_tokens_seen": 2827512,
"step": 7505
},
{
"epoch": 6.729390681003585,
"grad_norm": 0.3946280777454376,
"learning_rate": 1.4604926161634768e-05,
"loss": 0.6972,
"num_input_tokens_seen": 2829336,
"step": 7510
},
{
"epoch": 6.733870967741936,
"grad_norm": 0.35536396503448486,
"learning_rate": 1.45693811388324e-05,
"loss": 0.696,
"num_input_tokens_seen": 2831224,
"step": 7515
},
{
"epoch": 6.738351254480286,
"grad_norm": 0.5042831301689148,
"learning_rate": 1.4533861627549953e-05,
"loss": 0.6795,
"num_input_tokens_seen": 2833176,
"step": 7520
},
{
"epoch": 6.742831541218638,
"grad_norm": 0.6712019443511963,
"learning_rate": 1.4498367714662128e-05,
"loss": 0.6755,
"num_input_tokens_seen": 2835096,
"step": 7525
},
{
"epoch": 6.747311827956989,
"grad_norm": 0.431209921836853,
"learning_rate": 1.4462899486980994e-05,
"loss": 0.6954,
"num_input_tokens_seen": 2836952,
"step": 7530
},
{
"epoch": 6.75179211469534,
"grad_norm": 0.55506432056427,
"learning_rate": 1.4427457031255803e-05,
"loss": 0.682,
"num_input_tokens_seen": 2838936,
"step": 7535
},
{
"epoch": 6.756272401433692,
"grad_norm": 0.6920475363731384,
"learning_rate": 1.4392040434172773e-05,
"loss": 0.7126,
"num_input_tokens_seen": 2840888,
"step": 7540
},
{
"epoch": 6.760752688172043,
"grad_norm": 0.775012731552124,
"learning_rate": 1.4356649782354872e-05,
"loss": 0.6911,
"num_input_tokens_seen": 2842776,
"step": 7545
},
{
"epoch": 6.765232974910394,
"grad_norm": 0.41614946722984314,
"learning_rate": 1.432128516236163e-05,
"loss": 0.6598,
"num_input_tokens_seen": 2844568,
"step": 7550
},
{
"epoch": 6.769713261648746,
"grad_norm": 0.4207038879394531,
"learning_rate": 1.4285946660688888e-05,
"loss": 0.697,
"num_input_tokens_seen": 2846328,
"step": 7555
},
{
"epoch": 6.774193548387097,
"grad_norm": 0.5408599972724915,
"learning_rate": 1.4250634363768601e-05,
"loss": 0.7036,
"num_input_tokens_seen": 2848216,
"step": 7560
},
{
"epoch": 6.778673835125448,
"grad_norm": 0.617653489112854,
"learning_rate": 1.4215348357968669e-05,
"loss": 0.7178,
"num_input_tokens_seen": 2850104,
"step": 7565
},
{
"epoch": 6.7831541218638,
"grad_norm": 0.6825722455978394,
"learning_rate": 1.4180088729592633e-05,
"loss": 0.6723,
"num_input_tokens_seen": 2851960,
"step": 7570
},
{
"epoch": 6.78763440860215,
"grad_norm": 0.48724040389060974,
"learning_rate": 1.4144855564879553e-05,
"loss": 0.7048,
"num_input_tokens_seen": 2853944,
"step": 7575
},
{
"epoch": 6.792114695340501,
"grad_norm": 0.5684961080551147,
"learning_rate": 1.410964895000377e-05,
"loss": 0.7017,
"num_input_tokens_seen": 2855672,
"step": 7580
},
{
"epoch": 6.796594982078853,
"grad_norm": 0.586720883846283,
"learning_rate": 1.4074468971074673e-05,
"loss": 0.696,
"num_input_tokens_seen": 2857496,
"step": 7585
},
{
"epoch": 6.801075268817204,
"grad_norm": 0.44053155183792114,
"learning_rate": 1.4039315714136502e-05,
"loss": 0.7209,
"num_input_tokens_seen": 2859320,
"step": 7590
},
{
"epoch": 6.805555555555555,
"grad_norm": 0.537570595741272,
"learning_rate": 1.4004189265168149e-05,
"loss": 0.6767,
"num_input_tokens_seen": 2861272,
"step": 7595
},
{
"epoch": 6.810035842293907,
"grad_norm": 0.4736902117729187,
"learning_rate": 1.3969089710082927e-05,
"loss": 0.6851,
"num_input_tokens_seen": 2863256,
"step": 7600
},
{
"epoch": 6.814516129032258,
"grad_norm": 0.4806761145591736,
"learning_rate": 1.3934017134728397e-05,
"loss": 0.6803,
"num_input_tokens_seen": 2865048,
"step": 7605
},
{
"epoch": 6.818996415770609,
"grad_norm": 0.563230037689209,
"learning_rate": 1.3898971624886101e-05,
"loss": 0.7188,
"num_input_tokens_seen": 2866904,
"step": 7610
},
{
"epoch": 6.823476702508961,
"grad_norm": 0.6949727535247803,
"learning_rate": 1.386395326627139e-05,
"loss": 0.6784,
"num_input_tokens_seen": 2868920,
"step": 7615
},
{
"epoch": 6.827956989247312,
"grad_norm": 0.5937641263008118,
"learning_rate": 1.3828962144533242e-05,
"loss": 0.6957,
"num_input_tokens_seen": 2870840,
"step": 7620
},
{
"epoch": 6.832437275985663,
"grad_norm": 0.5545377135276794,
"learning_rate": 1.379399834525395e-05,
"loss": 0.7275,
"num_input_tokens_seen": 2872792,
"step": 7625
},
{
"epoch": 6.836917562724015,
"grad_norm": 0.5834463238716125,
"learning_rate": 1.3759061953949054e-05,
"loss": 0.6668,
"num_input_tokens_seen": 2874552,
"step": 7630
},
{
"epoch": 6.841397849462366,
"grad_norm": 0.6372240781784058,
"learning_rate": 1.3724153056067013e-05,
"loss": 0.6821,
"num_input_tokens_seen": 2876312,
"step": 7635
},
{
"epoch": 6.845878136200717,
"grad_norm": 0.4398591220378876,
"learning_rate": 1.3689271736989046e-05,
"loss": 0.6746,
"num_input_tokens_seen": 2878200,
"step": 7640
},
{
"epoch": 6.850358422939068,
"grad_norm": 0.43143922090530396,
"learning_rate": 1.3654418082028956e-05,
"loss": 0.692,
"num_input_tokens_seen": 2879992,
"step": 7645
},
{
"epoch": 6.854838709677419,
"grad_norm": 0.5556138753890991,
"learning_rate": 1.3619592176432816e-05,
"loss": 0.7245,
"num_input_tokens_seen": 2881816,
"step": 7650
},
{
"epoch": 6.85931899641577,
"grad_norm": 0.8755431771278381,
"learning_rate": 1.3584794105378904e-05,
"loss": 0.7197,
"num_input_tokens_seen": 2883832,
"step": 7655
},
{
"epoch": 6.863799283154122,
"grad_norm": 0.5189265608787537,
"learning_rate": 1.3550023953977367e-05,
"loss": 0.6686,
"num_input_tokens_seen": 2885848,
"step": 7660
},
{
"epoch": 6.868279569892473,
"grad_norm": 0.6444762945175171,
"learning_rate": 1.3515281807270075e-05,
"loss": 0.7059,
"num_input_tokens_seen": 2887864,
"step": 7665
},
{
"epoch": 6.872759856630824,
"grad_norm": 0.60382479429245,
"learning_rate": 1.3480567750230433e-05,
"loss": 0.7083,
"num_input_tokens_seen": 2889816,
"step": 7670
},
{
"epoch": 6.877240143369176,
"grad_norm": 0.5019137859344482,
"learning_rate": 1.344588186776311e-05,
"loss": 0.7427,
"num_input_tokens_seen": 2891608,
"step": 7675
},
{
"epoch": 6.881720430107527,
"grad_norm": 0.37922972440719604,
"learning_rate": 1.3411224244703873e-05,
"loss": 0.7213,
"num_input_tokens_seen": 2893528,
"step": 7680
},
{
"epoch": 6.886200716845878,
"grad_norm": 0.537884533405304,
"learning_rate": 1.3376594965819378e-05,
"loss": 0.6898,
"num_input_tokens_seen": 2895576,
"step": 7685
},
{
"epoch": 6.89068100358423,
"grad_norm": 0.4329852759838104,
"learning_rate": 1.3341994115806943e-05,
"loss": 0.6824,
"num_input_tokens_seen": 2897592,
"step": 7690
},
{
"epoch": 6.895161290322581,
"grad_norm": 0.36487624049186707,
"learning_rate": 1.3307421779294377e-05,
"loss": 0.6868,
"num_input_tokens_seen": 2899384,
"step": 7695
},
{
"epoch": 6.899641577060932,
"grad_norm": 0.5098947286605835,
"learning_rate": 1.3272878040839742e-05,
"loss": 0.6745,
"num_input_tokens_seen": 2901240,
"step": 7700
},
{
"epoch": 6.904121863799283,
"grad_norm": 0.4954400360584259,
"learning_rate": 1.3238362984931113e-05,
"loss": 0.6972,
"num_input_tokens_seen": 2903224,
"step": 7705
},
{
"epoch": 6.908602150537634,
"grad_norm": 0.3625122904777527,
"learning_rate": 1.3203876695986478e-05,
"loss": 0.691,
"num_input_tokens_seen": 2905112,
"step": 7710
},
{
"epoch": 6.913082437275985,
"grad_norm": 0.5607591271400452,
"learning_rate": 1.3169419258353433e-05,
"loss": 0.7021,
"num_input_tokens_seen": 2907192,
"step": 7715
},
{
"epoch": 6.917562724014337,
"grad_norm": 0.4479368031024933,
"learning_rate": 1.313499075630899e-05,
"loss": 0.709,
"num_input_tokens_seen": 2909016,
"step": 7720
},
{
"epoch": 6.922043010752688,
"grad_norm": 0.6518425345420837,
"learning_rate": 1.3100591274059431e-05,
"loss": 0.6698,
"num_input_tokens_seen": 2910968,
"step": 7725
},
{
"epoch": 6.926523297491039,
"grad_norm": 0.48006075620651245,
"learning_rate": 1.3066220895740039e-05,
"loss": 0.7108,
"num_input_tokens_seen": 2913080,
"step": 7730
},
{
"epoch": 6.931003584229391,
"grad_norm": 0.5145459771156311,
"learning_rate": 1.3031879705414907e-05,
"loss": 0.6862,
"num_input_tokens_seen": 2914968,
"step": 7735
},
{
"epoch": 6.935483870967742,
"grad_norm": 0.35982221364974976,
"learning_rate": 1.2997567787076747e-05,
"loss": 0.6826,
"num_input_tokens_seen": 2916824,
"step": 7740
},
{
"epoch": 6.939964157706093,
"grad_norm": 0.5295773148536682,
"learning_rate": 1.296328522464667e-05,
"loss": 0.7037,
"num_input_tokens_seen": 2919032,
"step": 7745
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.4160950183868408,
"learning_rate": 1.2929032101974009e-05,
"loss": 0.6743,
"num_input_tokens_seen": 2920920,
"step": 7750
},
{
"epoch": 6.948924731182796,
"grad_norm": 0.502730667591095,
"learning_rate": 1.289480850283607e-05,
"loss": 0.6956,
"num_input_tokens_seen": 2923032,
"step": 7755
},
{
"epoch": 6.953405017921147,
"grad_norm": 0.5727908611297607,
"learning_rate": 1.2860614510937955e-05,
"loss": 0.6978,
"num_input_tokens_seen": 2924856,
"step": 7760
},
{
"epoch": 6.957885304659499,
"grad_norm": 0.5175768136978149,
"learning_rate": 1.2826450209912355e-05,
"loss": 0.7139,
"num_input_tokens_seen": 2926680,
"step": 7765
},
{
"epoch": 6.96236559139785,
"grad_norm": 0.63689786195755,
"learning_rate": 1.2792315683319328e-05,
"loss": 0.6853,
"num_input_tokens_seen": 2928568,
"step": 7770
},
{
"epoch": 6.9668458781362,
"grad_norm": 0.6053296327590942,
"learning_rate": 1.2758211014646143e-05,
"loss": 0.7121,
"num_input_tokens_seen": 2930424,
"step": 7775
},
{
"epoch": 6.971326164874552,
"grad_norm": 0.4488166272640228,
"learning_rate": 1.2724136287307009e-05,
"loss": 0.6899,
"num_input_tokens_seen": 2932376,
"step": 7780
},
{
"epoch": 6.975806451612903,
"grad_norm": 0.5750177502632141,
"learning_rate": 1.2690091584642916e-05,
"loss": 0.7093,
"num_input_tokens_seen": 2934072,
"step": 7785
},
{
"epoch": 6.980286738351254,
"grad_norm": 0.4811491370201111,
"learning_rate": 1.2656076989921417e-05,
"loss": 0.6774,
"num_input_tokens_seen": 2935896,
"step": 7790
},
{
"epoch": 6.984767025089606,
"grad_norm": 0.6178358793258667,
"learning_rate": 1.2622092586336415e-05,
"loss": 0.6656,
"num_input_tokens_seen": 2937720,
"step": 7795
},
{
"epoch": 6.989247311827957,
"grad_norm": 0.814631998538971,
"learning_rate": 1.2588138457008e-05,
"loss": 0.694,
"num_input_tokens_seen": 2939480,
"step": 7800
},
{
"epoch": 6.993727598566308,
"grad_norm": 0.555487871170044,
"learning_rate": 1.2554214684982191e-05,
"loss": 0.6775,
"num_input_tokens_seen": 2941304,
"step": 7805
},
{
"epoch": 6.99820788530466,
"grad_norm": 0.4906958341598511,
"learning_rate": 1.2520321353230769e-05,
"loss": 0.6765,
"num_input_tokens_seen": 2943256,
"step": 7810
},
{
"epoch": 7.0,
"eval_loss": 0.7003687620162964,
"eval_runtime": 5.6201,
"eval_samples_per_second": 88.255,
"eval_steps_per_second": 22.064,
"num_input_tokens_seen": 2943688,
"step": 7812
},
{
"epoch": 7.002688172043011,
"grad_norm": 0.6357385516166687,
"learning_rate": 1.248645854465105e-05,
"loss": 0.6548,
"num_input_tokens_seen": 2944680,
"step": 7815
},
{
"epoch": 7.007168458781362,
"grad_norm": 0.4822031557559967,
"learning_rate": 1.2452626342065702e-05,
"loss": 0.683,
"num_input_tokens_seen": 2946696,
"step": 7820
},
{
"epoch": 7.011648745519714,
"grad_norm": 0.5621390342712402,
"learning_rate": 1.2418824828222559e-05,
"loss": 0.6629,
"num_input_tokens_seen": 2948616,
"step": 7825
},
{
"epoch": 7.016129032258065,
"grad_norm": 0.5329768061637878,
"learning_rate": 1.2385054085794361e-05,
"loss": 0.6876,
"num_input_tokens_seen": 2950472,
"step": 7830
},
{
"epoch": 7.020609318996415,
"grad_norm": 0.49358680844306946,
"learning_rate": 1.2351314197378597e-05,
"loss": 0.7007,
"num_input_tokens_seen": 2952392,
"step": 7835
},
{
"epoch": 7.025089605734767,
"grad_norm": 0.5468727946281433,
"learning_rate": 1.2317605245497323e-05,
"loss": 0.6655,
"num_input_tokens_seen": 2954248,
"step": 7840
},
{
"epoch": 7.029569892473118,
"grad_norm": 0.5755560994148254,
"learning_rate": 1.2283927312596874e-05,
"loss": 0.6909,
"num_input_tokens_seen": 2956072,
"step": 7845
},
{
"epoch": 7.034050179211469,
"grad_norm": 0.48634764552116394,
"learning_rate": 1.2250280481047746e-05,
"loss": 0.6717,
"num_input_tokens_seen": 2958024,
"step": 7850
},
{
"epoch": 7.038530465949821,
"grad_norm": 0.7252983450889587,
"learning_rate": 1.2216664833144386e-05,
"loss": 0.7125,
"num_input_tokens_seen": 2959816,
"step": 7855
},
{
"epoch": 7.043010752688172,
"grad_norm": 0.7096565961837769,
"learning_rate": 1.2183080451104937e-05,
"loss": 0.7341,
"num_input_tokens_seen": 2961672,
"step": 7860
},
{
"epoch": 7.047491039426523,
"grad_norm": 0.5406831502914429,
"learning_rate": 1.2149527417071107e-05,
"loss": 0.6632,
"num_input_tokens_seen": 2963688,
"step": 7865
},
{
"epoch": 7.051971326164875,
"grad_norm": 0.5515265464782715,
"learning_rate": 1.2116005813107891e-05,
"loss": 0.6838,
"num_input_tokens_seen": 2965576,
"step": 7870
},
{
"epoch": 7.056451612903226,
"grad_norm": 0.42072048783302307,
"learning_rate": 1.2082515721203427e-05,
"loss": 0.6974,
"num_input_tokens_seen": 2967464,
"step": 7875
},
{
"epoch": 7.060931899641577,
"grad_norm": 0.46907660365104675,
"learning_rate": 1.2049057223268807e-05,
"loss": 0.6855,
"num_input_tokens_seen": 2969416,
"step": 7880
},
{
"epoch": 7.065412186379929,
"grad_norm": 0.47050419449806213,
"learning_rate": 1.2015630401137812e-05,
"loss": 0.6843,
"num_input_tokens_seen": 2971304,
"step": 7885
},
{
"epoch": 7.06989247311828,
"grad_norm": 0.3717748820781708,
"learning_rate": 1.198223533656676e-05,
"loss": 0.6866,
"num_input_tokens_seen": 2973096,
"step": 7890
},
{
"epoch": 7.07437275985663,
"grad_norm": 0.690002977848053,
"learning_rate": 1.1948872111234327e-05,
"loss": 0.669,
"num_input_tokens_seen": 2975080,
"step": 7895
},
{
"epoch": 7.078853046594982,
"grad_norm": 0.6414685845375061,
"learning_rate": 1.191554080674125e-05,
"loss": 0.6924,
"num_input_tokens_seen": 2977064,
"step": 7900
},
{
"epoch": 7.083333333333333,
"grad_norm": 0.6106062531471252,
"learning_rate": 1.188224150461026e-05,
"loss": 0.6997,
"num_input_tokens_seen": 2979016,
"step": 7905
},
{
"epoch": 7.087813620071684,
"grad_norm": 0.7638527750968933,
"learning_rate": 1.1848974286285774e-05,
"loss": 0.6863,
"num_input_tokens_seen": 2980904,
"step": 7910
},
{
"epoch": 7.092293906810036,
"grad_norm": 0.5625856518745422,
"learning_rate": 1.181573923313375e-05,
"loss": 0.6968,
"num_input_tokens_seen": 2982792,
"step": 7915
},
{
"epoch": 7.096774193548387,
"grad_norm": 0.5783429145812988,
"learning_rate": 1.1782536426441498e-05,
"loss": 0.6761,
"num_input_tokens_seen": 2984552,
"step": 7920
},
{
"epoch": 7.101254480286738,
"grad_norm": 0.4028991162776947,
"learning_rate": 1.17493659474174e-05,
"loss": 0.6894,
"num_input_tokens_seen": 2986280,
"step": 7925
},
{
"epoch": 7.10573476702509,
"grad_norm": 0.5815252661705017,
"learning_rate": 1.1716227877190839e-05,
"loss": 0.6844,
"num_input_tokens_seen": 2988200,
"step": 7930
},
{
"epoch": 7.110215053763441,
"grad_norm": 0.5259630084037781,
"learning_rate": 1.1683122296811883e-05,
"loss": 0.663,
"num_input_tokens_seen": 2989928,
"step": 7935
},
{
"epoch": 7.114695340501792,
"grad_norm": 0.6334452629089355,
"learning_rate": 1.1650049287251147e-05,
"loss": 0.6699,
"num_input_tokens_seen": 2991752,
"step": 7940
},
{
"epoch": 7.119175627240144,
"grad_norm": 0.5388948917388916,
"learning_rate": 1.1617008929399606e-05,
"loss": 0.6883,
"num_input_tokens_seen": 2993640,
"step": 7945
},
{
"epoch": 7.123655913978495,
"grad_norm": 0.712680459022522,
"learning_rate": 1.1584001304068349e-05,
"loss": 0.6778,
"num_input_tokens_seen": 2995528,
"step": 7950
},
{
"epoch": 7.128136200716846,
"grad_norm": 0.5485901236534119,
"learning_rate": 1.155102649198841e-05,
"loss": 0.6874,
"num_input_tokens_seen": 2997512,
"step": 7955
},
{
"epoch": 7.132616487455197,
"grad_norm": 0.5746596455574036,
"learning_rate": 1.1518084573810575e-05,
"loss": 0.7271,
"num_input_tokens_seen": 2999272,
"step": 7960
},
{
"epoch": 7.137096774193548,
"grad_norm": 0.5184845924377441,
"learning_rate": 1.1485175630105163e-05,
"loss": 0.6905,
"num_input_tokens_seen": 3001160,
"step": 7965
},
{
"epoch": 7.141577060931899,
"grad_norm": 0.49086540937423706,
"learning_rate": 1.1452299741361875e-05,
"loss": 0.6594,
"num_input_tokens_seen": 3003048,
"step": 7970
},
{
"epoch": 7.146057347670251,
"grad_norm": 0.548001229763031,
"learning_rate": 1.141945698798954e-05,
"loss": 0.7119,
"num_input_tokens_seen": 3005224,
"step": 7975
},
{
"epoch": 7.150537634408602,
"grad_norm": 0.47451603412628174,
"learning_rate": 1.1386647450315924e-05,
"loss": 0.6799,
"num_input_tokens_seen": 3007112,
"step": 7980
},
{
"epoch": 7.155017921146953,
"grad_norm": 0.6413478851318359,
"learning_rate": 1.1353871208587602e-05,
"loss": 0.7233,
"num_input_tokens_seen": 3009000,
"step": 7985
},
{
"epoch": 7.159498207885305,
"grad_norm": 0.6764240264892578,
"learning_rate": 1.132112834296967e-05,
"loss": 0.6607,
"num_input_tokens_seen": 3010920,
"step": 7990
},
{
"epoch": 7.163978494623656,
"grad_norm": 0.4934854209423065,
"learning_rate": 1.1288418933545624e-05,
"loss": 0.7034,
"num_input_tokens_seen": 3012936,
"step": 7995
},
{
"epoch": 7.168458781362007,
"grad_norm": 0.32125502824783325,
"learning_rate": 1.1255743060317115e-05,
"loss": 0.6992,
"num_input_tokens_seen": 3014600,
"step": 8000
},
{
"epoch": 7.172939068100359,
"grad_norm": 0.5356723070144653,
"learning_rate": 1.1223100803203767e-05,
"loss": 0.6913,
"num_input_tokens_seen": 3016552,
"step": 8005
},
{
"epoch": 7.17741935483871,
"grad_norm": 0.3970068395137787,
"learning_rate": 1.1190492242042989e-05,
"loss": 0.7154,
"num_input_tokens_seen": 3018376,
"step": 8010
},
{
"epoch": 7.181899641577061,
"grad_norm": 0.5137478113174438,
"learning_rate": 1.1157917456589778e-05,
"loss": 0.7028,
"num_input_tokens_seen": 3020296,
"step": 8015
},
{
"epoch": 7.186379928315413,
"grad_norm": 0.6350739598274231,
"learning_rate": 1.1125376526516511e-05,
"loss": 0.6737,
"num_input_tokens_seen": 3022120,
"step": 8020
},
{
"epoch": 7.190860215053763,
"grad_norm": 0.5553760528564453,
"learning_rate": 1.109286953141279e-05,
"loss": 0.6878,
"num_input_tokens_seen": 3023816,
"step": 8025
},
{
"epoch": 7.195340501792114,
"grad_norm": 0.42521554231643677,
"learning_rate": 1.1060396550785182e-05,
"loss": 0.6979,
"num_input_tokens_seen": 3025672,
"step": 8030
},
{
"epoch": 7.199820788530466,
"grad_norm": 0.5245974659919739,
"learning_rate": 1.1027957664057079e-05,
"loss": 0.7418,
"num_input_tokens_seen": 3027496,
"step": 8035
},
{
"epoch": 7.204301075268817,
"grad_norm": 0.4561592638492584,
"learning_rate": 1.099555295056848e-05,
"loss": 0.6645,
"num_input_tokens_seen": 3029288,
"step": 8040
},
{
"epoch": 7.208781362007168,
"grad_norm": 0.46861767768859863,
"learning_rate": 1.0963182489575797e-05,
"loss": 0.6933,
"num_input_tokens_seen": 3031080,
"step": 8045
},
{
"epoch": 7.21326164874552,
"grad_norm": 0.4887754023075104,
"learning_rate": 1.0930846360251684e-05,
"loss": 0.6598,
"num_input_tokens_seen": 3033128,
"step": 8050
},
{
"epoch": 7.217741935483871,
"grad_norm": 0.7280912399291992,
"learning_rate": 1.0898544641684816e-05,
"loss": 0.6929,
"num_input_tokens_seen": 3035144,
"step": 8055
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.5493155121803284,
"learning_rate": 1.0866277412879695e-05,
"loss": 0.7104,
"num_input_tokens_seen": 3037032,
"step": 8060
},
{
"epoch": 7.226702508960574,
"grad_norm": 0.3439621925354004,
"learning_rate": 1.0834044752756478e-05,
"loss": 0.6971,
"num_input_tokens_seen": 3038952,
"step": 8065
},
{
"epoch": 7.231182795698925,
"grad_norm": 0.5505177974700928,
"learning_rate": 1.0801846740150759e-05,
"loss": 0.6735,
"num_input_tokens_seen": 3041000,
"step": 8070
},
{
"epoch": 7.235663082437276,
"grad_norm": 0.44457921385765076,
"learning_rate": 1.0769683453813426e-05,
"loss": 0.7112,
"num_input_tokens_seen": 3042824,
"step": 8075
},
{
"epoch": 7.240143369175628,
"grad_norm": 0.41731348633766174,
"learning_rate": 1.0737554972410391e-05,
"loss": 0.6924,
"num_input_tokens_seen": 3044648,
"step": 8080
},
{
"epoch": 7.244623655913978,
"grad_norm": 0.6889303922653198,
"learning_rate": 1.0705461374522463e-05,
"loss": 0.6767,
"num_input_tokens_seen": 3046664,
"step": 8085
},
{
"epoch": 7.249103942652329,
"grad_norm": 0.4339003264904022,
"learning_rate": 1.0673402738645116e-05,
"loss": 0.7029,
"num_input_tokens_seen": 3048456,
"step": 8090
},
{
"epoch": 7.253584229390681,
"grad_norm": 0.5571053624153137,
"learning_rate": 1.0641379143188321e-05,
"loss": 0.7222,
"num_input_tokens_seen": 3050408,
"step": 8095
},
{
"epoch": 7.258064516129032,
"grad_norm": 0.5285934805870056,
"learning_rate": 1.060939066647636e-05,
"loss": 0.6845,
"num_input_tokens_seen": 3052264,
"step": 8100
},
{
"epoch": 7.262544802867383,
"grad_norm": 0.5844146013259888,
"learning_rate": 1.0577437386747601e-05,
"loss": 0.6693,
"num_input_tokens_seen": 3054184,
"step": 8105
},
{
"epoch": 7.267025089605735,
"grad_norm": 0.5706028342247009,
"learning_rate": 1.054551938215432e-05,
"loss": 0.7184,
"num_input_tokens_seen": 3056008,
"step": 8110
},
{
"epoch": 7.271505376344086,
"grad_norm": 0.6662572622299194,
"learning_rate": 1.0513636730762558e-05,
"loss": 0.6994,
"num_input_tokens_seen": 3057992,
"step": 8115
},
{
"epoch": 7.275985663082437,
"grad_norm": 0.5411928296089172,
"learning_rate": 1.0481789510551821e-05,
"loss": 0.6712,
"num_input_tokens_seen": 3059720,
"step": 8120
},
{
"epoch": 7.280465949820789,
"grad_norm": 0.46258223056793213,
"learning_rate": 1.044997779941502e-05,
"loss": 0.6983,
"num_input_tokens_seen": 3061576,
"step": 8125
},
{
"epoch": 7.28494623655914,
"grad_norm": 0.5718064308166504,
"learning_rate": 1.0418201675158182e-05,
"loss": 0.707,
"num_input_tokens_seen": 3063368,
"step": 8130
},
{
"epoch": 7.289426523297491,
"grad_norm": 0.5663242936134338,
"learning_rate": 1.0386461215500296e-05,
"loss": 0.6284,
"num_input_tokens_seen": 3065128,
"step": 8135
},
{
"epoch": 7.293906810035843,
"grad_norm": 0.647124171257019,
"learning_rate": 1.0354756498073156e-05,
"loss": 0.6919,
"num_input_tokens_seen": 3067144,
"step": 8140
},
{
"epoch": 7.298387096774194,
"grad_norm": 0.5559285283088684,
"learning_rate": 1.032308760042108e-05,
"loss": 0.6971,
"num_input_tokens_seen": 3069064,
"step": 8145
},
{
"epoch": 7.302867383512545,
"grad_norm": 0.5535130500793457,
"learning_rate": 1.0291454600000805e-05,
"loss": 0.6837,
"num_input_tokens_seen": 3071048,
"step": 8150
},
{
"epoch": 7.307347670250896,
"grad_norm": 0.48422160744667053,
"learning_rate": 1.0259857574181292e-05,
"loss": 0.6874,
"num_input_tokens_seen": 3073032,
"step": 8155
},
{
"epoch": 7.311827956989247,
"grad_norm": 0.4725182354450226,
"learning_rate": 1.0228296600243483e-05,
"loss": 0.666,
"num_input_tokens_seen": 3074824,
"step": 8160
},
{
"epoch": 7.316308243727598,
"grad_norm": 0.5510170459747314,
"learning_rate": 1.0196771755380145e-05,
"loss": 0.6881,
"num_input_tokens_seen": 3076712,
"step": 8165
},
{
"epoch": 7.32078853046595,
"grad_norm": 0.22946563363075256,
"learning_rate": 1.016528311669571e-05,
"loss": 0.7115,
"num_input_tokens_seen": 3078536,
"step": 8170
},
{
"epoch": 7.325268817204301,
"grad_norm": 0.559217631816864,
"learning_rate": 1.0133830761206e-05,
"loss": 0.6647,
"num_input_tokens_seen": 3080424,
"step": 8175
},
{
"epoch": 7.329749103942652,
"grad_norm": 0.6456106305122375,
"learning_rate": 1.0102414765838156e-05,
"loss": 0.6888,
"num_input_tokens_seen": 3082472,
"step": 8180
},
{
"epoch": 7.334229390681004,
"grad_norm": 0.635503351688385,
"learning_rate": 1.0071035207430352e-05,
"loss": 0.6336,
"num_input_tokens_seen": 3084328,
"step": 8185
},
{
"epoch": 7.338709677419355,
"grad_norm": 0.519087553024292,
"learning_rate": 1.0039692162731637e-05,
"loss": 0.6786,
"num_input_tokens_seen": 3086120,
"step": 8190
},
{
"epoch": 7.343189964157706,
"grad_norm": 0.32819828391075134,
"learning_rate": 1.0008385708401802e-05,
"loss": 0.6819,
"num_input_tokens_seen": 3087976,
"step": 8195
},
{
"epoch": 7.347670250896058,
"grad_norm": 0.5701029896736145,
"learning_rate": 9.977115921011071e-06,
"loss": 0.6909,
"num_input_tokens_seen": 3089864,
"step": 8200
},
{
"epoch": 7.352150537634409,
"grad_norm": 0.5487130880355835,
"learning_rate": 9.945882877040053e-06,
"loss": 0.6849,
"num_input_tokens_seen": 3091688,
"step": 8205
},
{
"epoch": 7.356630824372759,
"grad_norm": 0.4447524845600128,
"learning_rate": 9.914686652879454e-06,
"loss": 0.7049,
"num_input_tokens_seen": 3093480,
"step": 8210
},
{
"epoch": 7.361111111111111,
"grad_norm": 0.7242264747619629,
"learning_rate": 9.883527324829925e-06,
"loss": 0.6908,
"num_input_tokens_seen": 3095368,
"step": 8215
},
{
"epoch": 7.365591397849462,
"grad_norm": 0.44289588928222656,
"learning_rate": 9.8524049691019e-06,
"loss": 0.6813,
"num_input_tokens_seen": 3097224,
"step": 8220
},
{
"epoch": 7.370071684587813,
"grad_norm": 0.49530482292175293,
"learning_rate": 9.821319661815359e-06,
"loss": 0.7155,
"num_input_tokens_seen": 3099016,
"step": 8225
},
{
"epoch": 7.374551971326165,
"grad_norm": 0.7250574827194214,
"learning_rate": 9.790271478999677e-06,
"loss": 0.689,
"num_input_tokens_seen": 3100904,
"step": 8230
},
{
"epoch": 7.379032258064516,
"grad_norm": 0.6142673492431641,
"learning_rate": 9.759260496593434e-06,
"loss": 0.6448,
"num_input_tokens_seen": 3102696,
"step": 8235
},
{
"epoch": 7.383512544802867,
"grad_norm": 0.4833540916442871,
"learning_rate": 9.728286790444206e-06,
"loss": 0.6913,
"num_input_tokens_seen": 3104488,
"step": 8240
},
{
"epoch": 7.387992831541219,
"grad_norm": 0.5866033434867859,
"learning_rate": 9.697350436308427e-06,
"loss": 0.7112,
"num_input_tokens_seen": 3106440,
"step": 8245
},
{
"epoch": 7.39247311827957,
"grad_norm": 0.5501033663749695,
"learning_rate": 9.666451509851158e-06,
"loss": 0.6574,
"num_input_tokens_seen": 3108264,
"step": 8250
},
{
"epoch": 7.396953405017921,
"grad_norm": 0.5121339559555054,
"learning_rate": 9.635590086645906e-06,
"loss": 0.7593,
"num_input_tokens_seen": 3110120,
"step": 8255
},
{
"epoch": 7.401433691756273,
"grad_norm": 0.5490121841430664,
"learning_rate": 9.604766242174474e-06,
"loss": 0.6737,
"num_input_tokens_seen": 3111912,
"step": 8260
},
{
"epoch": 7.405913978494624,
"grad_norm": 0.49912509322166443,
"learning_rate": 9.573980051826731e-06,
"loss": 0.6849,
"num_input_tokens_seen": 3113832,
"step": 8265
},
{
"epoch": 7.410394265232975,
"grad_norm": 0.4087314009666443,
"learning_rate": 9.54323159090048e-06,
"loss": 0.6895,
"num_input_tokens_seen": 3115624,
"step": 8270
},
{
"epoch": 7.414874551971327,
"grad_norm": 0.40049365162849426,
"learning_rate": 9.512520934601225e-06,
"loss": 0.6722,
"num_input_tokens_seen": 3117544,
"step": 8275
},
{
"epoch": 7.419354838709677,
"grad_norm": 0.490710586309433,
"learning_rate": 9.481848158041998e-06,
"loss": 0.6829,
"num_input_tokens_seen": 3119464,
"step": 8280
},
{
"epoch": 7.423835125448028,
"grad_norm": 0.6339983940124512,
"learning_rate": 9.4512133362432e-06,
"loss": 0.6974,
"num_input_tokens_seen": 3121224,
"step": 8285
},
{
"epoch": 7.42831541218638,
"grad_norm": 0.49414268136024475,
"learning_rate": 9.4206165441324e-06,
"loss": 0.7126,
"num_input_tokens_seen": 3123080,
"step": 8290
},
{
"epoch": 7.432795698924731,
"grad_norm": 0.43187791109085083,
"learning_rate": 9.390057856544129e-06,
"loss": 0.7088,
"num_input_tokens_seen": 3125000,
"step": 8295
},
{
"epoch": 7.437275985663082,
"grad_norm": 0.5625995993614197,
"learning_rate": 9.359537348219768e-06,
"loss": 0.6566,
"num_input_tokens_seen": 3127080,
"step": 8300
},
{
"epoch": 7.441756272401434,
"grad_norm": 0.45806175470352173,
"learning_rate": 9.329055093807268e-06,
"loss": 0.6758,
"num_input_tokens_seen": 3129032,
"step": 8305
},
{
"epoch": 7.446236559139785,
"grad_norm": 0.5392464995384216,
"learning_rate": 9.298611167861062e-06,
"loss": 0.7007,
"num_input_tokens_seen": 3130792,
"step": 8310
},
{
"epoch": 7.450716845878136,
"grad_norm": 0.7194018363952637,
"learning_rate": 9.2682056448418e-06,
"loss": 0.7235,
"num_input_tokens_seen": 3132776,
"step": 8315
},
{
"epoch": 7.455197132616488,
"grad_norm": 0.3086296021938324,
"learning_rate": 9.237838599116208e-06,
"loss": 0.6978,
"num_input_tokens_seen": 3134728,
"step": 8320
},
{
"epoch": 7.459677419354839,
"grad_norm": 0.6527276635169983,
"learning_rate": 9.207510104956944e-06,
"loss": 0.7155,
"num_input_tokens_seen": 3136616,
"step": 8325
},
{
"epoch": 7.46415770609319,
"grad_norm": 0.6955182552337646,
"learning_rate": 9.17722023654233e-06,
"loss": 0.7086,
"num_input_tokens_seen": 3138632,
"step": 8330
},
{
"epoch": 7.468637992831542,
"grad_norm": 0.5681894421577454,
"learning_rate": 9.146969067956238e-06,
"loss": 0.6894,
"num_input_tokens_seen": 3140456,
"step": 8335
},
{
"epoch": 7.473118279569892,
"grad_norm": 0.640601634979248,
"learning_rate": 9.116756673187878e-06,
"loss": 0.6617,
"num_input_tokens_seen": 3142312,
"step": 8340
},
{
"epoch": 7.477598566308243,
"grad_norm": 0.34400156140327454,
"learning_rate": 9.08658312613163e-06,
"loss": 0.6795,
"num_input_tokens_seen": 3144360,
"step": 8345
},
{
"epoch": 7.482078853046595,
"grad_norm": 0.6456219553947449,
"learning_rate": 9.056448500586865e-06,
"loss": 0.7127,
"num_input_tokens_seen": 3146152,
"step": 8350
},
{
"epoch": 7.486559139784946,
"grad_norm": 0.5124385952949524,
"learning_rate": 9.026352870257748e-06,
"loss": 0.6962,
"num_input_tokens_seen": 3148040,
"step": 8355
},
{
"epoch": 7.491039426523297,
"grad_norm": 0.6023479700088501,
"learning_rate": 8.996296308753069e-06,
"loss": 0.6879,
"num_input_tokens_seen": 3149864,
"step": 8360
},
{
"epoch": 7.495519713261649,
"grad_norm": 0.4037036597728729,
"learning_rate": 8.966278889586086e-06,
"loss": 0.6976,
"num_input_tokens_seen": 3151720,
"step": 8365
},
{
"epoch": 7.5,
"grad_norm": 0.7624744772911072,
"learning_rate": 8.936300686174268e-06,
"loss": 0.7108,
"num_input_tokens_seen": 3153640,
"step": 8370
},
{
"epoch": 7.5,
"eval_loss": 0.7001423835754395,
"eval_runtime": 5.6376,
"eval_samples_per_second": 87.98,
"eval_steps_per_second": 21.995,
"num_input_tokens_seen": 3153640,
"step": 8370
},
{
"epoch": 7.504480286738351,
"grad_norm": 0.4228692650794983,
"learning_rate": 8.906361771839227e-06,
"loss": 0.6946,
"num_input_tokens_seen": 3155496,
"step": 8375
},
{
"epoch": 7.508960573476703,
"grad_norm": 0.7325205206871033,
"learning_rate": 8.876462219806456e-06,
"loss": 0.6861,
"num_input_tokens_seen": 3157448,
"step": 8380
},
{
"epoch": 7.513440860215054,
"grad_norm": 0.6230577230453491,
"learning_rate": 8.846602103205157e-06,
"loss": 0.6706,
"num_input_tokens_seen": 3159496,
"step": 8385
},
{
"epoch": 7.517921146953405,
"grad_norm": 0.6926305294036865,
"learning_rate": 8.816781495068125e-06,
"loss": 0.6744,
"num_input_tokens_seen": 3161320,
"step": 8390
},
{
"epoch": 7.522401433691757,
"grad_norm": 0.606583297252655,
"learning_rate": 8.787000468331463e-06,
"loss": 0.7136,
"num_input_tokens_seen": 3163144,
"step": 8395
},
{
"epoch": 7.526881720430108,
"grad_norm": 0.5779367089271545,
"learning_rate": 8.757259095834525e-06,
"loss": 0.7018,
"num_input_tokens_seen": 3164904,
"step": 8400
},
{
"epoch": 7.531362007168459,
"grad_norm": 0.588382363319397,
"learning_rate": 8.72755745031964e-06,
"loss": 0.6922,
"num_input_tokens_seen": 3166696,
"step": 8405
},
{
"epoch": 7.53584229390681,
"grad_norm": 0.4489257335662842,
"learning_rate": 8.697895604431974e-06,
"loss": 0.7202,
"num_input_tokens_seen": 3168456,
"step": 8410
},
{
"epoch": 7.540322580645161,
"grad_norm": 0.5579271912574768,
"learning_rate": 8.668273630719373e-06,
"loss": 0.7056,
"num_input_tokens_seen": 3170344,
"step": 8415
},
{
"epoch": 7.544802867383512,
"grad_norm": 0.6759196519851685,
"learning_rate": 8.638691601632152e-06,
"loss": 0.684,
"num_input_tokens_seen": 3172232,
"step": 8420
},
{
"epoch": 7.549283154121864,
"grad_norm": 0.45993009209632874,
"learning_rate": 8.609149589522894e-06,
"loss": 0.6925,
"num_input_tokens_seen": 3174056,
"step": 8425
},
{
"epoch": 7.553763440860215,
"grad_norm": 0.5422730445861816,
"learning_rate": 8.579647666646361e-06,
"loss": 0.6939,
"num_input_tokens_seen": 3175944,
"step": 8430
},
{
"epoch": 7.558243727598566,
"grad_norm": 0.48670727014541626,
"learning_rate": 8.550185905159227e-06,
"loss": 0.6682,
"num_input_tokens_seen": 3177896,
"step": 8435
},
{
"epoch": 7.562724014336918,
"grad_norm": 0.6719427108764648,
"learning_rate": 8.520764377119964e-06,
"loss": 0.7217,
"num_input_tokens_seen": 3179912,
"step": 8440
},
{
"epoch": 7.567204301075269,
"grad_norm": 0.5485569834709167,
"learning_rate": 8.491383154488628e-06,
"loss": 0.7005,
"num_input_tokens_seen": 3181736,
"step": 8445
},
{
"epoch": 7.57168458781362,
"grad_norm": 0.47630730271339417,
"learning_rate": 8.462042309126664e-06,
"loss": 0.707,
"num_input_tokens_seen": 3183592,
"step": 8450
},
{
"epoch": 7.576164874551972,
"grad_norm": 0.6303143501281738,
"learning_rate": 8.432741912796821e-06,
"loss": 0.6824,
"num_input_tokens_seen": 3185448,
"step": 8455
},
{
"epoch": 7.580645161290323,
"grad_norm": 0.5015650391578674,
"learning_rate": 8.403482037162873e-06,
"loss": 0.688,
"num_input_tokens_seen": 3187368,
"step": 8460
},
{
"epoch": 7.585125448028673,
"grad_norm": 0.690214216709137,
"learning_rate": 8.374262753789493e-06,
"loss": 0.6917,
"num_input_tokens_seen": 3189192,
"step": 8465
},
{
"epoch": 7.589605734767025,
"grad_norm": 0.47096434235572815,
"learning_rate": 8.345084134142098e-06,
"loss": 0.6957,
"num_input_tokens_seen": 3191112,
"step": 8470
},
{
"epoch": 7.594086021505376,
"grad_norm": 0.5318554639816284,
"learning_rate": 8.31594624958662e-06,
"loss": 0.7068,
"num_input_tokens_seen": 3192808,
"step": 8475
},
{
"epoch": 7.598566308243727,
"grad_norm": 0.808319628238678,
"learning_rate": 8.286849171389366e-06,
"loss": 0.6443,
"num_input_tokens_seen": 3194632,
"step": 8480
},
{
"epoch": 7.603046594982079,
"grad_norm": 0.44692009687423706,
"learning_rate": 8.257792970716846e-06,
"loss": 0.7021,
"num_input_tokens_seen": 3196488,
"step": 8485
},
{
"epoch": 7.60752688172043,
"grad_norm": 0.5783076882362366,
"learning_rate": 8.228777718635575e-06,
"loss": 0.6777,
"num_input_tokens_seen": 3198408,
"step": 8490
},
{
"epoch": 7.612007168458781,
"grad_norm": 0.5471696257591248,
"learning_rate": 8.19980348611194e-06,
"loss": 0.6884,
"num_input_tokens_seen": 3200264,
"step": 8495
},
{
"epoch": 7.616487455197133,
"grad_norm": 0.5129026174545288,
"learning_rate": 8.170870344011982e-06,
"loss": 0.7057,
"num_input_tokens_seen": 3202120,
"step": 8500
},
{
"epoch": 7.620967741935484,
"grad_norm": 0.5552482008934021,
"learning_rate": 8.141978363101243e-06,
"loss": 0.7117,
"num_input_tokens_seen": 3203976,
"step": 8505
},
{
"epoch": 7.625448028673835,
"grad_norm": 0.7430400252342224,
"learning_rate": 8.1131276140446e-06,
"loss": 0.7236,
"num_input_tokens_seen": 3205832,
"step": 8510
},
{
"epoch": 7.629928315412187,
"grad_norm": 0.5252755880355835,
"learning_rate": 8.084318167406063e-06,
"loss": 0.699,
"num_input_tokens_seen": 3207816,
"step": 8515
},
{
"epoch": 7.634408602150538,
"grad_norm": 0.5251414775848389,
"learning_rate": 8.055550093648665e-06,
"loss": 0.7018,
"num_input_tokens_seen": 3209768,
"step": 8520
},
{
"epoch": 7.638888888888889,
"grad_norm": 0.3986772298812866,
"learning_rate": 8.026823463134206e-06,
"loss": 0.6745,
"num_input_tokens_seen": 3211464,
"step": 8525
},
{
"epoch": 7.643369175627241,
"grad_norm": 0.4646606743335724,
"learning_rate": 7.99813834612314e-06,
"loss": 0.6786,
"num_input_tokens_seen": 3213320,
"step": 8530
},
{
"epoch": 7.647849462365591,
"grad_norm": 0.5674442648887634,
"learning_rate": 7.969494812774392e-06,
"loss": 0.6596,
"num_input_tokens_seen": 3215272,
"step": 8535
},
{
"epoch": 7.652329749103942,
"grad_norm": 0.42505696415901184,
"learning_rate": 7.940892933145156e-06,
"loss": 0.667,
"num_input_tokens_seen": 3217256,
"step": 8540
},
{
"epoch": 7.656810035842294,
"grad_norm": 0.5792336463928223,
"learning_rate": 7.91233277719079e-06,
"loss": 0.6752,
"num_input_tokens_seen": 3219016,
"step": 8545
},
{
"epoch": 7.661290322580645,
"grad_norm": 0.5960796475410461,
"learning_rate": 7.883814414764566e-06,
"loss": 0.7255,
"num_input_tokens_seen": 3220680,
"step": 8550
},
{
"epoch": 7.665770609318996,
"grad_norm": 0.8227483034133911,
"learning_rate": 7.855337915617548e-06,
"loss": 0.6573,
"num_input_tokens_seen": 3222344,
"step": 8555
},
{
"epoch": 7.670250896057348,
"grad_norm": 0.6761976480484009,
"learning_rate": 7.82690334939841e-06,
"loss": 0.6926,
"num_input_tokens_seen": 3224168,
"step": 8560
},
{
"epoch": 7.674731182795699,
"grad_norm": 0.4717724025249481,
"learning_rate": 7.798510785653263e-06,
"loss": 0.6832,
"num_input_tokens_seen": 3225992,
"step": 8565
},
{
"epoch": 7.67921146953405,
"grad_norm": 0.4982898533344269,
"learning_rate": 7.770160293825498e-06,
"loss": 0.6951,
"num_input_tokens_seen": 3227912,
"step": 8570
},
{
"epoch": 7.683691756272402,
"grad_norm": 0.5173410177230835,
"learning_rate": 7.741851943255596e-06,
"loss": 0.7161,
"num_input_tokens_seen": 3229736,
"step": 8575
},
{
"epoch": 7.688172043010753,
"grad_norm": 0.9301579594612122,
"learning_rate": 7.713585803180956e-06,
"loss": 0.6635,
"num_input_tokens_seen": 3231720,
"step": 8580
},
{
"epoch": 7.692652329749104,
"grad_norm": 0.5519044995307922,
"learning_rate": 7.685361942735777e-06,
"loss": 0.7005,
"num_input_tokens_seen": 3233640,
"step": 8585
},
{
"epoch": 7.697132616487455,
"grad_norm": 0.34752702713012695,
"learning_rate": 7.657180430950794e-06,
"loss": 0.7004,
"num_input_tokens_seen": 3235400,
"step": 8590
},
{
"epoch": 7.701612903225806,
"grad_norm": 0.5915330648422241,
"learning_rate": 7.629041336753193e-06,
"loss": 0.7062,
"num_input_tokens_seen": 3237384,
"step": 8595
},
{
"epoch": 7.706093189964157,
"grad_norm": 0.6914758682250977,
"learning_rate": 7.600944728966433e-06,
"loss": 0.6589,
"num_input_tokens_seen": 3239496,
"step": 8600
},
{
"epoch": 7.710573476702509,
"grad_norm": 0.3407343029975891,
"learning_rate": 7.572890676310026e-06,
"loss": 0.6834,
"num_input_tokens_seen": 3241128,
"step": 8605
},
{
"epoch": 7.71505376344086,
"grad_norm": 0.7424318194389343,
"learning_rate": 7.544879247399417e-06,
"loss": 0.7125,
"num_input_tokens_seen": 3242920,
"step": 8610
},
{
"epoch": 7.719534050179211,
"grad_norm": 0.900806725025177,
"learning_rate": 7.516910510745795e-06,
"loss": 0.6943,
"num_input_tokens_seen": 3244680,
"step": 8615
},
{
"epoch": 7.724014336917563,
"grad_norm": 0.6076695322990417,
"learning_rate": 7.48898453475593e-06,
"loss": 0.6976,
"num_input_tokens_seen": 3246728,
"step": 8620
},
{
"epoch": 7.728494623655914,
"grad_norm": 0.4813416600227356,
"learning_rate": 7.46110138773202e-06,
"loss": 0.6916,
"num_input_tokens_seen": 3248712,
"step": 8625
},
{
"epoch": 7.732974910394265,
"grad_norm": 0.4648270905017853,
"learning_rate": 7.433261137871497e-06,
"loss": 0.6958,
"num_input_tokens_seen": 3250568,
"step": 8630
},
{
"epoch": 7.737455197132617,
"grad_norm": 0.49301114678382874,
"learning_rate": 7.405463853266869e-06,
"loss": 0.6908,
"num_input_tokens_seen": 3252328,
"step": 8635
},
{
"epoch": 7.741935483870968,
"grad_norm": 0.7146629691123962,
"learning_rate": 7.377709601905594e-06,
"loss": 0.6818,
"num_input_tokens_seen": 3254248,
"step": 8640
},
{
"epoch": 7.746415770609319,
"grad_norm": 0.49218234419822693,
"learning_rate": 7.349998451669812e-06,
"loss": 0.6951,
"num_input_tokens_seen": 3256040,
"step": 8645
},
{
"epoch": 7.750896057347671,
"grad_norm": 0.43367111682891846,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.7164,
"num_input_tokens_seen": 3257960,
"step": 8650
},
{
"epoch": 7.755376344086022,
"grad_norm": 0.43355652689933777,
"learning_rate": 7.294705725576267e-06,
"loss": 0.6817,
"num_input_tokens_seen": 3259880,
"step": 8655
},
{
"epoch": 7.759856630824372,
"grad_norm": 0.42015159130096436,
"learning_rate": 7.2671242849550905e-06,
"loss": 0.685,
"num_input_tokens_seen": 3261960,
"step": 8660
},
{
"epoch": 7.764336917562724,
"grad_norm": 0.5154469013214111,
"learning_rate": 7.239586215932323e-06,
"loss": 0.7112,
"num_input_tokens_seen": 3263784,
"step": 8665
},
{
"epoch": 7.768817204301075,
"grad_norm": 0.4720630347728729,
"learning_rate": 7.212091585861363e-06,
"loss": 0.6855,
"num_input_tokens_seen": 3265640,
"step": 8670
},
{
"epoch": 7.773297491039426,
"grad_norm": 0.4294196367263794,
"learning_rate": 7.184640461989431e-06,
"loss": 0.6866,
"num_input_tokens_seen": 3267368,
"step": 8675
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.6301731467247009,
"learning_rate": 7.157232911457293e-06,
"loss": 0.709,
"num_input_tokens_seen": 3269096,
"step": 8680
},
{
"epoch": 7.782258064516129,
"grad_norm": 0.638538122177124,
"learning_rate": 7.12986900129915e-06,
"loss": 0.6918,
"num_input_tokens_seen": 3270920,
"step": 8685
},
{
"epoch": 7.78673835125448,
"grad_norm": 0.5257901549339294,
"learning_rate": 7.10254879844249e-06,
"loss": 0.6933,
"num_input_tokens_seen": 3272840,
"step": 8690
},
{
"epoch": 7.791218637992832,
"grad_norm": 0.6461403965950012,
"learning_rate": 7.075272369707878e-06,
"loss": 0.6791,
"num_input_tokens_seen": 3274824,
"step": 8695
},
{
"epoch": 7.795698924731183,
"grad_norm": 0.6005759835243225,
"learning_rate": 7.048039781808816e-06,
"loss": 0.6913,
"num_input_tokens_seen": 3276808,
"step": 8700
},
{
"epoch": 7.800179211469534,
"grad_norm": 0.5737102627754211,
"learning_rate": 7.020851101351583e-06,
"loss": 0.647,
"num_input_tokens_seen": 3279144,
"step": 8705
},
{
"epoch": 7.804659498207886,
"grad_norm": 0.6555394530296326,
"learning_rate": 6.993706394835062e-06,
"loss": 0.6987,
"num_input_tokens_seen": 3281128,
"step": 8710
},
{
"epoch": 7.809139784946236,
"grad_norm": 0.6129330396652222,
"learning_rate": 6.966605728650602e-06,
"loss": 0.7193,
"num_input_tokens_seen": 3282952,
"step": 8715
},
{
"epoch": 7.813620071684587,
"grad_norm": 0.4754336476325989,
"learning_rate": 6.939549169081827e-06,
"loss": 0.6926,
"num_input_tokens_seen": 3284904,
"step": 8720
},
{
"epoch": 7.818100358422939,
"grad_norm": 0.6485406756401062,
"learning_rate": 6.912536782304454e-06,
"loss": 0.6967,
"num_input_tokens_seen": 3286760,
"step": 8725
},
{
"epoch": 7.82258064516129,
"grad_norm": 0.5529478788375854,
"learning_rate": 6.885568634386217e-06,
"loss": 0.68,
"num_input_tokens_seen": 3288584,
"step": 8730
},
{
"epoch": 7.827060931899641,
"grad_norm": 0.4761482775211334,
"learning_rate": 6.858644791286603e-06,
"loss": 0.6578,
"num_input_tokens_seen": 3290632,
"step": 8735
},
{
"epoch": 7.831541218637993,
"grad_norm": 0.6546998023986816,
"learning_rate": 6.83176531885675e-06,
"loss": 0.7015,
"num_input_tokens_seen": 3292488,
"step": 8740
},
{
"epoch": 7.836021505376344,
"grad_norm": 0.4401530921459198,
"learning_rate": 6.804930282839295e-06,
"loss": 0.7002,
"num_input_tokens_seen": 3294376,
"step": 8745
},
{
"epoch": 7.840501792114695,
"grad_norm": 0.6239868402481079,
"learning_rate": 6.778139748868159e-06,
"loss": 0.6898,
"num_input_tokens_seen": 3296360,
"step": 8750
},
{
"epoch": 7.844982078853047,
"grad_norm": 0.6737553477287292,
"learning_rate": 6.751393782468438e-06,
"loss": 0.6658,
"num_input_tokens_seen": 3298152,
"step": 8755
},
{
"epoch": 7.849462365591398,
"grad_norm": 0.5571895837783813,
"learning_rate": 6.7246924490562135e-06,
"loss": 0.6665,
"num_input_tokens_seen": 3300104,
"step": 8760
},
{
"epoch": 7.853942652329749,
"grad_norm": 0.4849938750267029,
"learning_rate": 6.6980358139384e-06,
"loss": 0.6841,
"num_input_tokens_seen": 3301928,
"step": 8765
},
{
"epoch": 7.858422939068101,
"grad_norm": 0.4987306296825409,
"learning_rate": 6.671423942312608e-06,
"loss": 0.7039,
"num_input_tokens_seen": 3303816,
"step": 8770
},
{
"epoch": 7.862903225806452,
"grad_norm": 0.45879921317100525,
"learning_rate": 6.6448568992669434e-06,
"loss": 0.6825,
"num_input_tokens_seen": 3305704,
"step": 8775
},
{
"epoch": 7.867383512544803,
"grad_norm": 0.5061188340187073,
"learning_rate": 6.6183347497798755e-06,
"loss": 0.7011,
"num_input_tokens_seen": 3307656,
"step": 8780
},
{
"epoch": 7.871863799283155,
"grad_norm": 0.4768591523170471,
"learning_rate": 6.591857558720071e-06,
"loss": 0.6836,
"num_input_tokens_seen": 3309608,
"step": 8785
},
{
"epoch": 7.876344086021505,
"grad_norm": 0.3989751636981964,
"learning_rate": 6.565425390846233e-06,
"loss": 0.7049,
"num_input_tokens_seen": 3311368,
"step": 8790
},
{
"epoch": 7.880824372759856,
"grad_norm": 0.8028482794761658,
"learning_rate": 6.539038310806958e-06,
"loss": 0.7282,
"num_input_tokens_seen": 3313352,
"step": 8795
},
{
"epoch": 7.885304659498208,
"grad_norm": 0.45631352066993713,
"learning_rate": 6.512696383140551e-06,
"loss": 0.6603,
"num_input_tokens_seen": 3315240,
"step": 8800
},
{
"epoch": 7.889784946236559,
"grad_norm": 0.5273287296295166,
"learning_rate": 6.48639967227489e-06,
"loss": 0.7098,
"num_input_tokens_seen": 3317032,
"step": 8805
},
{
"epoch": 7.89426523297491,
"grad_norm": 0.5012726783752441,
"learning_rate": 6.460148242527253e-06,
"loss": 0.6982,
"num_input_tokens_seen": 3319048,
"step": 8810
},
{
"epoch": 7.898745519713262,
"grad_norm": 0.9831424951553345,
"learning_rate": 6.4339421581041725e-06,
"loss": 0.699,
"num_input_tokens_seen": 3320936,
"step": 8815
},
{
"epoch": 7.903225806451613,
"grad_norm": 0.6299842596054077,
"learning_rate": 6.407781483101283e-06,
"loss": 0.6759,
"num_input_tokens_seen": 3322760,
"step": 8820
},
{
"epoch": 7.907706093189964,
"grad_norm": 0.5512109994888306,
"learning_rate": 6.38166628150314e-06,
"loss": 0.6783,
"num_input_tokens_seen": 3324584,
"step": 8825
},
{
"epoch": 7.912186379928316,
"grad_norm": 0.6959258317947388,
"learning_rate": 6.355596617183091e-06,
"loss": 0.7322,
"num_input_tokens_seen": 3326600,
"step": 8830
},
{
"epoch": 7.916666666666667,
"grad_norm": 0.47916871309280396,
"learning_rate": 6.329572553903096e-06,
"loss": 0.705,
"num_input_tokens_seen": 3328456,
"step": 8835
},
{
"epoch": 7.921146953405018,
"grad_norm": 0.7595999240875244,
"learning_rate": 6.303594155313583e-06,
"loss": 0.6839,
"num_input_tokens_seen": 3330472,
"step": 8840
},
{
"epoch": 7.925627240143369,
"grad_norm": 0.48006659746170044,
"learning_rate": 6.277661484953309e-06,
"loss": 0.6823,
"num_input_tokens_seen": 3332488,
"step": 8845
},
{
"epoch": 7.93010752688172,
"grad_norm": 0.6921666860580444,
"learning_rate": 6.251774606249172e-06,
"loss": 0.6787,
"num_input_tokens_seen": 3334376,
"step": 8850
},
{
"epoch": 7.934587813620071,
"grad_norm": 0.6116976141929626,
"learning_rate": 6.225933582516069e-06,
"loss": 0.7175,
"num_input_tokens_seen": 3336264,
"step": 8855
},
{
"epoch": 7.939068100358423,
"grad_norm": 0.4193064570426941,
"learning_rate": 6.200138476956766e-06,
"loss": 0.6833,
"num_input_tokens_seen": 3338024,
"step": 8860
},
{
"epoch": 7.943548387096774,
"grad_norm": 0.48867300152778625,
"learning_rate": 6.174389352661686e-06,
"loss": 0.6892,
"num_input_tokens_seen": 3340008,
"step": 8865
},
{
"epoch": 7.948028673835125,
"grad_norm": 0.7177107930183411,
"learning_rate": 6.148686272608809e-06,
"loss": 0.7311,
"num_input_tokens_seen": 3341864,
"step": 8870
},
{
"epoch": 7.952508960573477,
"grad_norm": 0.5652689933776855,
"learning_rate": 6.12302929966351e-06,
"loss": 0.6862,
"num_input_tokens_seen": 3343752,
"step": 8875
},
{
"epoch": 7.956989247311828,
"grad_norm": 0.4583767354488373,
"learning_rate": 6.097418496578369e-06,
"loss": 0.6801,
"num_input_tokens_seen": 3345672,
"step": 8880
},
{
"epoch": 7.961469534050179,
"grad_norm": 0.38024917244911194,
"learning_rate": 6.0718539259930766e-06,
"loss": 0.6886,
"num_input_tokens_seen": 3347624,
"step": 8885
},
{
"epoch": 7.965949820788531,
"grad_norm": 0.7006999850273132,
"learning_rate": 6.046335650434201e-06,
"loss": 0.7096,
"num_input_tokens_seen": 3349480,
"step": 8890
},
{
"epoch": 7.970430107526882,
"grad_norm": 0.6173056364059448,
"learning_rate": 6.020863732315108e-06,
"loss": 0.6821,
"num_input_tokens_seen": 3351400,
"step": 8895
},
{
"epoch": 7.974910394265233,
"grad_norm": 0.5322303771972656,
"learning_rate": 5.9954382339357905e-06,
"loss": 0.6849,
"num_input_tokens_seen": 3353352,
"step": 8900
},
{
"epoch": 7.979390681003585,
"grad_norm": 0.5544753670692444,
"learning_rate": 5.970059217482685e-06,
"loss": 0.6994,
"num_input_tokens_seen": 3355176,
"step": 8905
},
{
"epoch": 7.983870967741936,
"grad_norm": 0.5962495803833008,
"learning_rate": 5.944726745028545e-06,
"loss": 0.6769,
"num_input_tokens_seen": 3357224,
"step": 8910
},
{
"epoch": 7.988351254480286,
"grad_norm": 0.7063138484954834,
"learning_rate": 5.919440878532312e-06,
"loss": 0.6867,
"num_input_tokens_seen": 3358984,
"step": 8915
},
{
"epoch": 7.992831541218638,
"grad_norm": 0.6204646825790405,
"learning_rate": 5.894201679838885e-06,
"loss": 0.6851,
"num_input_tokens_seen": 3361032,
"step": 8920
},
{
"epoch": 7.997311827956989,
"grad_norm": 0.5719345808029175,
"learning_rate": 5.869009210679074e-06,
"loss": 0.7097,
"num_input_tokens_seen": 3363048,
"step": 8925
},
{
"epoch": 8.0,
"eval_loss": 0.6994269490242004,
"eval_runtime": 5.6261,
"eval_samples_per_second": 88.16,
"eval_steps_per_second": 22.04,
"num_input_tokens_seen": 3363864,
"step": 8928
},
{
"epoch": 8.001792114695341,
"grad_norm": 0.42449355125427246,
"learning_rate": 5.8438635326693664e-06,
"loss": 0.6755,
"num_input_tokens_seen": 3364600,
"step": 8930
},
{
"epoch": 8.006272401433693,
"grad_norm": 0.5668836236000061,
"learning_rate": 5.818764707311811e-06,
"loss": 0.6986,
"num_input_tokens_seen": 3366360,
"step": 8935
},
{
"epoch": 8.010752688172044,
"grad_norm": 0.6535866856575012,
"learning_rate": 5.7937127959938806e-06,
"loss": 0.7205,
"num_input_tokens_seen": 3368312,
"step": 8940
},
{
"epoch": 8.015232974910393,
"grad_norm": 0.507878303527832,
"learning_rate": 5.768707859988267e-06,
"loss": 0.6678,
"num_input_tokens_seen": 3370200,
"step": 8945
},
{
"epoch": 8.019713261648745,
"grad_norm": 0.6113376021385193,
"learning_rate": 5.7437499604528125e-06,
"loss": 0.7054,
"num_input_tokens_seen": 3372056,
"step": 8950
},
{
"epoch": 8.024193548387096,
"grad_norm": 0.6190397143363953,
"learning_rate": 5.7188391584302895e-06,
"loss": 0.6893,
"num_input_tokens_seen": 3373976,
"step": 8955
},
{
"epoch": 8.028673835125447,
"grad_norm": 0.4994731545448303,
"learning_rate": 5.693975514848271e-06,
"loss": 0.6849,
"num_input_tokens_seen": 3375960,
"step": 8960
},
{
"epoch": 8.033154121863799,
"grad_norm": 0.703480064868927,
"learning_rate": 5.669159090519019e-06,
"loss": 0.7006,
"num_input_tokens_seen": 3377880,
"step": 8965
},
{
"epoch": 8.03763440860215,
"grad_norm": 0.559497594833374,
"learning_rate": 5.644389946139278e-06,
"loss": 0.6633,
"num_input_tokens_seen": 3379768,
"step": 8970
},
{
"epoch": 8.042114695340501,
"grad_norm": 0.5711075663566589,
"learning_rate": 5.6196681422901634e-06,
"loss": 0.6687,
"num_input_tokens_seen": 3381560,
"step": 8975
},
{
"epoch": 8.046594982078853,
"grad_norm": 0.6641356348991394,
"learning_rate": 5.594993739437007e-06,
"loss": 0.6936,
"num_input_tokens_seen": 3383544,
"step": 8980
},
{
"epoch": 8.051075268817204,
"grad_norm": 0.7046183347702026,
"learning_rate": 5.5703667979291915e-06,
"loss": 0.6733,
"num_input_tokens_seen": 3385272,
"step": 8985
},
{
"epoch": 8.055555555555555,
"grad_norm": 0.9060787558555603,
"learning_rate": 5.545787378000039e-06,
"loss": 0.6793,
"num_input_tokens_seen": 3387256,
"step": 8990
},
{
"epoch": 8.060035842293907,
"grad_norm": 0.48104485869407654,
"learning_rate": 5.521255539766637e-06,
"loss": 0.6987,
"num_input_tokens_seen": 3389144,
"step": 8995
},
{
"epoch": 8.064516129032258,
"grad_norm": 0.5199661254882812,
"learning_rate": 5.4967713432296674e-06,
"loss": 0.6867,
"num_input_tokens_seen": 3390904,
"step": 9000
},
{
"epoch": 8.06899641577061,
"grad_norm": 0.4922904074192047,
"learning_rate": 5.472334848273328e-06,
"loss": 0.6865,
"num_input_tokens_seen": 3392792,
"step": 9005
},
{
"epoch": 8.07347670250896,
"grad_norm": 0.44281700253486633,
"learning_rate": 5.44794611466512e-06,
"loss": 0.686,
"num_input_tokens_seen": 3394744,
"step": 9010
},
{
"epoch": 8.077956989247312,
"grad_norm": 0.4282800257205963,
"learning_rate": 5.4236052020557535e-06,
"loss": 0.6553,
"num_input_tokens_seen": 3396632,
"step": 9015
},
{
"epoch": 8.082437275985663,
"grad_norm": 0.7372511625289917,
"learning_rate": 5.399312169978949e-06,
"loss": 0.6899,
"num_input_tokens_seen": 3398424,
"step": 9020
},
{
"epoch": 8.086917562724015,
"grad_norm": 0.48595067858695984,
"learning_rate": 5.375067077851337e-06,
"loss": 0.6586,
"num_input_tokens_seen": 3400312,
"step": 9025
},
{
"epoch": 8.091397849462366,
"grad_norm": 0.5720658898353577,
"learning_rate": 5.350869984972287e-06,
"loss": 0.7233,
"num_input_tokens_seen": 3402200,
"step": 9030
},
{
"epoch": 8.095878136200717,
"grad_norm": 0.38736122846603394,
"learning_rate": 5.326720950523772e-06,
"loss": 0.6877,
"num_input_tokens_seen": 3404152,
"step": 9035
},
{
"epoch": 8.100358422939069,
"grad_norm": 0.5513134002685547,
"learning_rate": 5.302620033570222e-06,
"loss": 0.7067,
"num_input_tokens_seen": 3405912,
"step": 9040
},
{
"epoch": 8.10483870967742,
"grad_norm": 0.5928265452384949,
"learning_rate": 5.27856729305839e-06,
"loss": 0.713,
"num_input_tokens_seen": 3407672,
"step": 9045
},
{
"epoch": 8.109318996415771,
"grad_norm": 0.838718056678772,
"learning_rate": 5.254562787817183e-06,
"loss": 0.6896,
"num_input_tokens_seen": 3409496,
"step": 9050
},
{
"epoch": 8.113799283154123,
"grad_norm": 0.5603805184364319,
"learning_rate": 5.23060657655754e-06,
"loss": 0.6567,
"num_input_tokens_seen": 3411352,
"step": 9055
},
{
"epoch": 8.118279569892474,
"grad_norm": 0.5363320112228394,
"learning_rate": 5.206698717872277e-06,
"loss": 0.6803,
"num_input_tokens_seen": 3413432,
"step": 9060
},
{
"epoch": 8.122759856630825,
"grad_norm": 0.5360172390937805,
"learning_rate": 5.1828392702359504e-06,
"loss": 0.7181,
"num_input_tokens_seen": 3415320,
"step": 9065
},
{
"epoch": 8.127240143369175,
"grad_norm": 0.5474600791931152,
"learning_rate": 5.159028292004717e-06,
"loss": 0.6764,
"num_input_tokens_seen": 3417240,
"step": 9070
},
{
"epoch": 8.131720430107526,
"grad_norm": 0.5112743377685547,
"learning_rate": 5.1352658414161785e-06,
"loss": 0.6878,
"num_input_tokens_seen": 3419192,
"step": 9075
},
{
"epoch": 8.136200716845877,
"grad_norm": 0.784028172492981,
"learning_rate": 5.111551976589249e-06,
"loss": 0.6983,
"num_input_tokens_seen": 3421208,
"step": 9080
},
{
"epoch": 8.140681003584229,
"grad_norm": 0.6619753837585449,
"learning_rate": 5.087886755524005e-06,
"loss": 0.695,
"num_input_tokens_seen": 3423064,
"step": 9085
},
{
"epoch": 8.14516129032258,
"grad_norm": 0.4583960771560669,
"learning_rate": 5.064270236101548e-06,
"loss": 0.7061,
"num_input_tokens_seen": 3424984,
"step": 9090
},
{
"epoch": 8.149641577060931,
"grad_norm": 0.6008018851280212,
"learning_rate": 5.040702476083883e-06,
"loss": 0.6968,
"num_input_tokens_seen": 3426936,
"step": 9095
},
{
"epoch": 8.154121863799283,
"grad_norm": 0.4817352592945099,
"learning_rate": 5.0171835331137365e-06,
"loss": 0.691,
"num_input_tokens_seen": 3428696,
"step": 9100
},
{
"epoch": 8.158602150537634,
"grad_norm": 0.5647047162055969,
"learning_rate": 4.993713464714433e-06,
"loss": 0.6788,
"num_input_tokens_seen": 3430744,
"step": 9105
},
{
"epoch": 8.163082437275985,
"grad_norm": 0.503331184387207,
"learning_rate": 4.970292328289794e-06,
"loss": 0.6684,
"num_input_tokens_seen": 3432696,
"step": 9110
},
{
"epoch": 8.167562724014337,
"grad_norm": 0.39130842685699463,
"learning_rate": 4.946920181123904e-06,
"loss": 0.6576,
"num_input_tokens_seen": 3434424,
"step": 9115
},
{
"epoch": 8.172043010752688,
"grad_norm": 0.6829502582550049,
"learning_rate": 4.9235970803810845e-06,
"loss": 0.7095,
"num_input_tokens_seen": 3436312,
"step": 9120
},
{
"epoch": 8.17652329749104,
"grad_norm": 0.5101547241210938,
"learning_rate": 4.900323083105668e-06,
"loss": 0.6655,
"num_input_tokens_seen": 3438328,
"step": 9125
},
{
"epoch": 8.18100358422939,
"grad_norm": 0.3391803205013275,
"learning_rate": 4.877098246221881e-06,
"loss": 0.6881,
"num_input_tokens_seen": 3440088,
"step": 9130
},
{
"epoch": 8.185483870967742,
"grad_norm": 0.4887087643146515,
"learning_rate": 4.853922626533749e-06,
"loss": 0.6946,
"num_input_tokens_seen": 3441912,
"step": 9135
},
{
"epoch": 8.189964157706093,
"grad_norm": 0.48914480209350586,
"learning_rate": 4.830796280724873e-06,
"loss": 0.6883,
"num_input_tokens_seen": 3443832,
"step": 9140
},
{
"epoch": 8.194444444444445,
"grad_norm": 0.4121975302696228,
"learning_rate": 4.807719265358377e-06,
"loss": 0.6959,
"num_input_tokens_seen": 3445720,
"step": 9145
},
{
"epoch": 8.198924731182796,
"grad_norm": 0.4341879189014435,
"learning_rate": 4.7846916368767094e-06,
"loss": 0.6814,
"num_input_tokens_seen": 3447544,
"step": 9150
},
{
"epoch": 8.203405017921147,
"grad_norm": 0.6341235041618347,
"learning_rate": 4.761713451601532e-06,
"loss": 0.6919,
"num_input_tokens_seen": 3449400,
"step": 9155
},
{
"epoch": 8.207885304659499,
"grad_norm": 0.3464028835296631,
"learning_rate": 4.738784765733586e-06,
"loss": 0.6876,
"num_input_tokens_seen": 3451256,
"step": 9160
},
{
"epoch": 8.21236559139785,
"grad_norm": 0.4516923129558563,
"learning_rate": 4.715905635352541e-06,
"loss": 0.6937,
"num_input_tokens_seen": 3453240,
"step": 9165
},
{
"epoch": 8.216845878136201,
"grad_norm": 0.5117191672325134,
"learning_rate": 4.6930761164168395e-06,
"loss": 0.7015,
"num_input_tokens_seen": 3455064,
"step": 9170
},
{
"epoch": 8.221326164874553,
"grad_norm": 0.5112479329109192,
"learning_rate": 4.670296264763618e-06,
"loss": 0.6783,
"num_input_tokens_seen": 3456888,
"step": 9175
},
{
"epoch": 8.225806451612904,
"grad_norm": 0.6857882738113403,
"learning_rate": 4.6475661361085195e-06,
"loss": 0.7114,
"num_input_tokens_seen": 3458776,
"step": 9180
},
{
"epoch": 8.230286738351255,
"grad_norm": 0.6740723848342896,
"learning_rate": 4.624885786045563e-06,
"loss": 0.6879,
"num_input_tokens_seen": 3460600,
"step": 9185
},
{
"epoch": 8.234767025089607,
"grad_norm": 0.5737828016281128,
"learning_rate": 4.602255270047048e-06,
"loss": 0.68,
"num_input_tokens_seen": 3462552,
"step": 9190
},
{
"epoch": 8.239247311827956,
"grad_norm": 0.5475903749465942,
"learning_rate": 4.579674643463341e-06,
"loss": 0.7221,
"num_input_tokens_seen": 3464568,
"step": 9195
},
{
"epoch": 8.243727598566307,
"grad_norm": 0.43242955207824707,
"learning_rate": 4.557143961522836e-06,
"loss": 0.6986,
"num_input_tokens_seen": 3466328,
"step": 9200
},
{
"epoch": 8.248207885304659,
"grad_norm": 0.6515691876411438,
"learning_rate": 4.534663279331744e-06,
"loss": 0.6631,
"num_input_tokens_seen": 3468248,
"step": 9205
},
{
"epoch": 8.25268817204301,
"grad_norm": 0.6092105507850647,
"learning_rate": 4.512232651873982e-06,
"loss": 0.6971,
"num_input_tokens_seen": 3470200,
"step": 9210
},
{
"epoch": 8.257168458781361,
"grad_norm": 0.6402620077133179,
"learning_rate": 4.489852134011061e-06,
"loss": 0.6802,
"num_input_tokens_seen": 3472184,
"step": 9215
},
{
"epoch": 8.261648745519713,
"grad_norm": 0.4998003840446472,
"learning_rate": 4.46752178048192e-06,
"loss": 0.6781,
"num_input_tokens_seen": 3474008,
"step": 9220
},
{
"epoch": 8.266129032258064,
"grad_norm": 0.548748254776001,
"learning_rate": 4.445241645902804e-06,
"loss": 0.6829,
"num_input_tokens_seen": 3475896,
"step": 9225
},
{
"epoch": 8.270609318996415,
"grad_norm": 0.5525373220443726,
"learning_rate": 4.423011784767133e-06,
"loss": 0.683,
"num_input_tokens_seen": 3477880,
"step": 9230
},
{
"epoch": 8.275089605734767,
"grad_norm": 0.7804484963417053,
"learning_rate": 4.400832251445361e-06,
"loss": 0.6933,
"num_input_tokens_seen": 3479832,
"step": 9235
},
{
"epoch": 8.279569892473118,
"grad_norm": 0.5897542834281921,
"learning_rate": 4.378703100184869e-06,
"loss": 0.677,
"num_input_tokens_seen": 3481976,
"step": 9240
},
{
"epoch": 8.28405017921147,
"grad_norm": 0.335835337638855,
"learning_rate": 4.35662438510979e-06,
"loss": 0.6744,
"num_input_tokens_seen": 3483832,
"step": 9245
},
{
"epoch": 8.28853046594982,
"grad_norm": 0.7230558395385742,
"learning_rate": 4.334596160220905e-06,
"loss": 0.696,
"num_input_tokens_seen": 3485720,
"step": 9250
},
{
"epoch": 8.293010752688172,
"grad_norm": 0.5177806615829468,
"learning_rate": 4.312618479395506e-06,
"loss": 0.717,
"num_input_tokens_seen": 3487640,
"step": 9255
},
{
"epoch": 8.297491039426523,
"grad_norm": 0.6857832670211792,
"learning_rate": 4.290691396387258e-06,
"loss": 0.6967,
"num_input_tokens_seen": 3489688,
"step": 9260
},
{
"epoch": 8.301971326164875,
"grad_norm": 0.443286269903183,
"learning_rate": 4.268814964826093e-06,
"loss": 0.7012,
"num_input_tokens_seen": 3491512,
"step": 9265
},
{
"epoch": 8.306451612903226,
"grad_norm": 0.6708531975746155,
"learning_rate": 4.24698923821803e-06,
"loss": 0.6813,
"num_input_tokens_seen": 3493560,
"step": 9270
},
{
"epoch": 8.310931899641577,
"grad_norm": 0.5990952253341675,
"learning_rate": 4.225214269945088e-06,
"loss": 0.7024,
"num_input_tokens_seen": 3495608,
"step": 9275
},
{
"epoch": 8.315412186379929,
"grad_norm": 0.5891165137290955,
"learning_rate": 4.203490113265138e-06,
"loss": 0.6855,
"num_input_tokens_seen": 3497464,
"step": 9280
},
{
"epoch": 8.31989247311828,
"grad_norm": 0.5316137671470642,
"learning_rate": 4.181816821311763e-06,
"loss": 0.6746,
"num_input_tokens_seen": 3499416,
"step": 9285
},
{
"epoch": 8.324372759856631,
"grad_norm": 0.7413282990455627,
"learning_rate": 4.160194447094162e-06,
"loss": 0.6805,
"num_input_tokens_seen": 3501400,
"step": 9290
},
{
"epoch": 8.328853046594983,
"grad_norm": 0.41643762588500977,
"learning_rate": 4.138623043496981e-06,
"loss": 0.7057,
"num_input_tokens_seen": 3503160,
"step": 9295
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.5769673585891724,
"learning_rate": 4.1171026632802035e-06,
"loss": 0.6817,
"num_input_tokens_seen": 3504952,
"step": 9300
},
{
"epoch": 8.337813620071685,
"grad_norm": 0.5316203236579895,
"learning_rate": 4.095633359079024e-06,
"loss": 0.6856,
"num_input_tokens_seen": 3506680,
"step": 9305
},
{
"epoch": 8.342293906810037,
"grad_norm": 0.6140154600143433,
"learning_rate": 4.074215183403701e-06,
"loss": 0.6866,
"num_input_tokens_seen": 3508600,
"step": 9310
},
{
"epoch": 8.346774193548388,
"grad_norm": 0.440112441778183,
"learning_rate": 4.052848188639452e-06,
"loss": 0.6845,
"num_input_tokens_seen": 3510520,
"step": 9315
},
{
"epoch": 8.351254480286737,
"grad_norm": 0.4764098823070526,
"learning_rate": 4.031532427046322e-06,
"loss": 0.6773,
"num_input_tokens_seen": 3512312,
"step": 9320
},
{
"epoch": 8.355734767025089,
"grad_norm": 0.7385228276252747,
"learning_rate": 4.010267950759025e-06,
"loss": 0.6937,
"num_input_tokens_seen": 3514200,
"step": 9325
},
{
"epoch": 8.36021505376344,
"grad_norm": 0.41506174206733704,
"learning_rate": 3.989054811786874e-06,
"loss": 0.7099,
"num_input_tokens_seen": 3516024,
"step": 9330
},
{
"epoch": 8.364695340501791,
"grad_norm": 0.5923046469688416,
"learning_rate": 3.967893062013581e-06,
"loss": 0.6867,
"num_input_tokens_seen": 3517912,
"step": 9335
},
{
"epoch": 8.369175627240143,
"grad_norm": 0.470580518245697,
"learning_rate": 3.946782753197187e-06,
"loss": 0.6977,
"num_input_tokens_seen": 3519768,
"step": 9340
},
{
"epoch": 8.373655913978494,
"grad_norm": 0.4065123200416565,
"learning_rate": 3.925723936969927e-06,
"loss": 0.6779,
"num_input_tokens_seen": 3521560,
"step": 9345
},
{
"epoch": 8.378136200716845,
"grad_norm": 0.5575389266014099,
"learning_rate": 3.9047166648380844e-06,
"loss": 0.7009,
"num_input_tokens_seen": 3523448,
"step": 9350
},
{
"epoch": 8.382616487455197,
"grad_norm": 0.5877669453620911,
"learning_rate": 3.883760988181867e-06,
"loss": 0.6966,
"num_input_tokens_seen": 3525176,
"step": 9355
},
{
"epoch": 8.387096774193548,
"grad_norm": 0.6943244338035583,
"learning_rate": 3.862856958255304e-06,
"loss": 0.7053,
"num_input_tokens_seen": 3527128,
"step": 9360
},
{
"epoch": 8.3915770609319,
"grad_norm": 0.5657069087028503,
"learning_rate": 3.842004626186085e-06,
"loss": 0.6851,
"num_input_tokens_seen": 3528856,
"step": 9365
},
{
"epoch": 8.39605734767025,
"grad_norm": 0.40441882610321045,
"learning_rate": 3.821204042975482e-06,
"loss": 0.7008,
"num_input_tokens_seen": 3530648,
"step": 9370
},
{
"epoch": 8.400537634408602,
"grad_norm": 0.43344125151634216,
"learning_rate": 3.8004552594981815e-06,
"loss": 0.7028,
"num_input_tokens_seen": 3532376,
"step": 9375
},
{
"epoch": 8.405017921146953,
"grad_norm": 0.5527206659317017,
"learning_rate": 3.77975832650217e-06,
"loss": 0.7186,
"num_input_tokens_seen": 3534200,
"step": 9380
},
{
"epoch": 8.409498207885305,
"grad_norm": 0.5849681496620178,
"learning_rate": 3.7591132946086434e-06,
"loss": 0.6592,
"num_input_tokens_seen": 3536376,
"step": 9385
},
{
"epoch": 8.413978494623656,
"grad_norm": 0.6648504734039307,
"learning_rate": 3.7385202143118192e-06,
"loss": 0.681,
"num_input_tokens_seen": 3538392,
"step": 9390
},
{
"epoch": 8.418458781362007,
"grad_norm": 0.49053147435188293,
"learning_rate": 3.717979135978883e-06,
"loss": 0.6819,
"num_input_tokens_seen": 3540280,
"step": 9395
},
{
"epoch": 8.422939068100359,
"grad_norm": 0.6155134439468384,
"learning_rate": 3.697490109849816e-06,
"loss": 0.694,
"num_input_tokens_seen": 3542136,
"step": 9400
},
{
"epoch": 8.42741935483871,
"grad_norm": 0.583812415599823,
"learning_rate": 3.6770531860372853e-06,
"loss": 0.694,
"num_input_tokens_seen": 3543992,
"step": 9405
},
{
"epoch": 8.431899641577061,
"grad_norm": 0.4418407082557678,
"learning_rate": 3.6566684145265483e-06,
"loss": 0.6877,
"num_input_tokens_seen": 3545784,
"step": 9410
},
{
"epoch": 8.436379928315413,
"grad_norm": 0.6325966715812683,
"learning_rate": 3.636335845175265e-06,
"loss": 0.6729,
"num_input_tokens_seen": 3547800,
"step": 9415
},
{
"epoch": 8.440860215053764,
"grad_norm": 0.3695630729198456,
"learning_rate": 3.616055527713463e-06,
"loss": 0.7098,
"num_input_tokens_seen": 3549528,
"step": 9420
},
{
"epoch": 8.445340501792115,
"grad_norm": 0.6490418910980225,
"learning_rate": 3.595827511743341e-06,
"loss": 0.7014,
"num_input_tokens_seen": 3551416,
"step": 9425
},
{
"epoch": 8.449820788530467,
"grad_norm": 0.6782343983650208,
"learning_rate": 3.575651846739181e-06,
"loss": 0.6723,
"num_input_tokens_seen": 3553336,
"step": 9430
},
{
"epoch": 8.454301075268818,
"grad_norm": 0.6364790797233582,
"learning_rate": 3.5555285820472435e-06,
"loss": 0.6909,
"num_input_tokens_seen": 3555096,
"step": 9435
},
{
"epoch": 8.45878136200717,
"grad_norm": 0.5496588945388794,
"learning_rate": 3.5354577668856083e-06,
"loss": 0.6837,
"num_input_tokens_seen": 3556952,
"step": 9440
},
{
"epoch": 8.46326164874552,
"grad_norm": 0.6158499717712402,
"learning_rate": 3.5154394503440576e-06,
"loss": 0.686,
"num_input_tokens_seen": 3558776,
"step": 9445
},
{
"epoch": 8.46774193548387,
"grad_norm": 0.5592875480651855,
"learning_rate": 3.4954736813840095e-06,
"loss": 0.677,
"num_input_tokens_seen": 3560792,
"step": 9450
},
{
"epoch": 8.472222222222221,
"grad_norm": 0.5937447547912598,
"learning_rate": 3.47556050883833e-06,
"loss": 0.7008,
"num_input_tokens_seen": 3562680,
"step": 9455
},
{
"epoch": 8.476702508960573,
"grad_norm": 0.45481178164482117,
"learning_rate": 3.455699981411259e-06,
"loss": 0.6613,
"num_input_tokens_seen": 3564696,
"step": 9460
},
{
"epoch": 8.481182795698924,
"grad_norm": 0.61704421043396,
"learning_rate": 3.4358921476782714e-06,
"loss": 0.6897,
"num_input_tokens_seen": 3566520,
"step": 9465
},
{
"epoch": 8.485663082437275,
"grad_norm": 0.8740954995155334,
"learning_rate": 3.416137056085944e-06,
"loss": 0.6944,
"num_input_tokens_seen": 3568536,
"step": 9470
},
{
"epoch": 8.490143369175627,
"grad_norm": 0.5003806352615356,
"learning_rate": 3.3964347549518883e-06,
"loss": 0.6639,
"num_input_tokens_seen": 3570360,
"step": 9475
},
{
"epoch": 8.494623655913978,
"grad_norm": 0.7029122114181519,
"learning_rate": 3.376785292464574e-06,
"loss": 0.7039,
"num_input_tokens_seen": 3572280,
"step": 9480
},
{
"epoch": 8.49910394265233,
"grad_norm": 0.6127151250839233,
"learning_rate": 3.3571887166832434e-06,
"loss": 0.6701,
"num_input_tokens_seen": 3574200,
"step": 9485
},
{
"epoch": 8.5,
"eval_loss": 0.6988206505775452,
"eval_runtime": 5.621,
"eval_samples_per_second": 88.241,
"eval_steps_per_second": 22.06,
"num_input_tokens_seen": 3574616,
"step": 9486
},
{
"epoch": 8.50358422939068,
"grad_norm": 0.5318965315818787,
"learning_rate": 3.3376450755377958e-06,
"loss": 0.6762,
"num_input_tokens_seen": 3576344,
"step": 9490
},
{
"epoch": 8.508064516129032,
"grad_norm": 0.4716894030570984,
"learning_rate": 3.3181544168286503e-06,
"loss": 0.6827,
"num_input_tokens_seen": 3578296,
"step": 9495
},
{
"epoch": 8.512544802867383,
"grad_norm": 0.46207571029663086,
"learning_rate": 3.298716788226644e-06,
"loss": 0.6788,
"num_input_tokens_seen": 3580216,
"step": 9500
},
{
"epoch": 8.517025089605735,
"grad_norm": 0.7773041725158691,
"learning_rate": 3.2793322372729085e-06,
"loss": 0.6783,
"num_input_tokens_seen": 3582200,
"step": 9505
},
{
"epoch": 8.521505376344086,
"grad_norm": 0.6034818887710571,
"learning_rate": 3.260000811378755e-06,
"loss": 0.6997,
"num_input_tokens_seen": 3584152,
"step": 9510
},
{
"epoch": 8.525985663082437,
"grad_norm": 0.5097447633743286,
"learning_rate": 3.240722557825576e-06,
"loss": 0.6898,
"num_input_tokens_seen": 3585816,
"step": 9515
},
{
"epoch": 8.530465949820789,
"grad_norm": 0.5300137400627136,
"learning_rate": 3.2214975237646937e-06,
"loss": 0.6626,
"num_input_tokens_seen": 3587896,
"step": 9520
},
{
"epoch": 8.53494623655914,
"grad_norm": 0.6867164373397827,
"learning_rate": 3.2023257562172725e-06,
"loss": 0.6889,
"num_input_tokens_seen": 3589752,
"step": 9525
},
{
"epoch": 8.539426523297491,
"grad_norm": 0.4473750591278076,
"learning_rate": 3.1832073020741983e-06,
"loss": 0.6506,
"num_input_tokens_seen": 3591672,
"step": 9530
},
{
"epoch": 8.543906810035843,
"grad_norm": 0.4385966658592224,
"learning_rate": 3.1641422080959465e-06,
"loss": 0.6698,
"num_input_tokens_seen": 3593656,
"step": 9535
},
{
"epoch": 8.548387096774194,
"grad_norm": 0.5705263018608093,
"learning_rate": 3.145130520912515e-06,
"loss": 0.6846,
"num_input_tokens_seen": 3595832,
"step": 9540
},
{
"epoch": 8.552867383512545,
"grad_norm": 0.4351136088371277,
"learning_rate": 3.1261722870232436e-06,
"loss": 0.6792,
"num_input_tokens_seen": 3597528,
"step": 9545
},
{
"epoch": 8.557347670250897,
"grad_norm": 0.863899827003479,
"learning_rate": 3.1072675527967526e-06,
"loss": 0.7347,
"num_input_tokens_seen": 3599480,
"step": 9550
},
{
"epoch": 8.561827956989248,
"grad_norm": 0.5281751155853271,
"learning_rate": 3.0884163644708084e-06,
"loss": 0.679,
"num_input_tokens_seen": 3601368,
"step": 9555
},
{
"epoch": 8.5663082437276,
"grad_norm": 0.5913388133049011,
"learning_rate": 3.069618768152202e-06,
"loss": 0.7038,
"num_input_tokens_seen": 3603288,
"step": 9560
},
{
"epoch": 8.57078853046595,
"grad_norm": 0.4851022958755493,
"learning_rate": 3.050874809816673e-06,
"loss": 0.6821,
"num_input_tokens_seen": 3605048,
"step": 9565
},
{
"epoch": 8.575268817204302,
"grad_norm": 0.3585127294063568,
"learning_rate": 3.0321845353087463e-06,
"loss": 0.697,
"num_input_tokens_seen": 3606840,
"step": 9570
},
{
"epoch": 8.579749103942653,
"grad_norm": 0.5370261669158936,
"learning_rate": 3.0135479903416586e-06,
"loss": 0.6821,
"num_input_tokens_seen": 3608728,
"step": 9575
},
{
"epoch": 8.584229390681003,
"grad_norm": 0.7784853577613831,
"learning_rate": 2.9949652204972254e-06,
"loss": 0.6823,
"num_input_tokens_seen": 3610552,
"step": 9580
},
{
"epoch": 8.588709677419354,
"grad_norm": 0.5405849814414978,
"learning_rate": 2.976436271225741e-06,
"loss": 0.6896,
"num_input_tokens_seen": 3612472,
"step": 9585
},
{
"epoch": 8.593189964157705,
"grad_norm": 0.4924999475479126,
"learning_rate": 2.95796118784587e-06,
"loss": 0.6981,
"num_input_tokens_seen": 3614360,
"step": 9590
},
{
"epoch": 8.597670250896057,
"grad_norm": 0.8940668106079102,
"learning_rate": 2.939540015544523e-06,
"loss": 0.7345,
"num_input_tokens_seen": 3616216,
"step": 9595
},
{
"epoch": 8.602150537634408,
"grad_norm": 0.5306518077850342,
"learning_rate": 2.9211727993767507e-06,
"loss": 0.669,
"num_input_tokens_seen": 3617880,
"step": 9600
},
{
"epoch": 8.60663082437276,
"grad_norm": 0.5440559387207031,
"learning_rate": 2.902859584265649e-06,
"loss": 0.6866,
"num_input_tokens_seen": 3619736,
"step": 9605
},
{
"epoch": 8.61111111111111,
"grad_norm": 0.5547166466712952,
"learning_rate": 2.88460041500222e-06,
"loss": 0.6938,
"num_input_tokens_seen": 3621560,
"step": 9610
},
{
"epoch": 8.615591397849462,
"grad_norm": 0.27369892597198486,
"learning_rate": 2.866395336245284e-06,
"loss": 0.6766,
"num_input_tokens_seen": 3623224,
"step": 9615
},
{
"epoch": 8.620071684587813,
"grad_norm": 0.48885655403137207,
"learning_rate": 2.8482443925213765e-06,
"loss": 0.6696,
"num_input_tokens_seen": 3625208,
"step": 9620
},
{
"epoch": 8.624551971326165,
"grad_norm": 0.5681203603744507,
"learning_rate": 2.8301476282246164e-06,
"loss": 0.6538,
"num_input_tokens_seen": 3627192,
"step": 9625
},
{
"epoch": 8.629032258064516,
"grad_norm": 0.6536678075790405,
"learning_rate": 2.8121050876166096e-06,
"loss": 0.6999,
"num_input_tokens_seen": 3629112,
"step": 9630
},
{
"epoch": 8.633512544802867,
"grad_norm": 0.5405173897743225,
"learning_rate": 2.794116814826342e-06,
"loss": 0.686,
"num_input_tokens_seen": 3630808,
"step": 9635
},
{
"epoch": 8.637992831541219,
"grad_norm": 0.4284382462501526,
"learning_rate": 2.776182853850065e-06,
"loss": 0.6639,
"num_input_tokens_seen": 3632664,
"step": 9640
},
{
"epoch": 8.64247311827957,
"grad_norm": 0.4499342441558838,
"learning_rate": 2.758303248551211e-06,
"loss": 0.6603,
"num_input_tokens_seen": 3634456,
"step": 9645
},
{
"epoch": 8.646953405017921,
"grad_norm": 0.4879976809024811,
"learning_rate": 2.740478042660244e-06,
"loss": 0.6835,
"num_input_tokens_seen": 3636216,
"step": 9650
},
{
"epoch": 8.651433691756273,
"grad_norm": 0.33876270055770874,
"learning_rate": 2.7227072797745833e-06,
"loss": 0.6855,
"num_input_tokens_seen": 3638200,
"step": 9655
},
{
"epoch": 8.655913978494624,
"grad_norm": 0.7893423438072205,
"learning_rate": 2.7049910033585093e-06,
"loss": 0.7201,
"num_input_tokens_seen": 3640088,
"step": 9660
},
{
"epoch": 8.660394265232975,
"grad_norm": 0.4228385090827942,
"learning_rate": 2.6873292567429986e-06,
"loss": 0.6837,
"num_input_tokens_seen": 3641944,
"step": 9665
},
{
"epoch": 8.664874551971327,
"grad_norm": 0.5193427801132202,
"learning_rate": 2.6697220831256974e-06,
"loss": 0.6952,
"num_input_tokens_seen": 3643896,
"step": 9670
},
{
"epoch": 8.669354838709678,
"grad_norm": 0.5370306372642517,
"learning_rate": 2.6521695255707495e-06,
"loss": 0.7183,
"num_input_tokens_seen": 3645784,
"step": 9675
},
{
"epoch": 8.67383512544803,
"grad_norm": 0.6196435689926147,
"learning_rate": 2.6346716270087253e-06,
"loss": 0.7019,
"num_input_tokens_seen": 3647800,
"step": 9680
},
{
"epoch": 8.67831541218638,
"grad_norm": 0.8368743658065796,
"learning_rate": 2.617228430236521e-06,
"loss": 0.6967,
"num_input_tokens_seen": 3649624,
"step": 9685
},
{
"epoch": 8.682795698924732,
"grad_norm": 0.6965826153755188,
"learning_rate": 2.5998399779172123e-06,
"loss": 0.7065,
"num_input_tokens_seen": 3651416,
"step": 9690
},
{
"epoch": 8.687275985663083,
"grad_norm": 0.6506573557853699,
"learning_rate": 2.5825063125800074e-06,
"loss": 0.7179,
"num_input_tokens_seen": 3653464,
"step": 9695
},
{
"epoch": 8.691756272401435,
"grad_norm": 0.4431875944137573,
"learning_rate": 2.565227476620105e-06,
"loss": 0.6612,
"num_input_tokens_seen": 3655320,
"step": 9700
},
{
"epoch": 8.696236559139784,
"grad_norm": 0.6212185621261597,
"learning_rate": 2.5480035122985885e-06,
"loss": 0.6481,
"num_input_tokens_seen": 3657400,
"step": 9705
},
{
"epoch": 8.700716845878135,
"grad_norm": 0.5748448967933655,
"learning_rate": 2.530834461742357e-06,
"loss": 0.6894,
"num_input_tokens_seen": 3659256,
"step": 9710
},
{
"epoch": 8.705197132616487,
"grad_norm": 0.4338792860507965,
"learning_rate": 2.513720366943986e-06,
"loss": 0.702,
"num_input_tokens_seen": 3661080,
"step": 9715
},
{
"epoch": 8.709677419354838,
"grad_norm": 0.43087977170944214,
"learning_rate": 2.4966612697616382e-06,
"loss": 0.6779,
"num_input_tokens_seen": 3663128,
"step": 9720
},
{
"epoch": 8.71415770609319,
"grad_norm": 0.7099825739860535,
"learning_rate": 2.4796572119189647e-06,
"loss": 0.7211,
"num_input_tokens_seen": 3665144,
"step": 9725
},
{
"epoch": 8.71863799283154,
"grad_norm": 0.4541792869567871,
"learning_rate": 2.462708235004996e-06,
"loss": 0.7046,
"num_input_tokens_seen": 3666968,
"step": 9730
},
{
"epoch": 8.723118279569892,
"grad_norm": 0.5138086676597595,
"learning_rate": 2.445814380474057e-06,
"loss": 0.6604,
"num_input_tokens_seen": 3668952,
"step": 9735
},
{
"epoch": 8.727598566308243,
"grad_norm": 0.4486521780490875,
"learning_rate": 2.4289756896456434e-06,
"loss": 0.677,
"num_input_tokens_seen": 3670744,
"step": 9740
},
{
"epoch": 8.732078853046595,
"grad_norm": 0.6109570264816284,
"learning_rate": 2.412192203704311e-06,
"loss": 0.6889,
"num_input_tokens_seen": 3672600,
"step": 9745
},
{
"epoch": 8.736559139784946,
"grad_norm": 0.5936306715011597,
"learning_rate": 2.395463963699629e-06,
"loss": 0.6976,
"num_input_tokens_seen": 3674360,
"step": 9750
},
{
"epoch": 8.741039426523297,
"grad_norm": 0.6088002324104309,
"learning_rate": 2.3787910105460247e-06,
"loss": 0.7359,
"num_input_tokens_seen": 3676152,
"step": 9755
},
{
"epoch": 8.745519713261649,
"grad_norm": 0.49755457043647766,
"learning_rate": 2.362173385022701e-06,
"loss": 0.6827,
"num_input_tokens_seen": 3678104,
"step": 9760
},
{
"epoch": 8.75,
"grad_norm": 0.5336976647377014,
"learning_rate": 2.3456111277735506e-06,
"loss": 0.7125,
"num_input_tokens_seen": 3679864,
"step": 9765
},
{
"epoch": 8.754480286738351,
"grad_norm": 0.42951998114585876,
"learning_rate": 2.3291042793070374e-06,
"loss": 0.6635,
"num_input_tokens_seen": 3681720,
"step": 9770
},
{
"epoch": 8.758960573476703,
"grad_norm": 0.46314573287963867,
"learning_rate": 2.3126528799961024e-06,
"loss": 0.6886,
"num_input_tokens_seen": 3683832,
"step": 9775
},
{
"epoch": 8.763440860215054,
"grad_norm": 0.5346803069114685,
"learning_rate": 2.2962569700780726e-06,
"loss": 0.646,
"num_input_tokens_seen": 3685752,
"step": 9780
},
{
"epoch": 8.767921146953405,
"grad_norm": 0.4398757219314575,
"learning_rate": 2.279916589654549e-06,
"loss": 0.6684,
"num_input_tokens_seen": 3687704,
"step": 9785
},
{
"epoch": 8.772401433691757,
"grad_norm": 0.5858141183853149,
"learning_rate": 2.263631778691333e-06,
"loss": 0.6911,
"num_input_tokens_seen": 3689624,
"step": 9790
},
{
"epoch": 8.776881720430108,
"grad_norm": 0.7604189515113831,
"learning_rate": 2.2474025770182982e-06,
"loss": 0.6823,
"num_input_tokens_seen": 3691544,
"step": 9795
},
{
"epoch": 8.78136200716846,
"grad_norm": 0.46800726652145386,
"learning_rate": 2.2312290243293147e-06,
"loss": 0.6948,
"num_input_tokens_seen": 3693368,
"step": 9800
},
{
"epoch": 8.78584229390681,
"grad_norm": 0.40996742248535156,
"learning_rate": 2.21511116018214e-06,
"loss": 0.6847,
"num_input_tokens_seen": 3695224,
"step": 9805
},
{
"epoch": 8.790322580645162,
"grad_norm": 0.5102288126945496,
"learning_rate": 2.199049023998323e-06,
"loss": 0.7095,
"num_input_tokens_seen": 3697048,
"step": 9810
},
{
"epoch": 8.794802867383513,
"grad_norm": 0.4394807517528534,
"learning_rate": 2.1830426550631276e-06,
"loss": 0.6739,
"num_input_tokens_seen": 3699032,
"step": 9815
},
{
"epoch": 8.799283154121865,
"grad_norm": 0.6569847464561462,
"learning_rate": 2.1670920925254053e-06,
"loss": 0.664,
"num_input_tokens_seen": 3700888,
"step": 9820
},
{
"epoch": 8.803763440860216,
"grad_norm": 0.4514296054840088,
"learning_rate": 2.1511973753975208e-06,
"loss": 0.6695,
"num_input_tokens_seen": 3702680,
"step": 9825
},
{
"epoch": 8.808243727598565,
"grad_norm": 0.7222883105278015,
"learning_rate": 2.1353585425552463e-06,
"loss": 0.6985,
"num_input_tokens_seen": 3704536,
"step": 9830
},
{
"epoch": 8.812724014336917,
"grad_norm": 0.5998217463493347,
"learning_rate": 2.1195756327376722e-06,
"loss": 0.6851,
"num_input_tokens_seen": 3706360,
"step": 9835
},
{
"epoch": 8.817204301075268,
"grad_norm": 0.5000945329666138,
"learning_rate": 2.1038486845471215e-06,
"loss": 0.6977,
"num_input_tokens_seen": 3708088,
"step": 9840
},
{
"epoch": 8.82168458781362,
"grad_norm": 0.48113876581192017,
"learning_rate": 2.0881777364490265e-06,
"loss": 0.6994,
"num_input_tokens_seen": 3710040,
"step": 9845
},
{
"epoch": 8.82616487455197,
"grad_norm": 0.5569098591804504,
"learning_rate": 2.0725628267718595e-06,
"loss": 0.6841,
"num_input_tokens_seen": 3711928,
"step": 9850
},
{
"epoch": 8.830645161290322,
"grad_norm": 0.46733352541923523,
"learning_rate": 2.0570039937070463e-06,
"loss": 0.6954,
"num_input_tokens_seen": 3713720,
"step": 9855
},
{
"epoch": 8.835125448028673,
"grad_norm": 0.5141034126281738,
"learning_rate": 2.04150127530883e-06,
"loss": 0.7,
"num_input_tokens_seen": 3715416,
"step": 9860
},
{
"epoch": 8.839605734767025,
"grad_norm": 0.7939246892929077,
"learning_rate": 2.026054709494235e-06,
"loss": 0.6841,
"num_input_tokens_seen": 3717208,
"step": 9865
},
{
"epoch": 8.844086021505376,
"grad_norm": 0.6357170939445496,
"learning_rate": 2.0106643340429332e-06,
"loss": 0.7052,
"num_input_tokens_seen": 3718936,
"step": 9870
},
{
"epoch": 8.848566308243727,
"grad_norm": 0.6246017813682556,
"learning_rate": 1.995330186597158e-06,
"loss": 0.668,
"num_input_tokens_seen": 3720920,
"step": 9875
},
{
"epoch": 8.853046594982079,
"grad_norm": 0.5736755728721619,
"learning_rate": 1.980052304661642e-06,
"loss": 0.6858,
"num_input_tokens_seen": 3722776,
"step": 9880
},
{
"epoch": 8.85752688172043,
"grad_norm": 0.6197718977928162,
"learning_rate": 1.9648307256034697e-06,
"loss": 0.6752,
"num_input_tokens_seen": 3724792,
"step": 9885
},
{
"epoch": 8.862007168458781,
"grad_norm": 0.718443751335144,
"learning_rate": 1.9496654866520414e-06,
"loss": 0.6788,
"num_input_tokens_seen": 3726712,
"step": 9890
},
{
"epoch": 8.866487455197133,
"grad_norm": 0.6312748789787292,
"learning_rate": 1.9345566248989534e-06,
"loss": 0.7045,
"num_input_tokens_seen": 3728696,
"step": 9895
},
{
"epoch": 8.870967741935484,
"grad_norm": 0.4447576403617859,
"learning_rate": 1.9195041772979093e-06,
"loss": 0.6994,
"num_input_tokens_seen": 3730488,
"step": 9900
},
{
"epoch": 8.875448028673835,
"grad_norm": 0.7500755190849304,
"learning_rate": 1.9045081806646436e-06,
"loss": 0.7111,
"num_input_tokens_seen": 3732440,
"step": 9905
},
{
"epoch": 8.879928315412187,
"grad_norm": 0.5350488424301147,
"learning_rate": 1.8895686716768113e-06,
"loss": 0.7168,
"num_input_tokens_seen": 3734488,
"step": 9910
},
{
"epoch": 8.884408602150538,
"grad_norm": 0.4130159020423889,
"learning_rate": 1.8746856868739004e-06,
"loss": 0.6988,
"num_input_tokens_seen": 3736472,
"step": 9915
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.9204648733139038,
"learning_rate": 1.8598592626571737e-06,
"loss": 0.7053,
"num_input_tokens_seen": 3738264,
"step": 9920
},
{
"epoch": 8.89336917562724,
"grad_norm": 0.542391836643219,
"learning_rate": 1.8450894352895375e-06,
"loss": 0.7031,
"num_input_tokens_seen": 3740056,
"step": 9925
},
{
"epoch": 8.897849462365592,
"grad_norm": 0.7514336109161377,
"learning_rate": 1.8303762408954761e-06,
"loss": 0.6645,
"num_input_tokens_seen": 3742008,
"step": 9930
},
{
"epoch": 8.902329749103943,
"grad_norm": 0.6825776696205139,
"learning_rate": 1.81571971546097e-06,
"loss": 0.6812,
"num_input_tokens_seen": 3743864,
"step": 9935
},
{
"epoch": 8.906810035842295,
"grad_norm": 0.5593582987785339,
"learning_rate": 1.8011198948333751e-06,
"loss": 0.6949,
"num_input_tokens_seen": 3745752,
"step": 9940
},
{
"epoch": 8.911290322580646,
"grad_norm": 0.4969135522842407,
"learning_rate": 1.7865768147213802e-06,
"loss": 0.7029,
"num_input_tokens_seen": 3747576,
"step": 9945
},
{
"epoch": 8.915770609318997,
"grad_norm": 0.507438063621521,
"learning_rate": 1.7720905106948821e-06,
"loss": 0.7065,
"num_input_tokens_seen": 3749464,
"step": 9950
},
{
"epoch": 8.920250896057347,
"grad_norm": 0.6009644865989685,
"learning_rate": 1.7576610181849113e-06,
"loss": 0.6846,
"num_input_tokens_seen": 3751352,
"step": 9955
},
{
"epoch": 8.924731182795698,
"grad_norm": 0.5117262005805969,
"learning_rate": 1.7432883724835646e-06,
"loss": 0.6989,
"num_input_tokens_seen": 3753208,
"step": 9960
},
{
"epoch": 8.92921146953405,
"grad_norm": 0.45503732562065125,
"learning_rate": 1.7289726087438813e-06,
"loss": 0.6954,
"num_input_tokens_seen": 3755000,
"step": 9965
},
{
"epoch": 8.9336917562724,
"grad_norm": 0.5579915642738342,
"learning_rate": 1.7147137619797888e-06,
"loss": 0.6957,
"num_input_tokens_seen": 3756856,
"step": 9970
},
{
"epoch": 8.938172043010752,
"grad_norm": 0.3492272198200226,
"learning_rate": 1.7005118670659987e-06,
"loss": 0.7001,
"num_input_tokens_seen": 3758616,
"step": 9975
},
{
"epoch": 8.942652329749103,
"grad_norm": 0.47283822298049927,
"learning_rate": 1.6863669587379282e-06,
"loss": 0.7192,
"num_input_tokens_seen": 3760344,
"step": 9980
},
{
"epoch": 8.947132616487455,
"grad_norm": 0.5391170382499695,
"learning_rate": 1.6722790715916231e-06,
"loss": 0.6784,
"num_input_tokens_seen": 3762232,
"step": 9985
},
{
"epoch": 8.951612903225806,
"grad_norm": 0.6539673805236816,
"learning_rate": 1.658248240083657e-06,
"loss": 0.6817,
"num_input_tokens_seen": 3764088,
"step": 9990
},
{
"epoch": 8.956093189964157,
"grad_norm": 0.6863439083099365,
"learning_rate": 1.6442744985310593e-06,
"loss": 0.7096,
"num_input_tokens_seen": 3765656,
"step": 9995
},
{
"epoch": 8.960573476702509,
"grad_norm": 0.5178156495094299,
"learning_rate": 1.6303578811112246e-06,
"loss": 0.6845,
"num_input_tokens_seen": 3767352,
"step": 10000
},
{
"epoch": 8.96505376344086,
"grad_norm": 0.5608931183815002,
"learning_rate": 1.6164984218618285e-06,
"loss": 0.7052,
"num_input_tokens_seen": 3769240,
"step": 10005
},
{
"epoch": 8.969534050179211,
"grad_norm": 0.5038066506385803,
"learning_rate": 1.6026961546807605e-06,
"loss": 0.6746,
"num_input_tokens_seen": 3771032,
"step": 10010
},
{
"epoch": 8.974014336917563,
"grad_norm": 0.7060883045196533,
"learning_rate": 1.5889511133260121e-06,
"loss": 0.6875,
"num_input_tokens_seen": 3772952,
"step": 10015
},
{
"epoch": 8.978494623655914,
"grad_norm": 0.5578808188438416,
"learning_rate": 1.575263331415619e-06,
"loss": 0.66,
"num_input_tokens_seen": 3774904,
"step": 10020
},
{
"epoch": 8.982974910394265,
"grad_norm": 0.5263607501983643,
"learning_rate": 1.5616328424275656e-06,
"loss": 0.6953,
"num_input_tokens_seen": 3776696,
"step": 10025
},
{
"epoch": 8.987455197132617,
"grad_norm": 0.6655629277229309,
"learning_rate": 1.5480596796997094e-06,
"loss": 0.6753,
"num_input_tokens_seen": 3778648,
"step": 10030
},
{
"epoch": 8.991935483870968,
"grad_norm": 0.5918577909469604,
"learning_rate": 1.534543876429706e-06,
"loss": 0.7152,
"num_input_tokens_seen": 3780568,
"step": 10035
},
{
"epoch": 8.99641577060932,
"grad_norm": 0.5958806276321411,
"learning_rate": 1.521085465674904e-06,
"loss": 0.7297,
"num_input_tokens_seen": 3782488,
"step": 10040
},
{
"epoch": 9.0,
"eval_loss": 0.699591875076294,
"eval_runtime": 5.6395,
"eval_samples_per_second": 87.951,
"eval_steps_per_second": 21.988,
"num_input_tokens_seen": 3783840,
"step": 10044
},
{
"epoch": 9.00089605734767,
"grad_norm": 0.6611407399177551,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.6816,
"num_input_tokens_seen": 3784384,
"step": 10045
},
{
"epoch": 9.005376344086022,
"grad_norm": 0.45874500274658203,
"learning_rate": 1.494340953238399e-06,
"loss": 0.6632,
"num_input_tokens_seen": 3786240,
"step": 10050
},
{
"epoch": 9.009856630824373,
"grad_norm": 0.5364258289337158,
"learning_rate": 1.481054916969221e-06,
"loss": 0.6888,
"num_input_tokens_seen": 3788032,
"step": 10055
},
{
"epoch": 9.014336917562725,
"grad_norm": 0.5259512662887573,
"learning_rate": 1.4678264040401458e-06,
"loss": 0.6833,
"num_input_tokens_seen": 3789920,
"step": 10060
},
{
"epoch": 9.018817204301076,
"grad_norm": 0.5970426797866821,
"learning_rate": 1.4546554468058665e-06,
"loss": 0.7168,
"num_input_tokens_seen": 3791616,
"step": 10065
},
{
"epoch": 9.023297491039427,
"grad_norm": 0.6536547541618347,
"learning_rate": 1.441542077480304e-06,
"loss": 0.7067,
"num_input_tokens_seen": 3793472,
"step": 10070
},
{
"epoch": 9.027777777777779,
"grad_norm": 0.5501564145088196,
"learning_rate": 1.428486328136533e-06,
"loss": 0.6724,
"num_input_tokens_seen": 3795200,
"step": 10075
},
{
"epoch": 9.03225806451613,
"grad_norm": 0.6372694969177246,
"learning_rate": 1.4154882307066907e-06,
"loss": 0.7086,
"num_input_tokens_seen": 3797152,
"step": 10080
},
{
"epoch": 9.03673835125448,
"grad_norm": 0.4424917697906494,
"learning_rate": 1.402547816981914e-06,
"loss": 0.6851,
"num_input_tokens_seen": 3799008,
"step": 10085
},
{
"epoch": 9.04121863799283,
"grad_norm": 0.5123321413993835,
"learning_rate": 1.3896651186122573e-06,
"loss": 0.6978,
"num_input_tokens_seen": 3801024,
"step": 10090
},
{
"epoch": 9.045698924731182,
"grad_norm": 0.6249075531959534,
"learning_rate": 1.3768401671066105e-06,
"loss": 0.7141,
"num_input_tokens_seen": 3803008,
"step": 10095
},
{
"epoch": 9.050179211469533,
"grad_norm": 0.6043964624404907,
"learning_rate": 1.3640729938326213e-06,
"loss": 0.7001,
"num_input_tokens_seen": 3804992,
"step": 10100
},
{
"epoch": 9.054659498207885,
"grad_norm": 0.36214983463287354,
"learning_rate": 1.351363630016622e-06,
"loss": 0.6571,
"num_input_tokens_seen": 3806848,
"step": 10105
},
{
"epoch": 9.059139784946236,
"grad_norm": 0.594513475894928,
"learning_rate": 1.3387121067435588e-06,
"loss": 0.6684,
"num_input_tokens_seen": 3808704,
"step": 10110
},
{
"epoch": 9.063620071684587,
"grad_norm": 0.5957300662994385,
"learning_rate": 1.3261184549569066e-06,
"loss": 0.7092,
"num_input_tokens_seen": 3810528,
"step": 10115
},
{
"epoch": 9.068100358422939,
"grad_norm": 0.5582435727119446,
"learning_rate": 1.3135827054585964e-06,
"loss": 0.6886,
"num_input_tokens_seen": 3812288,
"step": 10120
},
{
"epoch": 9.07258064516129,
"grad_norm": 0.45786210894584656,
"learning_rate": 1.3011048889089355e-06,
"loss": 0.6701,
"num_input_tokens_seen": 3814048,
"step": 10125
},
{
"epoch": 9.077060931899641,
"grad_norm": 0.6108914613723755,
"learning_rate": 1.288685035826548e-06,
"loss": 0.712,
"num_input_tokens_seen": 3815840,
"step": 10130
},
{
"epoch": 9.081541218637993,
"grad_norm": 0.5366916656494141,
"learning_rate": 1.2763231765882732e-06,
"loss": 0.6844,
"num_input_tokens_seen": 3817632,
"step": 10135
},
{
"epoch": 9.086021505376344,
"grad_norm": 0.6079829931259155,
"learning_rate": 1.2640193414291262e-06,
"loss": 0.6964,
"num_input_tokens_seen": 3819456,
"step": 10140
},
{
"epoch": 9.090501792114695,
"grad_norm": 0.6302862167358398,
"learning_rate": 1.2517735604421904e-06,
"loss": 0.6833,
"num_input_tokens_seen": 3821344,
"step": 10145
},
{
"epoch": 9.094982078853047,
"grad_norm": 0.5367377400398254,
"learning_rate": 1.2395858635785602e-06,
"loss": 0.6815,
"num_input_tokens_seen": 3823296,
"step": 10150
},
{
"epoch": 9.099462365591398,
"grad_norm": 0.5147011876106262,
"learning_rate": 1.2274562806472794e-06,
"loss": 0.7238,
"num_input_tokens_seen": 3825184,
"step": 10155
},
{
"epoch": 9.10394265232975,
"grad_norm": 0.49252960085868835,
"learning_rate": 1.2153848413152341e-06,
"loss": 0.7037,
"num_input_tokens_seen": 3827296,
"step": 10160
},
{
"epoch": 9.1084229390681,
"grad_norm": 0.3270881175994873,
"learning_rate": 1.2033715751071206e-06,
"loss": 0.6846,
"num_input_tokens_seen": 3828992,
"step": 10165
},
{
"epoch": 9.112903225806452,
"grad_norm": 0.6329634785652161,
"learning_rate": 1.191416511405341e-06,
"loss": 0.6666,
"num_input_tokens_seen": 3830880,
"step": 10170
},
{
"epoch": 9.117383512544803,
"grad_norm": 0.4612480401992798,
"learning_rate": 1.1795196794499475e-06,
"loss": 0.6996,
"num_input_tokens_seen": 3832864,
"step": 10175
},
{
"epoch": 9.121863799283155,
"grad_norm": 0.678649365901947,
"learning_rate": 1.1676811083385698e-06,
"loss": 0.6893,
"num_input_tokens_seen": 3834752,
"step": 10180
},
{
"epoch": 9.126344086021506,
"grad_norm": 0.5528131127357483,
"learning_rate": 1.155900827026346e-06,
"loss": 0.7061,
"num_input_tokens_seen": 3836640,
"step": 10185
},
{
"epoch": 9.130824372759857,
"grad_norm": 0.48890024423599243,
"learning_rate": 1.1441788643258233e-06,
"loss": 0.7156,
"num_input_tokens_seen": 3838528,
"step": 10190
},
{
"epoch": 9.135304659498209,
"grad_norm": 0.571433424949646,
"learning_rate": 1.1325152489069457e-06,
"loss": 0.6852,
"num_input_tokens_seen": 3840320,
"step": 10195
},
{
"epoch": 9.13978494623656,
"grad_norm": 0.4964417517185211,
"learning_rate": 1.1209100092969244e-06,
"loss": 0.6722,
"num_input_tokens_seen": 3842496,
"step": 10200
},
{
"epoch": 9.144265232974911,
"grad_norm": 0.39581766724586487,
"learning_rate": 1.109363173880204e-06,
"loss": 0.7177,
"num_input_tokens_seen": 3844352,
"step": 10205
},
{
"epoch": 9.14874551971326,
"grad_norm": 0.5092259049415588,
"learning_rate": 1.0978747708983854e-06,
"loss": 0.6978,
"num_input_tokens_seen": 3846304,
"step": 10210
},
{
"epoch": 9.153225806451612,
"grad_norm": 0.3758569359779358,
"learning_rate": 1.0864448284501394e-06,
"loss": 0.6779,
"num_input_tokens_seen": 3848192,
"step": 10215
},
{
"epoch": 9.157706093189963,
"grad_norm": 0.48183518648147583,
"learning_rate": 1.0750733744911674e-06,
"loss": 0.722,
"num_input_tokens_seen": 3850016,
"step": 10220
},
{
"epoch": 9.162186379928315,
"grad_norm": 0.6768030524253845,
"learning_rate": 1.063760436834113e-06,
"loss": 0.6925,
"num_input_tokens_seen": 3852000,
"step": 10225
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.4611859917640686,
"learning_rate": 1.0525060431484907e-06,
"loss": 0.6945,
"num_input_tokens_seen": 3853760,
"step": 10230
},
{
"epoch": 9.171146953405017,
"grad_norm": 0.593660295009613,
"learning_rate": 1.0413102209606424e-06,
"loss": 0.6924,
"num_input_tokens_seen": 3855488,
"step": 10235
},
{
"epoch": 9.175627240143369,
"grad_norm": 0.35018354654312134,
"learning_rate": 1.0301729976536417e-06,
"loss": 0.6819,
"num_input_tokens_seen": 3857536,
"step": 10240
},
{
"epoch": 9.18010752688172,
"grad_norm": 0.6653251051902771,
"learning_rate": 1.0190944004672409e-06,
"loss": 0.6665,
"num_input_tokens_seen": 3859424,
"step": 10245
},
{
"epoch": 9.184587813620071,
"grad_norm": 0.4804827570915222,
"learning_rate": 1.0080744564978068e-06,
"loss": 0.6813,
"num_input_tokens_seen": 3861248,
"step": 10250
},
{
"epoch": 9.189068100358423,
"grad_norm": 0.5318313241004944,
"learning_rate": 9.971131926982458e-07,
"loss": 0.7229,
"num_input_tokens_seen": 3863168,
"step": 10255
},
{
"epoch": 9.193548387096774,
"grad_norm": 0.8895540237426758,
"learning_rate": 9.86210635877949e-07,
"loss": 0.6953,
"num_input_tokens_seen": 3865312,
"step": 10260
},
{
"epoch": 9.198028673835125,
"grad_norm": 0.5024261474609375,
"learning_rate": 9.753668127027133e-07,
"loss": 0.6892,
"num_input_tokens_seen": 3867328,
"step": 10265
},
{
"epoch": 9.202508960573477,
"grad_norm": 0.6652594208717346,
"learning_rate": 9.645817496946903e-07,
"loss": 0.6858,
"num_input_tokens_seen": 3869056,
"step": 10270
},
{
"epoch": 9.206989247311828,
"grad_norm": 0.5735182762145996,
"learning_rate": 9.538554732323041e-07,
"loss": 0.6699,
"num_input_tokens_seen": 3870976,
"step": 10275
},
{
"epoch": 9.21146953405018,
"grad_norm": 0.5266287326812744,
"learning_rate": 9.431880095502027e-07,
"loss": 0.6749,
"num_input_tokens_seen": 3872960,
"step": 10280
},
{
"epoch": 9.21594982078853,
"grad_norm": 0.4899725914001465,
"learning_rate": 9.325793847391962e-07,
"loss": 0.6694,
"num_input_tokens_seen": 3874976,
"step": 10285
},
{
"epoch": 9.220430107526882,
"grad_norm": 0.5171582698822021,
"learning_rate": 9.220296247461707e-07,
"loss": 0.7095,
"num_input_tokens_seen": 3876800,
"step": 10290
},
{
"epoch": 9.224910394265233,
"grad_norm": 0.5212193131446838,
"learning_rate": 9.115387553740473e-07,
"loss": 0.6967,
"num_input_tokens_seen": 3878560,
"step": 10295
},
{
"epoch": 9.229390681003585,
"grad_norm": 0.7846523523330688,
"learning_rate": 9.011068022817065e-07,
"loss": 0.6745,
"num_input_tokens_seen": 3880544,
"step": 10300
},
{
"epoch": 9.233870967741936,
"grad_norm": 0.46724578738212585,
"learning_rate": 8.907337909839275e-07,
"loss": 0.6785,
"num_input_tokens_seen": 3882368,
"step": 10305
},
{
"epoch": 9.238351254480287,
"grad_norm": 0.4712303578853607,
"learning_rate": 8.804197468513436e-07,
"loss": 0.6942,
"num_input_tokens_seen": 3884288,
"step": 10310
},
{
"epoch": 9.242831541218639,
"grad_norm": 0.5561097860336304,
"learning_rate": 8.701646951103425e-07,
"loss": 0.6814,
"num_input_tokens_seen": 3886176,
"step": 10315
},
{
"epoch": 9.24731182795699,
"grad_norm": 0.6059445738792419,
"learning_rate": 8.599686608430413e-07,
"loss": 0.6716,
"num_input_tokens_seen": 3888192,
"step": 10320
},
{
"epoch": 9.251792114695341,
"grad_norm": 0.4931063652038574,
"learning_rate": 8.498316689872055e-07,
"loss": 0.6829,
"num_input_tokens_seen": 3890048,
"step": 10325
},
{
"epoch": 9.256272401433693,
"grad_norm": 0.5145807266235352,
"learning_rate": 8.397537443361913e-07,
"loss": 0.7006,
"num_input_tokens_seen": 3891968,
"step": 10330
},
{
"epoch": 9.260752688172044,
"grad_norm": 0.3241588771343231,
"learning_rate": 8.297349115388903e-07,
"loss": 0.6973,
"num_input_tokens_seen": 3893696,
"step": 10335
},
{
"epoch": 9.265232974910393,
"grad_norm": 0.4836924076080322,
"learning_rate": 8.197751950996619e-07,
"loss": 0.7073,
"num_input_tokens_seen": 3895616,
"step": 10340
},
{
"epoch": 9.269713261648745,
"grad_norm": 0.667323887348175,
"learning_rate": 8.098746193782813e-07,
"loss": 0.6645,
"num_input_tokens_seen": 3897472,
"step": 10345
},
{
"epoch": 9.274193548387096,
"grad_norm": 0.5213760733604431,
"learning_rate": 8.00033208589876e-07,
"loss": 0.6747,
"num_input_tokens_seen": 3899264,
"step": 10350
},
{
"epoch": 9.278673835125447,
"grad_norm": 0.8510650992393494,
"learning_rate": 7.902509868048552e-07,
"loss": 0.6796,
"num_input_tokens_seen": 3901376,
"step": 10355
},
{
"epoch": 9.283154121863799,
"grad_norm": 0.4620399475097656,
"learning_rate": 7.805279779488722e-07,
"loss": 0.6795,
"num_input_tokens_seen": 3903328,
"step": 10360
},
{
"epoch": 9.28763440860215,
"grad_norm": 0.5646343231201172,
"learning_rate": 7.708642058027571e-07,
"loss": 0.6833,
"num_input_tokens_seen": 3905312,
"step": 10365
},
{
"epoch": 9.292114695340501,
"grad_norm": 0.5708273649215698,
"learning_rate": 7.61259694002453e-07,
"loss": 0.6887,
"num_input_tokens_seen": 3907200,
"step": 10370
},
{
"epoch": 9.296594982078853,
"grad_norm": 0.6553865671157837,
"learning_rate": 7.51714466038958e-07,
"loss": 0.6791,
"num_input_tokens_seen": 3909248,
"step": 10375
},
{
"epoch": 9.301075268817204,
"grad_norm": 0.5654320120811462,
"learning_rate": 7.422285452582805e-07,
"loss": 0.7134,
"num_input_tokens_seen": 3911168,
"step": 10380
},
{
"epoch": 9.305555555555555,
"grad_norm": 0.43947505950927734,
"learning_rate": 7.328019548613619e-07,
"loss": 0.7043,
"num_input_tokens_seen": 3912992,
"step": 10385
},
{
"epoch": 9.310035842293907,
"grad_norm": 0.7046077847480774,
"learning_rate": 7.234347179040507e-07,
"loss": 0.6869,
"num_input_tokens_seen": 3914784,
"step": 10390
},
{
"epoch": 9.314516129032258,
"grad_norm": 0.5752326250076294,
"learning_rate": 7.141268572970094e-07,
"loss": 0.6428,
"num_input_tokens_seen": 3916896,
"step": 10395
},
{
"epoch": 9.31899641577061,
"grad_norm": 0.5495961904525757,
"learning_rate": 7.048783958056804e-07,
"loss": 0.6925,
"num_input_tokens_seen": 3918688,
"step": 10400
},
{
"epoch": 9.32347670250896,
"grad_norm": 0.4289925694465637,
"learning_rate": 6.956893560502359e-07,
"loss": 0.706,
"num_input_tokens_seen": 3920512,
"step": 10405
},
{
"epoch": 9.327956989247312,
"grad_norm": 0.6946158409118652,
"learning_rate": 6.865597605054952e-07,
"loss": 0.6793,
"num_input_tokens_seen": 3922304,
"step": 10410
},
{
"epoch": 9.332437275985663,
"grad_norm": 0.4214330017566681,
"learning_rate": 6.774896315008994e-07,
"loss": 0.6784,
"num_input_tokens_seen": 3924384,
"step": 10415
},
{
"epoch": 9.336917562724015,
"grad_norm": 0.6280259490013123,
"learning_rate": 6.68478991220442e-07,
"loss": 0.6726,
"num_input_tokens_seen": 3926368,
"step": 10420
},
{
"epoch": 9.341397849462366,
"grad_norm": 0.5349095463752747,
"learning_rate": 6.595278617026163e-07,
"loss": 0.6903,
"num_input_tokens_seen": 3928288,
"step": 10425
},
{
"epoch": 9.345878136200717,
"grad_norm": 0.5612955689430237,
"learning_rate": 6.50636264840368e-07,
"loss": 0.7131,
"num_input_tokens_seen": 3930176,
"step": 10430
},
{
"epoch": 9.350358422939069,
"grad_norm": 0.5245122909545898,
"learning_rate": 6.418042223810234e-07,
"loss": 0.6809,
"num_input_tokens_seen": 3932224,
"step": 10435
},
{
"epoch": 9.35483870967742,
"grad_norm": 0.5770167112350464,
"learning_rate": 6.33031755926261e-07,
"loss": 0.6999,
"num_input_tokens_seen": 3934112,
"step": 10440
},
{
"epoch": 9.359318996415771,
"grad_norm": 0.46723732352256775,
"learning_rate": 6.243188869320377e-07,
"loss": 0.6972,
"num_input_tokens_seen": 3936096,
"step": 10445
},
{
"epoch": 9.363799283154123,
"grad_norm": 0.505517840385437,
"learning_rate": 6.156656367085539e-07,
"loss": 0.6911,
"num_input_tokens_seen": 3937952,
"step": 10450
},
{
"epoch": 9.368279569892474,
"grad_norm": 0.7766585350036621,
"learning_rate": 6.070720264201857e-07,
"loss": 0.6759,
"num_input_tokens_seen": 3939872,
"step": 10455
},
{
"epoch": 9.372759856630825,
"grad_norm": 0.6021184325218201,
"learning_rate": 5.985380770854476e-07,
"loss": 0.6598,
"num_input_tokens_seen": 3941920,
"step": 10460
},
{
"epoch": 9.377240143369175,
"grad_norm": 0.5606325268745422,
"learning_rate": 5.900638095769185e-07,
"loss": 0.6604,
"num_input_tokens_seen": 3943712,
"step": 10465
},
{
"epoch": 9.381720430107526,
"grad_norm": 0.4621894657611847,
"learning_rate": 5.816492446212213e-07,
"loss": 0.6957,
"num_input_tokens_seen": 3945440,
"step": 10470
},
{
"epoch": 9.386200716845877,
"grad_norm": 0.681788444519043,
"learning_rate": 5.732944027989518e-07,
"loss": 0.6627,
"num_input_tokens_seen": 3947296,
"step": 10475
},
{
"epoch": 9.390681003584229,
"grad_norm": 0.7300588488578796,
"learning_rate": 5.649993045446305e-07,
"loss": 0.6905,
"num_input_tokens_seen": 3949152,
"step": 10480
},
{
"epoch": 9.39516129032258,
"grad_norm": 0.6331045031547546,
"learning_rate": 5.56763970146662e-07,
"loss": 0.6845,
"num_input_tokens_seen": 3951008,
"step": 10485
},
{
"epoch": 9.399641577060931,
"grad_norm": 0.4790831506252289,
"learning_rate": 5.485884197472646e-07,
"loss": 0.6956,
"num_input_tokens_seen": 3952960,
"step": 10490
},
{
"epoch": 9.404121863799283,
"grad_norm": 0.4457530975341797,
"learning_rate": 5.404726733424514e-07,
"loss": 0.7039,
"num_input_tokens_seen": 3954752,
"step": 10495
},
{
"epoch": 9.408602150537634,
"grad_norm": 0.48740777373313904,
"learning_rate": 5.324167507819555e-07,
"loss": 0.6837,
"num_input_tokens_seen": 3956736,
"step": 10500
},
{
"epoch": 9.413082437275985,
"grad_norm": 0.6508479714393616,
"learning_rate": 5.244206717691908e-07,
"loss": 0.6699,
"num_input_tokens_seen": 3958528,
"step": 10505
},
{
"epoch": 9.417562724014337,
"grad_norm": 0.5810533165931702,
"learning_rate": 5.164844558612131e-07,
"loss": 0.6726,
"num_input_tokens_seen": 3960672,
"step": 10510
},
{
"epoch": 9.422043010752688,
"grad_norm": 0.5137490630149841,
"learning_rate": 5.086081224686512e-07,
"loss": 0.6818,
"num_input_tokens_seen": 3962752,
"step": 10515
},
{
"epoch": 9.42652329749104,
"grad_norm": 0.49306225776672363,
"learning_rate": 5.007916908556814e-07,
"loss": 0.6892,
"num_input_tokens_seen": 3964576,
"step": 10520
},
{
"epoch": 9.43100358422939,
"grad_norm": 0.6570764183998108,
"learning_rate": 4.930351801399641e-07,
"loss": 0.6514,
"num_input_tokens_seen": 3966432,
"step": 10525
},
{
"epoch": 9.435483870967742,
"grad_norm": 0.528634250164032,
"learning_rate": 4.853386092926044e-07,
"loss": 0.681,
"num_input_tokens_seen": 3968256,
"step": 10530
},
{
"epoch": 9.439964157706093,
"grad_norm": 0.5617717504501343,
"learning_rate": 4.77701997138108e-07,
"loss": 0.6681,
"num_input_tokens_seen": 3970048,
"step": 10535
},
{
"epoch": 9.444444444444445,
"grad_norm": 0.46119117736816406,
"learning_rate": 4.701253623543289e-07,
"loss": 0.6593,
"num_input_tokens_seen": 3971840,
"step": 10540
},
{
"epoch": 9.448924731182796,
"grad_norm": 0.33854037523269653,
"learning_rate": 4.626087234724269e-07,
"loss": 0.6559,
"num_input_tokens_seen": 3973760,
"step": 10545
},
{
"epoch": 9.453405017921147,
"grad_norm": 0.6266850233078003,
"learning_rate": 4.5515209887682096e-07,
"loss": 0.6831,
"num_input_tokens_seen": 3975520,
"step": 10550
},
{
"epoch": 9.457885304659499,
"grad_norm": 0.6646607518196106,
"learning_rate": 4.477555068051476e-07,
"loss": 0.6685,
"num_input_tokens_seen": 3977376,
"step": 10555
},
{
"epoch": 9.46236559139785,
"grad_norm": 0.5167231559753418,
"learning_rate": 4.40418965348216e-07,
"loss": 0.662,
"num_input_tokens_seen": 3979232,
"step": 10560
},
{
"epoch": 9.466845878136201,
"grad_norm": 0.46271848678588867,
"learning_rate": 4.3314249244995884e-07,
"loss": 0.6683,
"num_input_tokens_seen": 3981024,
"step": 10565
},
{
"epoch": 9.471326164874553,
"grad_norm": 0.272670179605484,
"learning_rate": 4.259261059073871e-07,
"loss": 0.7004,
"num_input_tokens_seen": 3982816,
"step": 10570
},
{
"epoch": 9.475806451612904,
"grad_norm": 0.4650828242301941,
"learning_rate": 4.1876982337055725e-07,
"loss": 0.6686,
"num_input_tokens_seen": 3984736,
"step": 10575
},
{
"epoch": 9.480286738351255,
"grad_norm": 0.686195433139801,
"learning_rate": 4.1167366234251824e-07,
"loss": 0.6862,
"num_input_tokens_seen": 3986816,
"step": 10580
},
{
"epoch": 9.484767025089607,
"grad_norm": 0.594731330871582,
"learning_rate": 4.0463764017927565e-07,
"loss": 0.6631,
"num_input_tokens_seen": 3988640,
"step": 10585
},
{
"epoch": 9.489247311827956,
"grad_norm": 0.5093191266059875,
"learning_rate": 3.976617740897415e-07,
"loss": 0.7095,
"num_input_tokens_seen": 3990560,
"step": 10590
},
{
"epoch": 9.493727598566307,
"grad_norm": 0.7626033425331116,
"learning_rate": 3.907460811356956e-07,
"loss": 0.7275,
"num_input_tokens_seen": 3992416,
"step": 10595
},
{
"epoch": 9.498207885304659,
"grad_norm": 0.7179610133171082,
"learning_rate": 3.8389057823175754e-07,
"loss": 0.6966,
"num_input_tokens_seen": 3994240,
"step": 10600
},
{
"epoch": 9.5,
"eval_loss": 0.6984838843345642,
"eval_runtime": 5.6358,
"eval_samples_per_second": 88.009,
"eval_steps_per_second": 22.002,
"num_input_tokens_seen": 3994976,
"step": 10602
},
{
"epoch": 9.50268817204301,
"grad_norm": 0.5522060990333557,
"learning_rate": 3.7709528214530664e-07,
"loss": 0.7008,
"num_input_tokens_seen": 3996192,
"step": 10605
},
{
"epoch": 9.507168458781361,
"grad_norm": 0.5137274861335754,
"learning_rate": 3.7036020949648974e-07,
"loss": 0.6976,
"num_input_tokens_seen": 3998144,
"step": 10610
},
{
"epoch": 9.511648745519713,
"grad_norm": 0.5644457936286926,
"learning_rate": 3.636853767581494e-07,
"loss": 0.697,
"num_input_tokens_seen": 4000160,
"step": 10615
},
{
"epoch": 9.516129032258064,
"grad_norm": 0.6362537741661072,
"learning_rate": 3.5707080025579045e-07,
"loss": 0.7134,
"num_input_tokens_seen": 4001888,
"step": 10620
},
{
"epoch": 9.520609318996415,
"grad_norm": 0.45133867859840393,
"learning_rate": 3.5051649616754114e-07,
"loss": 0.6981,
"num_input_tokens_seen": 4003680,
"step": 10625
},
{
"epoch": 9.525089605734767,
"grad_norm": 0.7299519181251526,
"learning_rate": 3.440224805241171e-07,
"loss": 0.6643,
"num_input_tokens_seen": 4005632,
"step": 10630
},
{
"epoch": 9.529569892473118,
"grad_norm": 0.5302301645278931,
"learning_rate": 3.3758876920877147e-07,
"loss": 0.7006,
"num_input_tokens_seen": 4007296,
"step": 10635
},
{
"epoch": 9.53405017921147,
"grad_norm": 0.5000677704811096,
"learning_rate": 3.312153779572724e-07,
"loss": 0.6945,
"num_input_tokens_seen": 4009120,
"step": 10640
},
{
"epoch": 9.53853046594982,
"grad_norm": 0.5188679099082947,
"learning_rate": 3.249023223578479e-07,
"loss": 0.6937,
"num_input_tokens_seen": 4010944,
"step": 10645
},
{
"epoch": 9.543010752688172,
"grad_norm": 0.48496314883232117,
"learning_rate": 3.1864961785116054e-07,
"loss": 0.696,
"num_input_tokens_seen": 4012832,
"step": 10650
},
{
"epoch": 9.547491039426523,
"grad_norm": 0.5330994129180908,
"learning_rate": 3.124572797302661e-07,
"loss": 0.7141,
"num_input_tokens_seen": 4014688,
"step": 10655
},
{
"epoch": 9.551971326164875,
"grad_norm": 0.5689115524291992,
"learning_rate": 3.063253231405605e-07,
"loss": 0.709,
"num_input_tokens_seen": 4016512,
"step": 10660
},
{
"epoch": 9.556451612903226,
"grad_norm": 0.4524476230144501,
"learning_rate": 3.002537630797747e-07,
"loss": 0.655,
"num_input_tokens_seen": 4018368,
"step": 10665
},
{
"epoch": 9.560931899641577,
"grad_norm": 0.6236125230789185,
"learning_rate": 2.9424261439791323e-07,
"loss": 0.6903,
"num_input_tokens_seen": 4020096,
"step": 10670
},
{
"epoch": 9.565412186379929,
"grad_norm": 0.4066424071788788,
"learning_rate": 2.8829189179721547e-07,
"loss": 0.6701,
"num_input_tokens_seen": 4022048,
"step": 10675
},
{
"epoch": 9.56989247311828,
"grad_norm": 0.8376783132553101,
"learning_rate": 2.824016098321447e-07,
"loss": 0.6913,
"num_input_tokens_seen": 4023936,
"step": 10680
},
{
"epoch": 9.574372759856631,
"grad_norm": 0.7089189887046814,
"learning_rate": 2.7657178290932396e-07,
"loss": 0.6963,
"num_input_tokens_seen": 4025984,
"step": 10685
},
{
"epoch": 9.578853046594983,
"grad_norm": 0.6390495896339417,
"learning_rate": 2.7080242528751964e-07,
"loss": 0.6971,
"num_input_tokens_seen": 4027808,
"step": 10690
},
{
"epoch": 9.583333333333334,
"grad_norm": 0.5339614152908325,
"learning_rate": 2.650935510776026e-07,
"loss": 0.7024,
"num_input_tokens_seen": 4029632,
"step": 10695
},
{
"epoch": 9.587813620071685,
"grad_norm": 0.47523030638694763,
"learning_rate": 2.594451742425036e-07,
"loss": 0.6794,
"num_input_tokens_seen": 4031520,
"step": 10700
},
{
"epoch": 9.592293906810037,
"grad_norm": 0.49205970764160156,
"learning_rate": 2.538573085971968e-07,
"loss": 0.6579,
"num_input_tokens_seen": 4033568,
"step": 10705
},
{
"epoch": 9.596774193548388,
"grad_norm": 0.6642840504646301,
"learning_rate": 2.4832996780864704e-07,
"loss": 0.6687,
"num_input_tokens_seen": 4035424,
"step": 10710
},
{
"epoch": 9.601254480286737,
"grad_norm": 0.5492425560951233,
"learning_rate": 2.42863165395793e-07,
"loss": 0.6616,
"num_input_tokens_seen": 4037376,
"step": 10715
},
{
"epoch": 9.60573476702509,
"grad_norm": 0.6824979186058044,
"learning_rate": 2.3745691472950026e-07,
"loss": 0.7162,
"num_input_tokens_seen": 4039264,
"step": 10720
},
{
"epoch": 9.61021505376344,
"grad_norm": 0.6007011532783508,
"learning_rate": 2.3211122903254167e-07,
"loss": 0.6801,
"num_input_tokens_seen": 4040992,
"step": 10725
},
{
"epoch": 9.614695340501791,
"grad_norm": 0.32746192812919617,
"learning_rate": 2.2682612137955307e-07,
"loss": 0.6677,
"num_input_tokens_seen": 4042848,
"step": 10730
},
{
"epoch": 9.619175627240143,
"grad_norm": 0.4873063266277313,
"learning_rate": 2.2160160469701097e-07,
"loss": 0.6752,
"num_input_tokens_seen": 4044608,
"step": 10735
},
{
"epoch": 9.623655913978494,
"grad_norm": 0.5440759062767029,
"learning_rate": 2.1643769176319385e-07,
"loss": 0.672,
"num_input_tokens_seen": 4046528,
"step": 10740
},
{
"epoch": 9.628136200716845,
"grad_norm": 0.5048784017562866,
"learning_rate": 2.1133439520815423e-07,
"loss": 0.723,
"num_input_tokens_seen": 4048448,
"step": 10745
},
{
"epoch": 9.632616487455197,
"grad_norm": 0.48166608810424805,
"learning_rate": 2.062917275136883e-07,
"loss": 0.6671,
"num_input_tokens_seen": 4050304,
"step": 10750
},
{
"epoch": 9.637096774193548,
"grad_norm": 0.5569744110107422,
"learning_rate": 2.0130970101330527e-07,
"loss": 0.7106,
"num_input_tokens_seen": 4052224,
"step": 10755
},
{
"epoch": 9.6415770609319,
"grad_norm": 0.4456222951412201,
"learning_rate": 1.963883278921913e-07,
"loss": 0.692,
"num_input_tokens_seen": 4054208,
"step": 10760
},
{
"epoch": 9.64605734767025,
"grad_norm": 0.5037098526954651,
"learning_rate": 1.9152762018719017e-07,
"loss": 0.6757,
"num_input_tokens_seen": 4056192,
"step": 10765
},
{
"epoch": 9.650537634408602,
"grad_norm": 0.47247257828712463,
"learning_rate": 1.867275897867643e-07,
"loss": 0.681,
"num_input_tokens_seen": 4058208,
"step": 10770
},
{
"epoch": 9.655017921146953,
"grad_norm": 0.5090415477752686,
"learning_rate": 1.819882484309754e-07,
"loss": 0.6892,
"num_input_tokens_seen": 4060096,
"step": 10775
},
{
"epoch": 9.659498207885305,
"grad_norm": 0.5318375825881958,
"learning_rate": 1.773096077114428e-07,
"loss": 0.7078,
"num_input_tokens_seen": 4062016,
"step": 10780
},
{
"epoch": 9.663978494623656,
"grad_norm": 0.5755466222763062,
"learning_rate": 1.7269167907132954e-07,
"loss": 0.6475,
"num_input_tokens_seen": 4063808,
"step": 10785
},
{
"epoch": 9.668458781362007,
"grad_norm": 0.3897082209587097,
"learning_rate": 1.681344738053009e-07,
"loss": 0.7167,
"num_input_tokens_seen": 4065600,
"step": 10790
},
{
"epoch": 9.672939068100359,
"grad_norm": 0.5163991451263428,
"learning_rate": 1.636380030595075e-07,
"loss": 0.6564,
"num_input_tokens_seen": 4067488,
"step": 10795
},
{
"epoch": 9.67741935483871,
"grad_norm": 0.3706883192062378,
"learning_rate": 1.5920227783155217e-07,
"loss": 0.703,
"num_input_tokens_seen": 4069312,
"step": 10800
},
{
"epoch": 9.681899641577061,
"grad_norm": 0.554594874382019,
"learning_rate": 1.5482730897046216e-07,
"loss": 0.6392,
"num_input_tokens_seen": 4071104,
"step": 10805
},
{
"epoch": 9.686379928315413,
"grad_norm": 0.631853461265564,
"learning_rate": 1.5051310717666967e-07,
"loss": 0.689,
"num_input_tokens_seen": 4073184,
"step": 10810
},
{
"epoch": 9.690860215053764,
"grad_norm": 0.6092721819877625,
"learning_rate": 1.4625968300197857e-07,
"loss": 0.7235,
"num_input_tokens_seen": 4075072,
"step": 10815
},
{
"epoch": 9.695340501792115,
"grad_norm": 0.598450779914856,
"learning_rate": 1.4206704684953943e-07,
"loss": 0.6993,
"num_input_tokens_seen": 4077024,
"step": 10820
},
{
"epoch": 9.699820788530467,
"grad_norm": 0.8374021053314209,
"learning_rate": 1.3793520897383006e-07,
"loss": 0.6696,
"num_input_tokens_seen": 4078944,
"step": 10825
},
{
"epoch": 9.704301075268818,
"grad_norm": 0.6128981709480286,
"learning_rate": 1.3386417948061947e-07,
"loss": 0.6869,
"num_input_tokens_seen": 4080704,
"step": 10830
},
{
"epoch": 9.70878136200717,
"grad_norm": 0.6915923953056335,
"learning_rate": 1.2985396832695674e-07,
"loss": 0.6667,
"num_input_tokens_seen": 4082432,
"step": 10835
},
{
"epoch": 9.713261648745519,
"grad_norm": 0.43707898259162903,
"learning_rate": 1.259045853211349e-07,
"loss": 0.6825,
"num_input_tokens_seen": 4084320,
"step": 10840
},
{
"epoch": 9.717741935483872,
"grad_norm": 0.5835666060447693,
"learning_rate": 1.2201604012267442e-07,
"loss": 0.6845,
"num_input_tokens_seen": 4086240,
"step": 10845
},
{
"epoch": 9.722222222222221,
"grad_norm": 0.5008912682533264,
"learning_rate": 1.1818834224229525e-07,
"loss": 0.6837,
"num_input_tokens_seen": 4088096,
"step": 10850
},
{
"epoch": 9.726702508960573,
"grad_norm": 0.6731480956077576,
"learning_rate": 1.1442150104189198e-07,
"loss": 0.6958,
"num_input_tokens_seen": 4089888,
"step": 10855
},
{
"epoch": 9.731182795698924,
"grad_norm": 0.5034038424491882,
"learning_rate": 1.1071552573452271e-07,
"loss": 0.6914,
"num_input_tokens_seen": 4091744,
"step": 10860
},
{
"epoch": 9.735663082437275,
"grad_norm": 0.530077338218689,
"learning_rate": 1.0707042538437018e-07,
"loss": 0.6844,
"num_input_tokens_seen": 4093408,
"step": 10865
},
{
"epoch": 9.740143369175627,
"grad_norm": 0.32035142183303833,
"learning_rate": 1.0348620890673067e-07,
"loss": 0.6572,
"num_input_tokens_seen": 4095392,
"step": 10870
},
{
"epoch": 9.744623655913978,
"grad_norm": 0.5076518058776855,
"learning_rate": 9.9962885067989e-08,
"loss": 0.6863,
"num_input_tokens_seen": 4097280,
"step": 10875
},
{
"epoch": 9.74910394265233,
"grad_norm": 0.508080780506134,
"learning_rate": 9.650046248559363e-08,
"loss": 0.6726,
"num_input_tokens_seen": 4099360,
"step": 10880
},
{
"epoch": 9.75358422939068,
"grad_norm": 0.5304430723190308,
"learning_rate": 9.309894962804267e-08,
"loss": 0.6844,
"num_input_tokens_seen": 4101376,
"step": 10885
},
{
"epoch": 9.758064516129032,
"grad_norm": 0.5922977328300476,
"learning_rate": 8.975835481485895e-08,
"loss": 0.7022,
"num_input_tokens_seen": 4103296,
"step": 10890
},
{
"epoch": 9.762544802867383,
"grad_norm": 0.65008145570755,
"learning_rate": 8.647868621656785e-08,
"loss": 0.6829,
"num_input_tokens_seen": 4105248,
"step": 10895
},
{
"epoch": 9.767025089605735,
"grad_norm": 0.6050592064857483,
"learning_rate": 8.325995185468339e-08,
"loss": 0.6834,
"num_input_tokens_seen": 4107072,
"step": 10900
},
{
"epoch": 9.771505376344086,
"grad_norm": 0.3722684979438782,
"learning_rate": 8.010215960168044e-08,
"loss": 0.7032,
"num_input_tokens_seen": 4108768,
"step": 10905
},
{
"epoch": 9.775985663082437,
"grad_norm": 0.5007041692733765,
"learning_rate": 7.700531718098092e-08,
"loss": 0.6709,
"num_input_tokens_seen": 4110624,
"step": 10910
},
{
"epoch": 9.780465949820789,
"grad_norm": 0.5899874567985535,
"learning_rate": 7.396943216693708e-08,
"loss": 0.6837,
"num_input_tokens_seen": 4112352,
"step": 10915
},
{
"epoch": 9.78494623655914,
"grad_norm": 0.5524150133132935,
"learning_rate": 7.099451198480378e-08,
"loss": 0.6927,
"num_input_tokens_seen": 4114144,
"step": 10920
},
{
"epoch": 9.789426523297491,
"grad_norm": 0.5737338662147522,
"learning_rate": 6.808056391073569e-08,
"loss": 0.6877,
"num_input_tokens_seen": 4115872,
"step": 10925
},
{
"epoch": 9.793906810035843,
"grad_norm": 0.5014054179191589,
"learning_rate": 6.522759507175124e-08,
"loss": 0.6803,
"num_input_tokens_seen": 4117984,
"step": 10930
},
{
"epoch": 9.798387096774194,
"grad_norm": 0.5102945566177368,
"learning_rate": 6.243561244572427e-08,
"loss": 0.6793,
"num_input_tokens_seen": 4119968,
"step": 10935
},
{
"epoch": 9.802867383512545,
"grad_norm": 0.7418368458747864,
"learning_rate": 5.970462286137291e-08,
"loss": 0.705,
"num_input_tokens_seen": 4122048,
"step": 10940
},
{
"epoch": 9.807347670250897,
"grad_norm": 0.728233814239502,
"learning_rate": 5.7034632998231865e-08,
"loss": 0.6996,
"num_input_tokens_seen": 4124032,
"step": 10945
},
{
"epoch": 9.811827956989248,
"grad_norm": 0.4803222417831421,
"learning_rate": 5.4425649386644075e-08,
"loss": 0.6633,
"num_input_tokens_seen": 4125984,
"step": 10950
},
{
"epoch": 9.8163082437276,
"grad_norm": 0.4849235415458679,
"learning_rate": 5.187767840773849e-08,
"loss": 0.6833,
"num_input_tokens_seen": 4128032,
"step": 10955
},
{
"epoch": 9.82078853046595,
"grad_norm": 0.7425673604011536,
"learning_rate": 4.939072629341901e-08,
"loss": 0.6912,
"num_input_tokens_seen": 4129792,
"step": 10960
},
{
"epoch": 9.825268817204302,
"grad_norm": 0.5853078365325928,
"learning_rate": 4.696479912634499e-08,
"loss": 0.708,
"num_input_tokens_seen": 4131808,
"step": 10965
},
{
"epoch": 9.829749103942653,
"grad_norm": 0.6256317496299744,
"learning_rate": 4.459990283992577e-08,
"loss": 0.6856,
"num_input_tokens_seen": 4133696,
"step": 10970
},
{
"epoch": 9.834229390681003,
"grad_norm": 0.5935572981834412,
"learning_rate": 4.229604321829561e-08,
"loss": 0.6775,
"num_input_tokens_seen": 4135616,
"step": 10975
},
{
"epoch": 9.838709677419354,
"grad_norm": 0.6854491233825684,
"learning_rate": 4.0053225896299894e-08,
"loss": 0.6769,
"num_input_tokens_seen": 4137472,
"step": 10980
},
{
"epoch": 9.843189964157705,
"grad_norm": 0.5285246968269348,
"learning_rate": 3.787145635948952e-08,
"loss": 0.7279,
"num_input_tokens_seen": 4139328,
"step": 10985
},
{
"epoch": 9.847670250896057,
"grad_norm": 0.5252229571342468,
"learning_rate": 3.575073994410427e-08,
"loss": 0.6765,
"num_input_tokens_seen": 4141216,
"step": 10990
},
{
"epoch": 9.852150537634408,
"grad_norm": 0.31370288133621216,
"learning_rate": 3.369108183705339e-08,
"loss": 0.697,
"num_input_tokens_seen": 4142976,
"step": 10995
},
{
"epoch": 9.85663082437276,
"grad_norm": 0.42567935585975647,
"learning_rate": 3.169248707590999e-08,
"loss": 0.7026,
"num_input_tokens_seen": 4144672,
"step": 11000
},
{
"epoch": 9.86111111111111,
"grad_norm": 0.48273128271102905,
"learning_rate": 2.975496054889726e-08,
"loss": 0.706,
"num_input_tokens_seen": 4146496,
"step": 11005
},
{
"epoch": 9.865591397849462,
"grad_norm": 0.7682796716690063,
"learning_rate": 2.7878506994877263e-08,
"loss": 0.7025,
"num_input_tokens_seen": 4148320,
"step": 11010
},
{
"epoch": 9.870071684587813,
"grad_norm": 0.3882335424423218,
"learning_rate": 2.6063131003337126e-08,
"loss": 0.6872,
"num_input_tokens_seen": 4150176,
"step": 11015
},
{
"epoch": 9.874551971326165,
"grad_norm": 0.5765259265899658,
"learning_rate": 2.4308837014372366e-08,
"loss": 0.6708,
"num_input_tokens_seen": 4152000,
"step": 11020
},
{
"epoch": 9.879032258064516,
"grad_norm": 0.5910921692848206,
"learning_rate": 2.2615629318692434e-08,
"loss": 0.697,
"num_input_tokens_seen": 4153984,
"step": 11025
},
{
"epoch": 9.883512544802867,
"grad_norm": 0.7944561839103699,
"learning_rate": 2.0983512057595743e-08,
"loss": 0.6893,
"num_input_tokens_seen": 4155904,
"step": 11030
},
{
"epoch": 9.887992831541219,
"grad_norm": 0.7031881213188171,
"learning_rate": 1.941248922296135e-08,
"loss": 0.6565,
"num_input_tokens_seen": 4157760,
"step": 11035
},
{
"epoch": 9.89247311827957,
"grad_norm": 0.5116055607795715,
"learning_rate": 1.7902564657246158e-08,
"loss": 0.7064,
"num_input_tokens_seen": 4159584,
"step": 11040
},
{
"epoch": 9.896953405017921,
"grad_norm": 0.5304086208343506,
"learning_rate": 1.6453742053465504e-08,
"loss": 0.706,
"num_input_tokens_seen": 4161472,
"step": 11045
},
{
"epoch": 9.901433691756273,
"grad_norm": 0.8019806146621704,
"learning_rate": 1.506602495519316e-08,
"loss": 0.7026,
"num_input_tokens_seen": 4163328,
"step": 11050
},
{
"epoch": 9.905913978494624,
"grad_norm": 0.6591276526451111,
"learning_rate": 1.3739416756555768e-08,
"loss": 0.6832,
"num_input_tokens_seen": 4165376,
"step": 11055
},
{
"epoch": 9.910394265232975,
"grad_norm": 0.39249947667121887,
"learning_rate": 1.2473920702202325e-08,
"loss": 0.6374,
"num_input_tokens_seen": 4167168,
"step": 11060
},
{
"epoch": 9.914874551971327,
"grad_norm": 0.534702479839325,
"learning_rate": 1.126953988732915e-08,
"loss": 0.7048,
"num_input_tokens_seen": 4169056,
"step": 11065
},
{
"epoch": 9.919354838709678,
"grad_norm": 0.4636304974555969,
"learning_rate": 1.0126277257641037e-08,
"loss": 0.6804,
"num_input_tokens_seen": 4170976,
"step": 11070
},
{
"epoch": 9.92383512544803,
"grad_norm": 0.5376958250999451,
"learning_rate": 9.044135609365124e-09,
"loss": 0.7069,
"num_input_tokens_seen": 4172704,
"step": 11075
},
{
"epoch": 9.92831541218638,
"grad_norm": 0.5938841700553894,
"learning_rate": 8.023117589237017e-09,
"loss": 0.6862,
"num_input_tokens_seen": 4174688,
"step": 11080
},
{
"epoch": 9.932795698924732,
"grad_norm": 0.6230871677398682,
"learning_rate": 7.06322569449247e-09,
"loss": 0.6972,
"num_input_tokens_seen": 4176480,
"step": 11085
},
{
"epoch": 9.937275985663083,
"grad_norm": 0.5930041074752808,
"learning_rate": 6.164462272864602e-09,
"loss": 0.6864,
"num_input_tokens_seen": 4178432,
"step": 11090
},
{
"epoch": 9.941756272401435,
"grad_norm": 0.7413445115089417,
"learning_rate": 5.326829522578347e-09,
"loss": 0.7061,
"num_input_tokens_seen": 4180256,
"step": 11095
},
{
"epoch": 9.946236559139784,
"grad_norm": 0.5409526824951172,
"learning_rate": 4.5503294923338044e-09,
"loss": 0.6698,
"num_input_tokens_seen": 4182144,
"step": 11100
},
{
"epoch": 9.950716845878135,
"grad_norm": 0.4235214591026306,
"learning_rate": 3.834964081325665e-09,
"loss": 0.6779,
"num_input_tokens_seen": 4184064,
"step": 11105
},
{
"epoch": 9.955197132616487,
"grad_norm": 0.6128969192504883,
"learning_rate": 3.1807350392099033e-09,
"loss": 0.6741,
"num_input_tokens_seen": 4185888,
"step": 11110
},
{
"epoch": 9.959677419354838,
"grad_norm": 0.7141082286834717,
"learning_rate": 2.58764396612321e-09,
"loss": 0.7148,
"num_input_tokens_seen": 4187808,
"step": 11115
},
{
"epoch": 9.96415770609319,
"grad_norm": 0.5248094201087952,
"learning_rate": 2.0556923126663353e-09,
"loss": 0.7143,
"num_input_tokens_seen": 4189440,
"step": 11120
},
{
"epoch": 9.96863799283154,
"grad_norm": 0.566316545009613,
"learning_rate": 1.5848813798985396e-09,
"loss": 0.7122,
"num_input_tokens_seen": 4191328,
"step": 11125
},
{
"epoch": 9.973118279569892,
"grad_norm": 0.5793007612228394,
"learning_rate": 1.1752123193459197e-09,
"loss": 0.7101,
"num_input_tokens_seen": 4193120,
"step": 11130
},
{
"epoch": 9.977598566308243,
"grad_norm": 0.45351698994636536,
"learning_rate": 8.266861329903064e-10,
"loss": 0.6819,
"num_input_tokens_seen": 4195200,
"step": 11135
},
{
"epoch": 9.982078853046595,
"grad_norm": 0.31627628207206726,
"learning_rate": 5.393036732637136e-10,
"loss": 0.7049,
"num_input_tokens_seen": 4197024,
"step": 11140
},
{
"epoch": 9.986559139784946,
"grad_norm": 0.8334797024726868,
"learning_rate": 3.130656430594403e-10,
"loss": 0.7084,
"num_input_tokens_seen": 4198976,
"step": 11145
},
{
"epoch": 9.991039426523297,
"grad_norm": 0.5872827768325806,
"learning_rate": 1.4797259571541767e-10,
"loss": 0.717,
"num_input_tokens_seen": 4200832,
"step": 11150
},
{
"epoch": 9.995519713261649,
"grad_norm": 0.6502652764320374,
"learning_rate": 4.402493501975968e-11,
"loss": 0.6907,
"num_input_tokens_seen": 4202656,
"step": 11155
},
{
"epoch": 10.0,
"grad_norm": 1.1099315881729126,
"learning_rate": 1.2229152107634533e-12,
"loss": 0.6937,
"num_input_tokens_seen": 4204168,
"step": 11160
},
{
"epoch": 10.0,
"eval_loss": 0.699965238571167,
"eval_runtime": 5.6351,
"eval_samples_per_second": 88.02,
"eval_steps_per_second": 22.005,
"num_input_tokens_seen": 4204168,
"step": 11160
},
{
"epoch": 10.0,
"num_input_tokens_seen": 4204168,
"step": 11160,
"total_flos": 1.8931178489059738e+17,
"train_loss": 1.13652567756646,
"train_runtime": 1257.9718,
"train_samples_per_second": 35.462,
"train_steps_per_second": 8.871
}
],
"logging_steps": 5,
"max_steps": 11160,
"num_input_tokens_seen": 4204168,
"num_train_epochs": 10,
"save_steps": 558,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8931178489059738e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}