train_cb_1757340215 / trainer_state.json
rbelanec's picture
End of training
aacf7f5 verified
{
"best_global_step": 2034,
"best_metric": 0.018208853900432587,
"best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_cb_1757340215/checkpoint-2034",
"epoch": 20.0,
"eval_steps": 113,
"global_step": 2260,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04424778761061947,
"grad_norm": 176.08645629882812,
"learning_rate": 8.849557522123894e-07,
"loss": 9.371,
"num_input_tokens_seen": 1520,
"step": 5
},
{
"epoch": 0.08849557522123894,
"grad_norm": 182.31874084472656,
"learning_rate": 1.991150442477876e-06,
"loss": 8.6136,
"num_input_tokens_seen": 2976,
"step": 10
},
{
"epoch": 0.13274336283185842,
"grad_norm": 191.4988555908203,
"learning_rate": 3.097345132743363e-06,
"loss": 7.307,
"num_input_tokens_seen": 4688,
"step": 15
},
{
"epoch": 0.17699115044247787,
"grad_norm": 121.74630737304688,
"learning_rate": 4.2035398230088504e-06,
"loss": 5.3146,
"num_input_tokens_seen": 5776,
"step": 20
},
{
"epoch": 0.22123893805309736,
"grad_norm": 85.33639526367188,
"learning_rate": 5.3097345132743365e-06,
"loss": 3.4853,
"num_input_tokens_seen": 6992,
"step": 25
},
{
"epoch": 0.26548672566371684,
"grad_norm": 104.62677001953125,
"learning_rate": 6.415929203539823e-06,
"loss": 2.6932,
"num_input_tokens_seen": 8320,
"step": 30
},
{
"epoch": 0.30973451327433627,
"grad_norm": 76.95153045654297,
"learning_rate": 7.52212389380531e-06,
"loss": 1.4263,
"num_input_tokens_seen": 9536,
"step": 35
},
{
"epoch": 0.35398230088495575,
"grad_norm": 91.25277709960938,
"learning_rate": 8.628318584070797e-06,
"loss": 0.6123,
"num_input_tokens_seen": 11072,
"step": 40
},
{
"epoch": 0.39823008849557523,
"grad_norm": 65.90292358398438,
"learning_rate": 9.734513274336284e-06,
"loss": 0.6019,
"num_input_tokens_seen": 12320,
"step": 45
},
{
"epoch": 0.4424778761061947,
"grad_norm": 67.72266387939453,
"learning_rate": 1.0840707964601771e-05,
"loss": 0.5444,
"num_input_tokens_seen": 13616,
"step": 50
},
{
"epoch": 0.48672566371681414,
"grad_norm": 75.33143615722656,
"learning_rate": 1.1946902654867258e-05,
"loss": 0.5862,
"num_input_tokens_seen": 15104,
"step": 55
},
{
"epoch": 0.5309734513274337,
"grad_norm": 29.605249404907227,
"learning_rate": 1.3053097345132745e-05,
"loss": 0.3168,
"num_input_tokens_seen": 16352,
"step": 60
},
{
"epoch": 0.5752212389380531,
"grad_norm": 8.359716415405273,
"learning_rate": 1.415929203539823e-05,
"loss": 0.1929,
"num_input_tokens_seen": 17888,
"step": 65
},
{
"epoch": 0.6194690265486725,
"grad_norm": 149.0347442626953,
"learning_rate": 1.5265486725663717e-05,
"loss": 0.7495,
"num_input_tokens_seen": 19008,
"step": 70
},
{
"epoch": 0.6637168141592921,
"grad_norm": 10.16899299621582,
"learning_rate": 1.6371681415929206e-05,
"loss": 0.2649,
"num_input_tokens_seen": 20480,
"step": 75
},
{
"epoch": 0.7079646017699115,
"grad_norm": 26.192371368408203,
"learning_rate": 1.747787610619469e-05,
"loss": 0.4134,
"num_input_tokens_seen": 22000,
"step": 80
},
{
"epoch": 0.7522123893805309,
"grad_norm": 6.6128435134887695,
"learning_rate": 1.858407079646018e-05,
"loss": 0.2021,
"num_input_tokens_seen": 23456,
"step": 85
},
{
"epoch": 0.7964601769911505,
"grad_norm": 31.164657592773438,
"learning_rate": 1.9690265486725665e-05,
"loss": 0.6914,
"num_input_tokens_seen": 25280,
"step": 90
},
{
"epoch": 0.8407079646017699,
"grad_norm": 27.877477645874023,
"learning_rate": 2.079646017699115e-05,
"loss": 0.17,
"num_input_tokens_seen": 26496,
"step": 95
},
{
"epoch": 0.8849557522123894,
"grad_norm": 20.50126075744629,
"learning_rate": 2.190265486725664e-05,
"loss": 0.1746,
"num_input_tokens_seen": 28016,
"step": 100
},
{
"epoch": 0.9292035398230089,
"grad_norm": 14.361896514892578,
"learning_rate": 2.3008849557522124e-05,
"loss": 0.418,
"num_input_tokens_seen": 29344,
"step": 105
},
{
"epoch": 0.9734513274336283,
"grad_norm": 25.596282958984375,
"learning_rate": 2.411504424778761e-05,
"loss": 0.1978,
"num_input_tokens_seen": 30496,
"step": 110
},
{
"epoch": 1.0,
"eval_loss": 0.46471935510635376,
"eval_runtime": 0.7074,
"eval_samples_per_second": 35.343,
"eval_steps_per_second": 18.378,
"num_input_tokens_seen": 31064,
"step": 113
},
{
"epoch": 1.0176991150442478,
"grad_norm": 35.75157928466797,
"learning_rate": 2.5221238938053098e-05,
"loss": 0.3202,
"num_input_tokens_seen": 31560,
"step": 115
},
{
"epoch": 1.0619469026548674,
"grad_norm": 4.021816730499268,
"learning_rate": 2.6327433628318586e-05,
"loss": 1.3669,
"num_input_tokens_seen": 32744,
"step": 120
},
{
"epoch": 1.1061946902654867,
"grad_norm": 14.25241756439209,
"learning_rate": 2.743362831858407e-05,
"loss": 0.3348,
"num_input_tokens_seen": 34440,
"step": 125
},
{
"epoch": 1.1504424778761062,
"grad_norm": 15.869715690612793,
"learning_rate": 2.853982300884956e-05,
"loss": 0.2621,
"num_input_tokens_seen": 35992,
"step": 130
},
{
"epoch": 1.1946902654867257,
"grad_norm": 9.137112617492676,
"learning_rate": 2.964601769911505e-05,
"loss": 0.4748,
"num_input_tokens_seen": 37480,
"step": 135
},
{
"epoch": 1.238938053097345,
"grad_norm": 15.987156867980957,
"learning_rate": 3.075221238938053e-05,
"loss": 0.1724,
"num_input_tokens_seen": 38776,
"step": 140
},
{
"epoch": 1.2831858407079646,
"grad_norm": 35.1132698059082,
"learning_rate": 3.185840707964602e-05,
"loss": 0.2813,
"num_input_tokens_seen": 40408,
"step": 145
},
{
"epoch": 1.3274336283185841,
"grad_norm": 17.494558334350586,
"learning_rate": 3.296460176991151e-05,
"loss": 0.8969,
"num_input_tokens_seen": 41944,
"step": 150
},
{
"epoch": 1.3716814159292037,
"grad_norm": 92.76953125,
"learning_rate": 3.407079646017699e-05,
"loss": 0.3488,
"num_input_tokens_seen": 43576,
"step": 155
},
{
"epoch": 1.415929203539823,
"grad_norm": 30.726852416992188,
"learning_rate": 3.517699115044248e-05,
"loss": 0.3747,
"num_input_tokens_seen": 44984,
"step": 160
},
{
"epoch": 1.4601769911504425,
"grad_norm": 10.402463912963867,
"learning_rate": 3.628318584070797e-05,
"loss": 0.2144,
"num_input_tokens_seen": 46424,
"step": 165
},
{
"epoch": 1.504424778761062,
"grad_norm": 47.82248306274414,
"learning_rate": 3.7389380530973455e-05,
"loss": 0.3073,
"num_input_tokens_seen": 47800,
"step": 170
},
{
"epoch": 1.5486725663716814,
"grad_norm": 4.886204719543457,
"learning_rate": 3.849557522123894e-05,
"loss": 0.4033,
"num_input_tokens_seen": 48776,
"step": 175
},
{
"epoch": 1.592920353982301,
"grad_norm": 42.30789566040039,
"learning_rate": 3.9601769911504426e-05,
"loss": 0.2292,
"num_input_tokens_seen": 49960,
"step": 180
},
{
"epoch": 1.6371681415929205,
"grad_norm": 0.1658252775669098,
"learning_rate": 4.0707964601769914e-05,
"loss": 0.0061,
"num_input_tokens_seen": 51624,
"step": 185
},
{
"epoch": 1.6814159292035398,
"grad_norm": 10.36694049835205,
"learning_rate": 4.1814159292035396e-05,
"loss": 0.3457,
"num_input_tokens_seen": 53048,
"step": 190
},
{
"epoch": 1.7256637168141593,
"grad_norm": 19.44915008544922,
"learning_rate": 4.2920353982300885e-05,
"loss": 0.569,
"num_input_tokens_seen": 54552,
"step": 195
},
{
"epoch": 1.7699115044247788,
"grad_norm": 13.492650985717773,
"learning_rate": 4.4026548672566373e-05,
"loss": 0.4705,
"num_input_tokens_seen": 55656,
"step": 200
},
{
"epoch": 1.8141592920353982,
"grad_norm": 2.0458106994628906,
"learning_rate": 4.5132743362831855e-05,
"loss": 0.1993,
"num_input_tokens_seen": 56984,
"step": 205
},
{
"epoch": 1.8584070796460177,
"grad_norm": 3.030862808227539,
"learning_rate": 4.6238938053097344e-05,
"loss": 0.3077,
"num_input_tokens_seen": 58200,
"step": 210
},
{
"epoch": 1.9026548672566372,
"grad_norm": 3.6195731163024902,
"learning_rate": 4.734513274336283e-05,
"loss": 0.1787,
"num_input_tokens_seen": 59544,
"step": 215
},
{
"epoch": 1.9469026548672566,
"grad_norm": 24.178558349609375,
"learning_rate": 4.845132743362832e-05,
"loss": 0.6463,
"num_input_tokens_seen": 60968,
"step": 220
},
{
"epoch": 1.991150442477876,
"grad_norm": 12.566280364990234,
"learning_rate": 4.955752212389381e-05,
"loss": 0.3015,
"num_input_tokens_seen": 62216,
"step": 225
},
{
"epoch": 2.0,
"eval_loss": 0.4196261763572693,
"eval_runtime": 0.7085,
"eval_samples_per_second": 35.285,
"eval_steps_per_second": 18.348,
"num_input_tokens_seen": 62304,
"step": 226
},
{
"epoch": 2.0353982300884956,
"grad_norm": 28.30797004699707,
"learning_rate": 4.9999731620342936e-05,
"loss": 0.3508,
"num_input_tokens_seen": 63568,
"step": 230
},
{
"epoch": 2.079646017699115,
"grad_norm": 16.764211654663086,
"learning_rate": 4.9998091543305845e-05,
"loss": 0.2591,
"num_input_tokens_seen": 64800,
"step": 235
},
{
"epoch": 2.1238938053097347,
"grad_norm": 17.578092575073242,
"learning_rate": 4.999496058673635e-05,
"loss": 0.3062,
"num_input_tokens_seen": 66144,
"step": 240
},
{
"epoch": 2.168141592920354,
"grad_norm": 0.1362372487783432,
"learning_rate": 4.999033893736386e-05,
"loss": 0.0411,
"num_input_tokens_seen": 67600,
"step": 245
},
{
"epoch": 2.2123893805309733,
"grad_norm": 14.771645545959473,
"learning_rate": 4.99842268708223e-05,
"loss": 0.4286,
"num_input_tokens_seen": 69056,
"step": 250
},
{
"epoch": 2.256637168141593,
"grad_norm": 4.802849292755127,
"learning_rate": 4.9976624751633725e-05,
"loss": 0.9021,
"num_input_tokens_seen": 70304,
"step": 255
},
{
"epoch": 2.3008849557522124,
"grad_norm": 8.362566947937012,
"learning_rate": 4.996753303318648e-05,
"loss": 0.1102,
"num_input_tokens_seen": 71872,
"step": 260
},
{
"epoch": 2.3451327433628317,
"grad_norm": 1.8170663118362427,
"learning_rate": 4.995695225770825e-05,
"loss": 0.3114,
"num_input_tokens_seen": 73520,
"step": 265
},
{
"epoch": 2.3893805309734515,
"grad_norm": 1.7204346656799316,
"learning_rate": 4.994488305623365e-05,
"loss": 0.2026,
"num_input_tokens_seen": 75184,
"step": 270
},
{
"epoch": 2.433628318584071,
"grad_norm": 5.6229963302612305,
"learning_rate": 4.993132614856666e-05,
"loss": 0.095,
"num_input_tokens_seen": 76592,
"step": 275
},
{
"epoch": 2.47787610619469,
"grad_norm": 7.6734771728515625,
"learning_rate": 4.991628234323765e-05,
"loss": 0.8076,
"num_input_tokens_seen": 77632,
"step": 280
},
{
"epoch": 2.52212389380531,
"grad_norm": 3.230637550354004,
"learning_rate": 4.9899752537455166e-05,
"loss": 0.1888,
"num_input_tokens_seen": 79104,
"step": 285
},
{
"epoch": 2.566371681415929,
"grad_norm": 0.5752978920936584,
"learning_rate": 4.9881737717052436e-05,
"loss": 0.1704,
"num_input_tokens_seen": 80432,
"step": 290
},
{
"epoch": 2.6106194690265485,
"grad_norm": 6.58157205581665,
"learning_rate": 4.9862238956428556e-05,
"loss": 0.7013,
"num_input_tokens_seen": 81744,
"step": 295
},
{
"epoch": 2.6548672566371683,
"grad_norm": 10.219573020935059,
"learning_rate": 4.984125741848441e-05,
"loss": 0.4427,
"num_input_tokens_seen": 83104,
"step": 300
},
{
"epoch": 2.6991150442477876,
"grad_norm": 17.33927345275879,
"learning_rate": 4.981879435455336e-05,
"loss": 0.3118,
"num_input_tokens_seen": 84336,
"step": 305
},
{
"epoch": 2.7433628318584073,
"grad_norm": 2.8333613872528076,
"learning_rate": 4.9794851104326554e-05,
"loss": 0.617,
"num_input_tokens_seen": 85936,
"step": 310
},
{
"epoch": 2.7876106194690267,
"grad_norm": 6.623862266540527,
"learning_rate": 4.976942909577307e-05,
"loss": 0.1483,
"num_input_tokens_seen": 87232,
"step": 315
},
{
"epoch": 2.831858407079646,
"grad_norm": 2.0592002868652344,
"learning_rate": 4.974252984505475e-05,
"loss": 0.1033,
"num_input_tokens_seen": 88336,
"step": 320
},
{
"epoch": 2.8761061946902657,
"grad_norm": 0.33601754903793335,
"learning_rate": 4.971415495643574e-05,
"loss": 0.3888,
"num_input_tokens_seen": 89584,
"step": 325
},
{
"epoch": 2.920353982300885,
"grad_norm": 7.770787239074707,
"learning_rate": 4.968430612218687e-05,
"loss": 0.0721,
"num_input_tokens_seen": 91008,
"step": 330
},
{
"epoch": 2.9646017699115044,
"grad_norm": 1.1051758527755737,
"learning_rate": 4.965298512248466e-05,
"loss": 0.49,
"num_input_tokens_seen": 92288,
"step": 335
},
{
"epoch": 3.0,
"eval_loss": 0.2181825488805771,
"eval_runtime": 0.7048,
"eval_samples_per_second": 35.47,
"eval_steps_per_second": 18.444,
"num_input_tokens_seen": 93232,
"step": 339
},
{
"epoch": 3.0088495575221237,
"grad_norm": 0.15708673000335693,
"learning_rate": 4.962019382530521e-05,
"loss": 0.1333,
"num_input_tokens_seen": 93456,
"step": 340
},
{
"epoch": 3.0530973451327434,
"grad_norm": 0.3463290333747864,
"learning_rate": 4.958593418631275e-05,
"loss": 0.0663,
"num_input_tokens_seen": 94752,
"step": 345
},
{
"epoch": 3.0973451327433628,
"grad_norm": 8.072488784790039,
"learning_rate": 4.955020824874307e-05,
"loss": 0.4001,
"num_input_tokens_seen": 96000,
"step": 350
},
{
"epoch": 3.1415929203539825,
"grad_norm": 3.377310276031494,
"learning_rate": 4.951301814328157e-05,
"loss": 0.0907,
"num_input_tokens_seen": 97568,
"step": 355
},
{
"epoch": 3.185840707964602,
"grad_norm": 16.744794845581055,
"learning_rate": 4.947436608793624e-05,
"loss": 0.1587,
"num_input_tokens_seen": 98896,
"step": 360
},
{
"epoch": 3.230088495575221,
"grad_norm": 0.08063212782144547,
"learning_rate": 4.9434254387905395e-05,
"loss": 0.2449,
"num_input_tokens_seen": 100336,
"step": 365
},
{
"epoch": 3.274336283185841,
"grad_norm": 6.403633117675781,
"learning_rate": 4.9392685435440154e-05,
"loss": 0.231,
"num_input_tokens_seen": 101888,
"step": 370
},
{
"epoch": 3.3185840707964602,
"grad_norm": 6.990411281585693,
"learning_rate": 4.93496617097018e-05,
"loss": 0.3525,
"num_input_tokens_seen": 103184,
"step": 375
},
{
"epoch": 3.3628318584070795,
"grad_norm": 1.9636709690093994,
"learning_rate": 4.930518577661388e-05,
"loss": 0.1136,
"num_input_tokens_seen": 104672,
"step": 380
},
{
"epoch": 3.4070796460176993,
"grad_norm": 0.7355748414993286,
"learning_rate": 4.925926028870923e-05,
"loss": 0.1842,
"num_input_tokens_seen": 105968,
"step": 385
},
{
"epoch": 3.4513274336283186,
"grad_norm": 1.4544111490249634,
"learning_rate": 4.921188798497173e-05,
"loss": 0.0577,
"num_input_tokens_seen": 107472,
"step": 390
},
{
"epoch": 3.495575221238938,
"grad_norm": 0.02327471598982811,
"learning_rate": 4.9163071690672973e-05,
"loss": 0.1441,
"num_input_tokens_seen": 108688,
"step": 395
},
{
"epoch": 3.5398230088495577,
"grad_norm": 5.047955513000488,
"learning_rate": 4.911281431720378e-05,
"loss": 0.3961,
"num_input_tokens_seen": 110320,
"step": 400
},
{
"epoch": 3.584070796460177,
"grad_norm": 0.618489146232605,
"learning_rate": 4.9061118861900537e-05,
"loss": 0.3237,
"num_input_tokens_seen": 111952,
"step": 405
},
{
"epoch": 3.6283185840707963,
"grad_norm": 0.40347716212272644,
"learning_rate": 4.900798840786645e-05,
"loss": 0.1157,
"num_input_tokens_seen": 113152,
"step": 410
},
{
"epoch": 3.672566371681416,
"grad_norm": 10.295228958129883,
"learning_rate": 4.8953426123787674e-05,
"loss": 0.2442,
"num_input_tokens_seen": 114592,
"step": 415
},
{
"epoch": 3.7168141592920354,
"grad_norm": 0.1771656572818756,
"learning_rate": 4.889743526374432e-05,
"loss": 0.6003,
"num_input_tokens_seen": 115936,
"step": 420
},
{
"epoch": 3.7610619469026547,
"grad_norm": 0.21636876463890076,
"learning_rate": 4.884001916701639e-05,
"loss": 0.1804,
"num_input_tokens_seen": 117232,
"step": 425
},
{
"epoch": 3.8053097345132745,
"grad_norm": 6.24064302444458,
"learning_rate": 4.878118125788462e-05,
"loss": 0.1424,
"num_input_tokens_seen": 118448,
"step": 430
},
{
"epoch": 3.849557522123894,
"grad_norm": 5.579165458679199,
"learning_rate": 4.872092504542629e-05,
"loss": 0.8064,
"num_input_tokens_seen": 119760,
"step": 435
},
{
"epoch": 3.893805309734513,
"grad_norm": 1.1224756240844727,
"learning_rate": 4.865925412330586e-05,
"loss": 0.0734,
"num_input_tokens_seen": 121488,
"step": 440
},
{
"epoch": 3.938053097345133,
"grad_norm": 4.9014458656311035,
"learning_rate": 4.859617216956074e-05,
"loss": 0.2712,
"num_input_tokens_seen": 122816,
"step": 445
},
{
"epoch": 3.982300884955752,
"grad_norm": 0.6472075581550598,
"learning_rate": 4.8531682946381874e-05,
"loss": 0.0664,
"num_input_tokens_seen": 124352,
"step": 450
},
{
"epoch": 4.0,
"eval_loss": 0.10101249814033508,
"eval_runtime": 0.7115,
"eval_samples_per_second": 35.137,
"eval_steps_per_second": 18.271,
"num_input_tokens_seen": 124680,
"step": 452
},
{
"epoch": 4.0265486725663715,
"grad_norm": 0.11688811331987381,
"learning_rate": 4.846579029988939e-05,
"loss": 0.5922,
"num_input_tokens_seen": 125432,
"step": 455
},
{
"epoch": 4.070796460176991,
"grad_norm": 0.17728334665298462,
"learning_rate": 4.8398498159903194e-05,
"loss": 0.131,
"num_input_tokens_seen": 126744,
"step": 460
},
{
"epoch": 4.115044247787611,
"grad_norm": 7.516114234924316,
"learning_rate": 4.8329810539708625e-05,
"loss": 0.0699,
"num_input_tokens_seen": 128424,
"step": 465
},
{
"epoch": 4.15929203539823,
"grad_norm": 0.06694227457046509,
"learning_rate": 4.825973153581709e-05,
"loss": 0.0666,
"num_input_tokens_seen": 129640,
"step": 470
},
{
"epoch": 4.20353982300885,
"grad_norm": 5.715642929077148,
"learning_rate": 4.818826532772174e-05,
"loss": 0.138,
"num_input_tokens_seen": 131016,
"step": 475
},
{
"epoch": 4.247787610619469,
"grad_norm": 12.158321380615234,
"learning_rate": 4.8115416177648234e-05,
"loss": 0.1991,
"num_input_tokens_seen": 132552,
"step": 480
},
{
"epoch": 4.292035398230088,
"grad_norm": 0.12277557700872421,
"learning_rate": 4.804118843030049e-05,
"loss": 0.3774,
"num_input_tokens_seen": 133624,
"step": 485
},
{
"epoch": 4.336283185840708,
"grad_norm": 0.031609226018190384,
"learning_rate": 4.796558651260165e-05,
"loss": 0.0041,
"num_input_tokens_seen": 135464,
"step": 490
},
{
"epoch": 4.380530973451328,
"grad_norm": 3.4474875926971436,
"learning_rate": 4.7888614933429955e-05,
"loss": 0.2738,
"num_input_tokens_seen": 136664,
"step": 495
},
{
"epoch": 4.424778761061947,
"grad_norm": 12.919379234313965,
"learning_rate": 4.781027828334994e-05,
"loss": 0.418,
"num_input_tokens_seen": 137752,
"step": 500
},
{
"epoch": 4.469026548672566,
"grad_norm": 0.014108425937592983,
"learning_rate": 4.773058123433857e-05,
"loss": 0.0716,
"num_input_tokens_seen": 139000,
"step": 505
},
{
"epoch": 4.513274336283186,
"grad_norm": 6.61913537979126,
"learning_rate": 4.7649528539506673e-05,
"loss": 0.0242,
"num_input_tokens_seen": 140184,
"step": 510
},
{
"epoch": 4.557522123893805,
"grad_norm": 0.04510444402694702,
"learning_rate": 4.7567125032815394e-05,
"loss": 0.0425,
"num_input_tokens_seen": 141512,
"step": 515
},
{
"epoch": 4.601769911504425,
"grad_norm": 1.0070881843566895,
"learning_rate": 4.7483375628787975e-05,
"loss": 0.1949,
"num_input_tokens_seen": 142888,
"step": 520
},
{
"epoch": 4.646017699115045,
"grad_norm": 6.574953079223633,
"learning_rate": 4.739828532221661e-05,
"loss": 0.128,
"num_input_tokens_seen": 144312,
"step": 525
},
{
"epoch": 4.6902654867256635,
"grad_norm": 14.323014259338379,
"learning_rate": 4.731185918786453e-05,
"loss": 0.2598,
"num_input_tokens_seen": 146008,
"step": 530
},
{
"epoch": 4.734513274336283,
"grad_norm": 8.124874114990234,
"learning_rate": 4.722410238016343e-05,
"loss": 0.3355,
"num_input_tokens_seen": 147448,
"step": 535
},
{
"epoch": 4.778761061946903,
"grad_norm": 0.4788016676902771,
"learning_rate": 4.7135020132905985e-05,
"loss": 0.0034,
"num_input_tokens_seen": 148952,
"step": 540
},
{
"epoch": 4.823008849557522,
"grad_norm": 0.01158350519835949,
"learning_rate": 4.7044617758933714e-05,
"loss": 0.2511,
"num_input_tokens_seen": 150472,
"step": 545
},
{
"epoch": 4.867256637168142,
"grad_norm": 37.700584411621094,
"learning_rate": 4.695290064982018e-05,
"loss": 0.5979,
"num_input_tokens_seen": 151880,
"step": 550
},
{
"epoch": 4.911504424778761,
"grad_norm": 12.200066566467285,
"learning_rate": 4.6859874275549376e-05,
"loss": 0.1413,
"num_input_tokens_seen": 153336,
"step": 555
},
{
"epoch": 4.95575221238938,
"grad_norm": 8.2332763671875,
"learning_rate": 4.676554418418953e-05,
"loss": 0.2624,
"num_input_tokens_seen": 154584,
"step": 560
},
{
"epoch": 5.0,
"grad_norm": 0.3725515902042389,
"learning_rate": 4.66699160015622e-05,
"loss": 0.2766,
"num_input_tokens_seen": 155672,
"step": 565
},
{
"epoch": 5.0,
"eval_loss": 0.10234539210796356,
"eval_runtime": 0.7055,
"eval_samples_per_second": 35.434,
"eval_steps_per_second": 18.426,
"num_input_tokens_seen": 155672,
"step": 565
},
{
"epoch": 5.04424778761062,
"grad_norm": 6.086857795715332,
"learning_rate": 4.6572995430906784e-05,
"loss": 0.161,
"num_input_tokens_seen": 156920,
"step": 570
},
{
"epoch": 5.088495575221239,
"grad_norm": 1.3057224750518799,
"learning_rate": 4.6474788252540323e-05,
"loss": 0.0179,
"num_input_tokens_seen": 158872,
"step": 575
},
{
"epoch": 5.132743362831858,
"grad_norm": 0.07508724927902222,
"learning_rate": 4.637530032351284e-05,
"loss": 0.0159,
"num_input_tokens_seen": 159960,
"step": 580
},
{
"epoch": 5.176991150442478,
"grad_norm": 0.41541609168052673,
"learning_rate": 4.627453757725796e-05,
"loss": 0.084,
"num_input_tokens_seen": 161112,
"step": 585
},
{
"epoch": 5.221238938053097,
"grad_norm": 10.99107551574707,
"learning_rate": 4.617250602323907e-05,
"loss": 0.0434,
"num_input_tokens_seen": 162472,
"step": 590
},
{
"epoch": 5.265486725663717,
"grad_norm": 1.7388826608657837,
"learning_rate": 4.6069211746590926e-05,
"loss": 0.2506,
"num_input_tokens_seen": 164216,
"step": 595
},
{
"epoch": 5.3097345132743365,
"grad_norm": 2.5360372066497803,
"learning_rate": 4.596466090775672e-05,
"loss": 0.0283,
"num_input_tokens_seen": 165832,
"step": 600
},
{
"epoch": 5.353982300884955,
"grad_norm": 0.5227421522140503,
"learning_rate": 4.585885974212068e-05,
"loss": 0.0081,
"num_input_tokens_seen": 167000,
"step": 605
},
{
"epoch": 5.398230088495575,
"grad_norm": 16.39299774169922,
"learning_rate": 4.575181455963619e-05,
"loss": 0.2277,
"num_input_tokens_seen": 168456,
"step": 610
},
{
"epoch": 5.442477876106195,
"grad_norm": 0.012677973136305809,
"learning_rate": 4.5643531744449474e-05,
"loss": 0.0005,
"num_input_tokens_seen": 169624,
"step": 615
},
{
"epoch": 5.486725663716814,
"grad_norm": 0.8441804647445679,
"learning_rate": 4.553401775451882e-05,
"loss": 0.0689,
"num_input_tokens_seen": 170888,
"step": 620
},
{
"epoch": 5.530973451327434,
"grad_norm": 0.009645035490393639,
"learning_rate": 4.542327912122949e-05,
"loss": 0.0622,
"num_input_tokens_seen": 172056,
"step": 625
},
{
"epoch": 5.575221238938053,
"grad_norm": 12.262785911560059,
"learning_rate": 4.531132244900411e-05,
"loss": 0.6379,
"num_input_tokens_seen": 173448,
"step": 630
},
{
"epoch": 5.619469026548672,
"grad_norm": 0.2407040148973465,
"learning_rate": 4.519815441490884e-05,
"loss": 0.0084,
"num_input_tokens_seen": 174872,
"step": 635
},
{
"epoch": 5.663716814159292,
"grad_norm": 1.2417314052581787,
"learning_rate": 4.508378176825516e-05,
"loss": 0.0051,
"num_input_tokens_seen": 176488,
"step": 640
},
{
"epoch": 5.707964601769912,
"grad_norm": 0.4008258283138275,
"learning_rate": 4.496821133019728e-05,
"loss": 0.004,
"num_input_tokens_seen": 177816,
"step": 645
},
{
"epoch": 5.752212389380531,
"grad_norm": 8.208776473999023,
"learning_rate": 4.485144999332541e-05,
"loss": 0.0227,
"num_input_tokens_seen": 179352,
"step": 650
},
{
"epoch": 5.79646017699115,
"grad_norm": 0.11330597847700119,
"learning_rate": 4.4733504721254625e-05,
"loss": 0.0118,
"num_input_tokens_seen": 180504,
"step": 655
},
{
"epoch": 5.84070796460177,
"grad_norm": 13.392186164855957,
"learning_rate": 4.461438254820959e-05,
"loss": 0.3968,
"num_input_tokens_seen": 181976,
"step": 660
},
{
"epoch": 5.88495575221239,
"grad_norm": 0.05890387296676636,
"learning_rate": 4.449409057860504e-05,
"loss": 0.321,
"num_input_tokens_seen": 183000,
"step": 665
},
{
"epoch": 5.929203539823009,
"grad_norm": 0.124122753739357,
"learning_rate": 4.4372635986622044e-05,
"loss": 0.0571,
"num_input_tokens_seen": 184344,
"step": 670
},
{
"epoch": 5.9734513274336285,
"grad_norm": 11.64846134185791,
"learning_rate": 4.425002601578017e-05,
"loss": 0.0905,
"num_input_tokens_seen": 185976,
"step": 675
},
{
"epoch": 6.0,
"eval_loss": 0.10962831228971481,
"eval_runtime": 0.7204,
"eval_samples_per_second": 34.702,
"eval_steps_per_second": 18.045,
"num_input_tokens_seen": 186688,
"step": 678
},
{
"epoch": 6.017699115044247,
"grad_norm": 0.014174363575875759,
"learning_rate": 4.4126267978505486e-05,
"loss": 0.0036,
"num_input_tokens_seen": 187312,
"step": 680
},
{
"epoch": 6.061946902654867,
"grad_norm": 16.07929229736328,
"learning_rate": 4.4001369255694416e-05,
"loss": 0.0637,
"num_input_tokens_seen": 188512,
"step": 685
},
{
"epoch": 6.106194690265487,
"grad_norm": 0.041025493294000626,
"learning_rate": 4.387533729627359e-05,
"loss": 0.0082,
"num_input_tokens_seen": 189616,
"step": 690
},
{
"epoch": 6.150442477876107,
"grad_norm": 0.010337191633880138,
"learning_rate": 4.374817961675553e-05,
"loss": 0.0386,
"num_input_tokens_seen": 191264,
"step": 695
},
{
"epoch": 6.1946902654867255,
"grad_norm": 0.05837007984519005,
"learning_rate": 4.3619903800790465e-05,
"loss": 0.0009,
"num_input_tokens_seen": 192576,
"step": 700
},
{
"epoch": 6.238938053097345,
"grad_norm": 0.5997312664985657,
"learning_rate": 4.3490517498713924e-05,
"loss": 0.0175,
"num_input_tokens_seen": 193920,
"step": 705
},
{
"epoch": 6.283185840707965,
"grad_norm": 0.010935655795037746,
"learning_rate": 4.336002842709057e-05,
"loss": 0.0006,
"num_input_tokens_seen": 195216,
"step": 710
},
{
"epoch": 6.327433628318584,
"grad_norm": 7.356991767883301,
"learning_rate": 4.3228444368253925e-05,
"loss": 0.0111,
"num_input_tokens_seen": 196640,
"step": 715
},
{
"epoch": 6.371681415929204,
"grad_norm": 0.005636111833155155,
"learning_rate": 4.309577316984228e-05,
"loss": 0.008,
"num_input_tokens_seen": 197696,
"step": 720
},
{
"epoch": 6.415929203539823,
"grad_norm": 0.05741603672504425,
"learning_rate": 4.2962022744330616e-05,
"loss": 0.0015,
"num_input_tokens_seen": 199040,
"step": 725
},
{
"epoch": 6.460176991150442,
"grad_norm": 0.011010805144906044,
"learning_rate": 4.282720106855876e-05,
"loss": 0.0005,
"num_input_tokens_seen": 200928,
"step": 730
},
{
"epoch": 6.504424778761062,
"grad_norm": 0.013609147630631924,
"learning_rate": 4.269131618325559e-05,
"loss": 0.0004,
"num_input_tokens_seen": 202784,
"step": 735
},
{
"epoch": 6.548672566371682,
"grad_norm": 16.62497901916504,
"learning_rate": 4.255437619255955e-05,
"loss": 0.0541,
"num_input_tokens_seen": 204080,
"step": 740
},
{
"epoch": 6.592920353982301,
"grad_norm": 4.816155433654785,
"learning_rate": 4.241638926353526e-05,
"loss": 0.0078,
"num_input_tokens_seen": 205296,
"step": 745
},
{
"epoch": 6.6371681415929205,
"grad_norm": 2.4015004634857178,
"learning_rate": 4.2277363625686475e-05,
"loss": 0.0094,
"num_input_tokens_seen": 206768,
"step": 750
},
{
"epoch": 6.68141592920354,
"grad_norm": 0.004668759182095528,
"learning_rate": 4.213730757046528e-05,
"loss": 0.0044,
"num_input_tokens_seen": 208048,
"step": 755
},
{
"epoch": 6.725663716814159,
"grad_norm": 0.017873145639896393,
"learning_rate": 4.199622945077755e-05,
"loss": 0.0683,
"num_input_tokens_seen": 209200,
"step": 760
},
{
"epoch": 6.769911504424779,
"grad_norm": 0.277229368686676,
"learning_rate": 4.185413768048483e-05,
"loss": 0.0029,
"num_input_tokens_seen": 210448,
"step": 765
},
{
"epoch": 6.814159292035399,
"grad_norm": 54.36215591430664,
"learning_rate": 4.1711040733902526e-05,
"loss": 0.8871,
"num_input_tokens_seen": 211984,
"step": 770
},
{
"epoch": 6.8584070796460175,
"grad_norm": 0.006391752976924181,
"learning_rate": 4.1566947145294474e-05,
"loss": 0.0012,
"num_input_tokens_seen": 213328,
"step": 775
},
{
"epoch": 6.902654867256637,
"grad_norm": 17.785545349121094,
"learning_rate": 4.142186550836399e-05,
"loss": 0.202,
"num_input_tokens_seen": 214400,
"step": 780
},
{
"epoch": 6.946902654867257,
"grad_norm": 0.0706261619925499,
"learning_rate": 4.127580447574131e-05,
"loss": 0.0008,
"num_input_tokens_seen": 215888,
"step": 785
},
{
"epoch": 6.991150442477876,
"grad_norm": 0.0091946329921484,
"learning_rate": 4.1128772758467604e-05,
"loss": 0.0007,
"num_input_tokens_seen": 217664,
"step": 790
},
{
"epoch": 7.0,
"eval_loss": 0.2586836516857147,
"eval_runtime": 0.7093,
"eval_samples_per_second": 35.248,
"eval_steps_per_second": 18.329,
"num_input_tokens_seen": 217736,
"step": 791
},
{
"epoch": 7.035398230088496,
"grad_norm": 0.23946355283260345,
"learning_rate": 4.098077912547536e-05,
"loss": 0.1797,
"num_input_tokens_seen": 218840,
"step": 795
},
{
"epoch": 7.079646017699115,
"grad_norm": 0.8961653709411621,
"learning_rate": 4.0831832403065526e-05,
"loss": 0.0118,
"num_input_tokens_seen": 219992,
"step": 800
},
{
"epoch": 7.123893805309734,
"grad_norm": 0.005731819197535515,
"learning_rate": 4.068194147438101e-05,
"loss": 0.0018,
"num_input_tokens_seen": 221160,
"step": 805
},
{
"epoch": 7.168141592920354,
"grad_norm": 0.0060266111977398396,
"learning_rate": 4.0531115278876934e-05,
"loss": 0.0097,
"num_input_tokens_seen": 222696,
"step": 810
},
{
"epoch": 7.212389380530974,
"grad_norm": 0.01002022996544838,
"learning_rate": 4.0379362811787504e-05,
"loss": 0.0003,
"num_input_tokens_seen": 224088,
"step": 815
},
{
"epoch": 7.256637168141593,
"grad_norm": 0.1268666535615921,
"learning_rate": 4.022669312358949e-05,
"loss": 0.1595,
"num_input_tokens_seen": 225432,
"step": 820
},
{
"epoch": 7.300884955752212,
"grad_norm": 0.004778804257512093,
"learning_rate": 4.007311531946252e-05,
"loss": 0.0011,
"num_input_tokens_seen": 226696,
"step": 825
},
{
"epoch": 7.345132743362832,
"grad_norm": 0.0037855140399187803,
"learning_rate": 3.9918638558745966e-05,
"loss": 0.0135,
"num_input_tokens_seen": 228168,
"step": 830
},
{
"epoch": 7.389380530973451,
"grad_norm": 0.01771862432360649,
"learning_rate": 3.976327205439279e-05,
"loss": 0.0004,
"num_input_tokens_seen": 229528,
"step": 835
},
{
"epoch": 7.433628318584071,
"grad_norm": 0.10408202558755875,
"learning_rate": 3.9607025072419986e-05,
"loss": 0.0012,
"num_input_tokens_seen": 231208,
"step": 840
},
{
"epoch": 7.477876106194691,
"grad_norm": 0.0022122100926935673,
"learning_rate": 3.9449906931356005e-05,
"loss": 0.0024,
"num_input_tokens_seen": 232392,
"step": 845
},
{
"epoch": 7.522123893805309,
"grad_norm": 0.0038752174004912376,
"learning_rate": 3.929192700168501e-05,
"loss": 0.0001,
"num_input_tokens_seen": 233480,
"step": 850
},
{
"epoch": 7.566371681415929,
"grad_norm": 0.001984543865546584,
"learning_rate": 3.9133094705287984e-05,
"loss": 0.0003,
"num_input_tokens_seen": 235256,
"step": 855
},
{
"epoch": 7.610619469026549,
"grad_norm": 0.00306964130140841,
"learning_rate": 3.897341951488087e-05,
"loss": 0.0003,
"num_input_tokens_seen": 236744,
"step": 860
},
{
"epoch": 7.654867256637168,
"grad_norm": 0.0018930825171992183,
"learning_rate": 3.8812910953449555e-05,
"loss": 0.1634,
"num_input_tokens_seen": 237880,
"step": 865
},
{
"epoch": 7.699115044247788,
"grad_norm": 0.6990119814872742,
"learning_rate": 3.865157859368196e-05,
"loss": 0.001,
"num_input_tokens_seen": 239544,
"step": 870
},
{
"epoch": 7.743362831858407,
"grad_norm": 0.013935576193034649,
"learning_rate": 3.848943205739711e-05,
"loss": 0.0034,
"num_input_tokens_seen": 241048,
"step": 875
},
{
"epoch": 7.787610619469026,
"grad_norm": 0.00296784657984972,
"learning_rate": 3.832648101497134e-05,
"loss": 0.0007,
"num_input_tokens_seen": 242408,
"step": 880
},
{
"epoch": 7.831858407079646,
"grad_norm": 0.018667805939912796,
"learning_rate": 3.8162735184761476e-05,
"loss": 0.0005,
"num_input_tokens_seen": 243688,
"step": 885
},
{
"epoch": 7.876106194690266,
"grad_norm": 0.0245062168687582,
"learning_rate": 3.799820433252529e-05,
"loss": 0.0002,
"num_input_tokens_seen": 245320,
"step": 890
},
{
"epoch": 7.920353982300885,
"grad_norm": 0.00318564148619771,
"learning_rate": 3.783289827083905e-05,
"loss": 0.0004,
"num_input_tokens_seen": 246584,
"step": 895
},
{
"epoch": 7.964601769911504,
"grad_norm": 0.00905859749764204,
"learning_rate": 3.766682685851234e-05,
"loss": 0.0003,
"num_input_tokens_seen": 248024,
"step": 900
},
{
"epoch": 8.0,
"eval_loss": 0.03978044167160988,
"eval_runtime": 0.7103,
"eval_samples_per_second": 35.199,
"eval_steps_per_second": 18.303,
"num_input_tokens_seen": 248784,
"step": 904
},
{
"epoch": 8.008849557522124,
"grad_norm": 0.010085551999509335,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0002,
"num_input_tokens_seen": 249024,
"step": 905
},
{
"epoch": 8.053097345132743,
"grad_norm": 0.33434563875198364,
"learning_rate": 3.733242764481154e-05,
"loss": 0.0012,
"num_input_tokens_seen": 250672,
"step": 910
},
{
"epoch": 8.097345132743364,
"grad_norm": 0.0033214823342859745,
"learning_rate": 3.716411978691766e-05,
"loss": 0.0033,
"num_input_tokens_seen": 252368,
"step": 915
},
{
"epoch": 8.141592920353983,
"grad_norm": 0.0026391008868813515,
"learning_rate": 3.699508646415424e-05,
"loss": 0.0001,
"num_input_tokens_seen": 253920,
"step": 920
},
{
"epoch": 8.185840707964601,
"grad_norm": 0.003199717728421092,
"learning_rate": 3.6825337757623696e-05,
"loss": 0.0001,
"num_input_tokens_seen": 255248,
"step": 925
},
{
"epoch": 8.230088495575222,
"grad_norm": 0.002210139762610197,
"learning_rate": 3.665488379109377e-05,
"loss": 0.0001,
"num_input_tokens_seen": 256336,
"step": 930
},
{
"epoch": 8.274336283185841,
"grad_norm": 0.0021523365285247564,
"learning_rate": 3.648373473039368e-05,
"loss": 0.0001,
"num_input_tokens_seen": 257648,
"step": 935
},
{
"epoch": 8.31858407079646,
"grad_norm": 0.0015277840429916978,
"learning_rate": 3.631190078280791e-05,
"loss": 0.0001,
"num_input_tokens_seen": 258752,
"step": 940
},
{
"epoch": 8.36283185840708,
"grad_norm": 0.002127046464011073,
"learning_rate": 3.613939219646739e-05,
"loss": 0.0001,
"num_input_tokens_seen": 260144,
"step": 945
},
{
"epoch": 8.4070796460177,
"grad_norm": 0.003957969136536121,
"learning_rate": 3.596621925973835e-05,
"loss": 0.0001,
"num_input_tokens_seen": 261328,
"step": 950
},
{
"epoch": 8.451327433628318,
"grad_norm": 0.0043671284802258015,
"learning_rate": 3.579239230060867e-05,
"loss": 0.0001,
"num_input_tokens_seen": 262784,
"step": 955
},
{
"epoch": 8.495575221238939,
"grad_norm": 0.0023445875849574804,
"learning_rate": 3.5617921686071995e-05,
"loss": 0.001,
"num_input_tokens_seen": 264448,
"step": 960
},
{
"epoch": 8.539823008849558,
"grad_norm": 0.9188032746315002,
"learning_rate": 3.544281782150936e-05,
"loss": 0.0012,
"num_input_tokens_seen": 265920,
"step": 965
},
{
"epoch": 8.584070796460177,
"grad_norm": 0.0022221386898308992,
"learning_rate": 3.526709115006871e-05,
"loss": 0.0001,
"num_input_tokens_seen": 266992,
"step": 970
},
{
"epoch": 8.628318584070797,
"grad_norm": 0.017859293147921562,
"learning_rate": 3.5090752152041975e-05,
"loss": 0.0003,
"num_input_tokens_seen": 268688,
"step": 975
},
{
"epoch": 8.672566371681416,
"grad_norm": 0.0011308812536299229,
"learning_rate": 3.491381134424012e-05,
"loss": 0.0228,
"num_input_tokens_seen": 270000,
"step": 980
},
{
"epoch": 8.716814159292035,
"grad_norm": 0.002236367203295231,
"learning_rate": 3.4736279279365876e-05,
"loss": 0.0002,
"num_input_tokens_seen": 271056,
"step": 985
},
{
"epoch": 8.761061946902656,
"grad_norm": 0.030419211834669113,
"learning_rate": 3.455816654538438e-05,
"loss": 0.0089,
"num_input_tokens_seen": 272608,
"step": 990
},
{
"epoch": 8.805309734513274,
"grad_norm": 0.0011954925721511245,
"learning_rate": 3.437948376489172e-05,
"loss": 0.0003,
"num_input_tokens_seen": 274000,
"step": 995
},
{
"epoch": 8.849557522123893,
"grad_norm": 0.008035533130168915,
"learning_rate": 3.420024159448142e-05,
"loss": 0.0004,
"num_input_tokens_seen": 275200,
"step": 1000
},
{
"epoch": 8.893805309734514,
"grad_norm": 0.033750034868717194,
"learning_rate": 3.402045072410886e-05,
"loss": 0.0003,
"num_input_tokens_seen": 276352,
"step": 1005
},
{
"epoch": 8.938053097345133,
"grad_norm": 0.0011004299158230424,
"learning_rate": 3.3840121876453734e-05,
"loss": 0.0,
"num_input_tokens_seen": 277952,
"step": 1010
},
{
"epoch": 8.982300884955752,
"grad_norm": 0.0018102293834090233,
"learning_rate": 3.365926580628057e-05,
"loss": 0.0001,
"num_input_tokens_seen": 279328,
"step": 1015
},
{
"epoch": 9.0,
"eval_loss": 0.09988843649625778,
"eval_runtime": 0.7112,
"eval_samples_per_second": 35.152,
"eval_steps_per_second": 18.279,
"num_input_tokens_seen": 279688,
"step": 1017
},
{
"epoch": 9.026548672566372,
"grad_norm": 0.0009696983615867794,
"learning_rate": 3.3477893299797304e-05,
"loss": 0.0,
"num_input_tokens_seen": 280456,
"step": 1020
},
{
"epoch": 9.070796460176991,
"grad_norm": 0.0008515130612067878,
"learning_rate": 3.3296015174011984e-05,
"loss": 0.0002,
"num_input_tokens_seen": 281752,
"step": 1025
},
{
"epoch": 9.11504424778761,
"grad_norm": 0.0012484554899856448,
"learning_rate": 3.311364227608768e-05,
"loss": 0.0001,
"num_input_tokens_seen": 283464,
"step": 1030
},
{
"epoch": 9.15929203539823,
"grad_norm": 0.0006809367914684117,
"learning_rate": 3.293078548269553e-05,
"loss": 0.0004,
"num_input_tokens_seen": 284760,
"step": 1035
},
{
"epoch": 9.20353982300885,
"grad_norm": 0.0011875375639647245,
"learning_rate": 3.2747455699366056e-05,
"loss": 0.0001,
"num_input_tokens_seen": 286104,
"step": 1040
},
{
"epoch": 9.247787610619469,
"grad_norm": 0.0012038415297865868,
"learning_rate": 3.256366385983879e-05,
"loss": 0.0,
"num_input_tokens_seen": 287672,
"step": 1045
},
{
"epoch": 9.29203539823009,
"grad_norm": 0.006046834401786327,
"learning_rate": 3.237942092541018e-05,
"loss": 0.0004,
"num_input_tokens_seen": 288920,
"step": 1050
},
{
"epoch": 9.336283185840708,
"grad_norm": 0.0012190825073048472,
"learning_rate": 3.219473788427984e-05,
"loss": 0.0,
"num_input_tokens_seen": 289960,
"step": 1055
},
{
"epoch": 9.380530973451327,
"grad_norm": 0.0028741054702550173,
"learning_rate": 3.2009625750895224e-05,
"loss": 0.0001,
"num_input_tokens_seen": 291544,
"step": 1060
},
{
"epoch": 9.424778761061948,
"grad_norm": 0.024399342015385628,
"learning_rate": 3.182409556529476e-05,
"loss": 0.0001,
"num_input_tokens_seen": 292808,
"step": 1065
},
{
"epoch": 9.469026548672566,
"grad_norm": 0.0008577514672651887,
"learning_rate": 3.163815839244937e-05,
"loss": 0.0001,
"num_input_tokens_seen": 293912,
"step": 1070
},
{
"epoch": 9.513274336283185,
"grad_norm": 0.0012534806737676263,
"learning_rate": 3.14518253216026e-05,
"loss": 0.0001,
"num_input_tokens_seen": 295416,
"step": 1075
},
{
"epoch": 9.557522123893806,
"grad_norm": 0.0012791682966053486,
"learning_rate": 3.126510746560925e-05,
"loss": 0.0001,
"num_input_tokens_seen": 297016,
"step": 1080
},
{
"epoch": 9.601769911504425,
"grad_norm": 0.0010787008795887232,
"learning_rate": 3.107801596027261e-05,
"loss": 0.0,
"num_input_tokens_seen": 298168,
"step": 1085
},
{
"epoch": 9.646017699115044,
"grad_norm": 0.0010192421032115817,
"learning_rate": 3.0890561963680306e-05,
"loss": 0.0,
"num_input_tokens_seen": 299672,
"step": 1090
},
{
"epoch": 9.690265486725664,
"grad_norm": 0.0038640201091766357,
"learning_rate": 3.0702756655538835e-05,
"loss": 0.0,
"num_input_tokens_seen": 301080,
"step": 1095
},
{
"epoch": 9.734513274336283,
"grad_norm": 0.0006423942977562547,
"learning_rate": 3.051461123650685e-05,
"loss": 0.0001,
"num_input_tokens_seen": 302472,
"step": 1100
},
{
"epoch": 9.778761061946902,
"grad_norm": 0.01918724551796913,
"learning_rate": 3.032613692752711e-05,
"loss": 0.0001,
"num_input_tokens_seen": 303592,
"step": 1105
},
{
"epoch": 9.823008849557523,
"grad_norm": 0.0008666754583828151,
"learning_rate": 3.0137344969157284e-05,
"loss": 0.0001,
"num_input_tokens_seen": 305224,
"step": 1110
},
{
"epoch": 9.867256637168142,
"grad_norm": 0.005792871117591858,
"learning_rate": 2.9948246620899557e-05,
"loss": 0.0,
"num_input_tokens_seen": 306200,
"step": 1115
},
{
"epoch": 9.91150442477876,
"grad_norm": 0.0005233255214989185,
"learning_rate": 2.9758853160529148e-05,
"loss": 0.0,
"num_input_tokens_seen": 307880,
"step": 1120
},
{
"epoch": 9.955752212389381,
"grad_norm": 0.000557672290597111,
"learning_rate": 2.9569175883421672e-05,
"loss": 0.0,
"num_input_tokens_seen": 309240,
"step": 1125
},
{
"epoch": 10.0,
"grad_norm": 0.0006190123967826366,
"learning_rate": 2.93792261018795e-05,
"loss": 0.0,
"num_input_tokens_seen": 310504,
"step": 1130
},
{
"epoch": 10.0,
"eval_loss": 0.02271426096558571,
"eval_runtime": 0.7078,
"eval_samples_per_second": 35.321,
"eval_steps_per_second": 18.367,
"num_input_tokens_seen": 310504,
"step": 1130
},
{
"epoch": 10.044247787610619,
"grad_norm": 0.0008720250916667283,
"learning_rate": 2.9189015144457087e-05,
"loss": 0.0,
"num_input_tokens_seen": 311800,
"step": 1135
},
{
"epoch": 10.08849557522124,
"grad_norm": 0.0008014932973310351,
"learning_rate": 2.8998554355285355e-05,
"loss": 0.0001,
"num_input_tokens_seen": 313064,
"step": 1140
},
{
"epoch": 10.132743362831858,
"grad_norm": 0.0009775793878361583,
"learning_rate": 2.8807855093395126e-05,
"loss": 0.0001,
"num_input_tokens_seen": 314456,
"step": 1145
},
{
"epoch": 10.176991150442477,
"grad_norm": 0.000823335547465831,
"learning_rate": 2.8616928732039684e-05,
"loss": 0.0,
"num_input_tokens_seen": 315928,
"step": 1150
},
{
"epoch": 10.221238938053098,
"grad_norm": 0.0010703738080337644,
"learning_rate": 2.8425786658016423e-05,
"loss": 0.0,
"num_input_tokens_seen": 317528,
"step": 1155
},
{
"epoch": 10.265486725663717,
"grad_norm": 0.0008935982477851212,
"learning_rate": 2.8234440270987837e-05,
"loss": 0.0,
"num_input_tokens_seen": 318952,
"step": 1160
},
{
"epoch": 10.309734513274336,
"grad_norm": 0.0008601442677900195,
"learning_rate": 2.804290098280155e-05,
"loss": 0.0,
"num_input_tokens_seen": 320312,
"step": 1165
},
{
"epoch": 10.353982300884956,
"grad_norm": 0.001831409870646894,
"learning_rate": 2.7851180216809796e-05,
"loss": 0.0001,
"num_input_tokens_seen": 321560,
"step": 1170
},
{
"epoch": 10.398230088495575,
"grad_norm": 0.0011935007059946656,
"learning_rate": 2.765928940718806e-05,
"loss": 0.0,
"num_input_tokens_seen": 322984,
"step": 1175
},
{
"epoch": 10.442477876106194,
"grad_norm": 0.00106055848300457,
"learning_rate": 2.7467239998253214e-05,
"loss": 0.0,
"num_input_tokens_seen": 324120,
"step": 1180
},
{
"epoch": 10.486725663716815,
"grad_norm": 0.0005716433515772223,
"learning_rate": 2.7275043443780934e-05,
"loss": 0.0,
"num_input_tokens_seen": 325480,
"step": 1185
},
{
"epoch": 10.530973451327434,
"grad_norm": 0.0005892731714993715,
"learning_rate": 2.708271120632262e-05,
"loss": 0.0001,
"num_input_tokens_seen": 326680,
"step": 1190
},
{
"epoch": 10.575221238938052,
"grad_norm": 0.0006973208510316908,
"learning_rate": 2.6890254756521778e-05,
"loss": 0.0,
"num_input_tokens_seen": 327832,
"step": 1195
},
{
"epoch": 10.619469026548673,
"grad_norm": 0.006061312276870012,
"learning_rate": 2.6697685572429886e-05,
"loss": 0.0001,
"num_input_tokens_seen": 328968,
"step": 1200
},
{
"epoch": 10.663716814159292,
"grad_norm": 0.0005946594756096601,
"learning_rate": 2.65050151388219e-05,
"loss": 0.0,
"num_input_tokens_seen": 330312,
"step": 1205
},
{
"epoch": 10.70796460176991,
"grad_norm": 0.0018679506611078978,
"learning_rate": 2.6312254946511217e-05,
"loss": 0.0001,
"num_input_tokens_seen": 331704,
"step": 1210
},
{
"epoch": 10.752212389380531,
"grad_norm": 0.0007243392756208777,
"learning_rate": 2.6119416491664472e-05,
"loss": 0.0001,
"num_input_tokens_seen": 333016,
"step": 1215
},
{
"epoch": 10.79646017699115,
"grad_norm": 0.0011644094483926892,
"learning_rate": 2.5926511275115827e-05,
"loss": 0.0,
"num_input_tokens_seen": 334840,
"step": 1220
},
{
"epoch": 10.84070796460177,
"grad_norm": 0.0010138576617464423,
"learning_rate": 2.57335508016811e-05,
"loss": 0.0,
"num_input_tokens_seen": 336280,
"step": 1225
},
{
"epoch": 10.88495575221239,
"grad_norm": 0.0016166451387107372,
"learning_rate": 2.5540546579471624e-05,
"loss": 0.0,
"num_input_tokens_seen": 337720,
"step": 1230
},
{
"epoch": 10.929203539823009,
"grad_norm": 0.01318308338522911,
"learning_rate": 2.5347510119207878e-05,
"loss": 0.0001,
"num_input_tokens_seen": 339032,
"step": 1235
},
{
"epoch": 10.973451327433628,
"grad_norm": 0.0007939252536743879,
"learning_rate": 2.515445293353304e-05,
"loss": 0.0,
"num_input_tokens_seen": 340552,
"step": 1240
},
{
"epoch": 11.0,
"eval_loss": 0.021070513874292374,
"eval_runtime": 0.7097,
"eval_samples_per_second": 35.227,
"eval_steps_per_second": 18.318,
"num_input_tokens_seen": 341152,
"step": 1243
},
{
"epoch": 11.017699115044248,
"grad_norm": 0.0005229383241385221,
"learning_rate": 2.4961386536326307e-05,
"loss": 0.0,
"num_input_tokens_seen": 341728,
"step": 1245
},
{
"epoch": 11.061946902654867,
"grad_norm": 0.0010782463941723108,
"learning_rate": 2.4768322442016278e-05,
"loss": 0.0,
"num_input_tokens_seen": 343360,
"step": 1250
},
{
"epoch": 11.106194690265486,
"grad_norm": 0.0020232030656188726,
"learning_rate": 2.457527216489421e-05,
"loss": 0.0001,
"num_input_tokens_seen": 344912,
"step": 1255
},
{
"epoch": 11.150442477876107,
"grad_norm": 0.0005589700886048377,
"learning_rate": 2.438224721842728e-05,
"loss": 0.0,
"num_input_tokens_seen": 346176,
"step": 1260
},
{
"epoch": 11.194690265486726,
"grad_norm": 0.0008123366278596222,
"learning_rate": 2.4189259114571984e-05,
"loss": 0.0,
"num_input_tokens_seen": 348096,
"step": 1265
},
{
"epoch": 11.238938053097344,
"grad_norm": 0.0008835430489853024,
"learning_rate": 2.39963193630875e-05,
"loss": 0.0,
"num_input_tokens_seen": 349408,
"step": 1270
},
{
"epoch": 11.283185840707965,
"grad_norm": 0.0046989452093839645,
"learning_rate": 2.3803439470849335e-05,
"loss": 0.0001,
"num_input_tokens_seen": 350800,
"step": 1275
},
{
"epoch": 11.327433628318584,
"grad_norm": 0.0011660694144666195,
"learning_rate": 2.361063094116293e-05,
"loss": 0.0,
"num_input_tokens_seen": 352064,
"step": 1280
},
{
"epoch": 11.371681415929203,
"grad_norm": 0.0008970863418653607,
"learning_rate": 2.3417905273077756e-05,
"loss": 0.0,
"num_input_tokens_seen": 353424,
"step": 1285
},
{
"epoch": 11.415929203539823,
"grad_norm": 0.00047139974776655436,
"learning_rate": 2.32252739607014e-05,
"loss": 0.0,
"num_input_tokens_seen": 354704,
"step": 1290
},
{
"epoch": 11.460176991150442,
"grad_norm": 0.0005186158232390881,
"learning_rate": 2.3032748492514116e-05,
"loss": 0.0,
"num_input_tokens_seen": 355920,
"step": 1295
},
{
"epoch": 11.504424778761061,
"grad_norm": 0.0005980245769023895,
"learning_rate": 2.2840340350683622e-05,
"loss": 0.0,
"num_input_tokens_seen": 357168,
"step": 1300
},
{
"epoch": 11.548672566371682,
"grad_norm": 0.004304022993892431,
"learning_rate": 2.2648061010380346e-05,
"loss": 0.0,
"num_input_tokens_seen": 358448,
"step": 1305
},
{
"epoch": 11.5929203539823,
"grad_norm": 0.00046522877528332174,
"learning_rate": 2.2455921939093e-05,
"loss": 0.0,
"num_input_tokens_seen": 359856,
"step": 1310
},
{
"epoch": 11.63716814159292,
"grad_norm": 0.0010373727418482304,
"learning_rate": 2.2263934595944716e-05,
"loss": 0.0,
"num_input_tokens_seen": 361216,
"step": 1315
},
{
"epoch": 11.68141592920354,
"grad_norm": 0.0005117251421324909,
"learning_rate": 2.207211043100958e-05,
"loss": 0.0,
"num_input_tokens_seen": 362656,
"step": 1320
},
{
"epoch": 11.725663716814159,
"grad_norm": 0.0005031274631619453,
"learning_rate": 2.188046088462979e-05,
"loss": 0.0,
"num_input_tokens_seen": 363888,
"step": 1325
},
{
"epoch": 11.769911504424778,
"grad_norm": 0.013536173850297928,
"learning_rate": 2.1688997386733316e-05,
"loss": 0.0,
"num_input_tokens_seen": 365232,
"step": 1330
},
{
"epoch": 11.814159292035399,
"grad_norm": 0.00047480713692493737,
"learning_rate": 2.1497731356152286e-05,
"loss": 0.0,
"num_input_tokens_seen": 366720,
"step": 1335
},
{
"epoch": 11.858407079646017,
"grad_norm": 0.0026002004742622375,
"learning_rate": 2.1306674199941872e-05,
"loss": 0.0,
"num_input_tokens_seen": 367824,
"step": 1340
},
{
"epoch": 11.902654867256636,
"grad_norm": 0.0013884420040994883,
"learning_rate": 2.1115837312700088e-05,
"loss": 0.0,
"num_input_tokens_seen": 369184,
"step": 1345
},
{
"epoch": 11.946902654867257,
"grad_norm": 0.0004059988132212311,
"learning_rate": 2.0925232075888143e-05,
"loss": 0.0,
"num_input_tokens_seen": 370432,
"step": 1350
},
{
"epoch": 11.991150442477876,
"grad_norm": 0.0004867562965955585,
"learning_rate": 2.0734869857151666e-05,
"loss": 0.0,
"num_input_tokens_seen": 371696,
"step": 1355
},
{
"epoch": 12.0,
"eval_loss": 0.019818633794784546,
"eval_runtime": 0.7123,
"eval_samples_per_second": 35.099,
"eval_steps_per_second": 18.252,
"num_input_tokens_seen": 371768,
"step": 1356
},
{
"epoch": 12.035398230088495,
"grad_norm": 0.000515097810421139,
"learning_rate": 2.054476200964278e-05,
"loss": 0.0,
"num_input_tokens_seen": 372952,
"step": 1360
},
{
"epoch": 12.079646017699115,
"grad_norm": 0.002132029039785266,
"learning_rate": 2.035491987134294e-05,
"loss": 0.0,
"num_input_tokens_seen": 374264,
"step": 1365
},
{
"epoch": 12.123893805309734,
"grad_norm": 0.0006297901272773743,
"learning_rate": 2.0165354764386807e-05,
"loss": 0.0,
"num_input_tokens_seen": 375528,
"step": 1370
},
{
"epoch": 12.168141592920353,
"grad_norm": 0.0023943870328366756,
"learning_rate": 1.997607799438694e-05,
"loss": 0.0,
"num_input_tokens_seen": 377016,
"step": 1375
},
{
"epoch": 12.212389380530974,
"grad_norm": 0.0013272215146571398,
"learning_rate": 1.978710084975959e-05,
"loss": 0.0,
"num_input_tokens_seen": 378472,
"step": 1380
},
{
"epoch": 12.256637168141593,
"grad_norm": 0.001000846503302455,
"learning_rate": 1.9598434601051386e-05,
"loss": 0.0,
"num_input_tokens_seen": 379736,
"step": 1385
},
{
"epoch": 12.300884955752213,
"grad_norm": 0.0004203008720651269,
"learning_rate": 1.941009050026726e-05,
"loss": 0.0001,
"num_input_tokens_seen": 381160,
"step": 1390
},
{
"epoch": 12.345132743362832,
"grad_norm": 0.0007252345094457269,
"learning_rate": 1.922207978019928e-05,
"loss": 0.0,
"num_input_tokens_seen": 382232,
"step": 1395
},
{
"epoch": 12.389380530973451,
"grad_norm": 0.0018995037535205483,
"learning_rate": 1.903441365375681e-05,
"loss": 0.0,
"num_input_tokens_seen": 383384,
"step": 1400
},
{
"epoch": 12.43362831858407,
"grad_norm": 0.0005581756122410297,
"learning_rate": 1.884710331329772e-05,
"loss": 0.0,
"num_input_tokens_seen": 385000,
"step": 1405
},
{
"epoch": 12.47787610619469,
"grad_norm": 0.0008921206463128328,
"learning_rate": 1.8660159929960914e-05,
"loss": 0.0,
"num_input_tokens_seen": 386712,
"step": 1410
},
{
"epoch": 12.52212389380531,
"grad_norm": 0.004135303199291229,
"learning_rate": 1.847359465300006e-05,
"loss": 0.0,
"num_input_tokens_seen": 388360,
"step": 1415
},
{
"epoch": 12.56637168141593,
"grad_norm": 0.0005593369132839143,
"learning_rate": 1.828741860911867e-05,
"loss": 0.0,
"num_input_tokens_seen": 389672,
"step": 1420
},
{
"epoch": 12.610619469026549,
"grad_norm": 0.0005592005327343941,
"learning_rate": 1.8101642901806486e-05,
"loss": 0.0,
"num_input_tokens_seen": 391224,
"step": 1425
},
{
"epoch": 12.654867256637168,
"grad_norm": 0.0009430416394025087,
"learning_rate": 1.791627861067731e-05,
"loss": 0.0,
"num_input_tokens_seen": 392616,
"step": 1430
},
{
"epoch": 12.699115044247787,
"grad_norm": 0.005889066495001316,
"learning_rate": 1.7731336790808146e-05,
"loss": 0.0,
"num_input_tokens_seen": 393800,
"step": 1435
},
{
"epoch": 12.743362831858407,
"grad_norm": 0.000711196509655565,
"learning_rate": 1.7546828472079992e-05,
"loss": 0.0,
"num_input_tokens_seen": 395064,
"step": 1440
},
{
"epoch": 12.787610619469026,
"grad_norm": 0.00040520497714169323,
"learning_rate": 1.7362764658519877e-05,
"loss": 0.0,
"num_input_tokens_seen": 396392,
"step": 1445
},
{
"epoch": 12.831858407079647,
"grad_norm": 0.00030975393019616604,
"learning_rate": 1.7179156327644724e-05,
"loss": 0.0,
"num_input_tokens_seen": 397720,
"step": 1450
},
{
"epoch": 12.876106194690266,
"grad_norm": 0.0036038346588611603,
"learning_rate": 1.699601442980655e-05,
"loss": 0.0,
"num_input_tokens_seen": 398968,
"step": 1455
},
{
"epoch": 12.920353982300885,
"grad_norm": 0.0005937079549767077,
"learning_rate": 1.6813349887539443e-05,
"loss": 0.0,
"num_input_tokens_seen": 400696,
"step": 1460
},
{
"epoch": 12.964601769911503,
"grad_norm": 0.00041711816447786987,
"learning_rate": 1.663117359490814e-05,
"loss": 0.0,
"num_input_tokens_seen": 402168,
"step": 1465
},
{
"epoch": 13.0,
"eval_loss": 0.01962355710566044,
"eval_runtime": 0.7175,
"eval_samples_per_second": 34.844,
"eval_steps_per_second": 18.119,
"num_input_tokens_seen": 402896,
"step": 1469
},
{
"epoch": 13.008849557522124,
"grad_norm": 0.0004908296396024525,
"learning_rate": 1.6449496416858284e-05,
"loss": 0.0,
"num_input_tokens_seen": 403168,
"step": 1470
},
{
"epoch": 13.053097345132743,
"grad_norm": 0.0007329536601901054,
"learning_rate": 1.6268329188568468e-05,
"loss": 0.0,
"num_input_tokens_seen": 404752,
"step": 1475
},
{
"epoch": 13.097345132743364,
"grad_norm": 0.0009706666460260749,
"learning_rate": 1.6087682714804002e-05,
"loss": 0.0,
"num_input_tokens_seen": 406064,
"step": 1480
},
{
"epoch": 13.141592920353983,
"grad_norm": 0.0008216256974264979,
"learning_rate": 1.5907567769272568e-05,
"loss": 0.0,
"num_input_tokens_seen": 407472,
"step": 1485
},
{
"epoch": 13.185840707964601,
"grad_norm": 0.0008931403863243759,
"learning_rate": 1.5727995093981598e-05,
"loss": 0.0,
"num_input_tokens_seen": 408672,
"step": 1490
},
{
"epoch": 13.230088495575222,
"grad_norm": 0.00038248588680289686,
"learning_rate": 1.5548975398597718e-05,
"loss": 0.0,
"num_input_tokens_seen": 409968,
"step": 1495
},
{
"epoch": 13.274336283185841,
"grad_norm": 0.000629936985205859,
"learning_rate": 1.537051935980794e-05,
"loss": 0.0,
"num_input_tokens_seen": 411456,
"step": 1500
},
{
"epoch": 13.31858407079646,
"grad_norm": 0.001056081848219037,
"learning_rate": 1.5192637620682981e-05,
"loss": 0.0001,
"num_input_tokens_seen": 412864,
"step": 1505
},
{
"epoch": 13.36283185840708,
"grad_norm": 0.0005708714015781879,
"learning_rate": 1.5015340790042446e-05,
"loss": 0.0,
"num_input_tokens_seen": 414272,
"step": 1510
},
{
"epoch": 13.4070796460177,
"grad_norm": 0.00035775068681687117,
"learning_rate": 1.4838639441822183e-05,
"loss": 0.0,
"num_input_tokens_seen": 415536,
"step": 1515
},
{
"epoch": 13.451327433628318,
"grad_norm": 0.00041552510811015964,
"learning_rate": 1.46625441144436e-05,
"loss": 0.0,
"num_input_tokens_seen": 416896,
"step": 1520
},
{
"epoch": 13.495575221238939,
"grad_norm": 0.0004293840902391821,
"learning_rate": 1.4487065310185202e-05,
"loss": 0.0,
"num_input_tokens_seen": 418192,
"step": 1525
},
{
"epoch": 13.539823008849558,
"grad_norm": 0.008226204663515091,
"learning_rate": 1.4312213494556218e-05,
"loss": 0.0,
"num_input_tokens_seen": 419424,
"step": 1530
},
{
"epoch": 13.584070796460177,
"grad_norm": 0.0005483218701556325,
"learning_rate": 1.4137999095672444e-05,
"loss": 0.0,
"num_input_tokens_seen": 421056,
"step": 1535
},
{
"epoch": 13.628318584070797,
"grad_norm": 0.0009848109912127256,
"learning_rate": 1.3964432503634281e-05,
"loss": 0.0,
"num_input_tokens_seen": 422256,
"step": 1540
},
{
"epoch": 13.672566371681416,
"grad_norm": 0.0012349269818514585,
"learning_rate": 1.3791524069907141e-05,
"loss": 0.0,
"num_input_tokens_seen": 423408,
"step": 1545
},
{
"epoch": 13.716814159292035,
"grad_norm": 0.0006271946476772428,
"learning_rate": 1.361928410670403e-05,
"loss": 0.0,
"num_input_tokens_seen": 424496,
"step": 1550
},
{
"epoch": 13.761061946902656,
"grad_norm": 0.0007682631839998066,
"learning_rate": 1.3447722886370565e-05,
"loss": 0.0,
"num_input_tokens_seen": 426352,
"step": 1555
},
{
"epoch": 13.805309734513274,
"grad_norm": 0.002840487053617835,
"learning_rate": 1.3276850640772288e-05,
"loss": 0.0,
"num_input_tokens_seen": 427440,
"step": 1560
},
{
"epoch": 13.849557522123893,
"grad_norm": 0.0005562572623603046,
"learning_rate": 1.3106677560684494e-05,
"loss": 0.0,
"num_input_tokens_seen": 428784,
"step": 1565
},
{
"epoch": 13.893805309734514,
"grad_norm": 0.0003185897076036781,
"learning_rate": 1.2937213795184434e-05,
"loss": 0.0,
"num_input_tokens_seen": 430240,
"step": 1570
},
{
"epoch": 13.938053097345133,
"grad_norm": 0.0009106624638661742,
"learning_rate": 1.2768469451046029e-05,
"loss": 0.0,
"num_input_tokens_seen": 431664,
"step": 1575
},
{
"epoch": 13.982300884955752,
"grad_norm": 0.0005448419833555818,
"learning_rate": 1.2600454592137062e-05,
"loss": 0.0,
"num_input_tokens_seen": 433472,
"step": 1580
},
{
"epoch": 14.0,
"eval_loss": 0.019971443340182304,
"eval_runtime": 0.7266,
"eval_samples_per_second": 34.408,
"eval_steps_per_second": 17.892,
"num_input_tokens_seen": 433768,
"step": 1582
},
{
"epoch": 14.026548672566372,
"grad_norm": 0.007204546593129635,
"learning_rate": 1.2433179238819077e-05,
"loss": 0.0001,
"num_input_tokens_seen": 434536,
"step": 1585
},
{
"epoch": 14.070796460176991,
"grad_norm": 0.0004600539105013013,
"learning_rate": 1.2266653367349657e-05,
"loss": 0.0,
"num_input_tokens_seen": 435688,
"step": 1590
},
{
"epoch": 14.11504424778761,
"grad_norm": 0.00044398586032912135,
"learning_rate": 1.2100886909287478e-05,
"loss": 0.0,
"num_input_tokens_seen": 437224,
"step": 1595
},
{
"epoch": 14.15929203539823,
"grad_norm": 0.0004702652804553509,
"learning_rate": 1.1935889750900034e-05,
"loss": 0.0,
"num_input_tokens_seen": 438600,
"step": 1600
},
{
"epoch": 14.20353982300885,
"grad_norm": 0.00046526207006536424,
"learning_rate": 1.1771671732573976e-05,
"loss": 0.0,
"num_input_tokens_seen": 439832,
"step": 1605
},
{
"epoch": 14.247787610619469,
"grad_norm": 0.0005686290678568184,
"learning_rate": 1.1608242648228257e-05,
"loss": 0.0001,
"num_input_tokens_seen": 441048,
"step": 1610
},
{
"epoch": 14.29203539823009,
"grad_norm": 0.0004068968119099736,
"learning_rate": 1.1445612244729984e-05,
"loss": 0.0,
"num_input_tokens_seen": 442872,
"step": 1615
},
{
"epoch": 14.336283185840708,
"grad_norm": 0.001695982995443046,
"learning_rate": 1.1283790221313208e-05,
"loss": 0.0,
"num_input_tokens_seen": 444360,
"step": 1620
},
{
"epoch": 14.380530973451327,
"grad_norm": 0.0037256069481372833,
"learning_rate": 1.1122786229000356e-05,
"loss": 0.0,
"num_input_tokens_seen": 445688,
"step": 1625
},
{
"epoch": 14.424778761061948,
"grad_norm": 0.00461566960439086,
"learning_rate": 1.0962609870026724e-05,
"loss": 0.0,
"num_input_tokens_seen": 447256,
"step": 1630
},
{
"epoch": 14.469026548672566,
"grad_norm": 0.0006255768821574748,
"learning_rate": 1.0803270697267764e-05,
"loss": 0.0,
"num_input_tokens_seen": 448712,
"step": 1635
},
{
"epoch": 14.513274336283185,
"grad_norm": 0.00046136006130836904,
"learning_rate": 1.0644778213669385e-05,
"loss": 0.0,
"num_input_tokens_seen": 450072,
"step": 1640
},
{
"epoch": 14.557522123893806,
"grad_norm": 0.0002965346211567521,
"learning_rate": 1.0487141871681142e-05,
"loss": 0.0,
"num_input_tokens_seen": 451432,
"step": 1645
},
{
"epoch": 14.601769911504425,
"grad_norm": 0.000347623456036672,
"learning_rate": 1.0330371072692565e-05,
"loss": 0.0,
"num_input_tokens_seen": 452696,
"step": 1650
},
{
"epoch": 14.646017699115044,
"grad_norm": 0.0006429420900531113,
"learning_rate": 1.0174475166472417e-05,
"loss": 0.0,
"num_input_tokens_seen": 454056,
"step": 1655
},
{
"epoch": 14.690265486725664,
"grad_norm": 0.002083728089928627,
"learning_rate": 1.0019463450611103e-05,
"loss": 0.0,
"num_input_tokens_seen": 455592,
"step": 1660
},
{
"epoch": 14.734513274336283,
"grad_norm": 0.00048280772170983255,
"learning_rate": 9.865345169966114e-06,
"loss": 0.0,
"num_input_tokens_seen": 456680,
"step": 1665
},
{
"epoch": 14.778761061946902,
"grad_norm": 0.0007308169733732939,
"learning_rate": 9.71212951611074e-06,
"loss": 0.0,
"num_input_tokens_seen": 458248,
"step": 1670
},
{
"epoch": 14.823008849557523,
"grad_norm": 0.0008597771520726383,
"learning_rate": 9.559825626785837e-06,
"loss": 0.0,
"num_input_tokens_seen": 459720,
"step": 1675
},
{
"epoch": 14.867256637168142,
"grad_norm": 0.0030042710714042187,
"learning_rate": 9.40844258535487e-06,
"loss": 0.0,
"num_input_tokens_seen": 461096,
"step": 1680
},
{
"epoch": 14.91150442477876,
"grad_norm": 0.001321359071880579,
"learning_rate": 9.257989420262151e-06,
"loss": 0.0,
"num_input_tokens_seen": 462536,
"step": 1685
},
{
"epoch": 14.955752212389381,
"grad_norm": 0.0003227783308830112,
"learning_rate": 9.108475104494475e-06,
"loss": 0.0,
"num_input_tokens_seen": 463832,
"step": 1690
},
{
"epoch": 15.0,
"grad_norm": 0.0005668445373885334,
"learning_rate": 8.959908555045846e-06,
"loss": 0.0,
"num_input_tokens_seen": 464816,
"step": 1695
},
{
"epoch": 15.0,
"eval_loss": 0.018581919372081757,
"eval_runtime": 0.7209,
"eval_samples_per_second": 34.678,
"eval_steps_per_second": 18.033,
"num_input_tokens_seen": 464816,
"step": 1695
},
{
"epoch": 15.044247787610619,
"grad_norm": 0.00046278457739390433,
"learning_rate": 8.812298632385784e-06,
"loss": 0.0,
"num_input_tokens_seen": 466240,
"step": 1700
},
{
"epoch": 15.08849557522124,
"grad_norm": 0.0005744256195612252,
"learning_rate": 8.66565413993082e-06,
"loss": 0.0,
"num_input_tokens_seen": 467344,
"step": 1705
},
{
"epoch": 15.132743362831858,
"grad_norm": 0.0002654808049555868,
"learning_rate": 8.519983823519496e-06,
"loss": 0.0,
"num_input_tokens_seen": 468432,
"step": 1710
},
{
"epoch": 15.176991150442477,
"grad_norm": 0.00036680474295280874,
"learning_rate": 8.375296370890749e-06,
"loss": 0.0,
"num_input_tokens_seen": 470224,
"step": 1715
},
{
"epoch": 15.221238938053098,
"grad_norm": 0.00044989073649048805,
"learning_rate": 8.231600411165757e-06,
"loss": 0.0,
"num_input_tokens_seen": 471552,
"step": 1720
},
{
"epoch": 15.265486725663717,
"grad_norm": 0.0006426925538107753,
"learning_rate": 8.088904514333384e-06,
"loss": 0.0,
"num_input_tokens_seen": 472896,
"step": 1725
},
{
"epoch": 15.309734513274336,
"grad_norm": 0.0004558507935144007,
"learning_rate": 7.947217190738945e-06,
"loss": 0.0,
"num_input_tokens_seen": 474384,
"step": 1730
},
{
"epoch": 15.353982300884956,
"grad_norm": 0.00033109524520114064,
"learning_rate": 7.806546890576753e-06,
"loss": 0.0,
"num_input_tokens_seen": 475456,
"step": 1735
},
{
"epoch": 15.398230088495575,
"grad_norm": 0.0010439999168738723,
"learning_rate": 7.666902003386104e-06,
"loss": 0.0,
"num_input_tokens_seen": 477136,
"step": 1740
},
{
"epoch": 15.442477876106194,
"grad_norm": 0.00024651194689795375,
"learning_rate": 7.528290857550943e-06,
"loss": 0.0,
"num_input_tokens_seen": 478672,
"step": 1745
},
{
"epoch": 15.486725663716815,
"grad_norm": 0.00045066667371429503,
"learning_rate": 7.390721719803137e-06,
"loss": 0.0,
"num_input_tokens_seen": 480096,
"step": 1750
},
{
"epoch": 15.530973451327434,
"grad_norm": 0.00037901801988482475,
"learning_rate": 7.254202794729484e-06,
"loss": 0.0,
"num_input_tokens_seen": 481904,
"step": 1755
},
{
"epoch": 15.575221238938052,
"grad_norm": 0.0010054173180833459,
"learning_rate": 7.11874222428238e-06,
"loss": 0.0,
"num_input_tokens_seen": 483440,
"step": 1760
},
{
"epoch": 15.619469026548673,
"grad_norm": 0.0004434712463989854,
"learning_rate": 6.9843480872942294e-06,
"loss": 0.0,
"num_input_tokens_seen": 485056,
"step": 1765
},
{
"epoch": 15.663716814159292,
"grad_norm": 0.00036231096601113677,
"learning_rate": 6.851028398995607e-06,
"loss": 0.0,
"num_input_tokens_seen": 486608,
"step": 1770
},
{
"epoch": 15.70796460176991,
"grad_norm": 0.00084352632984519,
"learning_rate": 6.718791110537287e-06,
"loss": 0.0,
"num_input_tokens_seen": 487888,
"step": 1775
},
{
"epoch": 15.752212389380531,
"grad_norm": 0.0006180101190693676,
"learning_rate": 6.587644108515986e-06,
"loss": 0.0,
"num_input_tokens_seen": 489184,
"step": 1780
},
{
"epoch": 15.79646017699115,
"grad_norm": 0.00047136572538875043,
"learning_rate": 6.457595214504042e-06,
"loss": 0.0,
"num_input_tokens_seen": 490528,
"step": 1785
},
{
"epoch": 15.84070796460177,
"grad_norm": 0.001999986357986927,
"learning_rate": 6.328652184582884e-06,
"loss": 0.0,
"num_input_tokens_seen": 491680,
"step": 1790
},
{
"epoch": 15.88495575221239,
"grad_norm": 0.0006776810041628778,
"learning_rate": 6.200822708880563e-06,
"loss": 0.0,
"num_input_tokens_seen": 493152,
"step": 1795
},
{
"epoch": 15.929203539823009,
"grad_norm": 0.0005173166864551604,
"learning_rate": 6.074114411112997e-06,
"loss": 0.0,
"num_input_tokens_seen": 494400,
"step": 1800
},
{
"epoch": 15.973451327433628,
"grad_norm": 0.0004017841420136392,
"learning_rate": 5.948534848129378e-06,
"loss": 0.0,
"num_input_tokens_seen": 495584,
"step": 1805
},
{
"epoch": 16.0,
"eval_loss": 0.019682612270116806,
"eval_runtime": 0.7212,
"eval_samples_per_second": 34.662,
"eval_steps_per_second": 18.024,
"num_input_tokens_seen": 496216,
"step": 1808
},
{
"epoch": 16.01769911504425,
"grad_norm": 0.0007653535576537251,
"learning_rate": 5.824091509461449e-06,
"loss": 0.0,
"num_input_tokens_seen": 496680,
"step": 1810
},
{
"epoch": 16.061946902654867,
"grad_norm": 0.00038566955481655896,
"learning_rate": 5.7007918168768405e-06,
"loss": 0.0,
"num_input_tokens_seen": 497992,
"step": 1815
},
{
"epoch": 16.106194690265486,
"grad_norm": 0.0003379808913450688,
"learning_rate": 5.5786431239364365e-06,
"loss": 0.0,
"num_input_tokens_seen": 499272,
"step": 1820
},
{
"epoch": 16.150442477876105,
"grad_norm": 0.0006492345710285008,
"learning_rate": 5.457652715555781e-06,
"loss": 0.0,
"num_input_tokens_seen": 500776,
"step": 1825
},
{
"epoch": 16.194690265486727,
"grad_norm": 0.002473029075190425,
"learning_rate": 5.337827807570689e-06,
"loss": 0.0,
"num_input_tokens_seen": 501928,
"step": 1830
},
{
"epoch": 16.238938053097346,
"grad_norm": 0.0006469090585596859,
"learning_rate": 5.219175546306784e-06,
"loss": 0.0,
"num_input_tokens_seen": 502936,
"step": 1835
},
{
"epoch": 16.283185840707965,
"grad_norm": 0.00036444375291466713,
"learning_rate": 5.1017030081533914e-06,
"loss": 0.0,
"num_input_tokens_seen": 504200,
"step": 1840
},
{
"epoch": 16.327433628318584,
"grad_norm": 0.0003275285707786679,
"learning_rate": 4.985417199141443e-06,
"loss": 0.0,
"num_input_tokens_seen": 505368,
"step": 1845
},
{
"epoch": 16.371681415929203,
"grad_norm": 0.00047016076860018075,
"learning_rate": 4.870325054525673e-06,
"loss": 0.0,
"num_input_tokens_seen": 506792,
"step": 1850
},
{
"epoch": 16.41592920353982,
"grad_norm": 0.0005479489336721599,
"learning_rate": 4.7564334383709745e-06,
"loss": 0.0,
"num_input_tokens_seen": 508184,
"step": 1855
},
{
"epoch": 16.460176991150444,
"grad_norm": 0.0007782442844472826,
"learning_rate": 4.6437491431430556e-06,
"loss": 0.0,
"num_input_tokens_seen": 509704,
"step": 1860
},
{
"epoch": 16.504424778761063,
"grad_norm": 0.0004585021815728396,
"learning_rate": 4.5322788893033155e-06,
"loss": 0.0,
"num_input_tokens_seen": 511272,
"step": 1865
},
{
"epoch": 16.548672566371682,
"grad_norm": 0.0005866039427928627,
"learning_rate": 4.422029324908061e-06,
"loss": 0.0,
"num_input_tokens_seen": 512744,
"step": 1870
},
{
"epoch": 16.5929203539823,
"grad_norm": 0.00041383542702533305,
"learning_rate": 4.313007025211985e-06,
"loss": 0.0,
"num_input_tokens_seen": 514264,
"step": 1875
},
{
"epoch": 16.63716814159292,
"grad_norm": 0.0014405195834115148,
"learning_rate": 4.205218492276055e-06,
"loss": 0.0,
"num_input_tokens_seen": 515720,
"step": 1880
},
{
"epoch": 16.68141592920354,
"grad_norm": 0.0037912712432444096,
"learning_rate": 4.098670154579715e-06,
"loss": 0.0,
"num_input_tokens_seen": 517080,
"step": 1885
},
{
"epoch": 16.72566371681416,
"grad_norm": 0.001533073023892939,
"learning_rate": 3.9933683666374986e-06,
"loss": 0.0,
"num_input_tokens_seen": 518568,
"step": 1890
},
{
"epoch": 16.76991150442478,
"grad_norm": 0.007198888808488846,
"learning_rate": 3.889319408620021e-06,
"loss": 0.0,
"num_input_tokens_seen": 520008,
"step": 1895
},
{
"epoch": 16.8141592920354,
"grad_norm": 0.0009076311835087836,
"learning_rate": 3.7865294859794926e-06,
"loss": 0.0,
"num_input_tokens_seen": 521368,
"step": 1900
},
{
"epoch": 16.858407079646017,
"grad_norm": 0.0002917584788519889,
"learning_rate": 3.68500472907955e-06,
"loss": 0.0,
"num_input_tokens_seen": 522648,
"step": 1905
},
{
"epoch": 16.902654867256636,
"grad_norm": 0.0003024785837624222,
"learning_rate": 3.584751192829705e-06,
"loss": 0.0,
"num_input_tokens_seen": 524072,
"step": 1910
},
{
"epoch": 16.946902654867255,
"grad_norm": 0.0003079813322983682,
"learning_rate": 3.4857748563242006e-06,
"loss": 0.0,
"num_input_tokens_seen": 525640,
"step": 1915
},
{
"epoch": 16.991150442477878,
"grad_norm": 0.0002828809665516019,
"learning_rate": 3.388081622485431e-06,
"loss": 0.0,
"num_input_tokens_seen": 527192,
"step": 1920
},
{
"epoch": 17.0,
"eval_loss": 0.019199641421437263,
"eval_runtime": 0.7112,
"eval_samples_per_second": 35.151,
"eval_steps_per_second": 18.278,
"num_input_tokens_seen": 527360,
"step": 1921
},
{
"epoch": 17.035398230088497,
"grad_norm": 0.00035240757279098034,
"learning_rate": 3.2916773177118778e-06,
"loss": 0.0,
"num_input_tokens_seen": 528400,
"step": 1925
},
{
"epoch": 17.079646017699115,
"grad_norm": 0.0005224815104156733,
"learning_rate": 3.1965676915306384e-06,
"loss": 0.0,
"num_input_tokens_seen": 529952,
"step": 1930
},
{
"epoch": 17.123893805309734,
"grad_norm": 0.0007481279317289591,
"learning_rate": 3.102758416254545e-06,
"loss": 0.0,
"num_input_tokens_seen": 531248,
"step": 1935
},
{
"epoch": 17.168141592920353,
"grad_norm": 0.0003523877530824393,
"learning_rate": 3.010255086643818e-06,
"loss": 0.0,
"num_input_tokens_seen": 532720,
"step": 1940
},
{
"epoch": 17.212389380530972,
"grad_norm": 0.00042034246143884957,
"learning_rate": 2.919063219572438e-06,
"loss": 0.0,
"num_input_tokens_seen": 534176,
"step": 1945
},
{
"epoch": 17.256637168141594,
"grad_norm": 0.0007219575345516205,
"learning_rate": 2.829188253699111e-06,
"loss": 0.0,
"num_input_tokens_seen": 535792,
"step": 1950
},
{
"epoch": 17.300884955752213,
"grad_norm": 0.0003039201837964356,
"learning_rate": 2.7406355491429086e-06,
"loss": 0.0,
"num_input_tokens_seen": 537088,
"step": 1955
},
{
"epoch": 17.345132743362832,
"grad_norm": 0.0003757340309675783,
"learning_rate": 2.653410387163574e-06,
"loss": 0.0,
"num_input_tokens_seen": 538624,
"step": 1960
},
{
"epoch": 17.38938053097345,
"grad_norm": 0.0015457504196092486,
"learning_rate": 2.567517969846575e-06,
"loss": 0.0,
"num_input_tokens_seen": 539968,
"step": 1965
},
{
"epoch": 17.43362831858407,
"grad_norm": 0.0003834764356724918,
"learning_rate": 2.482963419792844e-06,
"loss": 0.0,
"num_input_tokens_seen": 541024,
"step": 1970
},
{
"epoch": 17.47787610619469,
"grad_norm": 0.00040257559157907963,
"learning_rate": 2.399751779813264e-06,
"loss": 0.0,
"num_input_tokens_seen": 542304,
"step": 1975
},
{
"epoch": 17.52212389380531,
"grad_norm": 0.00033592438558116555,
"learning_rate": 2.317888012627914e-06,
"loss": 0.0,
"num_input_tokens_seen": 543712,
"step": 1980
},
{
"epoch": 17.56637168141593,
"grad_norm": 0.0005625042249448597,
"learning_rate": 2.2373770005700955e-06,
"loss": 0.0,
"num_input_tokens_seen": 545024,
"step": 1985
},
{
"epoch": 17.61061946902655,
"grad_norm": 0.0005863187252543867,
"learning_rate": 2.1582235452951682e-06,
"loss": 0.0,
"num_input_tokens_seen": 546416,
"step": 1990
},
{
"epoch": 17.654867256637168,
"grad_norm": 0.0003696437634062022,
"learning_rate": 2.0804323674941563e-06,
"loss": 0.0,
"num_input_tokens_seen": 547760,
"step": 1995
},
{
"epoch": 17.699115044247787,
"grad_norm": 0.00047242920845746994,
"learning_rate": 2.0040081066122043e-06,
"loss": 0.0,
"num_input_tokens_seen": 548880,
"step": 2000
},
{
"epoch": 17.743362831858406,
"grad_norm": 0.0029493607580661774,
"learning_rate": 1.9289553205719317e-06,
"loss": 0.0,
"num_input_tokens_seen": 550192,
"step": 2005
},
{
"epoch": 17.787610619469028,
"grad_norm": 0.00043380033457651734,
"learning_rate": 1.8552784855015215e-06,
"loss": 0.0,
"num_input_tokens_seen": 551424,
"step": 2010
},
{
"epoch": 17.831858407079647,
"grad_norm": 0.0005246605142019689,
"learning_rate": 1.7829819954678361e-06,
"loss": 0.0,
"num_input_tokens_seen": 552704,
"step": 2015
},
{
"epoch": 17.876106194690266,
"grad_norm": 0.0019084580708295107,
"learning_rate": 1.7120701622143132e-06,
"loss": 0.0,
"num_input_tokens_seen": 554432,
"step": 2020
},
{
"epoch": 17.920353982300885,
"grad_norm": 0.0005461287801153958,
"learning_rate": 1.6425472149038361e-06,
"loss": 0.0,
"num_input_tokens_seen": 556032,
"step": 2025
},
{
"epoch": 17.964601769911503,
"grad_norm": 0.00038796328590251505,
"learning_rate": 1.5744172998664902e-06,
"loss": 0.0,
"num_input_tokens_seen": 557168,
"step": 2030
},
{
"epoch": 18.0,
"eval_loss": 0.018208853900432587,
"eval_runtime": 0.7295,
"eval_samples_per_second": 34.269,
"eval_steps_per_second": 17.82,
"num_input_tokens_seen": 558088,
"step": 2034
},
{
"epoch": 18.008849557522122,
"grad_norm": 0.0016791054513305426,
"learning_rate": 1.5076844803522922e-06,
"loss": 0.0,
"num_input_tokens_seen": 558408,
"step": 2035
},
{
"epoch": 18.053097345132745,
"grad_norm": 0.00732705183327198,
"learning_rate": 1.4423527362888546e-06,
"loss": 0.0001,
"num_input_tokens_seen": 559960,
"step": 2040
},
{
"epoch": 18.097345132743364,
"grad_norm": 0.000439478870248422,
"learning_rate": 1.3784259640440279e-06,
"loss": 0.0,
"num_input_tokens_seen": 561272,
"step": 2045
},
{
"epoch": 18.141592920353983,
"grad_norm": 0.0011805107351392508,
"learning_rate": 1.3159079761934923e-06,
"loss": 0.0,
"num_input_tokens_seen": 563064,
"step": 2050
},
{
"epoch": 18.1858407079646,
"grad_norm": 0.00039607463986612856,
"learning_rate": 1.2548025012934367e-06,
"loss": 0.0,
"num_input_tokens_seen": 564504,
"step": 2055
},
{
"epoch": 18.23008849557522,
"grad_norm": 0.00045017743832431734,
"learning_rate": 1.195113183658131e-06,
"loss": 0.0,
"num_input_tokens_seen": 565816,
"step": 2060
},
{
"epoch": 18.27433628318584,
"grad_norm": 0.00038782545016147196,
"learning_rate": 1.1368435831426021e-06,
"loss": 0.0,
"num_input_tokens_seen": 567048,
"step": 2065
},
{
"epoch": 18.31858407079646,
"grad_norm": 0.00046747527085244656,
"learning_rate": 1.0799971749303333e-06,
"loss": 0.0,
"num_input_tokens_seen": 568552,
"step": 2070
},
{
"epoch": 18.36283185840708,
"grad_norm": 0.00045611089444719255,
"learning_rate": 1.0245773493259946e-06,
"loss": 0.0,
"num_input_tokens_seen": 570088,
"step": 2075
},
{
"epoch": 18.4070796460177,
"grad_norm": 0.0003884553152602166,
"learning_rate": 9.705874115532532e-07,
"loss": 0.0,
"num_input_tokens_seen": 571400,
"step": 2080
},
{
"epoch": 18.451327433628318,
"grad_norm": 0.0006371684139594436,
"learning_rate": 9.180305815576301e-07,
"loss": 0.0,
"num_input_tokens_seen": 572648,
"step": 2085
},
{
"epoch": 18.495575221238937,
"grad_norm": 0.00032609282061457634,
"learning_rate": 8.669099938144992e-07,
"loss": 0.0,
"num_input_tokens_seen": 573976,
"step": 2090
},
{
"epoch": 18.539823008849556,
"grad_norm": 0.0037040063180029392,
"learning_rate": 8.172286971421167e-07,
"loss": 0.0,
"num_input_tokens_seen": 575560,
"step": 2095
},
{
"epoch": 18.58407079646018,
"grad_norm": 0.00041839800542220473,
"learning_rate": 7.689896545198111e-07,
"loss": 0.0,
"num_input_tokens_seen": 577096,
"step": 2100
},
{
"epoch": 18.628318584070797,
"grad_norm": 0.001043095369823277,
"learning_rate": 7.221957429112469e-07,
"loss": 0.0,
"num_input_tokens_seen": 578440,
"step": 2105
},
{
"epoch": 18.672566371681416,
"grad_norm": 0.000590807176195085,
"learning_rate": 6.768497530928785e-07,
"loss": 0.0,
"num_input_tokens_seen": 579592,
"step": 2110
},
{
"epoch": 18.716814159292035,
"grad_norm": 0.000375599367544055,
"learning_rate": 6.329543894874779e-07,
"loss": 0.0,
"num_input_tokens_seen": 580984,
"step": 2115
},
{
"epoch": 18.761061946902654,
"grad_norm": 0.004237520508468151,
"learning_rate": 5.905122700028576e-07,
"loss": 0.0001,
"num_input_tokens_seen": 582424,
"step": 2120
},
{
"epoch": 18.805309734513273,
"grad_norm": 0.0015916775446385145,
"learning_rate": 5.49525925875738e-07,
"loss": 0.0,
"num_input_tokens_seen": 583608,
"step": 2125
},
{
"epoch": 18.849557522123895,
"grad_norm": 0.0024985643103718758,
"learning_rate": 5.099978015207868e-07,
"loss": 0.0,
"num_input_tokens_seen": 584568,
"step": 2130
},
{
"epoch": 18.893805309734514,
"grad_norm": 0.000398988340748474,
"learning_rate": 4.719302543848225e-07,
"loss": 0.0,
"num_input_tokens_seen": 586232,
"step": 2135
},
{
"epoch": 18.938053097345133,
"grad_norm": 0.0004915536846965551,
"learning_rate": 4.3532555480624295e-07,
"loss": 0.0,
"num_input_tokens_seen": 587608,
"step": 2140
},
{
"epoch": 18.98230088495575,
"grad_norm": 0.0018118071602657437,
"learning_rate": 4.001858858795893e-07,
"loss": 0.0,
"num_input_tokens_seen": 588744,
"step": 2145
},
{
"epoch": 19.0,
"eval_loss": 0.0190139003098011,
"eval_runtime": 0.7256,
"eval_samples_per_second": 34.455,
"eval_steps_per_second": 17.916,
"num_input_tokens_seen": 589072,
"step": 2147
},
{
"epoch": 19.02654867256637,
"grad_norm": 0.003615348832681775,
"learning_rate": 3.665133433253809e-07,
"loss": 0.0,
"num_input_tokens_seen": 589776,
"step": 2150
},
{
"epoch": 19.07079646017699,
"grad_norm": 0.00036754633765667677,
"learning_rate": 3.34309935365093e-07,
"loss": 0.0,
"num_input_tokens_seen": 591200,
"step": 2155
},
{
"epoch": 19.115044247787612,
"grad_norm": 0.0005496389348991215,
"learning_rate": 3.03577582601422e-07,
"loss": 0.0,
"num_input_tokens_seen": 592464,
"step": 2160
},
{
"epoch": 19.15929203539823,
"grad_norm": 0.0004596963117364794,
"learning_rate": 2.743181179037047e-07,
"loss": 0.0,
"num_input_tokens_seen": 593936,
"step": 2165
},
{
"epoch": 19.20353982300885,
"grad_norm": 0.00025676190853118896,
"learning_rate": 2.465332862986447e-07,
"loss": 0.0,
"num_input_tokens_seen": 595232,
"step": 2170
},
{
"epoch": 19.24778761061947,
"grad_norm": 0.0004909674171358347,
"learning_rate": 2.2022474486620427e-07,
"loss": 0.0,
"num_input_tokens_seen": 596592,
"step": 2175
},
{
"epoch": 19.292035398230087,
"grad_norm": 0.00031084747752174735,
"learning_rate": 1.953940626408024e-07,
"loss": 0.0,
"num_input_tokens_seen": 598176,
"step": 2180
},
{
"epoch": 19.336283185840706,
"grad_norm": 0.001011427491903305,
"learning_rate": 1.720427205177233e-07,
"loss": 0.0001,
"num_input_tokens_seen": 599792,
"step": 2185
},
{
"epoch": 19.38053097345133,
"grad_norm": 0.0016426928341388702,
"learning_rate": 1.5017211116479802e-07,
"loss": 0.0,
"num_input_tokens_seen": 601072,
"step": 2190
},
{
"epoch": 19.424778761061948,
"grad_norm": 0.00022410904057323933,
"learning_rate": 1.297835389393598e-07,
"loss": 0.0,
"num_input_tokens_seen": 602384,
"step": 2195
},
{
"epoch": 19.469026548672566,
"grad_norm": 0.0016083059599623084,
"learning_rate": 1.1087821981042856e-07,
"loss": 0.0,
"num_input_tokens_seen": 603840,
"step": 2200
},
{
"epoch": 19.513274336283185,
"grad_norm": 0.0005845269188284874,
"learning_rate": 9.345728128621611e-08,
"loss": 0.0,
"num_input_tokens_seen": 604944,
"step": 2205
},
{
"epoch": 19.557522123893804,
"grad_norm": 0.00038981102989055216,
"learning_rate": 7.752176234685771e-08,
"loss": 0.0,
"num_input_tokens_seen": 606224,
"step": 2210
},
{
"epoch": 19.601769911504427,
"grad_norm": 0.001588311162777245,
"learning_rate": 6.307261338246718e-08,
"loss": 0.0,
"num_input_tokens_seen": 607632,
"step": 2215
},
{
"epoch": 19.646017699115045,
"grad_norm": 0.005621492862701416,
"learning_rate": 5.011069613644892e-08,
"loss": 0.0,
"num_input_tokens_seen": 608960,
"step": 2220
},
{
"epoch": 19.690265486725664,
"grad_norm": 0.0005275033763609827,
"learning_rate": 3.8636783654100174e-08,
"loss": 0.0,
"num_input_tokens_seen": 610400,
"step": 2225
},
{
"epoch": 19.734513274336283,
"grad_norm": 0.0027882629074156284,
"learning_rate": 2.865156023650617e-08,
"loss": 0.0,
"num_input_tokens_seen": 611648,
"step": 2230
},
{
"epoch": 19.778761061946902,
"grad_norm": 0.0033670312259346247,
"learning_rate": 2.0155621399742254e-08,
"loss": 0.0,
"num_input_tokens_seen": 612880,
"step": 2235
},
{
"epoch": 19.82300884955752,
"grad_norm": 0.000658839417155832,
"learning_rate": 1.31494738393384e-08,
"loss": 0.0,
"num_input_tokens_seen": 614560,
"step": 2240
},
{
"epoch": 19.86725663716814,
"grad_norm": 0.0017916634678840637,
"learning_rate": 7.633535400070057e-09,
"loss": 0.0,
"num_input_tokens_seen": 615856,
"step": 2245
},
{
"epoch": 19.911504424778762,
"grad_norm": 0.0003976623120252043,
"learning_rate": 3.6081350510447365e-09,
"loss": 0.0,
"num_input_tokens_seen": 617136,
"step": 2250
},
{
"epoch": 19.95575221238938,
"grad_norm": 0.00045023686834611,
"learning_rate": 1.0735128660649406e-09,
"loss": 0.0,
"num_input_tokens_seen": 618640,
"step": 2255
},
{
"epoch": 20.0,
"grad_norm": 0.0003346599987708032,
"learning_rate": 2.982000932294504e-11,
"loss": 0.0,
"num_input_tokens_seen": 620240,
"step": 2260
},
{
"epoch": 20.0,
"eval_loss": 0.01861300691962242,
"eval_runtime": 0.7164,
"eval_samples_per_second": 34.898,
"eval_steps_per_second": 18.147,
"num_input_tokens_seen": 620240,
"step": 2260
},
{
"epoch": 20.0,
"num_input_tokens_seen": 620240,
"step": 2260,
"total_flos": 2.792912687136768e+16,
"train_loss": 0.167221273584263,
"train_runtime": 256.8958,
"train_samples_per_second": 17.517,
"train_steps_per_second": 8.797
}
],
"logging_steps": 5,
"max_steps": 2260,
"num_input_tokens_seen": 620240,
"num_train_epochs": 20,
"save_steps": 113,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.792912687136768e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}