LLaVA-baseline-checkpoint-6000 / trainer_state.json
PhoenixGS's picture
Upload folder using huggingface_hub
099f141 verified
{
"best_global_step": 6000,
"best_metric": 0.95930004,
"best_model_checkpoint": "/user/yutianyu/Duplex_Finetune/output/4B_LLaVA_SFT/zero3_0dot6B_LLaVA_SFT_nopacking/v0-20251202-145343/checkpoint-6000",
"epoch": 1.4081488957213604,
"eval_steps": 100,
"global_step": 6000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002347032177811158,
"grad_norm": 19.127729366517716,
"learning_rate": 4e-09,
"loss": 1.9448599815368652,
"num_input_tokens_seen": 197295,
"step": 1,
"token_acc": 0.5819854991634132
},
{
"epoch": 0.002347032177811158,
"grad_norm": 16.494659300395906,
"learning_rate": 4e-08,
"loss": 1.92243406507704,
"num_input_tokens_seen": 2020272,
"step": 10,
"token_acc": 0.5839277085360667
},
{
"epoch": 0.004694064355622316,
"grad_norm": 22.126428554674575,
"learning_rate": 8e-08,
"loss": 1.916154670715332,
"num_input_tokens_seen": 4084884,
"step": 20,
"token_acc": 0.5835957997637319
},
{
"epoch": 0.007041096533433474,
"grad_norm": 12.603610772661288,
"learning_rate": 1.2e-07,
"loss": 1.8856426239013673,
"num_input_tokens_seen": 6056667,
"step": 30,
"token_acc": 0.584776074988841
},
{
"epoch": 0.009388128711244632,
"grad_norm": 10.779646193701467,
"learning_rate": 1.6e-07,
"loss": 1.8353569030761718,
"num_input_tokens_seen": 8091435,
"step": 40,
"token_acc": 0.5897754631538845
},
{
"epoch": 0.011735160889055789,
"grad_norm": 136.86547531489668,
"learning_rate": 2e-07,
"loss": 1.721211051940918,
"num_input_tokens_seen": 10091673,
"step": 50,
"token_acc": 0.6031858358236022
},
{
"epoch": 0.014082193066866948,
"grad_norm": 5.080233785680673,
"learning_rate": 2.4e-07,
"loss": 1.6145668029785156,
"num_input_tokens_seen": 12099135,
"step": 60,
"token_acc": 0.6209089567372474
},
{
"epoch": 0.016429225244678103,
"grad_norm": 12.207422981911558,
"learning_rate": 2.8e-07,
"loss": 1.5414657592773438,
"num_input_tokens_seen": 14135250,
"step": 70,
"token_acc": 0.6310852754061408
},
{
"epoch": 0.018776257422489263,
"grad_norm": 8.118283422146021,
"learning_rate": 3.2e-07,
"loss": 1.528026008605957,
"num_input_tokens_seen": 16200873,
"step": 80,
"token_acc": 0.6382918453943185
},
{
"epoch": 0.02112328960030042,
"grad_norm": 3.47292239362735,
"learning_rate": 3.6e-07,
"loss": 1.4715272903442382,
"num_input_tokens_seen": 18177258,
"step": 90,
"token_acc": 0.6432060553309527
},
{
"epoch": 0.023470321778111577,
"grad_norm": 2.2332542577632526,
"learning_rate": 4e-07,
"loss": 1.4491453170776367,
"num_input_tokens_seen": 20161581,
"step": 100,
"token_acc": 0.6477376652924822
},
{
"epoch": 0.023470321778111577,
"eval_loss": 1.4709749221801758,
"eval_runtime": 33.2583,
"eval_samples_per_second": 30.068,
"eval_steps_per_second": 1.263,
"eval_token_acc": 0.6456822326354424,
"num_input_tokens_seen": 20161581,
"step": 100
},
{
"epoch": 0.025817353955922735,
"grad_norm": 1.9112779247166412,
"learning_rate": 4.3999999999999997e-07,
"loss": 1.4166399002075196,
"num_input_tokens_seen": 22179654,
"step": 110,
"token_acc": 0.6549487213586689
},
{
"epoch": 0.028164386133733895,
"grad_norm": 1.897077748827419,
"learning_rate": 4.8e-07,
"loss": 1.3960003852844238,
"num_input_tokens_seen": 24157263,
"step": 120,
"token_acc": 0.6563052255139141
},
{
"epoch": 0.030511418311545052,
"grad_norm": 2.063047689522777,
"learning_rate": 5.2e-07,
"loss": 1.373966884613037,
"num_input_tokens_seen": 26112051,
"step": 130,
"token_acc": 0.6609459618615088
},
{
"epoch": 0.032858450489356206,
"grad_norm": 6.453024686764437,
"learning_rate": 5.6e-07,
"loss": 1.3647557258605958,
"num_input_tokens_seen": 28133607,
"step": 140,
"token_acc": 0.6628235998176535
},
{
"epoch": 0.035205482667167366,
"grad_norm": 1.9586538878052986,
"learning_rate": 6e-07,
"loss": 1.394300651550293,
"num_input_tokens_seen": 30182052,
"step": 150,
"token_acc": 0.6566977644780848
},
{
"epoch": 0.03755251484497853,
"grad_norm": 4.663911418765899,
"learning_rate": 6.4e-07,
"loss": 1.3559602737426757,
"num_input_tokens_seen": 32177622,
"step": 160,
"token_acc": 0.6647926044470018
},
{
"epoch": 0.03989954702278968,
"grad_norm": 2.649130437820903,
"learning_rate": 6.800000000000001e-07,
"loss": 1.320611572265625,
"num_input_tokens_seen": 34150332,
"step": 170,
"token_acc": 0.671068499517214
},
{
"epoch": 0.04224657920060084,
"grad_norm": 1.988272208775732,
"learning_rate": 7.2e-07,
"loss": 1.3357341766357422,
"num_input_tokens_seen": 36129600,
"step": 180,
"token_acc": 0.6666514308426074
},
{
"epoch": 0.044593611378412,
"grad_norm": 2.089282254202976,
"learning_rate": 7.599999999999999e-07,
"loss": 1.3014695167541503,
"num_input_tokens_seen": 38143260,
"step": 190,
"token_acc": 0.6714680103247839
},
{
"epoch": 0.046940643556223155,
"grad_norm": 1.6962028687120758,
"learning_rate": 8e-07,
"loss": 1.316438865661621,
"num_input_tokens_seen": 40117473,
"step": 200,
"token_acc": 0.6719479705996623
},
{
"epoch": 0.046940643556223155,
"eval_loss": 1.3232439756393433,
"eval_runtime": 32.4409,
"eval_samples_per_second": 30.825,
"eval_steps_per_second": 1.295,
"eval_token_acc": 0.6714895778029132,
"num_input_tokens_seen": 40117473,
"step": 200
},
{
"epoch": 0.049287675734034316,
"grad_norm": 2.7375989158006453,
"learning_rate": 8.399999999999999e-07,
"loss": 1.2971059799194335,
"num_input_tokens_seen": 42051432,
"step": 210,
"token_acc": 0.6724870727708412
},
{
"epoch": 0.05163470791184547,
"grad_norm": 1.8333521024827166,
"learning_rate": 8.799999999999999e-07,
"loss": 1.251258945465088,
"num_input_tokens_seen": 44060409,
"step": 220,
"token_acc": 0.6825975678761506
},
{
"epoch": 0.05398174008965663,
"grad_norm": 1.7067135742590114,
"learning_rate": 9.2e-07,
"loss": 1.2541748046875,
"num_input_tokens_seen": 46140576,
"step": 230,
"token_acc": 0.6838314298998598
},
{
"epoch": 0.05632877226746779,
"grad_norm": 2.117007115532979,
"learning_rate": 9.6e-07,
"loss": 1.2801358222961425,
"num_input_tokens_seen": 48218463,
"step": 240,
"token_acc": 0.6776609031540706
},
{
"epoch": 0.058675804445278944,
"grad_norm": 1.9477588029675073,
"learning_rate": 1e-06,
"loss": 1.272374153137207,
"num_input_tokens_seen": 50212704,
"step": 250,
"token_acc": 0.6784600688499179
},
{
"epoch": 0.061022836623090104,
"grad_norm": 2.0653725127756495,
"learning_rate": 1.04e-06,
"loss": 1.2390222549438477,
"num_input_tokens_seen": 52210065,
"step": 260,
"token_acc": 0.6846190216694448
},
{
"epoch": 0.06336986880090126,
"grad_norm": 1.6368439315898582,
"learning_rate": 1.08e-06,
"loss": 1.2289260864257812,
"num_input_tokens_seen": 54287394,
"step": 270,
"token_acc": 0.6870453172664521
},
{
"epoch": 0.06571690097871241,
"grad_norm": 1.8897694890455825,
"learning_rate": 1.12e-06,
"loss": 1.2016170501708985,
"num_input_tokens_seen": 56286087,
"step": 280,
"token_acc": 0.6947360302493355
},
{
"epoch": 0.06806393315652358,
"grad_norm": 4.191924245995845,
"learning_rate": 1.16e-06,
"loss": 1.2314638137817382,
"num_input_tokens_seen": 58259631,
"step": 290,
"token_acc": 0.6869124082650091
},
{
"epoch": 0.07041096533433473,
"grad_norm": 2.010449522588459,
"learning_rate": 1.2e-06,
"loss": 1.2306774139404297,
"num_input_tokens_seen": 60262377,
"step": 300,
"token_acc": 0.6861436424474188
},
{
"epoch": 0.07041096533433473,
"eval_loss": 1.253986120223999,
"eval_runtime": 32.5346,
"eval_samples_per_second": 30.737,
"eval_steps_per_second": 1.291,
"eval_token_acc": 0.6843470834006602,
"num_input_tokens_seen": 60262377,
"step": 300
},
{
"epoch": 0.07275799751214589,
"grad_norm": 1.6903068952734268,
"learning_rate": 1.24e-06,
"loss": 1.2244423866271972,
"num_input_tokens_seen": 62257881,
"step": 310,
"token_acc": 0.6906986736484506
},
{
"epoch": 0.07510502968995705,
"grad_norm": 2.00802423495143,
"learning_rate": 1.28e-06,
"loss": 1.2204778671264649,
"num_input_tokens_seen": 64218216,
"step": 320,
"token_acc": 0.6881087345222366
},
{
"epoch": 0.07745206186776821,
"grad_norm": 2.136483656358153,
"learning_rate": 1.32e-06,
"loss": 1.1911478996276856,
"num_input_tokens_seen": 66217860,
"step": 330,
"token_acc": 0.6964124357320138
},
{
"epoch": 0.07979909404557936,
"grad_norm": 1.7796738858497867,
"learning_rate": 1.3600000000000001e-06,
"loss": 1.195077896118164,
"num_input_tokens_seen": 68222130,
"step": 340,
"token_acc": 0.6935684789950294
},
{
"epoch": 0.08214612622339053,
"grad_norm": 1.7117012000365959,
"learning_rate": 1.4e-06,
"loss": 1.2089216232299804,
"num_input_tokens_seen": 70209570,
"step": 350,
"token_acc": 0.6915523828674844
},
{
"epoch": 0.08449315840120168,
"grad_norm": 1.928181461412703,
"learning_rate": 1.44e-06,
"loss": 1.195500946044922,
"num_input_tokens_seen": 72228831,
"step": 360,
"token_acc": 0.6958758115748244
},
{
"epoch": 0.08684019057901284,
"grad_norm": 2.016364423213612,
"learning_rate": 1.48e-06,
"loss": 1.1975667953491211,
"num_input_tokens_seen": 74280357,
"step": 370,
"token_acc": 0.6944993196346585
},
{
"epoch": 0.089187222756824,
"grad_norm": 1.9269783210667364,
"learning_rate": 1.5199999999999998e-06,
"loss": 1.196579933166504,
"num_input_tokens_seen": 76255509,
"step": 380,
"token_acc": 0.693136319725866
},
{
"epoch": 0.09153425493463516,
"grad_norm": 2.276495563257121,
"learning_rate": 1.5599999999999999e-06,
"loss": 1.1727699279785155,
"num_input_tokens_seen": 78216720,
"step": 390,
"token_acc": 0.6980378317334839
},
{
"epoch": 0.09388128711244631,
"grad_norm": 2.665827226302004,
"learning_rate": 1.6e-06,
"loss": 1.2070913314819336,
"num_input_tokens_seen": 80187780,
"step": 400,
"token_acc": 0.6931723081009408
},
{
"epoch": 0.09388128711244631,
"eval_loss": 1.210001826286316,
"eval_runtime": 32.311,
"eval_samples_per_second": 30.949,
"eval_steps_per_second": 1.3,
"eval_token_acc": 0.693446596338958,
"num_input_tokens_seen": 80187780,
"step": 400
},
{
"epoch": 0.09622831929025746,
"grad_norm": 1.8020869110136488,
"learning_rate": 1.6399999999999998e-06,
"loss": 1.1979689598083496,
"num_input_tokens_seen": 82253211,
"step": 410,
"token_acc": 0.6943290418797176
},
{
"epoch": 0.09857535146806863,
"grad_norm": 1.613339482743251,
"learning_rate": 1.6799999999999998e-06,
"loss": 1.1750219345092774,
"num_input_tokens_seen": 84275082,
"step": 420,
"token_acc": 0.6988580180720491
},
{
"epoch": 0.10092238364587978,
"grad_norm": 2.0225577242890402,
"learning_rate": 1.7199999999999998e-06,
"loss": 1.160631275177002,
"num_input_tokens_seen": 86256174,
"step": 430,
"token_acc": 0.7049912003932076
},
{
"epoch": 0.10326941582369094,
"grad_norm": 2.0588425205195047,
"learning_rate": 1.7599999999999999e-06,
"loss": 1.155072021484375,
"num_input_tokens_seen": 88191771,
"step": 440,
"token_acc": 0.7027978727051616
},
{
"epoch": 0.1056164480015021,
"grad_norm": 1.750652589288128,
"learning_rate": 1.8e-06,
"loss": 1.1657937049865723,
"num_input_tokens_seen": 90280068,
"step": 450,
"token_acc": 0.70126095038482
},
{
"epoch": 0.10796348017931326,
"grad_norm": 1.6965579041737329,
"learning_rate": 1.84e-06,
"loss": 1.1403490066528321,
"num_input_tokens_seen": 92282058,
"step": 460,
"token_acc": 0.7044917775975158
},
{
"epoch": 0.11031051235712441,
"grad_norm": 1.8743542107195483,
"learning_rate": 1.8799999999999998e-06,
"loss": 1.1614572525024414,
"num_input_tokens_seen": 94268343,
"step": 470,
"token_acc": 0.7024686011260286
},
{
"epoch": 0.11265754453493558,
"grad_norm": 2.2378848353450693,
"learning_rate": 1.92e-06,
"loss": 1.1589451789855958,
"num_input_tokens_seen": 96220941,
"step": 480,
"token_acc": 0.7040593029694393
},
{
"epoch": 0.11500457671274673,
"grad_norm": 1.7219641168340587,
"learning_rate": 1.96e-06,
"loss": 1.139027214050293,
"num_input_tokens_seen": 98234790,
"step": 490,
"token_acc": 0.7069929196641098
},
{
"epoch": 0.11735160889055789,
"grad_norm": 1.7720161431115489,
"learning_rate": 2e-06,
"loss": 1.1347829818725585,
"num_input_tokens_seen": 100243815,
"step": 500,
"token_acc": 0.7060081282908567
},
{
"epoch": 0.11735160889055789,
"eval_loss": 1.1757478713989258,
"eval_runtime": 32.3883,
"eval_samples_per_second": 30.875,
"eval_steps_per_second": 1.297,
"eval_token_acc": 0.7003347106484153,
"num_input_tokens_seen": 100243815,
"step": 500
},
{
"epoch": 0.11969864106836904,
"grad_norm": 1.8210047186952776,
"learning_rate": 1.9999912270311373e-06,
"loss": 1.1792086601257323,
"num_input_tokens_seen": 102249078,
"step": 510,
"token_acc": 0.698590893627688
},
{
"epoch": 0.12204567324618021,
"grad_norm": 1.8609755736841171,
"learning_rate": 1.999964908278481e-06,
"loss": 1.1209921836853027,
"num_input_tokens_seen": 104220897,
"step": 520,
"token_acc": 0.7090112628579576
},
{
"epoch": 0.12439270542399136,
"grad_norm": 2.1446809333226584,
"learning_rate": 1.9999210442038163e-06,
"loss": 1.1469528198242187,
"num_input_tokens_seen": 106234191,
"step": 530,
"token_acc": 0.703947954006619
},
{
"epoch": 0.12673973760180252,
"grad_norm": 2.046893089210468,
"learning_rate": 1.9998596355767802e-06,
"loss": 1.1571426391601562,
"num_input_tokens_seen": 108272712,
"step": 540,
"token_acc": 0.7027365001081043
},
{
"epoch": 0.12908676977961367,
"grad_norm": 1.8591189017227578,
"learning_rate": 1.999780683474845e-06,
"loss": 1.1333347320556642,
"num_input_tokens_seen": 110241915,
"step": 550,
"token_acc": 0.7072802072223069
},
{
"epoch": 0.13143380195742482,
"grad_norm": 1.6591451063058131,
"learning_rate": 1.9996841892832997e-06,
"loss": 1.1434220314025878,
"num_input_tokens_seen": 112166943,
"step": 560,
"token_acc": 0.7056084295682411
},
{
"epoch": 0.133780834135236,
"grad_norm": 2.020864993257282,
"learning_rate": 1.999570154695225e-06,
"loss": 1.1571636199951172,
"num_input_tokens_seen": 114151494,
"step": 570,
"token_acc": 0.7044949720967205
},
{
"epoch": 0.13612786631304716,
"grad_norm": 2.064129107439252,
"learning_rate": 1.9994385817114644e-06,
"loss": 1.1311494827270507,
"num_input_tokens_seen": 116169552,
"step": 580,
"token_acc": 0.7063148017463998
},
{
"epoch": 0.1384748984908583,
"grad_norm": 2.0906868028581798,
"learning_rate": 1.999289472640589e-06,
"loss": 1.1150264739990234,
"num_input_tokens_seen": 118161789,
"step": 590,
"token_acc": 0.7104190105422314
},
{
"epoch": 0.14082193066866946,
"grad_norm": 1.783399118737723,
"learning_rate": 1.999122830098858e-06,
"loss": 1.14277925491333,
"num_input_tokens_seen": 120188337,
"step": 600,
"token_acc": 0.7054239286277058
},
{
"epoch": 0.14082193066866946,
"eval_loss": 1.150290846824646,
"eval_runtime": 32.9507,
"eval_samples_per_second": 30.348,
"eval_steps_per_second": 1.275,
"eval_token_acc": 0.7054176958057293,
"num_input_tokens_seen": 120188337,
"step": 600
},
{
"epoch": 0.14316896284648062,
"grad_norm": 2.087850842021689,
"learning_rate": 1.998938657010171e-06,
"loss": 1.1017154693603515,
"num_input_tokens_seen": 122187903,
"step": 610,
"token_acc": 0.7150525542709177
},
{
"epoch": 0.14551599502429177,
"grad_norm": 1.941074004275762,
"learning_rate": 1.9987369566060176e-06,
"loss": 1.0946624755859375,
"num_input_tokens_seen": 124171368,
"step": 620,
"token_acc": 0.7163183324905894
},
{
"epoch": 0.14786302720210295,
"grad_norm": 2.176865017774056,
"learning_rate": 1.9985177324254197e-06,
"loss": 1.1165874481201172,
"num_input_tokens_seen": 126183993,
"step": 630,
"token_acc": 0.7110827727359269
},
{
"epoch": 0.1502100593799141,
"grad_norm": 1.7953820815140804,
"learning_rate": 1.998280988314872e-06,
"loss": 1.1424741744995117,
"num_input_tokens_seen": 128176863,
"step": 640,
"token_acc": 0.7053827925519703
},
{
"epoch": 0.15255709155772526,
"grad_norm": 1.8929944794579523,
"learning_rate": 1.9980267284282714e-06,
"loss": 1.1028331756591796,
"num_input_tokens_seen": 130125408,
"step": 650,
"token_acc": 0.7125094339622642
},
{
"epoch": 0.15490412373553641,
"grad_norm": 1.691459310367227,
"learning_rate": 1.9977549572268466e-06,
"loss": 1.107553482055664,
"num_input_tokens_seen": 132065343,
"step": 660,
"token_acc": 0.7138998256484975
},
{
"epoch": 0.15725115591334757,
"grad_norm": 2.1233419395787556,
"learning_rate": 1.9974656794790772e-06,
"loss": 1.1101640701293944,
"num_input_tokens_seen": 134090148,
"step": 670,
"token_acc": 0.713199782361379
},
{
"epoch": 0.15959818809115872,
"grad_norm": 1.7559578625602645,
"learning_rate": 1.997158900260614e-06,
"loss": 1.1094940185546875,
"num_input_tokens_seen": 136112988,
"step": 680,
"token_acc": 0.7122396887639626
},
{
"epoch": 0.16194522026896987,
"grad_norm": 1.7829226146649233,
"learning_rate": 1.9968346249541846e-06,
"loss": 1.117540168762207,
"num_input_tokens_seen": 138058629,
"step": 690,
"token_acc": 0.7106555900807559
},
{
"epoch": 0.16429225244678106,
"grad_norm": 2.1483315176659166,
"learning_rate": 1.9964928592495045e-06,
"loss": 1.0879833221435546,
"num_input_tokens_seen": 140078598,
"step": 700,
"token_acc": 0.7166827394425921
},
{
"epoch": 0.16429225244678106,
"eval_loss": 1.1313835382461548,
"eval_runtime": 32.334,
"eval_samples_per_second": 30.927,
"eval_steps_per_second": 1.299,
"eval_token_acc": 0.7089033032478474,
"num_input_tokens_seen": 140078598,
"step": 700
},
{
"epoch": 0.1666392846245922,
"grad_norm": 1.867471314482214,
"learning_rate": 1.9961336091431724e-06,
"loss": 1.1190789222717286,
"num_input_tokens_seen": 142099659,
"step": 710,
"token_acc": 0.712375749359721
},
{
"epoch": 0.16898631680240336,
"grad_norm": 1.9022337846097856,
"learning_rate": 1.995756880938569e-06,
"loss": 1.0825121879577637,
"num_input_tokens_seen": 144092310,
"step": 720,
"token_acc": 0.7172525783126845
},
{
"epoch": 0.17133334898021452,
"grad_norm": 1.9984437173736713,
"learning_rate": 1.9953626812457438e-06,
"loss": 1.095411491394043,
"num_input_tokens_seen": 146064039,
"step": 730,
"token_acc": 0.714463713054313
},
{
"epoch": 0.17368038115802567,
"grad_norm": 2.1447202509234304,
"learning_rate": 1.9949510169813e-06,
"loss": 1.1152179718017579,
"num_input_tokens_seen": 148112049,
"step": 740,
"token_acc": 0.712060909164676
},
{
"epoch": 0.17602741333583682,
"grad_norm": 1.6936993245361356,
"learning_rate": 1.994521895368273e-06,
"loss": 1.0852348327636718,
"num_input_tokens_seen": 150133974,
"step": 750,
"token_acc": 0.7162799236018076
},
{
"epoch": 0.178374445513648,
"grad_norm": 2.352601598144833,
"learning_rate": 1.9940753239360045e-06,
"loss": 1.1107561111450195,
"num_input_tokens_seen": 152099280,
"step": 760,
"token_acc": 0.7127718906860011
},
{
"epoch": 0.18072147769145916,
"grad_norm": 1.84112693117569,
"learning_rate": 1.9936113105200084e-06,
"loss": 1.110912036895752,
"num_input_tokens_seen": 154146792,
"step": 770,
"token_acc": 0.7112778436268925
},
{
"epoch": 0.1830685098692703,
"grad_norm": 2.3592228692729367,
"learning_rate": 1.9931298632618353e-06,
"loss": 1.127957820892334,
"num_input_tokens_seen": 156087093,
"step": 780,
"token_acc": 0.7073941119432238
},
{
"epoch": 0.18541554204708147,
"grad_norm": 1.8453081635946817,
"learning_rate": 1.9926309906089288e-06,
"loss": 1.0826932907104492,
"num_input_tokens_seen": 158083548,
"step": 790,
"token_acc": 0.7176969639197369
},
{
"epoch": 0.18776257422489262,
"grad_norm": 1.6598465812647105,
"learning_rate": 1.9921147013144777e-06,
"loss": 1.097795295715332,
"num_input_tokens_seen": 160083087,
"step": 800,
"token_acc": 0.712001722391892
},
{
"epoch": 0.18776257422489262,
"eval_loss": 1.1155238151550293,
"eval_runtime": 32.4633,
"eval_samples_per_second": 30.804,
"eval_steps_per_second": 1.294,
"eval_token_acc": 0.712074975185245,
"num_input_tokens_seen": 160083087,
"step": 800
},
{
"epoch": 0.19010960640270377,
"grad_norm": 2.345143984991418,
"learning_rate": 1.9915810044372615e-06,
"loss": 1.0773065567016602,
"num_input_tokens_seen": 162043827,
"step": 810,
"token_acc": 0.7185854363462685
},
{
"epoch": 0.19245663858051493,
"grad_norm": 1.6218625881025774,
"learning_rate": 1.991029909341493e-06,
"loss": 1.1322909355163575,
"num_input_tokens_seen": 164065197,
"step": 820,
"token_acc": 0.7112101172756877
},
{
"epoch": 0.1948036707583261,
"grad_norm": 3.4128089423104204,
"learning_rate": 1.990461425696651e-06,
"loss": 1.1018625259399415,
"num_input_tokens_seen": 166095825,
"step": 830,
"token_acc": 0.7132049834650468
},
{
"epoch": 0.19715070293613726,
"grad_norm": 3.8983014033715273,
"learning_rate": 1.9898755634773155e-06,
"loss": 1.092278289794922,
"num_input_tokens_seen": 168127596,
"step": 840,
"token_acc": 0.7165934113928826
},
{
"epoch": 0.19949773511394842,
"grad_norm": 1.7080322418933676,
"learning_rate": 1.9892723329629885e-06,
"loss": 1.0770910263061524,
"num_input_tokens_seen": 170112078,
"step": 850,
"token_acc": 0.7174151496405977
},
{
"epoch": 0.20184476729175957,
"grad_norm": 1.6399516756726806,
"learning_rate": 1.988651744737914e-06,
"loss": 1.119683837890625,
"num_input_tokens_seen": 172089120,
"step": 860,
"token_acc": 0.7093057553740301
},
{
"epoch": 0.20419179946957072,
"grad_norm": 1.9211847623415963,
"learning_rate": 1.988013809690895e-06,
"loss": 1.0811002731323243,
"num_input_tokens_seen": 174102978,
"step": 870,
"token_acc": 0.7170278749197704
},
{
"epoch": 0.20653883164738188,
"grad_norm": 1.7860065176012982,
"learning_rate": 1.9873585390151003e-06,
"loss": 1.0824663162231445,
"num_input_tokens_seen": 176106354,
"step": 880,
"token_acc": 0.7187242752799151
},
{
"epoch": 0.20888586382519303,
"grad_norm": 6.212586622472415,
"learning_rate": 1.986685944207868e-06,
"loss": 1.0738523483276368,
"num_input_tokens_seen": 178098096,
"step": 890,
"token_acc": 0.7199481706694962
},
{
"epoch": 0.2112328960030042,
"grad_norm": 1.7634279436109257,
"learning_rate": 1.985996037070505e-06,
"loss": 1.0606145858764648,
"num_input_tokens_seen": 180140916,
"step": 900,
"token_acc": 0.7212711540534449
},
{
"epoch": 0.2112328960030042,
"eval_loss": 1.102053165435791,
"eval_runtime": 32.763,
"eval_samples_per_second": 30.522,
"eval_steps_per_second": 1.282,
"eval_token_acc": 0.7141986565407077,
"num_input_tokens_seen": 180140916,
"step": 900
},
{
"epoch": 0.21357992818081536,
"grad_norm": 1.927927016491826,
"learning_rate": 1.9852888297080784e-06,
"loss": 1.0789798736572265,
"num_input_tokens_seen": 182134725,
"step": 910,
"token_acc": 0.7180467099845159
},
{
"epoch": 0.21592696035862652,
"grad_norm": 1.633422631466873,
"learning_rate": 1.9845643345292055e-06,
"loss": 1.075742530822754,
"num_input_tokens_seen": 184161738,
"step": 920,
"token_acc": 0.719577260000721
},
{
"epoch": 0.21827399253643767,
"grad_norm": 1.763375240928624,
"learning_rate": 1.9838225642458328e-06,
"loss": 1.0633999824523925,
"num_input_tokens_seen": 186250896,
"step": 930,
"token_acc": 0.7216072711554525
},
{
"epoch": 0.22062102471424883,
"grad_norm": 1.578730561244102,
"learning_rate": 1.9830635318730153e-06,
"loss": 1.0807870864868163,
"num_input_tokens_seen": 188240646,
"step": 940,
"token_acc": 0.719998073905838
},
{
"epoch": 0.22296805689205998,
"grad_norm": 1.9778473417464,
"learning_rate": 1.9822872507286887e-06,
"loss": 1.0958086013793946,
"num_input_tokens_seen": 190240614,
"step": 950,
"token_acc": 0.715133457837701
},
{
"epoch": 0.22531508906987116,
"grad_norm": 1.7070736536906375,
"learning_rate": 1.9814937344334326e-06,
"loss": 1.083117961883545,
"num_input_tokens_seen": 192202005,
"step": 960,
"token_acc": 0.718299042165819
},
{
"epoch": 0.22766212124768231,
"grad_norm": 1.6694843702106625,
"learning_rate": 1.9806829969102353e-06,
"loss": 1.0489460945129394,
"num_input_tokens_seen": 194152464,
"step": 970,
"token_acc": 0.7243972802430247
},
{
"epoch": 0.23000915342549347,
"grad_norm": 1.6802225185406368,
"learning_rate": 1.9798550523842466e-06,
"loss": 1.055472183227539,
"num_input_tokens_seen": 196146252,
"step": 980,
"token_acc": 0.7222500499869107
},
{
"epoch": 0.23235618560330462,
"grad_norm": 1.586112316988885,
"learning_rate": 1.9790099153825295e-06,
"loss": 1.0688490867614746,
"num_input_tokens_seen": 198216198,
"step": 990,
"token_acc": 0.721910041723649
},
{
"epoch": 0.23470321778111577,
"grad_norm": 1.9657681362344652,
"learning_rate": 1.9781476007338054e-06,
"loss": 1.0997188568115235,
"num_input_tokens_seen": 200266242,
"step": 1000,
"token_acc": 0.7134316006040672
},
{
"epoch": 0.23470321778111577,
"eval_loss": 1.0927079916000366,
"eval_runtime": 32.3958,
"eval_samples_per_second": 30.868,
"eval_steps_per_second": 1.296,
"eval_token_acc": 0.7168070912490478,
"num_input_tokens_seen": 200266242,
"step": 1000
},
{
"epoch": 0.23705024995892693,
"grad_norm": 2.279451886611171,
"learning_rate": 1.9772681235681933e-06,
"loss": 1.0306278228759767,
"num_input_tokens_seen": 202268343,
"step": 1010,
"token_acc": 0.7296832940863017
},
{
"epoch": 0.23939728213673808,
"grad_norm": 1.7883291650438458,
"learning_rate": 1.976371499316945e-06,
"loss": 1.0757831573486327,
"num_input_tokens_seen": 204289632,
"step": 1020,
"token_acc": 0.7182072037465692
},
{
"epoch": 0.24174431431454926,
"grad_norm": 1.8342347796963645,
"learning_rate": 1.975457743712173e-06,
"loss": 1.0590785980224608,
"num_input_tokens_seen": 206327745,
"step": 1030,
"token_acc": 0.7225627285705905
},
{
"epoch": 0.24409134649236042,
"grad_norm": 1.762378045102792,
"learning_rate": 1.974526872786577e-06,
"loss": 1.0789016723632812,
"num_input_tokens_seen": 208322556,
"step": 1040,
"token_acc": 0.7185882266690018
},
{
"epoch": 0.24643837867017157,
"grad_norm": 1.7642619697840807,
"learning_rate": 1.97357890287316e-06,
"loss": 1.090459442138672,
"num_input_tokens_seen": 210345396,
"step": 1050,
"token_acc": 0.715633342030789
},
{
"epoch": 0.24878541084798272,
"grad_norm": 1.8062010829609079,
"learning_rate": 1.9726138506049433e-06,
"loss": 1.0327832221984863,
"num_input_tokens_seen": 212289177,
"step": 1060,
"token_acc": 0.728890125802145
},
{
"epoch": 0.2511324430257939,
"grad_norm": 1.6741852997103905,
"learning_rate": 1.971631732914674e-06,
"loss": 1.0438125610351563,
"num_input_tokens_seen": 214294110,
"step": 1070,
"token_acc": 0.7274771422710105
},
{
"epoch": 0.25347947520360503,
"grad_norm": 1.8889183202576878,
"learning_rate": 1.970632567034527e-06,
"loss": 1.0874737739562987,
"num_input_tokens_seen": 216250632,
"step": 1080,
"token_acc": 0.7169543090609345
},
{
"epoch": 0.2558265073814162,
"grad_norm": 1.768581214259287,
"learning_rate": 1.9696163704958057e-06,
"loss": 1.0529390335083009,
"num_input_tokens_seen": 218235084,
"step": 1090,
"token_acc": 0.7233062911737727
},
{
"epoch": 0.25817353955922734,
"grad_norm": 1.6728742294003298,
"learning_rate": 1.968583161128631e-06,
"loss": 1.0434741973876953,
"num_input_tokens_seen": 220250775,
"step": 1100,
"token_acc": 0.72555486645587
},
{
"epoch": 0.25817353955922734,
"eval_loss": 1.0830632448196411,
"eval_runtime": 32.7745,
"eval_samples_per_second": 30.512,
"eval_steps_per_second": 1.281,
"eval_token_acc": 0.7187414879619584,
"num_input_tokens_seen": 220250775,
"step": 1100
},
{
"epoch": 0.2605205717370385,
"grad_norm": 3.5179698506650827,
"learning_rate": 1.9675329570616295e-06,
"loss": 1.036564826965332,
"num_input_tokens_seen": 222248643,
"step": 1110,
"token_acc": 0.7253935790918138
},
{
"epoch": 0.26286760391484965,
"grad_norm": 1.7194590974245725,
"learning_rate": 1.9664657767216175e-06,
"loss": 1.034214401245117,
"num_input_tokens_seen": 224176074,
"step": 1120,
"token_acc": 0.731699968385116
},
{
"epoch": 0.2652146360926608,
"grad_norm": 1.6151699401355315,
"learning_rate": 1.9653816388332737e-06,
"loss": 1.0186534881591798,
"num_input_tokens_seen": 226256241,
"step": 1130,
"token_acc": 0.729031512194937
},
{
"epoch": 0.267561668270472,
"grad_norm": 1.915048663566233,
"learning_rate": 1.9642805624188146e-06,
"loss": 1.0460872650146484,
"num_input_tokens_seen": 228227991,
"step": 1140,
"token_acc": 0.7245494456551131
},
{
"epoch": 0.26990870044828313,
"grad_norm": 2.3808335250565387,
"learning_rate": 1.963162566797658e-06,
"loss": 1.0558183670043946,
"num_input_tokens_seen": 230254347,
"step": 1150,
"token_acc": 0.7232573802936575
},
{
"epoch": 0.2722557326260943,
"grad_norm": 1.7367289249419906,
"learning_rate": 1.962027671586086e-06,
"loss": 1.050713062286377,
"num_input_tokens_seen": 232285218,
"step": 1160,
"token_acc": 0.7248766799700481
},
{
"epoch": 0.27460276480390544,
"grad_norm": 1.8903258230442381,
"learning_rate": 1.9608758966968984e-06,
"loss": 1.0442859649658203,
"num_input_tokens_seen": 234350787,
"step": 1170,
"token_acc": 0.7246446168983565
},
{
"epoch": 0.2769497969817166,
"grad_norm": 2.0858660720659064,
"learning_rate": 1.959707262339067e-06,
"loss": 1.0628435134887695,
"num_input_tokens_seen": 236401623,
"step": 1180,
"token_acc": 0.7223079815551465
},
{
"epoch": 0.2792968291595278,
"grad_norm": 2.977405059549,
"learning_rate": 1.9585217890173757e-06,
"loss": 1.0738126754760742,
"num_input_tokens_seen": 238361190,
"step": 1190,
"token_acc": 0.7190871093733786
},
{
"epoch": 0.28164386133733893,
"grad_norm": 2.283563309099777,
"learning_rate": 1.957319497532067e-06,
"loss": 1.0180787086486816,
"num_input_tokens_seen": 240437730,
"step": 1200,
"token_acc": 0.7330017297652685
},
{
"epoch": 0.28164386133733893,
"eval_loss": 1.0745400190353394,
"eval_runtime": 32.4066,
"eval_samples_per_second": 30.858,
"eval_steps_per_second": 1.296,
"eval_token_acc": 0.7201588144317999,
"num_input_tokens_seen": 240437730,
"step": 1200
},
{
"epoch": 0.2839908935151501,
"grad_norm": 1.9270515119564795,
"learning_rate": 1.956100408978472e-06,
"loss": 1.0345954895019531,
"num_input_tokens_seen": 242382708,
"step": 1210,
"token_acc": 0.7277172037115998
},
{
"epoch": 0.28633792569296124,
"grad_norm": 1.5733358413499778,
"learning_rate": 1.954864544746643e-06,
"loss": 1.0476463317871094,
"num_input_tokens_seen": 244350303,
"step": 1220,
"token_acc": 0.7255343803753794
},
{
"epoch": 0.2886849578707724,
"grad_norm": 2.0867528996051345,
"learning_rate": 1.9536119265209757e-06,
"loss": 1.0576335906982421,
"num_input_tokens_seen": 246334116,
"step": 1230,
"token_acc": 0.7241534895699202
},
{
"epoch": 0.29103199004858354,
"grad_norm": 1.587056177259835,
"learning_rate": 1.952342576279833e-06,
"loss": 1.0451471328735351,
"num_input_tokens_seen": 248362662,
"step": 1240,
"token_acc": 0.7264873056477157
},
{
"epoch": 0.2933790222263947,
"grad_norm": 3.147776681472526,
"learning_rate": 1.9510565162951534e-06,
"loss": 1.0531164169311524,
"num_input_tokens_seen": 250326474,
"step": 1250,
"token_acc": 0.7241427379495411
},
{
"epoch": 0.2957260544042059,
"grad_norm": 1.677213988705626,
"learning_rate": 1.9497537691320667e-06,
"loss": 1.0469918251037598,
"num_input_tokens_seen": 252382641,
"step": 1260,
"token_acc": 0.7247498649880667
},
{
"epoch": 0.29807308658201703,
"grad_norm": 3.244921913867558,
"learning_rate": 1.9484343576484934e-06,
"loss": 1.0731307983398437,
"num_input_tokens_seen": 254380842,
"step": 1270,
"token_acc": 0.7198529707146587
},
{
"epoch": 0.3004201187598282,
"grad_norm": 2.2715118534896424,
"learning_rate": 1.9470983049947442e-06,
"loss": 1.0327179908752442,
"num_input_tokens_seen": 256367745,
"step": 1280,
"token_acc": 0.7273322442040123
},
{
"epoch": 0.30276715093763934,
"grad_norm": 2.388511262608066,
"learning_rate": 1.9457456346131168e-06,
"loss": 1.0295280456542968,
"num_input_tokens_seen": 258362418,
"step": 1290,
"token_acc": 0.7289352257814815
},
{
"epoch": 0.3051141831154505,
"grad_norm": 2.284896449465709,
"learning_rate": 1.944376370237481e-06,
"loss": 1.0356334686279296,
"num_input_tokens_seen": 260389752,
"step": 1300,
"token_acc": 0.7264502277424404
},
{
"epoch": 0.3051141831154505,
"eval_loss": 1.0682131052017212,
"eval_runtime": 32.3728,
"eval_samples_per_second": 30.89,
"eval_steps_per_second": 1.297,
"eval_token_acc": 0.7214607234366704,
"num_input_tokens_seen": 260389752,
"step": 1300
},
{
"epoch": 0.30746121529326165,
"grad_norm": 1.8462096822584517,
"learning_rate": 1.9429905358928646e-06,
"loss": 1.0431997299194335,
"num_input_tokens_seen": 262425369,
"step": 1310,
"token_acc": 0.7247579875646393
},
{
"epoch": 0.30980824747107283,
"grad_norm": 2.7288254092061286,
"learning_rate": 1.94158815589503e-06,
"loss": 1.03179931640625,
"num_input_tokens_seen": 264478839,
"step": 1320,
"token_acc": 0.7273030599423818
},
{
"epoch": 0.312155279648884,
"grad_norm": 2.0483013477422563,
"learning_rate": 1.9401692548500502e-06,
"loss": 1.0194345474243165,
"num_input_tokens_seen": 266467188,
"step": 1330,
"token_acc": 0.7318709842049548
},
{
"epoch": 0.31450231182669514,
"grad_norm": 3.607937481626218,
"learning_rate": 1.938733857653874e-06,
"loss": 1.0359786987304687,
"num_input_tokens_seen": 268553511,
"step": 1340,
"token_acc": 0.7270260288085842
},
{
"epoch": 0.3168493440045063,
"grad_norm": 2.2908695328416244,
"learning_rate": 1.9372819894918914e-06,
"loss": 1.005875015258789,
"num_input_tokens_seen": 270556128,
"step": 1350,
"token_acc": 0.733425647272143
},
{
"epoch": 0.31919637618231744,
"grad_norm": 2.2530826851795576,
"learning_rate": 1.935813675838491e-06,
"loss": 1.0363348007202149,
"num_input_tokens_seen": 272585331,
"step": 1360,
"token_acc": 0.7270068150894993
},
{
"epoch": 0.3215434083601286,
"grad_norm": 1.6599911510535466,
"learning_rate": 1.934328942456612e-06,
"loss": 0.9922657012939453,
"num_input_tokens_seen": 274625832,
"step": 1370,
"token_acc": 0.7369969482933556
},
{
"epoch": 0.32389044053793975,
"grad_norm": 1.6571812543491504,
"learning_rate": 1.9328278153972946e-06,
"loss": 1.0838043212890625,
"num_input_tokens_seen": 276646638,
"step": 1380,
"token_acc": 0.7254781164111181
},
{
"epoch": 0.32623747271575093,
"grad_norm": 1.7846961468797993,
"learning_rate": 1.9313103209992204e-06,
"loss": 1.0071705818176269,
"num_input_tokens_seen": 278652339,
"step": 1390,
"token_acc": 0.733368638373526
},
{
"epoch": 0.3285845048935621,
"grad_norm": 2.1490918049490717,
"learning_rate": 1.929776485888251e-06,
"loss": 1.0504549026489258,
"num_input_tokens_seen": 280677636,
"step": 1400,
"token_acc": 0.72332943463746
},
{
"epoch": 0.3285845048935621,
"eval_loss": 1.061837077140808,
"eval_runtime": 32.7164,
"eval_samples_per_second": 30.566,
"eval_steps_per_second": 1.284,
"eval_token_acc": 0.7231458184252441,
"num_input_tokens_seen": 280677636,
"step": 1400
},
{
"epoch": 0.33093153707137324,
"grad_norm": 1.776580604562134,
"learning_rate": 1.928226336976963e-06,
"loss": 1.0266141891479492,
"num_input_tokens_seen": 282669069,
"step": 1410,
"token_acc": 0.7291277131940492
},
{
"epoch": 0.3332785692491844,
"grad_norm": 8.438214405501748,
"learning_rate": 1.926659901464172e-06,
"loss": 1.0292797088623047,
"num_input_tokens_seen": 284659779,
"step": 1420,
"token_acc": 0.7288078819771109
},
{
"epoch": 0.33562560142699555,
"grad_norm": 2.252060217551861,
"learning_rate": 1.925077206834458e-06,
"loss": 1.0228628158569335,
"num_input_tokens_seen": 286673274,
"step": 1430,
"token_acc": 0.7280252171611444
},
{
"epoch": 0.3379726336048067,
"grad_norm": 1.4651418770258904,
"learning_rate": 1.923478280857682e-06,
"loss": 1.0042032241821288,
"num_input_tokens_seen": 288677157,
"step": 1440,
"token_acc": 0.7343410272213868
},
{
"epoch": 0.34031966578261785,
"grad_norm": 1.6827171089675037,
"learning_rate": 1.9218631515885003e-06,
"loss": 1.0294583320617676,
"num_input_tokens_seen": 290678706,
"step": 1450,
"token_acc": 0.7304443621152334
},
{
"epoch": 0.34266669796042903,
"grad_norm": 1.7341043440646111,
"learning_rate": 1.9202318473658702e-06,
"loss": 0.9965463638305664,
"num_input_tokens_seen": 292647750,
"step": 1460,
"token_acc": 0.736443122122828
},
{
"epoch": 0.3450137301382402,
"grad_norm": 1.706569258628379,
"learning_rate": 1.918584396812554e-06,
"loss": 1.0162506103515625,
"num_input_tokens_seen": 294701517,
"step": 1470,
"token_acc": 0.7316330245383567
},
{
"epoch": 0.34736076231605134,
"grad_norm": 1.6208113959472872,
"learning_rate": 1.9169208288346163e-06,
"loss": 1.0112849235534669,
"num_input_tokens_seen": 296720586,
"step": 1480,
"token_acc": 0.732423183545091
},
{
"epoch": 0.3497077944938625,
"grad_norm": 1.7865465491021926,
"learning_rate": 1.9152411726209172e-06,
"loss": 1.0156356811523437,
"num_input_tokens_seen": 298684938,
"step": 1490,
"token_acc": 0.7308413793103449
},
{
"epoch": 0.35205482667167365,
"grad_norm": 2.059441241693384,
"learning_rate": 1.9135454576426007e-06,
"loss": 1.0275184631347656,
"num_input_tokens_seen": 300684201,
"step": 1500,
"token_acc": 0.730526369912453
},
{
"epoch": 0.35205482667167365,
"eval_loss": 1.0552641153335571,
"eval_runtime": 32.4705,
"eval_samples_per_second": 30.797,
"eval_steps_per_second": 1.293,
"eval_token_acc": 0.72383370651647,
"num_input_tokens_seen": 300684201,
"step": 1500
},
{
"epoch": 0.35440185884948483,
"grad_norm": 2.3565377610515594,
"learning_rate": 1.9118337136525756e-06,
"loss": 1.0185004234313966,
"num_input_tokens_seen": 302704359,
"step": 1510,
"token_acc": 0.7304355716162425
},
{
"epoch": 0.356748891027296,
"grad_norm": 14.877826986152865,
"learning_rate": 1.9101059706849955e-06,
"loss": 1.019582176208496,
"num_input_tokens_seen": 304651629,
"step": 1520,
"token_acc": 0.731234582403383
},
{
"epoch": 0.35909592320510714,
"grad_norm": 2.879334483584151,
"learning_rate": 1.908362259054731e-06,
"loss": 1.0251285552978515,
"num_input_tokens_seen": 306641097,
"step": 1530,
"token_acc": 0.7294201685316217
},
{
"epoch": 0.3614429553829183,
"grad_norm": 1.7887355243868148,
"learning_rate": 1.9066026093568377e-06,
"loss": 1.0157214164733888,
"num_input_tokens_seen": 308660178,
"step": 1540,
"token_acc": 0.7307293262997984
},
{
"epoch": 0.36378998756072944,
"grad_norm": 1.867513936920377,
"learning_rate": 1.9048270524660196e-06,
"loss": 1.0161379814147948,
"num_input_tokens_seen": 310777926,
"step": 1550,
"token_acc": 0.7304925609175636
},
{
"epoch": 0.3661370197385406,
"grad_norm": 6.302806843132354,
"learning_rate": 1.9030356195360873e-06,
"loss": 0.9866199493408203,
"num_input_tokens_seen": 312788916,
"step": 1560,
"token_acc": 0.7381302995035983
},
{
"epoch": 0.36848405191635175,
"grad_norm": 1.8930345198459555,
"learning_rate": 1.9012283419994113e-06,
"loss": 1.0311415672302247,
"num_input_tokens_seen": 314814855,
"step": 1570,
"token_acc": 0.7291705656140012
},
{
"epoch": 0.37083108409416293,
"grad_norm": 2.3487824750816646,
"learning_rate": 1.899405251566371e-06,
"loss": 1.0350725173950195,
"num_input_tokens_seen": 316867344,
"step": 1580,
"token_acc": 0.7278371704934657
},
{
"epoch": 0.3731781162719741,
"grad_norm": 2.0782965598493917,
"learning_rate": 1.8975663802247975e-06,
"loss": 1.0283987998962403,
"num_input_tokens_seen": 318871404,
"step": 1590,
"token_acc": 0.7280485561890748
},
{
"epoch": 0.37552514844978524,
"grad_norm": 2.8179476770543546,
"learning_rate": 1.8957117602394128e-06,
"loss": 1.027695655822754,
"num_input_tokens_seen": 320871228,
"step": 1600,
"token_acc": 0.7284322929815703
},
{
"epoch": 0.37552514844978524,
"eval_loss": 1.0503556728363037,
"eval_runtime": 32.4119,
"eval_samples_per_second": 30.853,
"eval_steps_per_second": 1.296,
"eval_token_acc": 0.7255742018882297,
"num_input_tokens_seen": 320871228,
"step": 1600
},
{
"epoch": 0.3778721806275964,
"grad_norm": 2.240496844348581,
"learning_rate": 1.8938414241512637e-06,
"loss": 1.0263992309570313,
"num_input_tokens_seen": 322930128,
"step": 1610,
"token_acc": 0.731757208141934
},
{
"epoch": 0.38021921280540755,
"grad_norm": 3.896191708778685,
"learning_rate": 1.8919554047771507e-06,
"loss": 1.0006643295288087,
"num_input_tokens_seen": 324982575,
"step": 1620,
"token_acc": 0.732137966433454
},
{
"epoch": 0.38256624498321873,
"grad_norm": 1.7935819973243883,
"learning_rate": 1.8900537352090523e-06,
"loss": 0.9882081985473633,
"num_input_tokens_seen": 326990898,
"step": 1630,
"token_acc": 0.7385387731711782
},
{
"epoch": 0.38491327716102985,
"grad_norm": 3.1640907355889496,
"learning_rate": 1.8881364488135445e-06,
"loss": 1.0018336296081543,
"num_input_tokens_seen": 329033799,
"step": 1640,
"token_acc": 0.7350213182627736
},
{
"epoch": 0.38726030933884104,
"grad_norm": 5.630791095478135,
"learning_rate": 1.8862035792312146e-06,
"loss": 0.9879220962524414,
"num_input_tokens_seen": 331067478,
"step": 1650,
"token_acc": 0.736295696568692
},
{
"epoch": 0.3896073415166522,
"grad_norm": 1.5905696004173981,
"learning_rate": 1.8842551603760723e-06,
"loss": 1.004323387145996,
"num_input_tokens_seen": 333089880,
"step": 1660,
"token_acc": 0.7334599037600028
},
{
"epoch": 0.39195437369446334,
"grad_norm": 43.2007654518171,
"learning_rate": 1.8822912264349532e-06,
"loss": 1.0126733779907227,
"num_input_tokens_seen": 335093103,
"step": 1670,
"token_acc": 0.7332479964381122
},
{
"epoch": 0.3943014058722745,
"grad_norm": 1.6733459020369337,
"learning_rate": 1.8803118118669202e-06,
"loss": 1.0368854522705078,
"num_input_tokens_seen": 337115598,
"step": 1680,
"token_acc": 0.7274540217150455
},
{
"epoch": 0.39664843805008565,
"grad_norm": 1.9876180817181506,
"learning_rate": 1.8783169514026577e-06,
"loss": 1.0030999183654785,
"num_input_tokens_seen": 339154959,
"step": 1690,
"token_acc": 0.7345074320050601
},
{
"epoch": 0.39899547022789683,
"grad_norm": 1.842434463603931,
"learning_rate": 1.8763066800438634e-06,
"loss": 0.9946871757507324,
"num_input_tokens_seen": 341186700,
"step": 1700,
"token_acc": 0.7359575477937458
},
{
"epoch": 0.39899547022789683,
"eval_loss": 1.0446056127548218,
"eval_runtime": 33.305,
"eval_samples_per_second": 30.026,
"eval_steps_per_second": 1.261,
"eval_token_acc": 0.7265437085939844,
"num_input_tokens_seen": 341186700,
"step": 1700
},
{
"epoch": 0.40134250240570796,
"grad_norm": 1.9481089599377517,
"learning_rate": 1.8742810330626335e-06,
"loss": 1.0056350708007813,
"num_input_tokens_seen": 343197345,
"step": 1710,
"token_acc": 0.7343789679900354
},
{
"epoch": 0.40368953458351914,
"grad_norm": 1.8925573831015579,
"learning_rate": 1.8722400460008437e-06,
"loss": 1.0299295425415038,
"num_input_tokens_seen": 345220860,
"step": 1720,
"token_acc": 0.727836675491576
},
{
"epoch": 0.4060365667613303,
"grad_norm": 1.568094384198171,
"learning_rate": 1.8701837546695256e-06,
"loss": 1.011802864074707,
"num_input_tokens_seen": 347269032,
"step": 1730,
"token_acc": 0.731503068944188
},
{
"epoch": 0.40838359893914145,
"grad_norm": 4.690102343755759,
"learning_rate": 1.8681121951482393e-06,
"loss": 1.0340707778930665,
"num_input_tokens_seen": 349265856,
"step": 1740,
"token_acc": 0.7287572174652813
},
{
"epoch": 0.4107306311169526,
"grad_norm": 2.0732894110715776,
"learning_rate": 1.8660254037844386e-06,
"loss": 1.0054452896118165,
"num_input_tokens_seen": 351220833,
"step": 1750,
"token_acc": 0.7349583487050085
},
{
"epoch": 0.41307766329476375,
"grad_norm": 4.563573246901434,
"learning_rate": 1.863923417192835e-06,
"loss": 0.9984481811523438,
"num_input_tokens_seen": 353217660,
"step": 1760,
"token_acc": 0.7346953872236972
},
{
"epoch": 0.41542469547257493,
"grad_norm": 1.8182323815552697,
"learning_rate": 1.861806272254755e-06,
"loss": 1.0026565551757813,
"num_input_tokens_seen": 355231713,
"step": 1770,
"token_acc": 0.734238520256768
},
{
"epoch": 0.41777172765038606,
"grad_norm": 2.3723528968369867,
"learning_rate": 1.859674006117491e-06,
"loss": 0.9838489532470703,
"num_input_tokens_seen": 357318357,
"step": 1780,
"token_acc": 0.7385274102305481
},
{
"epoch": 0.42011875982819724,
"grad_norm": 2.413365084744393,
"learning_rate": 1.8575266561936522e-06,
"loss": 1.0196653366088868,
"num_input_tokens_seen": 359351646,
"step": 1790,
"token_acc": 0.730992332131187
},
{
"epoch": 0.4224657920060084,
"grad_norm": 7.914722238930336,
"learning_rate": 1.8553642601605066e-06,
"loss": 0.9948186874389648,
"num_input_tokens_seen": 361303284,
"step": 1800,
"token_acc": 0.7360711800377772
},
{
"epoch": 0.4224657920060084,
"eval_loss": 1.038891315460205,
"eval_runtime": 32.4449,
"eval_samples_per_second": 30.821,
"eval_steps_per_second": 1.295,
"eval_token_acc": 0.7277024999422913,
"num_input_tokens_seen": 361303284,
"step": 1800
},
{
"epoch": 0.42481282418381955,
"grad_norm": 1.7384957796876852,
"learning_rate": 1.8531868559593203e-06,
"loss": 1.0075714111328125,
"num_input_tokens_seen": 363290772,
"step": 1810,
"token_acc": 0.7332521267838883
},
{
"epoch": 0.42715985636163073,
"grad_norm": 1.73396216177198,
"learning_rate": 1.850994481794692e-06,
"loss": 1.018679428100586,
"num_input_tokens_seen": 365299026,
"step": 1820,
"token_acc": 0.7299744624828494
},
{
"epoch": 0.42950688853944186,
"grad_norm": 1.859054699772832,
"learning_rate": 1.8487871761338819e-06,
"loss": 0.9975422859191895,
"num_input_tokens_seen": 367342086,
"step": 1830,
"token_acc": 0.735841141099147
},
{
"epoch": 0.43185392071725304,
"grad_norm": 1.6167732458245692,
"learning_rate": 1.8465649777061376e-06,
"loss": 1.0366539001464843,
"num_input_tokens_seen": 369276633,
"step": 1840,
"token_acc": 0.7277804414793901
},
{
"epoch": 0.4342009528950642,
"grad_norm": 2.534040309718505,
"learning_rate": 1.844327925502015e-06,
"loss": 1.0096059799194337,
"num_input_tokens_seen": 371265615,
"step": 1850,
"token_acc": 0.7326266219047257
},
{
"epoch": 0.43654798507287534,
"grad_norm": 1.9228862468394357,
"learning_rate": 1.8420760587726921e-06,
"loss": 1.0271913528442382,
"num_input_tokens_seen": 373272270,
"step": 1860,
"token_acc": 0.7302226164565024
},
{
"epoch": 0.4388950172506865,
"grad_norm": 1.5025282734361622,
"learning_rate": 1.8398094170292829e-06,
"loss": 1.0059158325195312,
"num_input_tokens_seen": 375279099,
"step": 1870,
"token_acc": 0.7330154465542768
},
{
"epoch": 0.44124204942849765,
"grad_norm": 4.754818039721933,
"learning_rate": 1.8375280400421418e-06,
"loss": 0.9967041969299316,
"num_input_tokens_seen": 377223396,
"step": 1880,
"token_acc": 0.7358239778762203
},
{
"epoch": 0.44358908160630883,
"grad_norm": 1.691685468916323,
"learning_rate": 1.8352319678401674e-06,
"loss": 0.999173927307129,
"num_input_tokens_seen": 379235661,
"step": 1890,
"token_acc": 0.7347835016672305
},
{
"epoch": 0.44593611378411996,
"grad_norm": 1.7737231328640157,
"learning_rate": 1.8329212407100993e-06,
"loss": 0.9919824600219727,
"num_input_tokens_seen": 381243486,
"step": 1900,
"token_acc": 0.7371798315515523
},
{
"epoch": 0.44593611378411996,
"eval_loss": 1.0355346202850342,
"eval_runtime": 32.2582,
"eval_samples_per_second": 31.0,
"eval_steps_per_second": 1.302,
"eval_token_acc": 0.7281641698021745,
"num_input_tokens_seen": 381243486,
"step": 1900
},
{
"epoch": 0.44828314596193114,
"grad_norm": 2.5554510353139115,
"learning_rate": 1.8305958991958126e-06,
"loss": 0.9984329223632813,
"num_input_tokens_seen": 383266650,
"step": 1910,
"token_acc": 0.7348018362631924
},
{
"epoch": 0.4506301781397423,
"grad_norm": 3.4304227222936854,
"learning_rate": 1.8282559840976042e-06,
"loss": 0.9989996910095215,
"num_input_tokens_seen": 385198056,
"step": 1920,
"token_acc": 0.7340237302248127
},
{
"epoch": 0.45297721031755345,
"grad_norm": 1.8203825695395843,
"learning_rate": 1.8259015364714785e-06,
"loss": 1.005854892730713,
"num_input_tokens_seen": 387174645,
"step": 1930,
"token_acc": 0.7344124724323412
},
{
"epoch": 0.45532424249536463,
"grad_norm": 2.3790186216357387,
"learning_rate": 1.8235325976284273e-06,
"loss": 1.0130582809448243,
"num_input_tokens_seen": 389123001,
"step": 1940,
"token_acc": 0.7329481871636396
},
{
"epoch": 0.45767127467317575,
"grad_norm": 2.2702679233421366,
"learning_rate": 1.821149209133704e-06,
"loss": 1.0077364921569825,
"num_input_tokens_seen": 391185051,
"step": 1950,
"token_acc": 0.7325617754275695
},
{
"epoch": 0.46001830685098694,
"grad_norm": 1.7113606013198168,
"learning_rate": 1.8187514128060944e-06,
"loss": 1.0020957946777345,
"num_input_tokens_seen": 393232749,
"step": 1960,
"token_acc": 0.7342212411181741
},
{
"epoch": 0.46236533902879806,
"grad_norm": 2.0134995821074524,
"learning_rate": 1.816339250717184e-06,
"loss": 0.9884714126586914,
"num_input_tokens_seen": 395240403,
"step": 1970,
"token_acc": 0.7366033551966206
},
{
"epoch": 0.46471237120660924,
"grad_norm": 3.624673089989278,
"learning_rate": 1.8139127651906181e-06,
"loss": 1.0036752700805665,
"num_input_tokens_seen": 397222695,
"step": 1980,
"token_acc": 0.7327492557949239
},
{
"epoch": 0.4670594033844204,
"grad_norm": 12.741541567504669,
"learning_rate": 1.811471998801361e-06,
"loss": 1.0088150024414062,
"num_input_tokens_seen": 399265515,
"step": 1990,
"token_acc": 0.7318671375057033
},
{
"epoch": 0.46940643556223155,
"grad_norm": 1.9147316254240543,
"learning_rate": 1.8090169943749474e-06,
"loss": 1.0098794937133788,
"num_input_tokens_seen": 401254572,
"step": 2000,
"token_acc": 0.7348985741915172
},
{
"epoch": 0.46940643556223155,
"eval_loss": 1.0299264192581177,
"eval_runtime": 32.8145,
"eval_samples_per_second": 30.474,
"eval_steps_per_second": 1.28,
"eval_token_acc": 0.7290736594261443,
"num_input_tokens_seen": 401254572,
"step": 2000
},
{
"epoch": 0.47175346774004273,
"grad_norm": 1.8961444721894498,
"learning_rate": 1.8065477949867325e-06,
"loss": 1.016146469116211,
"num_input_tokens_seen": 403296912,
"step": 2010,
"token_acc": 0.7310783889798314
},
{
"epoch": 0.47410049991785386,
"grad_norm": 1.5674703012341533,
"learning_rate": 1.8040644439611345e-06,
"loss": 1.0078514099121094,
"num_input_tokens_seen": 405292185,
"step": 2020,
"token_acc": 0.7319825043230597
},
{
"epoch": 0.47644753209566504,
"grad_norm": 1.9494898023759353,
"learning_rate": 1.8015669848708766e-06,
"loss": 1.0296178817749024,
"num_input_tokens_seen": 407303625,
"step": 2030,
"token_acc": 0.7293832613834421
},
{
"epoch": 0.47879456427347616,
"grad_norm": 10.968015568038117,
"learning_rate": 1.7990554615362197e-06,
"loss": 0.9932464599609375,
"num_input_tokens_seen": 409284657,
"step": 2040,
"token_acc": 0.7361576877608628
},
{
"epoch": 0.48114159645128735,
"grad_norm": 1.5634395112041464,
"learning_rate": 1.7965299180241961e-06,
"loss": 0.9930622100830078,
"num_input_tokens_seen": 411350526,
"step": 2050,
"token_acc": 0.7371341064431953
},
{
"epoch": 0.4834886286290985,
"grad_norm": 4.940871877481185,
"learning_rate": 1.7939903986478354e-06,
"loss": 0.9968077659606933,
"num_input_tokens_seen": 413329158,
"step": 2060,
"token_acc": 0.7364979106166089
},
{
"epoch": 0.48583566080690965,
"grad_norm": 1.6357352710651227,
"learning_rate": 1.7914369479653857e-06,
"loss": 1.0207565307617188,
"num_input_tokens_seen": 415301217,
"step": 2070,
"token_acc": 0.7303749705838948
},
{
"epoch": 0.48818269298472083,
"grad_norm": 2.246788650609953,
"learning_rate": 1.788869610779534e-06,
"loss": 1.00274658203125,
"num_input_tokens_seen": 417261702,
"step": 2080,
"token_acc": 0.7341963767701447
},
{
"epoch": 0.49052972516253196,
"grad_norm": 1.56745308904305,
"learning_rate": 1.7862884321366187e-06,
"loss": 1.0060449600219727,
"num_input_tokens_seen": 419262057,
"step": 2090,
"token_acc": 0.7324562018430577
},
{
"epoch": 0.49287675734034314,
"grad_norm": 1.7117337983013203,
"learning_rate": 1.7836934573258397e-06,
"loss": 0.9900275230407715,
"num_input_tokens_seen": 421246710,
"step": 2100,
"token_acc": 0.7372878593403012
},
{
"epoch": 0.49287675734034314,
"eval_loss": 1.027020812034607,
"eval_runtime": 32.799,
"eval_samples_per_second": 30.489,
"eval_steps_per_second": 1.281,
"eval_token_acc": 0.7296553634495971,
"num_input_tokens_seen": 421246710,
"step": 2100
},
{
"epoch": 0.49522378951815427,
"grad_norm": 1.5242891687227014,
"learning_rate": 1.781084731878463e-06,
"loss": 0.9901479721069336,
"num_input_tokens_seen": 423187323,
"step": 2110,
"token_acc": 0.7374922148637526
},
{
"epoch": 0.49757082169596545,
"grad_norm": 2.148393307418336,
"learning_rate": 1.7784623015670235e-06,
"loss": 0.9794765472412109,
"num_input_tokens_seen": 425214681,
"step": 2120,
"token_acc": 0.7396016635749383
},
{
"epoch": 0.49991785387377663,
"grad_norm": 1.6777795098531292,
"learning_rate": 1.7758262124045194e-06,
"loss": 1.0104660987854004,
"num_input_tokens_seen": 427125735,
"step": 2130,
"token_acc": 0.7328506355953969
},
{
"epoch": 0.5022648860515878,
"grad_norm": 1.8399011401453165,
"learning_rate": 1.7731765106436071e-06,
"loss": 0.9876059532165528,
"num_input_tokens_seen": 429143655,
"step": 2140,
"token_acc": 0.7383790968301517
},
{
"epoch": 0.5046119182293989,
"grad_norm": 3.2054794139242047,
"learning_rate": 1.7705132427757892e-06,
"loss": 1.003396987915039,
"num_input_tokens_seen": 431161200,
"step": 2150,
"token_acc": 0.7355545283928578
},
{
"epoch": 0.5069589504072101,
"grad_norm": 1.5550880678151673,
"learning_rate": 1.7678364555305976e-06,
"loss": 0.9901845932006836,
"num_input_tokens_seen": 433164327,
"step": 2160,
"token_acc": 0.7361521188091766
},
{
"epoch": 0.5093059825850212,
"grad_norm": 2.416552637489239,
"learning_rate": 1.7651461958747741e-06,
"loss": 1.0047142028808593,
"num_input_tokens_seen": 435216456,
"step": 2170,
"token_acc": 0.733555096342685
},
{
"epoch": 0.5116530147628324,
"grad_norm": 2.387719191103811,
"learning_rate": 1.7624425110114479e-06,
"loss": 1.0148651123046875,
"num_input_tokens_seen": 437206023,
"step": 2180,
"token_acc": 0.7325390238452453
},
{
"epoch": 0.5140000469406436,
"grad_norm": 1.481562163308891,
"learning_rate": 1.7597254483793048e-06,
"loss": 0.9734397888183594,
"num_input_tokens_seen": 439163631,
"step": 2190,
"token_acc": 0.7413863843737306
},
{
"epoch": 0.5163470791184547,
"grad_norm": 7.115442308152491,
"learning_rate": 1.7569950556517563e-06,
"loss": 1.019681167602539,
"num_input_tokens_seen": 441170622,
"step": 2200,
"token_acc": 0.7295540569410798
},
{
"epoch": 0.5163470791184547,
"eval_loss": 1.0215942859649658,
"eval_runtime": 32.4901,
"eval_samples_per_second": 30.779,
"eval_steps_per_second": 1.293,
"eval_token_acc": 0.7315389764779207,
"num_input_tokens_seen": 441170622,
"step": 2200
},
{
"epoch": 0.5186941112962659,
"grad_norm": 1.6355732837542087,
"learning_rate": 1.7542513807361037e-06,
"loss": 1.0146623611450196,
"num_input_tokens_seen": 443157417,
"step": 2210,
"token_acc": 0.7331868122856259
},
{
"epoch": 0.521041143474077,
"grad_norm": 1.5373557972963237,
"learning_rate": 1.7514944717726961e-06,
"loss": 0.996919822692871,
"num_input_tokens_seen": 445115421,
"step": 2220,
"token_acc": 0.7370941300202442
},
{
"epoch": 0.5233881756518882,
"grad_norm": 3.095727021967102,
"learning_rate": 1.748724377134086e-06,
"loss": 1.008862018585205,
"num_input_tokens_seen": 447113430,
"step": 2230,
"token_acc": 0.7321047500353728
},
{
"epoch": 0.5257352078296993,
"grad_norm": 1.6318669740450855,
"learning_rate": 1.7459411454241822e-06,
"loss": 1.0091367721557618,
"num_input_tokens_seen": 449067504,
"step": 2240,
"token_acc": 0.7306417201986045
},
{
"epoch": 0.5280822400075105,
"grad_norm": 1.8958429005632293,
"learning_rate": 1.743144825477394e-06,
"loss": 0.9806262016296386,
"num_input_tokens_seen": 451028514,
"step": 2250,
"token_acc": 0.7392674057301928
},
{
"epoch": 0.5304292721853217,
"grad_norm": 1.8300311325163234,
"learning_rate": 1.740335466357778e-06,
"loss": 0.9876058578491211,
"num_input_tokens_seen": 453088446,
"step": 2260,
"token_acc": 0.7375388829110828
},
{
"epoch": 0.5327763043631328,
"grad_norm": 1.6283939332628163,
"learning_rate": 1.737513117358174e-06,
"loss": 1.0128792762756347,
"num_input_tokens_seen": 455064009,
"step": 2270,
"token_acc": 0.7309403491726847
},
{
"epoch": 0.535123336540944,
"grad_norm": 1.7443727538000593,
"learning_rate": 1.7346778279993416e-06,
"loss": 1.0167512893676758,
"num_input_tokens_seen": 457049565,
"step": 2280,
"token_acc": 0.7327466353251444
},
{
"epoch": 0.5374703687187551,
"grad_norm": 2.318872931178241,
"learning_rate": 1.731829648029091e-06,
"loss": 0.9633228302001953,
"num_input_tokens_seen": 459050343,
"step": 2290,
"token_acc": 0.7410114142684382
},
{
"epoch": 0.5398174008965663,
"grad_norm": 1.5210715736947538,
"learning_rate": 1.7289686274214115e-06,
"loss": 0.9929851531982422,
"num_input_tokens_seen": 461049750,
"step": 2300,
"token_acc": 0.7357508251313404
},
{
"epoch": 0.5398174008965663,
"eval_loss": 1.0185507535934448,
"eval_runtime": 32.6195,
"eval_samples_per_second": 30.657,
"eval_steps_per_second": 1.288,
"eval_token_acc": 0.731474342697537,
"num_input_tokens_seen": 461049750,
"step": 2300
},
{
"epoch": 0.5421644330743774,
"grad_norm": 1.5749401648234354,
"learning_rate": 1.7260948163755917e-06,
"loss": 0.9968940734863281,
"num_input_tokens_seen": 462989622,
"step": 2310,
"token_acc": 0.7375997849195517
},
{
"epoch": 0.5445114652521886,
"grad_norm": 2.5312095421318928,
"learning_rate": 1.723208265315342e-06,
"loss": 0.9779894828796387,
"num_input_tokens_seen": 465006357,
"step": 2320,
"token_acc": 0.7394803638714152
},
{
"epoch": 0.5468584974299998,
"grad_norm": 3.2822780472953803,
"learning_rate": 1.720309024887907e-06,
"loss": 1.0032640457153321,
"num_input_tokens_seen": 467017005,
"step": 2330,
"token_acc": 0.7345803640542331
},
{
"epoch": 0.5492055296078109,
"grad_norm": 1.6687009392941055,
"learning_rate": 1.7173971459631787e-06,
"loss": 1.0077280044555663,
"num_input_tokens_seen": 468979461,
"step": 2340,
"token_acc": 0.7342930917761522
},
{
"epoch": 0.5515525617856221,
"grad_norm": 11.650174621954747,
"learning_rate": 1.7144726796328032e-06,
"loss": 0.9968754768371582,
"num_input_tokens_seen": 470994735,
"step": 2350,
"token_acc": 0.734416431505073
},
{
"epoch": 0.5538995939634332,
"grad_norm": 2.599642616517287,
"learning_rate": 1.7115356772092855e-06,
"loss": 1.0374162673950196,
"num_input_tokens_seen": 472979052,
"step": 2360,
"token_acc": 0.7287551723023211
},
{
"epoch": 0.5562466261412444,
"grad_norm": 2.7538705299088453,
"learning_rate": 1.7085861902250862e-06,
"loss": 1.0119436264038086,
"num_input_tokens_seen": 475016298,
"step": 2370,
"token_acc": 0.7321991702851346
},
{
"epoch": 0.5585936583190556,
"grad_norm": 2.3397709495881682,
"learning_rate": 1.7056242704317208e-06,
"loss": 0.9402626991271973,
"num_input_tokens_seen": 477109281,
"step": 2380,
"token_acc": 0.7490173941732094
},
{
"epoch": 0.5609406904968667,
"grad_norm": 1.879207656038821,
"learning_rate": 1.7026499697988492e-06,
"loss": 0.9886844635009766,
"num_input_tokens_seen": 479146713,
"step": 2390,
"token_acc": 0.7365850879725937
},
{
"epoch": 0.5632877226746779,
"grad_norm": 1.9704873763682087,
"learning_rate": 1.6996633405133653e-06,
"loss": 0.9943101882934571,
"num_input_tokens_seen": 481102911,
"step": 2400,
"token_acc": 0.7366662244187203
},
{
"epoch": 0.5632877226746779,
"eval_loss": 1.015251636505127,
"eval_runtime": 32.7961,
"eval_samples_per_second": 30.491,
"eval_steps_per_second": 1.281,
"eval_token_acc": 0.7320929803097804,
"num_input_tokens_seen": 481102911,
"step": 2400
},
{
"epoch": 0.565634754852489,
"grad_norm": 1.6632244132905207,
"learning_rate": 1.6966644349784808e-06,
"loss": 0.9883607864379883,
"num_input_tokens_seen": 483084549,
"step": 2410,
"token_acc": 0.7358879192027988
},
{
"epoch": 0.5679817870303002,
"grad_norm": 1.5330248452956106,
"learning_rate": 1.6936533058128049e-06,
"loss": 1.0042284965515136,
"num_input_tokens_seen": 485112228,
"step": 2420,
"token_acc": 0.7344426514994169
},
{
"epoch": 0.5703288192081113,
"grad_norm": 2.5405918981273867,
"learning_rate": 1.6906300058494227e-06,
"loss": 0.9880990982055664,
"num_input_tokens_seen": 487123020,
"step": 2430,
"token_acc": 0.7372175131700104
},
{
"epoch": 0.5726758513859225,
"grad_norm": 3.9012975042201297,
"learning_rate": 1.6875945881349673e-06,
"loss": 0.9801074981689453,
"num_input_tokens_seen": 489120441,
"step": 2440,
"token_acc": 0.7381837376558823
},
{
"epoch": 0.5750228835637337,
"grad_norm": 1.6637494968221076,
"learning_rate": 1.6845471059286886e-06,
"loss": 1.0021610260009766,
"num_input_tokens_seen": 491066049,
"step": 2450,
"token_acc": 0.7346050699774175
},
{
"epoch": 0.5773699157415448,
"grad_norm": 1.652438429477013,
"learning_rate": 1.6814876127015198e-06,
"loss": 0.9841398239135742,
"num_input_tokens_seen": 493112928,
"step": 2460,
"token_acc": 0.7378321905180247
},
{
"epoch": 0.579716947919356,
"grad_norm": 3.543309593586376,
"learning_rate": 1.678416162135138e-06,
"loss": 0.979088020324707,
"num_input_tokens_seen": 495119139,
"step": 2470,
"token_acc": 0.7399358154268393
},
{
"epoch": 0.5820639800971671,
"grad_norm": 2.893410134875752,
"learning_rate": 1.6753328081210244e-06,
"loss": 0.9998300552368165,
"num_input_tokens_seen": 497115090,
"step": 2480,
"token_acc": 0.7359860001129023
},
{
"epoch": 0.5844110122749783,
"grad_norm": 1.9583144196315403,
"learning_rate": 1.6722376047595161e-06,
"loss": 0.9970391273498536,
"num_input_tokens_seen": 499168851,
"step": 2490,
"token_acc": 0.7355328073638283
},
{
"epoch": 0.5867580444527895,
"grad_norm": 5.903330257525673,
"learning_rate": 1.669130606358858e-06,
"loss": 1.0149246215820313,
"num_input_tokens_seen": 501138603,
"step": 2500,
"token_acc": 0.7320385426697377
},
{
"epoch": 0.5867580444527895,
"eval_loss": 1.0123026371002197,
"eval_runtime": 32.7432,
"eval_samples_per_second": 30.541,
"eval_steps_per_second": 1.283,
"eval_token_acc": 0.7328916691673784,
"num_input_tokens_seen": 501138603,
"step": 2500
},
{
"epoch": 0.5891050766306006,
"grad_norm": 1.919536213839018,
"learning_rate": 1.6660118674342515e-06,
"loss": 0.9900060653686523,
"num_input_tokens_seen": 503184900,
"step": 2510,
"token_acc": 0.7371078337925816
},
{
"epoch": 0.5914521088084118,
"grad_norm": 3.4511789649891966,
"learning_rate": 1.6628814427068952e-06,
"loss": 0.9589821815490722,
"num_input_tokens_seen": 505223106,
"step": 2520,
"token_acc": 0.7453759303446423
},
{
"epoch": 0.5937991409862229,
"grad_norm": 3.9395749071950554,
"learning_rate": 1.6597393871030261e-06,
"loss": 0.9944395065307617,
"num_input_tokens_seen": 507246369,
"step": 2530,
"token_acc": 0.7347724854980832
},
{
"epoch": 0.5961461731640341,
"grad_norm": 1.5397013326592903,
"learning_rate": 1.6565857557529564e-06,
"loss": 0.9756797790527344,
"num_input_tokens_seen": 509308893,
"step": 2540,
"token_acc": 0.7391703562324037
},
{
"epoch": 0.5984932053418452,
"grad_norm": 1.7526411604347196,
"learning_rate": 1.6534206039901055e-06,
"loss": 0.9834499359130859,
"num_input_tokens_seen": 511244184,
"step": 2550,
"token_acc": 0.7380458487339893
},
{
"epoch": 0.6008402375196564,
"grad_norm": 2.2921640319260024,
"learning_rate": 1.6502439873500286e-06,
"loss": 1.0054790496826171,
"num_input_tokens_seen": 513290352,
"step": 2560,
"token_acc": 0.734738491502126
},
{
"epoch": 0.6031872696974675,
"grad_norm": 1.9064014496743276,
"learning_rate": 1.6470559615694445e-06,
"loss": 0.9771562576293945,
"num_input_tokens_seen": 515276862,
"step": 2570,
"token_acc": 0.7392910978769869
},
{
"epoch": 0.6055343018752787,
"grad_norm": 2.0609613172670764,
"learning_rate": 1.6438565825852537e-06,
"loss": 0.9563516616821289,
"num_input_tokens_seen": 517288296,
"step": 2580,
"token_acc": 0.744728798321846
},
{
"epoch": 0.6078813340530899,
"grad_norm": 1.7302019611107595,
"learning_rate": 1.6406459065335614e-06,
"loss": 0.9771955490112305,
"num_input_tokens_seen": 519254622,
"step": 2590,
"token_acc": 0.740443198920546
},
{
"epoch": 0.610228366230901,
"grad_norm": 1.8178684355141148,
"learning_rate": 1.6374239897486897e-06,
"loss": 0.9703773498535156,
"num_input_tokens_seen": 521236017,
"step": 2600,
"token_acc": 0.7407382220106489
},
{
"epoch": 0.610228366230901,
"eval_loss": 1.0093790292739868,
"eval_runtime": 32.6088,
"eval_samples_per_second": 30.667,
"eval_steps_per_second": 1.288,
"eval_token_acc": 0.7333256388356686,
"num_input_tokens_seen": 521236017,
"step": 2600
},
{
"epoch": 0.6125753984087122,
"grad_norm": 1.8585384534519827,
"learning_rate": 1.6341908887621894e-06,
"loss": 0.9817310333251953,
"num_input_tokens_seen": 523212513,
"step": 2610,
"token_acc": 0.738175322879972
},
{
"epoch": 0.6149224305865233,
"grad_norm": 2.4244702161030625,
"learning_rate": 1.6309466603018495e-06,
"loss": 0.9609703063964844,
"num_input_tokens_seen": 525216327,
"step": 2620,
"token_acc": 0.7439178110371839
},
{
"epoch": 0.6172694627643345,
"grad_norm": 1.638774547412265,
"learning_rate": 1.6276913612907004e-06,
"loss": 0.9597613334655761,
"num_input_tokens_seen": 527198007,
"step": 2630,
"token_acc": 0.7433998992304688
},
{
"epoch": 0.6196164949421457,
"grad_norm": 1.8052959287052057,
"learning_rate": 1.6244250488460155e-06,
"loss": 0.9595340728759766,
"num_input_tokens_seen": 529328826,
"step": 2640,
"token_acc": 0.7424487405247924
},
{
"epoch": 0.6219635271199568,
"grad_norm": 2.9059084324443987,
"learning_rate": 1.6211477802783102e-06,
"loss": 0.9733432769775391,
"num_input_tokens_seen": 531353727,
"step": 2650,
"token_acc": 0.7391637709236651
},
{
"epoch": 0.624310559297768,
"grad_norm": 2.4613981471794117,
"learning_rate": 1.6178596130903343e-06,
"loss": 0.9548052787780762,
"num_input_tokens_seen": 533357184,
"step": 2660,
"token_acc": 0.7445567764998143
},
{
"epoch": 0.6266575914755791,
"grad_norm": 1.959126642555864,
"learning_rate": 1.6145606049760642e-06,
"loss": 0.9767616271972657,
"num_input_tokens_seen": 535321791,
"step": 2670,
"token_acc": 0.7381060525928277
},
{
"epoch": 0.6290046236533903,
"grad_norm": 1.5308332739370312,
"learning_rate": 1.6112508138196917e-06,
"loss": 0.9835859298706054,
"num_input_tokens_seen": 537364758,
"step": 2680,
"token_acc": 0.7381528449040924
},
{
"epoch": 0.6313516558312015,
"grad_norm": 1.8506281977228691,
"learning_rate": 1.6079302976946053e-06,
"loss": 0.9697771072387695,
"num_input_tokens_seen": 539423991,
"step": 2690,
"token_acc": 0.7428583040298499
},
{
"epoch": 0.6336986880090126,
"grad_norm": 2.143447146073978,
"learning_rate": 1.604599114862375e-06,
"loss": 0.9710499763488769,
"num_input_tokens_seen": 541385301,
"step": 2700,
"token_acc": 0.7437010271608948
},
{
"epoch": 0.6336986880090126,
"eval_loss": 1.006402611732483,
"eval_runtime": 32.4804,
"eval_samples_per_second": 30.788,
"eval_steps_per_second": 1.293,
"eval_token_acc": 0.7333256388356686,
"num_input_tokens_seen": 541385301,
"step": 2700
},
{
"epoch": 0.6360457201868238,
"grad_norm": 1.6759371033254091,
"learning_rate": 1.6012573237717265e-06,
"loss": 0.9557651519775391,
"num_input_tokens_seen": 543498738,
"step": 2710,
"token_acc": 0.744166114013349
},
{
"epoch": 0.6383927523646349,
"grad_norm": 1.8250942426916423,
"learning_rate": 1.5979049830575188e-06,
"loss": 0.9645903587341309,
"num_input_tokens_seen": 545489775,
"step": 2720,
"token_acc": 0.7429352817436318
},
{
"epoch": 0.6407397845424461,
"grad_norm": 1.9217599973801651,
"learning_rate": 1.5945421515397134e-06,
"loss": 0.9858356475830078,
"num_input_tokens_seen": 547577721,
"step": 2730,
"token_acc": 0.7375185153736568
},
{
"epoch": 0.6430868167202572,
"grad_norm": 1.7809745721720633,
"learning_rate": 1.591168888222342e-06,
"loss": 0.9513526916503906,
"num_input_tokens_seen": 549624339,
"step": 2740,
"token_acc": 0.745696874109412
},
{
"epoch": 0.6454338488980684,
"grad_norm": 4.185287393591199,
"learning_rate": 1.587785252292473e-06,
"loss": 1.0034643173217774,
"num_input_tokens_seen": 551637576,
"step": 2750,
"token_acc": 0.7338386568669174
},
{
"epoch": 0.6477808810758795,
"grad_norm": 1.5787917866107477,
"learning_rate": 1.584391303119172e-06,
"loss": 0.9657976150512695,
"num_input_tokens_seen": 553630620,
"step": 2760,
"token_acc": 0.7424132245973986
},
{
"epoch": 0.6501279132536907,
"grad_norm": 1.6169135735671403,
"learning_rate": 1.58098710025246e-06,
"loss": 0.976175594329834,
"num_input_tokens_seen": 555634122,
"step": 2770,
"token_acc": 0.7385256195920764
},
{
"epoch": 0.6524749454315019,
"grad_norm": 2.786090764996497,
"learning_rate": 1.5775727034222674e-06,
"loss": 1.0152118682861329,
"num_input_tokens_seen": 557567646,
"step": 2780,
"token_acc": 0.7318658065576464
},
{
"epoch": 0.654821977609313,
"grad_norm": 2.3429135221710142,
"learning_rate": 1.5741481725373898e-06,
"loss": 0.9660276412963867,
"num_input_tokens_seen": 559612812,
"step": 2790,
"token_acc": 0.7423686792009822
},
{
"epoch": 0.6571690097871242,
"grad_norm": 2.066607274894778,
"learning_rate": 1.5707135676844319e-06,
"loss": 0.9577510833740235,
"num_input_tokens_seen": 561582108,
"step": 2800,
"token_acc": 0.7451270299890406
},
{
"epoch": 0.6571690097871242,
"eval_loss": 1.0031476020812988,
"eval_runtime": 32.6219,
"eval_samples_per_second": 30.654,
"eval_steps_per_second": 1.287,
"eval_token_acc": 0.7348629994690796,
"num_input_tokens_seen": 561582108,
"step": 2800
},
{
"epoch": 0.6595160419649353,
"grad_norm": 2.6635168105713385,
"learning_rate": 1.5672689491267565e-06,
"loss": 0.9600403785705567,
"num_input_tokens_seen": 563559690,
"step": 2810,
"token_acc": 0.7428777482846697
},
{
"epoch": 0.6618630741427465,
"grad_norm": 1.7450802216902526,
"learning_rate": 1.5638143773034266e-06,
"loss": 0.9954195022583008,
"num_input_tokens_seen": 565524792,
"step": 2820,
"token_acc": 0.7348587056347071
},
{
"epoch": 0.6642101063205577,
"grad_norm": 1.8754649742088336,
"learning_rate": 1.5603499128281444e-06,
"loss": 0.969937515258789,
"num_input_tokens_seen": 567451971,
"step": 2830,
"token_acc": 0.7414208823996457
},
{
"epoch": 0.6665571384983688,
"grad_norm": 1.5835197058667514,
"learning_rate": 1.556875616488188e-06,
"loss": 0.969327163696289,
"num_input_tokens_seen": 569462406,
"step": 2840,
"token_acc": 0.7401524628156212
},
{
"epoch": 0.66890417067618,
"grad_norm": 2.6099644468289567,
"learning_rate": 1.553391549243344e-06,
"loss": 0.9504291534423828,
"num_input_tokens_seen": 571500279,
"step": 2850,
"token_acc": 0.7466074001336113
},
{
"epoch": 0.6712512028539911,
"grad_norm": 2.482803714476407,
"learning_rate": 1.54989777222484e-06,
"loss": 0.9784445762634277,
"num_input_tokens_seen": 573509781,
"step": 2860,
"token_acc": 0.7380322581926356
},
{
"epoch": 0.6735982350318023,
"grad_norm": 3.1626495400003227,
"learning_rate": 1.546394346734269e-06,
"loss": 0.9782054901123047,
"num_input_tokens_seen": 575490657,
"step": 2870,
"token_acc": 0.7396435152006547
},
{
"epoch": 0.6759452672096135,
"grad_norm": 1.8352007570418352,
"learning_rate": 1.5428813342425175e-06,
"loss": 0.9893608093261719,
"num_input_tokens_seen": 577443624,
"step": 2880,
"token_acc": 0.7371560289894273
},
{
"epoch": 0.6782922993874246,
"grad_norm": 11.589107012378998,
"learning_rate": 1.5393587963886834e-06,
"loss": 0.9795863151550293,
"num_input_tokens_seen": 579501576,
"step": 2890,
"token_acc": 0.738575752796563
},
{
"epoch": 0.6806393315652357,
"grad_norm": 2.3582930414965713,
"learning_rate": 1.5358267949789964e-06,
"loss": 0.986695671081543,
"num_input_tokens_seen": 581445867,
"step": 2900,
"token_acc": 0.7377336684807478
},
{
"epoch": 0.6806393315652357,
"eval_loss": 1.0008372068405151,
"eval_runtime": 32.2631,
"eval_samples_per_second": 30.995,
"eval_steps_per_second": 1.302,
"eval_token_acc": 0.7351584681794049,
"num_input_tokens_seen": 581445867,
"step": 2900
},
{
"epoch": 0.6829863637430469,
"grad_norm": 1.8683225729956336,
"learning_rate": 1.532285391985734e-06,
"loss": 0.9824249267578125,
"num_input_tokens_seen": 583473981,
"step": 2910,
"token_acc": 0.7386771656575185
},
{
"epoch": 0.6853333959208581,
"grad_norm": 1.9051084978341761,
"learning_rate": 1.5287346495461316e-06,
"loss": 0.9780803680419922,
"num_input_tokens_seen": 585488343,
"step": 2920,
"token_acc": 0.7386755390868261
},
{
"epoch": 0.6876804280986692,
"grad_norm": 1.7526173635768003,
"learning_rate": 1.5251746299612958e-06,
"loss": 0.9556564331054688,
"num_input_tokens_seen": 587536749,
"step": 2930,
"token_acc": 0.7437935964230544
},
{
"epoch": 0.6900274602764804,
"grad_norm": 1.908495061384156,
"learning_rate": 1.5216053956951078e-06,
"loss": 0.9559732437133789,
"num_input_tokens_seen": 589505883,
"step": 2940,
"token_acc": 0.7442760675515612
},
{
"epoch": 0.6923744924542915,
"grad_norm": 1.682313702090843,
"learning_rate": 1.5180270093731302e-06,
"loss": 0.9883411407470704,
"num_input_tokens_seen": 591496815,
"step": 2950,
"token_acc": 0.7374446310537534
},
{
"epoch": 0.6947215246321027,
"grad_norm": 1.6223526724021116,
"learning_rate": 1.5144395337815063e-06,
"loss": 0.9544116973876953,
"num_input_tokens_seen": 593483805,
"step": 2960,
"token_acc": 0.7434431431260328
},
{
"epoch": 0.6970685568099139,
"grad_norm": 3.3044227000595106,
"learning_rate": 1.5108430318658599e-06,
"loss": 0.9596687316894531,
"num_input_tokens_seen": 595472802,
"step": 2970,
"token_acc": 0.7425995483387807
},
{
"epoch": 0.699415588987725,
"grad_norm": 1.9946238986715072,
"learning_rate": 1.507237566730189e-06,
"loss": 0.9447664260864258,
"num_input_tokens_seen": 597458052,
"step": 2980,
"token_acc": 0.7471448055436924
},
{
"epoch": 0.7017626211655362,
"grad_norm": 1.9985767739510494,
"learning_rate": 1.5036232016357608e-06,
"loss": 0.9753869056701661,
"num_input_tokens_seen": 599511099,
"step": 2990,
"token_acc": 0.7407295913625692
},
{
"epoch": 0.7041096533433473,
"grad_norm": 1.7848206984050603,
"learning_rate": 1.5e-06,
"loss": 0.9929049491882325,
"num_input_tokens_seen": 601494039,
"step": 3000,
"token_acc": 0.733967886177249
},
{
"epoch": 0.7041096533433473,
"eval_loss": 0.9992188215255737,
"eval_runtime": 32.8388,
"eval_samples_per_second": 30.452,
"eval_steps_per_second": 1.279,
"eval_token_acc": 0.7353292860275616,
"num_input_tokens_seen": 601494039,
"step": 3000
},
{
"epoch": 0.7064566855211585,
"grad_norm": 1.5971312735806535,
"learning_rate": 1.4963680253953767e-06,
"loss": 0.9550104141235352,
"num_input_tokens_seen": 603479547,
"step": 3010,
"token_acc": 0.7457036074683664
},
{
"epoch": 0.7088037176989697,
"grad_norm": 1.7733615613171219,
"learning_rate": 1.4927273415482915e-06,
"loss": 0.9737858772277832,
"num_input_tokens_seen": 605442297,
"step": 3020,
"token_acc": 0.7412765006450565
},
{
"epoch": 0.7111507498767808,
"grad_norm": 6.926370874803529,
"learning_rate": 1.4890780123379563e-06,
"loss": 0.9665937423706055,
"num_input_tokens_seen": 607477695,
"step": 3030,
"token_acc": 0.7405696365107176
},
{
"epoch": 0.713497782054592,
"grad_norm": 1.6484833491764401,
"learning_rate": 1.485420101795274e-06,
"loss": 0.95927734375,
"num_input_tokens_seen": 609444318,
"step": 3040,
"token_acc": 0.7442635774417046
},
{
"epoch": 0.7158448142324031,
"grad_norm": 2.0360741208385913,
"learning_rate": 1.4817536741017151e-06,
"loss": 0.9595672607421875,
"num_input_tokens_seen": 611390574,
"step": 3050,
"token_acc": 0.743094030233154
},
{
"epoch": 0.7181918464102143,
"grad_norm": 1.6990138073052663,
"learning_rate": 1.4780787935881923e-06,
"loss": 0.9530370712280274,
"num_input_tokens_seen": 613394736,
"step": 3060,
"token_acc": 0.7442468822691946
},
{
"epoch": 0.7205388785880255,
"grad_norm": 1.7762074096620095,
"learning_rate": 1.474395524733929e-06,
"loss": 0.9581127166748047,
"num_input_tokens_seen": 615392505,
"step": 3070,
"token_acc": 0.7441699918818188
},
{
"epoch": 0.7228859107658366,
"grad_norm": 2.463738917765937,
"learning_rate": 1.4707039321653328e-06,
"loss": 0.9451935768127442,
"num_input_tokens_seen": 617397957,
"step": 3080,
"token_acc": 0.7463462899737582
},
{
"epoch": 0.7252329429436477,
"grad_norm": 1.5266790692018193,
"learning_rate": 1.4670040806548554e-06,
"loss": 0.9604751586914062,
"num_input_tokens_seen": 619431237,
"step": 3090,
"token_acc": 0.743774946972139
},
{
"epoch": 0.7275799751214589,
"grad_norm": 2.3079209032431858,
"learning_rate": 1.4632960351198617e-06,
"loss": 0.958247184753418,
"num_input_tokens_seen": 621429906,
"step": 3100,
"token_acc": 0.7430188770047043
},
{
"epoch": 0.7275799751214589,
"eval_loss": 0.9967913031578064,
"eval_runtime": 32.9622,
"eval_samples_per_second": 30.338,
"eval_steps_per_second": 1.274,
"eval_token_acc": 0.7358602063664273,
"num_input_tokens_seen": 621429906,
"step": 3100
},
{
"epoch": 0.7299270072992701,
"grad_norm": 1.9140433022488452,
"learning_rate": 1.459579860621488e-06,
"loss": 0.9593525886535644,
"num_input_tokens_seen": 623425752,
"step": 3110,
"token_acc": 0.7432277726301421
},
{
"epoch": 0.7322740394770813,
"grad_norm": 1.8212366585882274,
"learning_rate": 1.4558556223635e-06,
"loss": 0.9617977142333984,
"num_input_tokens_seen": 625420740,
"step": 3120,
"token_acc": 0.742332781810841
},
{
"epoch": 0.7346210716548924,
"grad_norm": 1.81167215973652,
"learning_rate": 1.4521233856911506e-06,
"loss": 0.958807373046875,
"num_input_tokens_seen": 627481314,
"step": 3130,
"token_acc": 0.7424123292987752
},
{
"epoch": 0.7369681038327035,
"grad_norm": 2.3831847210640373,
"learning_rate": 1.4483832160900325e-06,
"loss": 0.9585672378540039,
"num_input_tokens_seen": 629442897,
"step": 3140,
"token_acc": 0.7439413187403806
},
{
"epoch": 0.7393151360105147,
"grad_norm": 2.043229737472813,
"learning_rate": 1.4446351791849273e-06,
"loss": 0.9544695854187012,
"num_input_tokens_seen": 631432200,
"step": 3150,
"token_acc": 0.7442476653043495
},
{
"epoch": 0.7416621681883259,
"grad_norm": 4.811000207072732,
"learning_rate": 1.4408793407386585e-06,
"loss": 0.9843364715576172,
"num_input_tokens_seen": 633445356,
"step": 3160,
"token_acc": 0.7394687633144498
},
{
"epoch": 0.744009200366137,
"grad_norm": 2.4106731528164027,
"learning_rate": 1.4371157666509327e-06,
"loss": 0.9410341262817383,
"num_input_tokens_seen": 635526396,
"step": 3170,
"token_acc": 0.7483812367179011
},
{
"epoch": 0.7463562325439482,
"grad_norm": 1.7671356519717534,
"learning_rate": 1.4333445229571873e-06,
"loss": 0.9693818092346191,
"num_input_tokens_seen": 637512357,
"step": 3180,
"token_acc": 0.7406820079650566
},
{
"epoch": 0.7487032647217593,
"grad_norm": 7.649772673674551,
"learning_rate": 1.429565675827428e-06,
"loss": 0.9459026336669922,
"num_input_tokens_seen": 639512292,
"step": 3190,
"token_acc": 0.7462010482209617
},
{
"epoch": 0.7510502968995705,
"grad_norm": 1.8216992633152906,
"learning_rate": 1.4257792915650725e-06,
"loss": 0.9720870971679687,
"num_input_tokens_seen": 641562030,
"step": 3200,
"token_acc": 0.7415353056114234
},
{
"epoch": 0.7510502968995705,
"eval_loss": 0.9943264722824097,
"eval_runtime": 32.5126,
"eval_samples_per_second": 30.757,
"eval_steps_per_second": 1.292,
"eval_token_acc": 0.7364419103898802,
"num_input_tokens_seen": 641562030,
"step": 3200
},
{
"epoch": 0.7533973290773817,
"grad_norm": 2.5481168225977733,
"learning_rate": 1.421985436605783e-06,
"loss": 0.9607316970825195,
"num_input_tokens_seen": 643584060,
"step": 3210,
"token_acc": 0.7432411531496256
},
{
"epoch": 0.7557443612551928,
"grad_norm": 2.118059690668745,
"learning_rate": 1.4181841775163012e-06,
"loss": 0.9484768867492676,
"num_input_tokens_seen": 645607389,
"step": 3220,
"token_acc": 0.7466806535620841
},
{
"epoch": 0.7580913934330039,
"grad_norm": 1.536905846651224,
"learning_rate": 1.4143755809932843e-06,
"loss": 0.9712394714355469,
"num_input_tokens_seen": 647631456,
"step": 3230,
"token_acc": 0.7404885747138855
},
{
"epoch": 0.7604384256108151,
"grad_norm": 2.4582256619795935,
"learning_rate": 1.4105597138621279e-06,
"loss": 0.9821660041809082,
"num_input_tokens_seen": 649623648,
"step": 3240,
"token_acc": 0.7392644424148588
},
{
"epoch": 0.7627854577886263,
"grad_norm": 1.726231353540505,
"learning_rate": 1.4067366430758004e-06,
"loss": 0.9590049743652344,
"num_input_tokens_seen": 651641892,
"step": 3250,
"token_acc": 0.7437141846756814
},
{
"epoch": 0.7651324899664375,
"grad_norm": 1.922673635746528,
"learning_rate": 1.4029064357136626e-06,
"loss": 0.9750150680541992,
"num_input_tokens_seen": 653604150,
"step": 3260,
"token_acc": 0.7414191376968158
},
{
"epoch": 0.7674795221442486,
"grad_norm": 1.6204247475263307,
"learning_rate": 1.3990691589802952e-06,
"loss": 0.9551026344299316,
"num_input_tokens_seen": 655600902,
"step": 3270,
"token_acc": 0.7445325970386258
},
{
"epoch": 0.7698265543220597,
"grad_norm": 1.4635122389462327,
"learning_rate": 1.3952248802043165e-06,
"loss": 0.9669751167297364,
"num_input_tokens_seen": 657608466,
"step": 3280,
"token_acc": 0.7429139464814524
},
{
"epoch": 0.7721735864998709,
"grad_norm": 8.22536747363378,
"learning_rate": 1.3913736668372024e-06,
"loss": 0.9439043045043946,
"num_input_tokens_seen": 659654619,
"step": 3290,
"token_acc": 0.7456336900472631
},
{
"epoch": 0.7745206186776821,
"grad_norm": 9.543988869279518,
"learning_rate": 1.3875155864521028e-06,
"loss": 0.9564947128295899,
"num_input_tokens_seen": 661688691,
"step": 3300,
"token_acc": 0.7438499491922926
},
{
"epoch": 0.7745206186776821,
"eval_loss": 0.9920927882194519,
"eval_runtime": 32.5499,
"eval_samples_per_second": 30.722,
"eval_steps_per_second": 1.29,
"eval_token_acc": 0.7375683848479953,
"num_input_tokens_seen": 661688691,
"step": 3300
},
{
"epoch": 0.7768676508554933,
"grad_norm": 1.6682984595331,
"learning_rate": 1.3836507067426564e-06,
"loss": 0.9715993881225586,
"num_input_tokens_seen": 663716223,
"step": 3310,
"token_acc": 0.7398505776738471
},
{
"epoch": 0.7792146830333044,
"grad_norm": 1.4995458445383936,
"learning_rate": 1.379779095521801e-06,
"loss": 0.9635456085205079,
"num_input_tokens_seen": 665680179,
"step": 3320,
"token_acc": 0.7437482270495307
},
{
"epoch": 0.7815617152111155,
"grad_norm": 2.9414382856417665,
"learning_rate": 1.3759008207205866e-06,
"loss": 0.955263328552246,
"num_input_tokens_seen": 667683303,
"step": 3330,
"token_acc": 0.7446334146072263
},
{
"epoch": 0.7839087473889267,
"grad_norm": 2.4991760245357852,
"learning_rate": 1.3720159503869814e-06,
"loss": 0.9503087997436523,
"num_input_tokens_seen": 669640779,
"step": 3340,
"token_acc": 0.7461391567718691
},
{
"epoch": 0.7862557795667379,
"grad_norm": 1.6816037590987798,
"learning_rate": 1.3681245526846781e-06,
"loss": 0.9773989677429199,
"num_input_tokens_seen": 671655801,
"step": 3350,
"token_acc": 0.7381824953149948
},
{
"epoch": 0.788602811744549,
"grad_norm": 13.867129183446435,
"learning_rate": 1.3642266958918981e-06,
"loss": 0.9606409072875977,
"num_input_tokens_seen": 673618887,
"step": 3360,
"token_acc": 0.7444690515700922
},
{
"epoch": 0.7909498439223602,
"grad_norm": 1.68642281844996,
"learning_rate": 1.3603224484001947e-06,
"loss": 0.9683753967285156,
"num_input_tokens_seen": 675600486,
"step": 3370,
"token_acc": 0.7418183604302765
},
{
"epoch": 0.7932968761001713,
"grad_norm": 3.6876372609830947,
"learning_rate": 1.3564118787132506e-06,
"loss": 0.9690577507019043,
"num_input_tokens_seen": 677573577,
"step": 3380,
"token_acc": 0.7409562127336359
},
{
"epoch": 0.7956439082779825,
"grad_norm": 2.195994268899717,
"learning_rate": 1.3524950554456784e-06,
"loss": 0.9620229721069335,
"num_input_tokens_seen": 679562811,
"step": 3390,
"token_acc": 0.7450269148735509
},
{
"epoch": 0.7979909404557937,
"grad_norm": 1.66370232482538,
"learning_rate": 1.3485720473218152e-06,
"loss": 0.9747153282165527,
"num_input_tokens_seen": 681515289,
"step": 3400,
"token_acc": 0.7406435118536351
},
{
"epoch": 0.7979909404557937,
"eval_loss": 0.989876389503479,
"eval_runtime": 32.5612,
"eval_samples_per_second": 30.711,
"eval_steps_per_second": 1.29,
"eval_token_acc": 0.7377299692989543,
"num_input_tokens_seen": 681515289,
"step": 3400
},
{
"epoch": 0.8003379726336048,
"grad_norm": 7.5891295742929765,
"learning_rate": 1.344642923174517e-06,
"loss": 0.9531444549560547,
"num_input_tokens_seen": 683512767,
"step": 3410,
"token_acc": 0.743843269116981
},
{
"epoch": 0.8026850048114159,
"grad_norm": 2.227763312783814,
"learning_rate": 1.3407077519439517e-06,
"loss": 0.9736311912536622,
"num_input_tokens_seen": 685506138,
"step": 3420,
"token_acc": 0.7399243439837672
},
{
"epoch": 0.8050320369892271,
"grad_norm": 2.113082381566505,
"learning_rate": 1.3367666026763882e-06,
"loss": 0.9282070159912109,
"num_input_tokens_seen": 687553683,
"step": 3430,
"token_acc": 0.7491222650322436
},
{
"epoch": 0.8073790691670383,
"grad_norm": 2.6528881128590505,
"learning_rate": 1.3328195445229867e-06,
"loss": 0.9803478240966796,
"num_input_tokens_seen": 689471004,
"step": 3440,
"token_acc": 0.7387909473555786
},
{
"epoch": 0.8097261013448495,
"grad_norm": 1.9793103853657699,
"learning_rate": 1.3288666467385831e-06,
"loss": 0.9667415618896484,
"num_input_tokens_seen": 691496667,
"step": 3450,
"token_acc": 0.7424123423266975
},
{
"epoch": 0.8120731335226606,
"grad_norm": 1.7709247958435497,
"learning_rate": 1.3249079786804764e-06,
"loss": 0.9529176712036133,
"num_input_tokens_seen": 693546759,
"step": 3460,
"token_acc": 0.7441175099271877
},
{
"epoch": 0.8144201657004717,
"grad_norm": 1.5610954433541373,
"learning_rate": 1.3209436098072093e-06,
"loss": 0.9164794921875,
"num_input_tokens_seen": 695642895,
"step": 3470,
"token_acc": 0.7535160611124015
},
{
"epoch": 0.8167671978782829,
"grad_norm": 5.4874386973622675,
"learning_rate": 1.3169736096773518e-06,
"loss": 0.9681709289550782,
"num_input_tokens_seen": 697628748,
"step": 3480,
"token_acc": 0.7417104783717662
},
{
"epoch": 0.8191142300560941,
"grad_norm": 1.5904173197084162,
"learning_rate": 1.3129980479482781e-06,
"loss": 0.9423411369323731,
"num_input_tokens_seen": 699612816,
"step": 3490,
"token_acc": 0.7463674068222216
},
{
"epoch": 0.8214612622339053,
"grad_norm": 2.5766852327480185,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.9422481536865235,
"num_input_tokens_seen": 701681886,
"step": 3500,
"token_acc": 0.746677911017143
},
{
"epoch": 0.8214612622339053,
"eval_loss": 0.9871490597724915,
"eval_runtime": 32.4224,
"eval_samples_per_second": 30.843,
"eval_steps_per_second": 1.295,
"eval_token_acc": 0.738205489254634,
"num_input_tokens_seen": 701681886,
"step": 3500
},
{
"epoch": 0.8238082944117164,
"grad_norm": 1.6839398965190602,
"learning_rate": 1.3050305188086776e-06,
"loss": 0.9780057907104492,
"num_input_tokens_seen": 703749471,
"step": 3510,
"token_acc": 0.7461169628181562
},
{
"epoch": 0.8261553265895275,
"grad_norm": 1.6472063314918655,
"learning_rate": 1.3010386911959206e-06,
"loss": 0.9228075981140137,
"num_input_tokens_seen": 705742899,
"step": 3520,
"token_acc": 0.750938660857144
},
{
"epoch": 0.8285023587673387,
"grad_norm": 2.0632172614206934,
"learning_rate": 1.2970415815770348e-06,
"loss": 0.9639385223388672,
"num_input_tokens_seen": 707763786,
"step": 3530,
"token_acc": 0.7435530770762796
},
{
"epoch": 0.8308493909451499,
"grad_norm": 1.9277876571318946,
"learning_rate": 1.2930392600850572e-06,
"loss": 0.9361279487609864,
"num_input_tokens_seen": 709803774,
"step": 3540,
"token_acc": 0.7479319140358494
},
{
"epoch": 0.833196423122961,
"grad_norm": 1.7198703719511412,
"learning_rate": 1.2890317969444716e-06,
"loss": 0.9535655975341797,
"num_input_tokens_seen": 711862587,
"step": 3550,
"token_acc": 0.7448029965128141
},
{
"epoch": 0.8355434553007721,
"grad_norm": 2.212450916967764,
"learning_rate": 1.285019262469976e-06,
"loss": 0.9320892333984375,
"num_input_tokens_seen": 713902905,
"step": 3560,
"token_acc": 0.7496303953267546
},
{
"epoch": 0.8378904874785833,
"grad_norm": 1.9712068144466057,
"learning_rate": 1.281001727065251e-06,
"loss": 0.9570484161376953,
"num_input_tokens_seen": 715896024,
"step": 3570,
"token_acc": 0.7434224760474031
},
{
"epoch": 0.8402375196563945,
"grad_norm": 10.730434108038908,
"learning_rate": 1.2769792612217224e-06,
"loss": 0.9570381164550781,
"num_input_tokens_seen": 717863472,
"step": 3580,
"token_acc": 0.7445581595776979
},
{
"epoch": 0.8425845518342057,
"grad_norm": 3.30727503447712,
"learning_rate": 1.2729519355173253e-06,
"loss": 0.9440830230712891,
"num_input_tokens_seen": 719863371,
"step": 3590,
"token_acc": 0.7474822302083397
},
{
"epoch": 0.8449315840120168,
"grad_norm": 3.713841498382935,
"learning_rate": 1.2689198206152656e-06,
"loss": 0.9532724380493164,
"num_input_tokens_seen": 721831113,
"step": 3600,
"token_acc": 0.7449260731906336
},
{
"epoch": 0.8449315840120168,
"eval_loss": 0.9854407906532288,
"eval_runtime": 32.7136,
"eval_samples_per_second": 30.568,
"eval_steps_per_second": 1.284,
"eval_token_acc": 0.738343990212599,
"num_input_tokens_seen": 721831113,
"step": 3600
},
{
"epoch": 0.8472786161898279,
"grad_norm": 2.0589116386432122,
"learning_rate": 1.2648829872627807e-06,
"loss": 0.9483745574951172,
"num_input_tokens_seen": 723825324,
"step": 3610,
"token_acc": 0.745855639432676
},
{
"epoch": 0.8496256483676391,
"grad_norm": 2.2896157925507143,
"learning_rate": 1.2608415062898969e-06,
"loss": 0.9875471115112304,
"num_input_tokens_seen": 725824848,
"step": 3620,
"token_acc": 0.736929354012106
},
{
"epoch": 0.8519726805454503,
"grad_norm": 1.8359922545608438,
"learning_rate": 1.2567954486081878e-06,
"loss": 0.9514982223510742,
"num_input_tokens_seen": 727830747,
"step": 3630,
"token_acc": 0.7452454133152131
},
{
"epoch": 0.8543197127232615,
"grad_norm": 3.153372907954943,
"learning_rate": 1.2527448852095292e-06,
"loss": 0.9558559417724609,
"num_input_tokens_seen": 729852828,
"step": 3640,
"token_acc": 0.7435630305059377
},
{
"epoch": 0.8566667449010726,
"grad_norm": 3.2189620482043386,
"learning_rate": 1.2486898871648551e-06,
"loss": 0.9721113204956054,
"num_input_tokens_seen": 731850777,
"step": 3650,
"token_acc": 0.7411079350146542
},
{
"epoch": 0.8590137770788837,
"grad_norm": 3.3099093175401872,
"learning_rate": 1.2446305256229072e-06,
"loss": 0.9803009986877441,
"num_input_tokens_seen": 733814010,
"step": 3660,
"token_acc": 0.7365633927510155
},
{
"epoch": 0.8613608092566949,
"grad_norm": 1.5270344015944395,
"learning_rate": 1.2405668718089917e-06,
"loss": 0.9435177803039551,
"num_input_tokens_seen": 735837123,
"step": 3670,
"token_acc": 0.746749139522123
},
{
"epoch": 0.8637078414345061,
"grad_norm": 5.787047190916268,
"learning_rate": 1.2364989970237248e-06,
"loss": 0.956524658203125,
"num_input_tokens_seen": 737845806,
"step": 3680,
"token_acc": 0.7443589079040083
},
{
"epoch": 0.8660548736123173,
"grad_norm": 8.359169785563331,
"learning_rate": 1.232426972641784e-06,
"loss": 0.9011870384216308,
"num_input_tokens_seen": 739830486,
"step": 3690,
"token_acc": 0.75567660422689
},
{
"epoch": 0.8684019057901284,
"grad_norm": 1.5845135247364173,
"learning_rate": 1.2283508701106558e-06,
"loss": 0.9817106246948242,
"num_input_tokens_seen": 741791226,
"step": 3700,
"token_acc": 0.7385339271890049
},
{
"epoch": 0.8684019057901284,
"eval_loss": 0.983921468257904,
"eval_runtime": 32.7463,
"eval_samples_per_second": 30.538,
"eval_steps_per_second": 1.283,
"eval_token_acc": 0.7384963412663604,
"num_input_tokens_seen": 741791226,
"step": 3700
},
{
"epoch": 0.8707489379679395,
"grad_norm": 2.3840469812175087,
"learning_rate": 1.224270760949381e-06,
"loss": 0.9575783729553222,
"num_input_tokens_seen": 743787261,
"step": 3710,
"token_acc": 0.7436981812982442
},
{
"epoch": 0.8730959701457507,
"grad_norm": 1.947777089028747,
"learning_rate": 1.2201867167473015e-06,
"loss": 0.9696456909179687,
"num_input_tokens_seen": 745796382,
"step": 3720,
"token_acc": 0.7412485623553386
},
{
"epoch": 0.8754430023235619,
"grad_norm": 1.755420766932852,
"learning_rate": 1.2160988091628022e-06,
"loss": 0.9615589141845703,
"num_input_tokens_seen": 747780156,
"step": 3730,
"token_acc": 0.7427405478352258
},
{
"epoch": 0.877790034501373,
"grad_norm": 1.5327100981263035,
"learning_rate": 1.2120071099220547e-06,
"loss": 0.9285150527954101,
"num_input_tokens_seen": 749739183,
"step": 3740,
"token_acc": 0.7498815184287402
},
{
"epoch": 0.8801370666791841,
"grad_norm": 1.797316309204294,
"learning_rate": 1.207911690817759e-06,
"loss": 0.9365687370300293,
"num_input_tokens_seen": 751694550,
"step": 3750,
"token_acc": 0.747152564554286
},
{
"epoch": 0.8824840988569953,
"grad_norm": 3.689781286827284,
"learning_rate": 1.2038126237078849e-06,
"loss": 0.953128433227539,
"num_input_tokens_seen": 753712974,
"step": 3760,
"token_acc": 0.7452915604974099
},
{
"epoch": 0.8848311310348065,
"grad_norm": 1.7805781440802038,
"learning_rate": 1.1997099805144068e-06,
"loss": 0.9508394241333008,
"num_input_tokens_seen": 755748069,
"step": 3770,
"token_acc": 0.7452503865456881
},
{
"epoch": 0.8871781632126177,
"grad_norm": 1.6166917326261805,
"learning_rate": 1.195603833222048e-06,
"loss": 0.9421730995178222,
"num_input_tokens_seen": 757732731,
"step": 3780,
"token_acc": 0.746435002974226
},
{
"epoch": 0.8895251953904288,
"grad_norm": 2.7425269690357057,
"learning_rate": 1.191494253877013e-06,
"loss": 0.9745880126953125,
"num_input_tokens_seen": 759774399,
"step": 3790,
"token_acc": 0.7450119697550278
},
{
"epoch": 0.8918722275682399,
"grad_norm": 1.6146982833566892,
"learning_rate": 1.1873813145857248e-06,
"loss": 0.9547751426696778,
"num_input_tokens_seen": 761780385,
"step": 3800,
"token_acc": 0.7437249909057839
},
{
"epoch": 0.8918722275682399,
"eval_loss": 0.9822799563407898,
"eval_runtime": 32.7794,
"eval_samples_per_second": 30.507,
"eval_steps_per_second": 1.281,
"eval_token_acc": 0.738865677154267,
"num_input_tokens_seen": 761780385,
"step": 3800
},
{
"epoch": 0.8942192597460511,
"grad_norm": 8.557612907531114,
"learning_rate": 1.1832650875135597e-06,
"loss": 0.9583858489990235,
"num_input_tokens_seen": 763769655,
"step": 3810,
"token_acc": 0.7431487370276885
},
{
"epoch": 0.8965662919238623,
"grad_norm": 1.5077356512025262,
"learning_rate": 1.1791456448835825e-06,
"loss": 0.9206510543823242,
"num_input_tokens_seen": 765823593,
"step": 3820,
"token_acc": 0.7506628223950441
},
{
"epoch": 0.8989133241016735,
"grad_norm": 1.5006830716992956,
"learning_rate": 1.175023058975276e-06,
"loss": 0.9615950584411621,
"num_input_tokens_seen": 767831079,
"step": 3830,
"token_acc": 0.7423029397870712
},
{
"epoch": 0.9012603562794846,
"grad_norm": 1.6769633570300284,
"learning_rate": 1.1708974021232767e-06,
"loss": 0.9534446716308593,
"num_input_tokens_seen": 769798548,
"step": 3840,
"token_acc": 0.7445747944292532
},
{
"epoch": 0.9036073884572957,
"grad_norm": 1.759779515088976,
"learning_rate": 1.1667687467161023e-06,
"loss": 0.9459953308105469,
"num_input_tokens_seen": 771774078,
"step": 3850,
"token_acc": 0.744865905394826
},
{
"epoch": 0.9059544206351069,
"grad_norm": 1.6599709517731647,
"learning_rate": 1.1626371651948836e-06,
"loss": 0.9330622673034668,
"num_input_tokens_seen": 773817642,
"step": 3860,
"token_acc": 0.7481679393835271
},
{
"epoch": 0.9083014528129181,
"grad_norm": 1.6573686376498213,
"learning_rate": 1.158502730052093e-06,
"loss": 0.943012809753418,
"num_input_tokens_seen": 775877070,
"step": 3870,
"token_acc": 0.7472794230837547
},
{
"epoch": 0.9106484849907293,
"grad_norm": 2.4726992986853444,
"learning_rate": 1.1543655138302713e-06,
"loss": 0.9866430282592773,
"num_input_tokens_seen": 777904599,
"step": 3880,
"token_acc": 0.7372872068022087
},
{
"epoch": 0.9129955171685403,
"grad_norm": 1.7326340330977308,
"learning_rate": 1.150225589120757e-06,
"loss": 0.9427039146423339,
"num_input_tokens_seen": 779960793,
"step": 3890,
"token_acc": 0.7463757958063197
},
{
"epoch": 0.9153425493463515,
"grad_norm": 1.634253822545075,
"learning_rate": 1.1460830285624116e-06,
"loss": 0.9683923721313477,
"num_input_tokens_seen": 782008791,
"step": 3900,
"token_acc": 0.741813429536215
},
{
"epoch": 0.9153425493463515,
"eval_loss": 0.97979736328125,
"eval_runtime": 32.457,
"eval_samples_per_second": 30.81,
"eval_steps_per_second": 1.294,
"eval_token_acc": 0.739553565245493,
"num_input_tokens_seen": 782008791,
"step": 3900
},
{
"epoch": 0.9176895815241627,
"grad_norm": 5.153362224558377,
"learning_rate": 1.1419379048403444e-06,
"loss": 0.9662550926208496,
"num_input_tokens_seen": 784016886,
"step": 3910,
"token_acc": 0.7420221405659442
},
{
"epoch": 0.9200366137019739,
"grad_norm": 1.9857737502868835,
"learning_rate": 1.137790290684638e-06,
"loss": 0.9286038398742675,
"num_input_tokens_seen": 786018876,
"step": 3920,
"token_acc": 0.7495468248085001
},
{
"epoch": 0.922383645879785,
"grad_norm": 1.842562371990634,
"learning_rate": 1.1336402588690725e-06,
"loss": 0.9483222007751465,
"num_input_tokens_seen": 788055180,
"step": 3930,
"token_acc": 0.7456087098512761
},
{
"epoch": 0.9247306780575961,
"grad_norm": 1.928971592873294,
"learning_rate": 1.1294878822098467e-06,
"loss": 0.9480892181396484,
"num_input_tokens_seen": 790110096,
"step": 3940,
"token_acc": 0.7468523363829526
},
{
"epoch": 0.9270777102354073,
"grad_norm": 1.6567939468576487,
"learning_rate": 1.1253332335643042e-06,
"loss": 0.947171974182129,
"num_input_tokens_seen": 792098733,
"step": 3950,
"token_acc": 0.7463428498622995
},
{
"epoch": 0.9294247424132185,
"grad_norm": 2.382881124913188,
"learning_rate": 1.1211763858296505e-06,
"loss": 0.9341253280639649,
"num_input_tokens_seen": 794107374,
"step": 3960,
"token_acc": 0.749001431982777
},
{
"epoch": 0.9317717745910297,
"grad_norm": 2.385202785146866,
"learning_rate": 1.1170174119416775e-06,
"loss": 0.9605335235595703,
"num_input_tokens_seen": 796145907,
"step": 3970,
"token_acc": 0.7420721101207574
},
{
"epoch": 0.9341188067688408,
"grad_norm": 1.6538910226354369,
"learning_rate": 1.1128563848734815e-06,
"loss": 0.904339599609375,
"num_input_tokens_seen": 798189987,
"step": 3980,
"token_acc": 0.7552502219081598
},
{
"epoch": 0.9364658389466519,
"grad_norm": 2.1083368115488206,
"learning_rate": 1.108693377634185e-06,
"loss": 0.9489521980285645,
"num_input_tokens_seen": 800197461,
"step": 3990,
"token_acc": 0.7454285509759317
},
{
"epoch": 0.9388128711244631,
"grad_norm": 1.9940124977981624,
"learning_rate": 1.1045284632676535e-06,
"loss": 0.9406743049621582,
"num_input_tokens_seen": 802174746,
"step": 4000,
"token_acc": 0.7459721976990789
},
{
"epoch": 0.9388128711244631,
"eval_loss": 0.9778164029121399,
"eval_runtime": 32.5943,
"eval_samples_per_second": 30.68,
"eval_steps_per_second": 1.289,
"eval_token_acc": 0.7396274324230743,
"num_input_tokens_seen": 802174746,
"step": 4000
},
{
"epoch": 0.9411599033022743,
"grad_norm": 1.869832978916969,
"learning_rate": 1.1003617148512149e-06,
"loss": 0.9346565246582031,
"num_input_tokens_seen": 804141819,
"step": 4010,
"token_acc": 0.7472374245472837
},
{
"epoch": 0.9435069354800855,
"grad_norm": 2.364187676148168,
"learning_rate": 1.0961932054943776e-06,
"loss": 0.9504963874816894,
"num_input_tokens_seen": 806092293,
"step": 4020,
"token_acc": 0.7476745370464685
},
{
"epoch": 0.9458539676578966,
"grad_norm": 1.7457815556862932,
"learning_rate": 1.0920230083375472e-06,
"loss": 0.9478288650512695,
"num_input_tokens_seen": 808096725,
"step": 4030,
"token_acc": 0.7461893605967633
},
{
"epoch": 0.9482009998357077,
"grad_norm": 1.7540758806187229,
"learning_rate": 1.0878511965507434e-06,
"loss": 0.9289562225341796,
"num_input_tokens_seen": 810119691,
"step": 4040,
"token_acc": 0.7498504598729057
},
{
"epoch": 0.9505480320135189,
"grad_norm": 5.524603084757776,
"learning_rate": 1.0836778433323157e-06,
"loss": 0.9280494689941406,
"num_input_tokens_seen": 812173641,
"step": 4050,
"token_acc": 0.7489092478671032
},
{
"epoch": 0.9528950641913301,
"grad_norm": 2.2610221290856205,
"learning_rate": 1.0795030219076598e-06,
"loss": 0.9323202133178711,
"num_input_tokens_seen": 814155057,
"step": 4060,
"token_acc": 0.7484355792832109
},
{
"epoch": 0.9552420963691413,
"grad_norm": 1.7453803466041382,
"learning_rate": 1.0753268055279328e-06,
"loss": 0.9361183166503906,
"num_input_tokens_seen": 816203571,
"step": 4070,
"token_acc": 0.7480308978092947
},
{
"epoch": 0.9575891285469523,
"grad_norm": 3.200843146499252,
"learning_rate": 1.071149267468767e-06,
"loss": 0.9665923118591309,
"num_input_tokens_seen": 818255160,
"step": 4080,
"token_acc": 0.7428710890766919
},
{
"epoch": 0.9599361607247635,
"grad_norm": 2.769528286877977,
"learning_rate": 1.066970481028985e-06,
"loss": 0.9312915802001953,
"num_input_tokens_seen": 820210017,
"step": 4090,
"token_acc": 0.7505294435331026
},
{
"epoch": 0.9622831929025747,
"grad_norm": 3.5116532009374186,
"learning_rate": 1.0627905195293135e-06,
"loss": 0.9360153198242187,
"num_input_tokens_seen": 822213030,
"step": 4100,
"token_acc": 0.7485829324512936
},
{
"epoch": 0.9622831929025747,
"eval_loss": 0.9762653112411499,
"eval_runtime": 32.7782,
"eval_samples_per_second": 30.508,
"eval_steps_per_second": 1.281,
"eval_token_acc": 0.7401121857759516,
"num_input_tokens_seen": 822213030,
"step": 4100
},
{
"epoch": 0.9646302250803859,
"grad_norm": 5.045367081523594,
"learning_rate": 1.0586094563110963e-06,
"loss": 0.9286471366882324,
"num_input_tokens_seen": 824216382,
"step": 4110,
"token_acc": 0.7514687934606761
},
{
"epoch": 0.966977257258197,
"grad_norm": 2.1231322680588756,
"learning_rate": 1.054427364735009e-06,
"loss": 0.9417591094970703,
"num_input_tokens_seen": 826177221,
"step": 4120,
"token_acc": 0.746542864029784
},
{
"epoch": 0.9693242894360081,
"grad_norm": 1.5051650791104427,
"learning_rate": 1.0502443181797696e-06,
"loss": 0.9733121871948243,
"num_input_tokens_seen": 828212934,
"step": 4130,
"token_acc": 0.7397737060065835
},
{
"epoch": 0.9716713216138193,
"grad_norm": 1.9170280031638867,
"learning_rate": 1.0460603900408523e-06,
"loss": 0.9613967895507812,
"num_input_tokens_seen": 830208120,
"step": 4140,
"token_acc": 0.7418330397530002
},
{
"epoch": 0.9740183537916305,
"grad_norm": 2.477727800782275,
"learning_rate": 1.0418756537291995e-06,
"loss": 0.920326042175293,
"num_input_tokens_seen": 832205229,
"step": 4150,
"token_acc": 0.7535178501070156
},
{
"epoch": 0.9763653859694417,
"grad_norm": 1.544900641515008,
"learning_rate": 1.0376901826699347e-06,
"loss": 0.9237567901611328,
"num_input_tokens_seen": 834138633,
"step": 4160,
"token_acc": 0.7496954091824597
},
{
"epoch": 0.9787124181472528,
"grad_norm": 1.6877147081648456,
"learning_rate": 1.0335040503010715e-06,
"loss": 0.9391614913940429,
"num_input_tokens_seen": 836153739,
"step": 4170,
"token_acc": 0.7479080675786391
},
{
"epoch": 0.9810594503250639,
"grad_norm": 2.055524057953317,
"learning_rate": 1.0293173300722284e-06,
"loss": 0.9410205841064453,
"num_input_tokens_seen": 838071294,
"step": 4180,
"token_acc": 0.747964305973199
},
{
"epoch": 0.9834064825028751,
"grad_norm": 1.9825443022012719,
"learning_rate": 1.0251300954433374e-06,
"loss": 0.9293361663818359,
"num_input_tokens_seen": 840082950,
"step": 4190,
"token_acc": 0.7505939412855415
},
{
"epoch": 0.9857535146806863,
"grad_norm": 1.6517348379687422,
"learning_rate": 1.020942419883357e-06,
"loss": 0.9549247741699218,
"num_input_tokens_seen": 842083761,
"step": 4200,
"token_acc": 0.7446830629715671
},
{
"epoch": 0.9857535146806863,
"eval_loss": 0.9754964709281921,
"eval_runtime": 32.4547,
"eval_samples_per_second": 30.812,
"eval_steps_per_second": 1.294,
"eval_token_acc": 0.7408277740587705,
"num_input_tokens_seen": 842083761,
"step": 4200
},
{
"epoch": 0.9881005468584975,
"grad_norm": 1.7669813904614138,
"learning_rate": 1.0167543768689815e-06,
"loss": 0.9350774765014649,
"num_input_tokens_seen": 844080483,
"step": 4210,
"token_acc": 0.7474908930171247
},
{
"epoch": 0.9904475790363085,
"grad_norm": 1.9977363833715536,
"learning_rate": 1.0125660398833527e-06,
"loss": 0.9390117645263671,
"num_input_tokens_seen": 846069951,
"step": 4220,
"token_acc": 0.7463500450267371
},
{
"epoch": 0.9927946112141197,
"grad_norm": 1.6725983628184662,
"learning_rate": 1.0083774824147707e-06,
"loss": 0.946631908416748,
"num_input_tokens_seen": 848098152,
"step": 4230,
"token_acc": 0.7457750693945103
},
{
"epoch": 0.9951416433919309,
"grad_norm": 1.7247846754251406,
"learning_rate": 1.004188777955404e-06,
"loss": 0.9343754768371582,
"num_input_tokens_seen": 850113609,
"step": 4240,
"token_acc": 0.7490662455788695
},
{
"epoch": 0.9974886755697421,
"grad_norm": 2.0830434897072894,
"learning_rate": 1e-06,
"loss": 0.9314743041992187,
"num_input_tokens_seen": 852105906,
"step": 4250,
"token_acc": 0.749313829578074
},
{
"epoch": 0.9998357077475533,
"grad_norm": 1.814610722365582,
"learning_rate": 9.958112220445962e-07,
"loss": 0.9592094421386719,
"num_input_tokens_seen": 854098311,
"step": 4260,
"token_acc": 0.7431068897769029
},
{
"epoch": 1.00211232896003,
"grad_norm": 1.5113637229667725,
"learning_rate": 9.916225175852293e-07,
"loss": 0.894398307800293,
"num_input_tokens_seen": 856086594,
"step": 4270,
"token_acc": 0.7580048741904789
},
{
"epoch": 1.0044593611378412,
"grad_norm": 4.446393040487181,
"learning_rate": 9.874339601166472e-07,
"loss": 0.9135477066040039,
"num_input_tokens_seen": 858108198,
"step": 4280,
"token_acc": 0.7531681304263087
},
{
"epoch": 1.0068063933156524,
"grad_norm": 1.9208454193735196,
"learning_rate": 9.832456231310188e-07,
"loss": 0.9318746566772461,
"num_input_tokens_seen": 860120775,
"step": 4290,
"token_acc": 0.747537408902533
},
{
"epoch": 1.0091534254934635,
"grad_norm": 1.5928331203409287,
"learning_rate": 9.790575801166431e-07,
"loss": 0.9145861625671386,
"num_input_tokens_seen": 862143132,
"step": 4300,
"token_acc": 0.7532685063928213
},
{
"epoch": 1.0091534254934635,
"eval_loss": 0.9742150902748108,
"eval_runtime": 32.578,
"eval_samples_per_second": 30.696,
"eval_steps_per_second": 1.289,
"eval_token_acc": 0.7412802105214561,
"num_input_tokens_seen": 862143132,
"step": 4300
},
{
"epoch": 1.0115004576712747,
"grad_norm": 2.2199758281219837,
"learning_rate": 9.748699045566625e-07,
"loss": 0.9037257194519043,
"num_input_tokens_seen": 864130884,
"step": 4310,
"token_acc": 0.7554067579469933
},
{
"epoch": 1.013847489849086,
"grad_norm": 2.5403224399288926,
"learning_rate": 9.706826699277717e-07,
"loss": 0.8928478240966797,
"num_input_tokens_seen": 866146368,
"step": 4320,
"token_acc": 0.7571011279244853
},
{
"epoch": 1.016194522026897,
"grad_norm": 1.6880663111795373,
"learning_rate": 9.664959496989284e-07,
"loss": 0.8799491882324219,
"num_input_tokens_seen": 868132068,
"step": 4330,
"token_acc": 0.7608739162744612
},
{
"epoch": 1.018541554204708,
"grad_norm": 1.9603998555475624,
"learning_rate": 9.623098173300653e-07,
"loss": 0.9061168670654297,
"num_input_tokens_seen": 870168408,
"step": 4340,
"token_acc": 0.7558231445173181
},
{
"epoch": 1.0208885863825192,
"grad_norm": 2.052768381078441,
"learning_rate": 9.581243462708005e-07,
"loss": 0.891018009185791,
"num_input_tokens_seen": 872101149,
"step": 4350,
"token_acc": 0.7599988872462524
},
{
"epoch": 1.0232356185603304,
"grad_norm": 1.514439023769519,
"learning_rate": 9.539396099591476e-07,
"loss": 0.9129314422607422,
"num_input_tokens_seen": 874087335,
"step": 4360,
"token_acc": 0.7564216192481887
},
{
"epoch": 1.0255826507381416,
"grad_norm": 1.8673183879809325,
"learning_rate": 9.497556818202304e-07,
"loss": 0.9109779357910156,
"num_input_tokens_seen": 876059952,
"step": 4370,
"token_acc": 0.7535195830085737
},
{
"epoch": 1.0279296829159528,
"grad_norm": 6.147575681746076,
"learning_rate": 9.45572635264991e-07,
"loss": 0.9013278961181641,
"num_input_tokens_seen": 878124633,
"step": 4380,
"token_acc": 0.756046360357164
},
{
"epoch": 1.030276715093764,
"grad_norm": 3.3826066958331045,
"learning_rate": 9.413905436889033e-07,
"loss": 0.8935451507568359,
"num_input_tokens_seen": 880109727,
"step": 4390,
"token_acc": 0.7567750980510352
},
{
"epoch": 1.0326237472715751,
"grad_norm": 2.791787214417096,
"learning_rate": 9.372094804706866e-07,
"loss": 0.9111810684204101,
"num_input_tokens_seen": 882111045,
"step": 4400,
"token_acc": 0.7554985194799139
},
{
"epoch": 1.0326237472715751,
"eval_loss": 0.9730333685874939,
"eval_runtime": 32.4657,
"eval_samples_per_second": 30.802,
"eval_steps_per_second": 1.294,
"eval_token_acc": 0.7414048613836246,
"num_input_tokens_seen": 882111045,
"step": 4400
},
{
"epoch": 1.0349707794493863,
"grad_norm": 1.927568219024905,
"learning_rate": 9.330295189710151e-07,
"loss": 0.9100271224975586,
"num_input_tokens_seen": 884198595,
"step": 4410,
"token_acc": 0.7540011119241447
},
{
"epoch": 1.0373178116271975,
"grad_norm": 2.5062754907489797,
"learning_rate": 9.288507325312334e-07,
"loss": 0.8903081893920899,
"num_input_tokens_seen": 886152855,
"step": 4420,
"token_acc": 0.7574611181168558
},
{
"epoch": 1.0396648438050087,
"grad_norm": 1.9923532749108916,
"learning_rate": 9.246731944720674e-07,
"loss": 0.9105890274047852,
"num_input_tokens_seen": 888141444,
"step": 4430,
"token_acc": 0.7539804724713297
},
{
"epoch": 1.0420118759828196,
"grad_norm": 1.8502910487817004,
"learning_rate": 9.204969780923403e-07,
"loss": 0.9087862968444824,
"num_input_tokens_seen": 890115771,
"step": 4440,
"token_acc": 0.7559308727674652
},
{
"epoch": 1.0443589081606308,
"grad_norm": 5.223223230980478,
"learning_rate": 9.163221566676847e-07,
"loss": 0.9071809768676757,
"num_input_tokens_seen": 892098426,
"step": 4450,
"token_acc": 0.7547434701771973
},
{
"epoch": 1.046705940338442,
"grad_norm": 1.5951294272531664,
"learning_rate": 9.121488034492568e-07,
"loss": 0.9115602493286132,
"num_input_tokens_seen": 894150594,
"step": 4460,
"token_acc": 0.7560878381891606
},
{
"epoch": 1.0490529725162532,
"grad_norm": 24.227203178087926,
"learning_rate": 9.079769916624529e-07,
"loss": 0.8929647445678711,
"num_input_tokens_seen": 896182068,
"step": 4470,
"token_acc": 0.7569376280966494
},
{
"epoch": 1.0514000046940644,
"grad_norm": 4.446148288911931,
"learning_rate": 9.038067945056227e-07,
"loss": 0.8845357894897461,
"num_input_tokens_seen": 898144740,
"step": 4480,
"token_acc": 0.7596217335121099
},
{
"epoch": 1.0537470368718755,
"grad_norm": 2.33113822520666,
"learning_rate": 8.996382851487849e-07,
"loss": 0.9204854011535645,
"num_input_tokens_seen": 900153033,
"step": 4490,
"token_acc": 0.7531009457228544
},
{
"epoch": 1.0560940690496867,
"grad_norm": 1.6705258835681585,
"learning_rate": 8.954715367323466e-07,
"loss": 0.9108184814453125,
"num_input_tokens_seen": 902159874,
"step": 4500,
"token_acc": 0.7534851198704926
},
{
"epoch": 1.0560940690496867,
"eval_loss": 0.9722611308097839,
"eval_runtime": 32.6343,
"eval_samples_per_second": 30.643,
"eval_steps_per_second": 1.287,
"eval_token_acc": 0.7414879619584035,
"num_input_tokens_seen": 902159874,
"step": 4500
},
{
"epoch": 1.058441101227498,
"grad_norm": 1.814968079519632,
"learning_rate": 8.91306622365815e-07,
"loss": 0.9042104721069336,
"num_input_tokens_seen": 904127259,
"step": 4510,
"token_acc": 0.7549473429720114
},
{
"epoch": 1.060788133405309,
"grad_norm": 1.9598731265622114,
"learning_rate": 8.871436151265182e-07,
"loss": 0.9021028518676758,
"num_input_tokens_seen": 906131709,
"step": 4520,
"token_acc": 0.7555155495065009
},
{
"epoch": 1.06313516558312,
"grad_norm": 3.5546689619235106,
"learning_rate": 8.829825880583226e-07,
"loss": 0.8736377716064453,
"num_input_tokens_seen": 908144946,
"step": 4530,
"token_acc": 0.7615734862488263
},
{
"epoch": 1.0654821977609312,
"grad_norm": 3.1846241923818295,
"learning_rate": 8.788236141703497e-07,
"loss": 0.9034311294555664,
"num_input_tokens_seen": 910148658,
"step": 4540,
"token_acc": 0.7564678744009387
},
{
"epoch": 1.0678292299387424,
"grad_norm": 2.027265382942688,
"learning_rate": 8.746667664356955e-07,
"loss": 0.9266244888305664,
"num_input_tokens_seen": 912148779,
"step": 4550,
"token_acc": 0.7503857571491999
},
{
"epoch": 1.0701762621165536,
"grad_norm": 1.7499276338815972,
"learning_rate": 8.705121177901531e-07,
"loss": 0.900362205505371,
"num_input_tokens_seen": 914157060,
"step": 4560,
"token_acc": 0.757182167972395
},
{
"epoch": 1.0725232942943648,
"grad_norm": 2.8471968306459092,
"learning_rate": 8.663597411309278e-07,
"loss": 0.8963720321655273,
"num_input_tokens_seen": 916145403,
"step": 4570,
"token_acc": 0.7560617462222132
},
{
"epoch": 1.074870326472176,
"grad_norm": 1.6540494435074347,
"learning_rate": 8.62209709315362e-07,
"loss": 0.9004743576049805,
"num_input_tokens_seen": 918115113,
"step": 4580,
"token_acc": 0.7545025247249607
},
{
"epoch": 1.0772173586499871,
"grad_norm": 2.057030263327695,
"learning_rate": 8.580620951596556e-07,
"loss": 0.9495843887329102,
"num_input_tokens_seen": 920159124,
"step": 4590,
"token_acc": 0.7448036906164115
},
{
"epoch": 1.0795643908277983,
"grad_norm": 1.7066770272878358,
"learning_rate": 8.539169714375885e-07,
"loss": 0.9105659484863281,
"num_input_tokens_seen": 922121547,
"step": 4600,
"token_acc": 0.7536738054675078
},
{
"epoch": 1.0795643908277983,
"eval_loss": 0.9716529250144958,
"eval_runtime": 32.5395,
"eval_samples_per_second": 30.732,
"eval_steps_per_second": 1.291,
"eval_token_acc": 0.7416818632995544,
"num_input_tokens_seen": 922121547,
"step": 4600
},
{
"epoch": 1.0819114230056095,
"grad_norm": 1.9597668178542205,
"learning_rate": 8.497744108792429e-07,
"loss": 0.8963167190551757,
"num_input_tokens_seen": 924093546,
"step": 4610,
"token_acc": 0.7577693693987556
},
{
"epoch": 1.0842584551834205,
"grad_norm": 1.477104530901047,
"learning_rate": 8.456344861697287e-07,
"loss": 0.9177652359008789,
"num_input_tokens_seen": 926103639,
"step": 4620,
"token_acc": 0.7516901953627176
},
{
"epoch": 1.0866054873612316,
"grad_norm": 1.8830008370086135,
"learning_rate": 8.414972699479075e-07,
"loss": 0.9002264022827149,
"num_input_tokens_seen": 928135683,
"step": 4630,
"token_acc": 0.7559382042427807
},
{
"epoch": 1.0889525195390428,
"grad_norm": 3.016423460140028,
"learning_rate": 8.373628348051163e-07,
"loss": 0.8956707000732422,
"num_input_tokens_seen": 930127536,
"step": 4640,
"token_acc": 0.7571149500895269
},
{
"epoch": 1.091299551716854,
"grad_norm": 1.57022279289949,
"learning_rate": 8.332312532838978e-07,
"loss": 0.9269239425659179,
"num_input_tokens_seen": 932125299,
"step": 4650,
"token_acc": 0.7517471473920727
},
{
"epoch": 1.0936465838946652,
"grad_norm": 3.5134027190857435,
"learning_rate": 8.291025978767234e-07,
"loss": 0.9176504135131835,
"num_input_tokens_seen": 934168311,
"step": 4660,
"token_acc": 0.7548118730939853
},
{
"epoch": 1.0959936160724764,
"grad_norm": 2.5211326313148623,
"learning_rate": 8.249769410247238e-07,
"loss": 0.9234855651855469,
"num_input_tokens_seen": 936133608,
"step": 4670,
"token_acc": 0.7515400792838399
},
{
"epoch": 1.0983406482502875,
"grad_norm": 2.572125880008109,
"learning_rate": 8.208543551164177e-07,
"loss": 0.8986695289611817,
"num_input_tokens_seen": 938147853,
"step": 4680,
"token_acc": 0.7556977694823225
},
{
"epoch": 1.1006876804280987,
"grad_norm": 2.988789824663344,
"learning_rate": 8.167349124864404e-07,
"loss": 0.9072399139404297,
"num_input_tokens_seen": 940144569,
"step": 4690,
"token_acc": 0.7530836929897347
},
{
"epoch": 1.10303471260591,
"grad_norm": 1.6468695088048304,
"learning_rate": 8.126186854142751e-07,
"loss": 0.9020254135131835,
"num_input_tokens_seen": 942165525,
"step": 4700,
"token_acc": 0.7548501978958501
},
{
"epoch": 1.10303471260591,
"eval_loss": 0.9701104164123535,
"eval_runtime": 33.0994,
"eval_samples_per_second": 30.212,
"eval_steps_per_second": 1.269,
"eval_token_acc": 0.7415941460261767,
"num_input_tokens_seen": 942165525,
"step": 4700
},
{
"epoch": 1.105381744783721,
"grad_norm": 1.6564712470148706,
"learning_rate": 8.08505746122987e-07,
"loss": 0.8915030479431152,
"num_input_tokens_seen": 944177469,
"step": 4710,
"token_acc": 0.7565814201146365
},
{
"epoch": 1.107728776961532,
"grad_norm": 2.791755191613104,
"learning_rate": 8.043961667779518e-07,
"loss": 0.9122766494750977,
"num_input_tokens_seen": 946234932,
"step": 4720,
"token_acc": 0.7535114631778791
},
{
"epoch": 1.1100758091393432,
"grad_norm": 1.6738087861309878,
"learning_rate": 8.002900194855931e-07,
"loss": 0.9000448226928711,
"num_input_tokens_seen": 948228513,
"step": 4730,
"token_acc": 0.7559363093706895
},
{
"epoch": 1.1124228413171544,
"grad_norm": 1.5535937671654965,
"learning_rate": 7.961873762921151e-07,
"loss": 0.9070523262023926,
"num_input_tokens_seen": 950332011,
"step": 4740,
"token_acc": 0.7553185494918014
},
{
"epoch": 1.1147698734949656,
"grad_norm": 2.301542689211403,
"learning_rate": 7.920883091822408e-07,
"loss": 0.90597505569458,
"num_input_tokens_seen": 952319049,
"step": 4750,
"token_acc": 0.7548275049458286
},
{
"epoch": 1.1171169056727768,
"grad_norm": 1.7473797104994677,
"learning_rate": 7.879928900779455e-07,
"loss": 0.9030384063720703,
"num_input_tokens_seen": 954299892,
"step": 4760,
"token_acc": 0.756532667257456
},
{
"epoch": 1.119463937850588,
"grad_norm": 2.558847573037429,
"learning_rate": 7.839011908371979e-07,
"loss": 0.9100503921508789,
"num_input_tokens_seen": 956318847,
"step": 4770,
"token_acc": 0.7527636165796845
},
{
"epoch": 1.1218109700283991,
"grad_norm": 1.9894868553546619,
"learning_rate": 7.798132832526985e-07,
"loss": 0.8903913497924805,
"num_input_tokens_seen": 958308174,
"step": 4780,
"token_acc": 0.7594328320061341
},
{
"epoch": 1.1241580022062103,
"grad_norm": 1.9090250979917347,
"learning_rate": 7.757292390506189e-07,
"loss": 0.9077445983886718,
"num_input_tokens_seen": 960311976,
"step": 4790,
"token_acc": 0.7563037639640341
},
{
"epoch": 1.1265050343840215,
"grad_norm": 1.5195604142033567,
"learning_rate": 7.716491298893441e-07,
"loss": 0.9030027389526367,
"num_input_tokens_seen": 962312673,
"step": 4800,
"token_acc": 0.7546611261686987
},
{
"epoch": 1.1265050343840215,
"eval_loss": 0.9690244197845459,
"eval_runtime": 32.6363,
"eval_samples_per_second": 30.641,
"eval_steps_per_second": 1.287,
"eval_token_acc": 0.7421065995706471,
"num_input_tokens_seen": 962312673,
"step": 4800
},
{
"epoch": 1.1288520665618327,
"grad_norm": 7.06114138514342,
"learning_rate": 7.675730273582159e-07,
"loss": 0.9238859176635742,
"num_input_tokens_seen": 964266690,
"step": 4810,
"token_acc": 0.7510988303005139
},
{
"epoch": 1.1311990987396436,
"grad_norm": 1.9887377640273287,
"learning_rate": 7.635010029762755e-07,
"loss": 0.893895149230957,
"num_input_tokens_seen": 966243534,
"step": 4820,
"token_acc": 0.7578514127725531
},
{
"epoch": 1.1335461309174548,
"grad_norm": 2.8072244352137545,
"learning_rate": 7.594331281910081e-07,
"loss": 0.8709514617919922,
"num_input_tokens_seen": 968205627,
"step": 4830,
"token_acc": 0.7630826790971541
},
{
"epoch": 1.135893163095266,
"grad_norm": 1.5697632247100872,
"learning_rate": 7.553694743770927e-07,
"loss": 0.8988607406616211,
"num_input_tokens_seen": 970177137,
"step": 4840,
"token_acc": 0.7561233380663482
},
{
"epoch": 1.1382401952730772,
"grad_norm": 2.446099829583827,
"learning_rate": 7.513101128351453e-07,
"loss": 0.9138158798217774,
"num_input_tokens_seen": 972139821,
"step": 4850,
"token_acc": 0.7539762326169406
},
{
"epoch": 1.1405872274508884,
"grad_norm": 2.2189495017577103,
"learning_rate": 7.472551147904707e-07,
"loss": 0.9274373054504395,
"num_input_tokens_seen": 974155848,
"step": 4860,
"token_acc": 0.750778398745103
},
{
"epoch": 1.1429342596286995,
"grad_norm": 1.4873269538334397,
"learning_rate": 7.432045513918122e-07,
"loss": 0.8865886688232422,
"num_input_tokens_seen": 976121469,
"step": 4870,
"token_acc": 0.7581827865316892
},
{
"epoch": 1.1452812918065107,
"grad_norm": 1.7178629684971727,
"learning_rate": 7.391584937101033e-07,
"loss": 0.9193226814270019,
"num_input_tokens_seen": 978125502,
"step": 4880,
"token_acc": 0.7524842758549445
},
{
"epoch": 1.147628323984322,
"grad_norm": 1.7659656442538727,
"learning_rate": 7.351170127372191e-07,
"loss": 0.8870782852172852,
"num_input_tokens_seen": 980151348,
"step": 4890,
"token_acc": 0.7591273127875505
},
{
"epoch": 1.149975356162133,
"grad_norm": 2.512190986249406,
"learning_rate": 7.310801793847343e-07,
"loss": 0.9009071350097656,
"num_input_tokens_seen": 982116819,
"step": 4900,
"token_acc": 0.7555736532655332
},
{
"epoch": 1.149975356162133,
"eval_loss": 0.967960000038147,
"eval_runtime": 33.0751,
"eval_samples_per_second": 30.234,
"eval_steps_per_second": 1.27,
"eval_token_acc": 0.7421296830636412,
"num_input_tokens_seen": 982116819,
"step": 4900
},
{
"epoch": 1.152322388339944,
"grad_norm": 1.6304789397632498,
"learning_rate": 7.270480644826749e-07,
"loss": 0.9345785140991211,
"num_input_tokens_seen": 984113586,
"step": 4910,
"token_acc": 0.7481138414862325
},
{
"epoch": 1.1546694205177552,
"grad_norm": 1.6773663128682315,
"learning_rate": 7.230207387782776e-07,
"loss": 0.9058225631713868,
"num_input_tokens_seen": 986134590,
"step": 4920,
"token_acc": 0.7572044267504292
},
{
"epoch": 1.1570164526955664,
"grad_norm": 2.467806440031501,
"learning_rate": 7.18998272934749e-07,
"loss": 0.905792236328125,
"num_input_tokens_seen": 988128006,
"step": 4930,
"token_acc": 0.7552128911554362
},
{
"epoch": 1.1593634848733776,
"grad_norm": 2.3456217936104613,
"learning_rate": 7.149807375300238e-07,
"loss": 0.8924792289733887,
"num_input_tokens_seen": 990097689,
"step": 4940,
"token_acc": 0.7572508060847032
},
{
"epoch": 1.1617105170511888,
"grad_norm": 2.059131762842591,
"learning_rate": 7.109682030555282e-07,
"loss": 0.8982337951660156,
"num_input_tokens_seen": 992129379,
"step": 4950,
"token_acc": 0.7551930966690015
},
{
"epoch": 1.164057549229,
"grad_norm": 2.9573480896222772,
"learning_rate": 7.069607399149426e-07,
"loss": 0.8968988418579101,
"num_input_tokens_seen": 994140366,
"step": 4960,
"token_acc": 0.7568408887934138
},
{
"epoch": 1.1664045814068111,
"grad_norm": 1.7230957488152536,
"learning_rate": 7.029584184229652e-07,
"loss": 0.909503173828125,
"num_input_tokens_seen": 996159930,
"step": 4970,
"token_acc": 0.7549473717210192
},
{
"epoch": 1.1687516135846223,
"grad_norm": 1.7012209659002009,
"learning_rate": 6.989613088040795e-07,
"loss": 0.8788484573364258,
"num_input_tokens_seen": 998200734,
"step": 4980,
"token_acc": 0.7586579539038453
},
{
"epoch": 1.1710986457624335,
"grad_norm": 1.592891016055058,
"learning_rate": 6.949694811913225e-07,
"loss": 0.9113107681274414,
"num_input_tokens_seen": 1000131159,
"step": 4990,
"token_acc": 0.7557149987259054
},
{
"epoch": 1.1734456779402445,
"grad_norm": 5.935924197471871,
"learning_rate": 6.909830056250526e-07,
"loss": 0.900279426574707,
"num_input_tokens_seen": 1002152949,
"step": 5000,
"token_acc": 0.7555415584180373
},
{
"epoch": 1.1734456779402445,
"eval_loss": 0.9671830534934998,
"eval_runtime": 33.371,
"eval_samples_per_second": 29.966,
"eval_steps_per_second": 1.259,
"eval_token_acc": 0.7425636527319314,
"num_input_tokens_seen": 1002152949,
"step": 5000
},
{
"epoch": 1.1757927101180556,
"grad_norm": 2.200894548140831,
"learning_rate": 6.870019520517217e-07,
"loss": 0.8960202217102051,
"num_input_tokens_seen": 1004157984,
"step": 5010,
"token_acc": 0.7569971090628078
},
{
"epoch": 1.1781397422958668,
"grad_norm": 1.566572723585561,
"learning_rate": 6.830263903226482e-07,
"loss": 0.9069774627685547,
"num_input_tokens_seen": 1006144677,
"step": 5020,
"token_acc": 0.7552951138157661
},
{
"epoch": 1.180486774473678,
"grad_norm": 2.012794050429991,
"learning_rate": 6.790563901927906e-07,
"loss": 0.903378677368164,
"num_input_tokens_seen": 1008183480,
"step": 5030,
"token_acc": 0.7542571237096386
},
{
"epoch": 1.1828338066514892,
"grad_norm": 2.6190444654182663,
"learning_rate": 6.750920213195237e-07,
"loss": 0.9192432403564453,
"num_input_tokens_seen": 1010200815,
"step": 5040,
"token_acc": 0.752674829722257
},
{
"epoch": 1.1851808388293004,
"grad_norm": 1.876294751139499,
"learning_rate": 6.711333532614167e-07,
"loss": 0.8876149177551269,
"num_input_tokens_seen": 1012244334,
"step": 5050,
"token_acc": 0.7581334816982072
},
{
"epoch": 1.1875278710071115,
"grad_norm": 3.023714292115771,
"learning_rate": 6.671804554770134e-07,
"loss": 0.9129764556884765,
"num_input_tokens_seen": 1014307173,
"step": 5060,
"token_acc": 0.7553209579424762
},
{
"epoch": 1.1898749031849227,
"grad_norm": 1.9132860678026469,
"learning_rate": 6.63233397323612e-07,
"loss": 0.9299371719360352,
"num_input_tokens_seen": 1016348544,
"step": 5070,
"token_acc": 0.7510845945047212
},
{
"epoch": 1.192221935362734,
"grad_norm": 1.7646493320200434,
"learning_rate": 6.592922480560483e-07,
"loss": 0.8976167678833008,
"num_input_tokens_seen": 1018332171,
"step": 5080,
"token_acc": 0.7562631418499035
},
{
"epoch": 1.1945689675405449,
"grad_norm": 1.6538220495495426,
"learning_rate": 6.55357076825483e-07,
"loss": 0.9083082199096679,
"num_input_tokens_seen": 1020317589,
"step": 5090,
"token_acc": 0.7535232253620915
},
{
"epoch": 1.196915999718356,
"grad_norm": 1.8373787795166967,
"learning_rate": 6.51427952678185e-07,
"loss": 0.897801399230957,
"num_input_tokens_seen": 1022291424,
"step": 5100,
"token_acc": 0.7568812436238018
},
{
"epoch": 1.196915999718356,
"eval_loss": 0.965643048286438,
"eval_runtime": 32.457,
"eval_samples_per_second": 30.81,
"eval_steps_per_second": 1.294,
"eval_token_acc": 0.7426790701969022,
"num_input_tokens_seen": 1022291424,
"step": 5100
},
{
"epoch": 1.1992630318961672,
"grad_norm": 1.6643922341831798,
"learning_rate": 6.475049445543214e-07,
"loss": 0.8832623481750488,
"num_input_tokens_seen": 1024326642,
"step": 5110,
"token_acc": 0.7609418407772097
},
{
"epoch": 1.2016100640739784,
"grad_norm": 2.8760528519429527,
"learning_rate": 6.435881212867493e-07,
"loss": 0.8896665573120117,
"num_input_tokens_seen": 1026358665,
"step": 5120,
"token_acc": 0.7582770940849544
},
{
"epoch": 1.2039570962517896,
"grad_norm": 2.002315720555266,
"learning_rate": 6.396775515998054e-07,
"loss": 0.9143696784973144,
"num_input_tokens_seen": 1028363571,
"step": 5130,
"token_acc": 0.7524985799614379
},
{
"epoch": 1.2063041284296008,
"grad_norm": 2.371576045666034,
"learning_rate": 6.357733041081017e-07,
"loss": 0.9304786682128906,
"num_input_tokens_seen": 1030342941,
"step": 5140,
"token_acc": 0.7486818472638695
},
{
"epoch": 1.208651160607412,
"grad_norm": 2.346943055260075,
"learning_rate": 6.31875447315322e-07,
"loss": 0.9241456031799317,
"num_input_tokens_seen": 1032378225,
"step": 5150,
"token_acc": 0.7510933676127989
},
{
"epoch": 1.2109981927852231,
"grad_norm": 2.231488980986392,
"learning_rate": 6.279840496130188e-07,
"loss": 0.9039559364318848,
"num_input_tokens_seen": 1034346864,
"step": 5160,
"token_acc": 0.7524411349410404
},
{
"epoch": 1.2133452249630343,
"grad_norm": 1.9646213179831136,
"learning_rate": 6.240991792794133e-07,
"loss": 0.9074276924133301,
"num_input_tokens_seen": 1036368729,
"step": 5170,
"token_acc": 0.7546195549754318
},
{
"epoch": 1.2156922571408453,
"grad_norm": 1.722457316805155,
"learning_rate": 6.202209044781989e-07,
"loss": 0.8936328887939453,
"num_input_tokens_seen": 1038356424,
"step": 5180,
"token_acc": 0.7567584358948151
},
{
"epoch": 1.2180392893186567,
"grad_norm": 3.480235780891435,
"learning_rate": 6.163492932573438e-07,
"loss": 0.8924088478088379,
"num_input_tokens_seen": 1040404614,
"step": 5190,
"token_acc": 0.759963029202667
},
{
"epoch": 1.2203863214964676,
"grad_norm": 3.7980987371120305,
"learning_rate": 6.124844135478971e-07,
"loss": 0.9037540435791016,
"num_input_tokens_seen": 1042409814,
"step": 5200,
"token_acc": 0.7544269749931005
},
{
"epoch": 1.2203863214964676,
"eval_loss": 0.9651933908462524,
"eval_runtime": 32.4721,
"eval_samples_per_second": 30.796,
"eval_steps_per_second": 1.293,
"eval_token_acc": 0.7432146072343667,
"num_input_tokens_seen": 1042409814,
"step": 5200
},
{
"epoch": 1.2227333536742788,
"grad_norm": 2.0301844609252324,
"learning_rate": 6.086263331627975e-07,
"loss": 0.8960711479187011,
"num_input_tokens_seen": 1044474747,
"step": 5210,
"token_acc": 0.7566766133085695
},
{
"epoch": 1.22508038585209,
"grad_norm": 2.006861134477907,
"learning_rate": 6.047751197956838e-07,
"loss": 0.8874652862548829,
"num_input_tokens_seen": 1046542701,
"step": 5220,
"token_acc": 0.7577207817130738
},
{
"epoch": 1.2274274180299012,
"grad_norm": 1.64084154179337,
"learning_rate": 6.009308410197047e-07,
"loss": 0.9375964164733886,
"num_input_tokens_seen": 1048531923,
"step": 5230,
"token_acc": 0.7477447658832623
},
{
"epoch": 1.2297744502077124,
"grad_norm": 2.376806108677906,
"learning_rate": 5.970935642863374e-07,
"loss": 0.9305553436279297,
"num_input_tokens_seen": 1050497172,
"step": 5240,
"token_acc": 0.7477491309741687
},
{
"epoch": 1.2321214823855235,
"grad_norm": 2.0017133938943603,
"learning_rate": 5.932633569241999e-07,
"loss": 0.9117889404296875,
"num_input_tokens_seen": 1052489067,
"step": 5250,
"token_acc": 0.7528568241041047
},
{
"epoch": 1.2344685145633347,
"grad_norm": 1.676786348660199,
"learning_rate": 5.89440286137872e-07,
"loss": 0.9003104209899903,
"num_input_tokens_seen": 1054479834,
"step": 5260,
"token_acc": 0.7555148409000024
},
{
"epoch": 1.236815546741146,
"grad_norm": 3.0440164850905087,
"learning_rate": 5.856244190067159e-07,
"loss": 0.9047473907470703,
"num_input_tokens_seen": 1056426330,
"step": 5270,
"token_acc": 0.755049574664931
},
{
"epoch": 1.239162578918957,
"grad_norm": 2.869133561984615,
"learning_rate": 5.818158224836987e-07,
"loss": 0.9154601097106934,
"num_input_tokens_seen": 1058453490,
"step": 5280,
"token_acc": 0.7520037800567009
},
{
"epoch": 1.241509611096768,
"grad_norm": 3.801710317044165,
"learning_rate": 5.780145633942173e-07,
"loss": 0.9164340972900391,
"num_input_tokens_seen": 1060486977,
"step": 5290,
"token_acc": 0.752695566601707
},
{
"epoch": 1.2438566432745792,
"grad_norm": 2.741349679065458,
"learning_rate": 5.742207084349273e-07,
"loss": 0.871244239807129,
"num_input_tokens_seen": 1062507609,
"step": 5300,
"token_acc": 0.7623417495900512
},
{
"epoch": 1.2438566432745792,
"eval_loss": 0.9639586210250854,
"eval_runtime": 32.2213,
"eval_samples_per_second": 31.035,
"eval_steps_per_second": 1.303,
"eval_token_acc": 0.7430853396735994,
"num_input_tokens_seen": 1062507609,
"step": 5300
},
{
"epoch": 1.2462036754523904,
"grad_norm": 2.0632482933314273,
"learning_rate": 5.704343241725719e-07,
"loss": 0.902606201171875,
"num_input_tokens_seen": 1064565387,
"step": 5310,
"token_acc": 0.7573666940890565
},
{
"epoch": 1.2485507076302016,
"grad_norm": 2.5206259888398805,
"learning_rate": 5.666554770428128e-07,
"loss": 0.8999618530273438,
"num_input_tokens_seen": 1066547697,
"step": 5320,
"token_acc": 0.7568080644124454
},
{
"epoch": 1.2508977398080128,
"grad_norm": 2.5949843975238664,
"learning_rate": 5.628842333490673e-07,
"loss": 0.9164423942565918,
"num_input_tokens_seen": 1068581145,
"step": 5330,
"token_acc": 0.7550268878909339
},
{
"epoch": 1.253244771985824,
"grad_norm": 4.036566885568054,
"learning_rate": 5.591206592613416e-07,
"loss": 0.905246353149414,
"num_input_tokens_seen": 1070601372,
"step": 5340,
"token_acc": 0.7552413610147676
},
{
"epoch": 1.2555918041636351,
"grad_norm": 5.474286064221549,
"learning_rate": 5.553648208150728e-07,
"loss": 0.8880559921264648,
"num_input_tokens_seen": 1072560906,
"step": 5350,
"token_acc": 0.7592060617200068
},
{
"epoch": 1.2579388363414463,
"grad_norm": 1.7453040114014564,
"learning_rate": 5.51616783909968e-07,
"loss": 0.9003293991088868,
"num_input_tokens_seen": 1074501144,
"step": 5360,
"token_acc": 0.7574462673279918
},
{
"epoch": 1.2602858685192575,
"grad_norm": 2.437893460638386,
"learning_rate": 5.478766143088491e-07,
"loss": 0.8865642547607422,
"num_input_tokens_seen": 1076535018,
"step": 5370,
"token_acc": 0.7606810169616077
},
{
"epoch": 1.2626329006970685,
"grad_norm": 1.8609894370837823,
"learning_rate": 5.441443776365002e-07,
"loss": 0.8910144805908203,
"num_input_tokens_seen": 1078579935,
"step": 5380,
"token_acc": 0.7576510815314375
},
{
"epoch": 1.2649799328748796,
"grad_norm": 2.6436944735193184,
"learning_rate": 5.404201393785122e-07,
"loss": 0.8772344589233398,
"num_input_tokens_seen": 1080564321,
"step": 5390,
"token_acc": 0.7608925444457297
},
{
"epoch": 1.2673269650526908,
"grad_norm": 2.992758801643484,
"learning_rate": 5.367039648801385e-07,
"loss": 0.9159189224243164,
"num_input_tokens_seen": 1082533953,
"step": 5400,
"token_acc": 0.7533061633594679
},
{
"epoch": 1.2673269650526908,
"eval_loss": 0.9630009531974792,
"eval_runtime": 32.5066,
"eval_samples_per_second": 30.763,
"eval_steps_per_second": 1.292,
"eval_token_acc": 0.7429468387156345,
"num_input_tokens_seen": 1082533953,
"step": 5400
},
{
"epoch": 1.269673997230502,
"grad_norm": 2.692503141737527,
"learning_rate": 5.329959193451448e-07,
"loss": 0.8941567420959473,
"num_input_tokens_seen": 1084574751,
"step": 5410,
"token_acc": 0.7571006112607204
},
{
"epoch": 1.2720210294083132,
"grad_norm": 1.5669484406824739,
"learning_rate": 5.292960678346674e-07,
"loss": 0.8758008003234863,
"num_input_tokens_seen": 1086604491,
"step": 5420,
"token_acc": 0.7609121373438931
},
{
"epoch": 1.2743680615861244,
"grad_norm": 2.7196923900884538,
"learning_rate": 5.256044752660709e-07,
"loss": 0.8903736114501953,
"num_input_tokens_seen": 1088619414,
"step": 5430,
"token_acc": 0.7592087326109695
},
{
"epoch": 1.2767150937639355,
"grad_norm": 3.3252231876281044,
"learning_rate": 5.219212064118078e-07,
"loss": 0.8977795600891113,
"num_input_tokens_seen": 1090588407,
"step": 5440,
"token_acc": 0.7549231473500579
},
{
"epoch": 1.2790621259417467,
"grad_norm": 2.7540341423324115,
"learning_rate": 5.182463258982846e-07,
"loss": 0.9006612777709961,
"num_input_tokens_seen": 1092638625,
"step": 5450,
"token_acc": 0.7552658524098589
},
{
"epoch": 1.281409158119558,
"grad_norm": 3.5072503422513153,
"learning_rate": 5.14579898204726e-07,
"loss": 0.907337760925293,
"num_input_tokens_seen": 1094630577,
"step": 5460,
"token_acc": 0.7542059011906609
},
{
"epoch": 1.2837561902973689,
"grad_norm": 5.240311266290964,
"learning_rate": 5.109219876620441e-07,
"loss": 0.8758956909179687,
"num_input_tokens_seen": 1096660965,
"step": 5470,
"token_acc": 0.7625046517718082
},
{
"epoch": 1.28610322247518,
"grad_norm": 3.910915359433382,
"learning_rate": 5.072726584517085e-07,
"loss": 0.8722602844238281,
"num_input_tokens_seen": 1098640854,
"step": 5480,
"token_acc": 0.7603257317050821
},
{
"epoch": 1.2884502546529912,
"grad_norm": 1.6770275314193752,
"learning_rate": 5.036319746046231e-07,
"loss": 0.8983705520629883,
"num_input_tokens_seen": 1100637150,
"step": 5490,
"token_acc": 0.7550046700338503
},
{
"epoch": 1.2907972868308024,
"grad_norm": 2.5881524894297745,
"learning_rate": 5.000000000000002e-07,
"loss": 0.894923210144043,
"num_input_tokens_seen": 1102652097,
"step": 5500,
"token_acc": 0.7564139373070671
},
{
"epoch": 1.2907972868308024,
"eval_loss": 0.9622647762298584,
"eval_runtime": 32.3358,
"eval_samples_per_second": 30.925,
"eval_steps_per_second": 1.299,
"eval_token_acc": 0.7433207913021398,
"num_input_tokens_seen": 1102652097,
"step": 5500
},
{
"epoch": 1.2931443190086136,
"grad_norm": 2.4458432348948125,
"learning_rate": 4.963767983642391e-07,
"loss": 0.9156219482421875,
"num_input_tokens_seen": 1104675948,
"step": 5510,
"token_acc": 0.7537255650881494
},
{
"epoch": 1.2954913511864248,
"grad_norm": 1.6909965435703207,
"learning_rate": 4.927624332698109e-07,
"loss": 0.8871401786804199,
"num_input_tokens_seen": 1106680473,
"step": 5520,
"token_acc": 0.7581608722152928
},
{
"epoch": 1.297838383364236,
"grad_norm": 2.752441639954046,
"learning_rate": 4.891569681341402e-07,
"loss": 0.8774595260620117,
"num_input_tokens_seen": 1108675587,
"step": 5530,
"token_acc": 0.7608597953994441
},
{
"epoch": 1.3001854155420471,
"grad_norm": 4.312103259940411,
"learning_rate": 4.855604662184934e-07,
"loss": 0.94571533203125,
"num_input_tokens_seen": 1110676452,
"step": 5540,
"token_acc": 0.7557507607034466
},
{
"epoch": 1.3025324477198583,
"grad_norm": 15.655690028463368,
"learning_rate": 4.819729906268699e-07,
"loss": 0.906065559387207,
"num_input_tokens_seen": 1112710338,
"step": 5550,
"token_acc": 0.7553128935752625
},
{
"epoch": 1.3048794798976693,
"grad_norm": 4.05493729865088,
"learning_rate": 4.783946043048922e-07,
"loss": 0.8648593902587891,
"num_input_tokens_seen": 1114786149,
"step": 5560,
"token_acc": 0.763232807351506
},
{
"epoch": 1.3072265120754807,
"grad_norm": 18.132742646186717,
"learning_rate": 4.748253700387042e-07,
"loss": 0.9057920455932618,
"num_input_tokens_seen": 1116792414,
"step": 5570,
"token_acc": 0.7558468058389578
},
{
"epoch": 1.3095735442532916,
"grad_norm": 4.473293823942013,
"learning_rate": 4.712653504538683e-07,
"loss": 0.9168581008911133,
"num_input_tokens_seen": 1118755668,
"step": 5580,
"token_acc": 0.7533578569509507
},
{
"epoch": 1.3119205764311028,
"grad_norm": 1.8718331058830788,
"learning_rate": 4.677146080142663e-07,
"loss": 0.8930509567260743,
"num_input_tokens_seen": 1120786350,
"step": 5590,
"token_acc": 0.7578146339884224
},
{
"epoch": 1.314267608608914,
"grad_norm": 3.8006137217544853,
"learning_rate": 4.641732050210031e-07,
"loss": 0.8965305328369141,
"num_input_tokens_seen": 1122830253,
"step": 5600,
"token_acc": 0.757193734996655
},
{
"epoch": 1.314267608608914,
"eval_loss": 0.9616973996162415,
"eval_runtime": 32.7101,
"eval_samples_per_second": 30.572,
"eval_steps_per_second": 1.284,
"eval_token_acc": 0.7441794972415225,
"num_input_tokens_seen": 1122830253,
"step": 5600
},
{
"epoch": 1.3166146407867252,
"grad_norm": 2.1310327327750103,
"learning_rate": 4.6064120361131654e-07,
"loss": 0.8685415267944336,
"num_input_tokens_seen": 1124770431,
"step": 5610,
"token_acc": 0.7614362220849722
},
{
"epoch": 1.3189616729645364,
"grad_norm": 5.594205953222117,
"learning_rate": 4.571186657574827e-07,
"loss": 0.8749109268188476,
"num_input_tokens_seen": 1126803909,
"step": 5620,
"token_acc": 0.7609511594419571
},
{
"epoch": 1.3213087051423476,
"grad_norm": 1.4907604209575505,
"learning_rate": 4.5360565326573097e-07,
"loss": 0.8923271179199219,
"num_input_tokens_seen": 1128846693,
"step": 5630,
"token_acc": 0.7566236892264636
},
{
"epoch": 1.3236557373201587,
"grad_norm": 1.6487071255049761,
"learning_rate": 4.5010222777516016e-07,
"loss": 0.8908859252929687,
"num_input_tokens_seen": 1130851539,
"step": 5640,
"token_acc": 0.7570941516923059
},
{
"epoch": 1.3260027694979697,
"grad_norm": 3.588061851379213,
"learning_rate": 4.46608450756656e-07,
"loss": 0.8966587066650391,
"num_input_tokens_seen": 1132815081,
"step": 5650,
"token_acc": 0.7556865728413845
},
{
"epoch": 1.328349801675781,
"grad_norm": 1.8146830930493871,
"learning_rate": 4.431243835118124e-07,
"loss": 0.8989040374755859,
"num_input_tokens_seen": 1134802443,
"step": 5660,
"token_acc": 0.7558611844953211
},
{
"epoch": 1.330696833853592,
"grad_norm": 1.549860770891768,
"learning_rate": 4.3965008717185546e-07,
"loss": 0.9029041290283203,
"num_input_tokens_seen": 1136825982,
"step": 5670,
"token_acc": 0.7547953414140695
},
{
"epoch": 1.3330438660314032,
"grad_norm": 8.564808279417944,
"learning_rate": 4.361856226965732e-07,
"loss": 0.9094319343566895,
"num_input_tokens_seen": 1138844418,
"step": 5680,
"token_acc": 0.7534089471178856
},
{
"epoch": 1.3353908982092144,
"grad_norm": 2.4122342846590117,
"learning_rate": 4.327310508732437e-07,
"loss": 0.9330079078674316,
"num_input_tokens_seen": 1140865437,
"step": 5690,
"token_acc": 0.7480073371962428
},
{
"epoch": 1.3377379303870256,
"grad_norm": 2.222842201988777,
"learning_rate": 4.292864323155684e-07,
"loss": 0.9154201507568359,
"num_input_tokens_seen": 1142840739,
"step": 5700,
"token_acc": 0.7531476710355994
},
{
"epoch": 1.3377379303870256,
"eval_loss": 0.9612286686897278,
"eval_runtime": 32.3029,
"eval_samples_per_second": 30.957,
"eval_steps_per_second": 1.3,
"eval_token_acc": 0.7439994459961682,
"num_input_tokens_seen": 1142840739,
"step": 5700
},
{
"epoch": 1.3400849625648368,
"grad_norm": 1.810955978874136,
"learning_rate": 4.258518274626103e-07,
"loss": 0.8730932235717773,
"num_input_tokens_seen": 1144886610,
"step": 5710,
"token_acc": 0.763370671624448
},
{
"epoch": 1.342431994742648,
"grad_norm": 4.997305168302919,
"learning_rate": 4.224272965777326e-07,
"loss": 0.8956947326660156,
"num_input_tokens_seen": 1146863130,
"step": 5720,
"token_acc": 0.756512774681123
},
{
"epoch": 1.3447790269204591,
"grad_norm": 1.7714657210450584,
"learning_rate": 4.1901289974754017e-07,
"loss": 0.9034318923950195,
"num_input_tokens_seen": 1148825958,
"step": 5730,
"token_acc": 0.7528903974023187
},
{
"epoch": 1.34712605909827,
"grad_norm": 1.7970727143535203,
"learning_rate": 4.15608696880828e-07,
"loss": 0.9018034934997559,
"num_input_tokens_seen": 1150869660,
"step": 5740,
"token_acc": 0.7552370910083663
},
{
"epoch": 1.3494730912760815,
"grad_norm": 2.3962942580845765,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.8888204574584961,
"num_input_tokens_seen": 1152904527,
"step": 5750,
"token_acc": 0.7579487303127656
},
{
"epoch": 1.3518201234538925,
"grad_norm": 4.2459307299089355,
"learning_rate": 4.0883111177765793e-07,
"loss": 0.882927131652832,
"num_input_tokens_seen": 1154856621,
"step": 5760,
"token_acc": 0.760532270444878
},
{
"epoch": 1.3541671556317036,
"grad_norm": 8.805122520612176,
"learning_rate": 4.05457848460287e-07,
"loss": 0.8931197166442871,
"num_input_tokens_seen": 1156841811,
"step": 5770,
"token_acc": 0.7581117296199616
},
{
"epoch": 1.3565141878095148,
"grad_norm": 1.8029745033128655,
"learning_rate": 4.020950169424815e-07,
"loss": 0.8755680084228515,
"num_input_tokens_seen": 1158825375,
"step": 5780,
"token_acc": 0.7617876391236407
},
{
"epoch": 1.358861219987326,
"grad_norm": 2.3214932218170348,
"learning_rate": 3.9874267622827326e-07,
"loss": 0.8934176445007325,
"num_input_tokens_seen": 1160840175,
"step": 5790,
"token_acc": 0.7589212683515132
},
{
"epoch": 1.3612082521651372,
"grad_norm": 2.3985162877965873,
"learning_rate": 3.9540088513762516e-07,
"loss": 0.8847217559814453,
"num_input_tokens_seen": 1162829856,
"step": 5800,
"token_acc": 0.7612809344881545
},
{
"epoch": 1.3612082521651372,
"eval_loss": 0.9602800607681274,
"eval_runtime": 32.5062,
"eval_samples_per_second": 30.763,
"eval_steps_per_second": 1.292,
"eval_token_acc": 0.7438470949424066,
"num_input_tokens_seen": 1162829856,
"step": 5800
},
{
"epoch": 1.3635552843429484,
"grad_norm": 2.0010713169478738,
"learning_rate": 3.9206970230539484e-07,
"loss": 0.8922606468200683,
"num_input_tokens_seen": 1164855291,
"step": 5810,
"token_acc": 0.7569838860463012
},
{
"epoch": 1.3659023165207596,
"grad_norm": 3.272109870466011,
"learning_rate": 3.887491861803085e-07,
"loss": 0.9000480651855469,
"num_input_tokens_seen": 1166861097,
"step": 5820,
"token_acc": 0.7566891172207229
},
{
"epoch": 1.3682493486985707,
"grad_norm": 1.544077325900252,
"learning_rate": 3.8543939502393553e-07,
"loss": 0.8689347267150879,
"num_input_tokens_seen": 1168887147,
"step": 5830,
"token_acc": 0.7627186945780682
},
{
"epoch": 1.370596380876382,
"grad_norm": 2.103440792541161,
"learning_rate": 3.8214038690966577e-07,
"loss": 0.8851211547851563,
"num_input_tokens_seen": 1170981615,
"step": 5840,
"token_acc": 0.7597759262487763
},
{
"epoch": 1.3729434130541929,
"grad_norm": 1.7677876308306728,
"learning_rate": 3.788522197216897e-07,
"loss": 0.9024602890014648,
"num_input_tokens_seen": 1172878617,
"step": 5850,
"token_acc": 0.7557560328803166
},
{
"epoch": 1.375290445232004,
"grad_norm": 1.9402726241839798,
"learning_rate": 3.7557495115398443e-07,
"loss": 0.9134780883789062,
"num_input_tokens_seen": 1174893015,
"step": 5860,
"token_acc": 0.753564070544764
},
{
"epoch": 1.3776374774098152,
"grad_norm": 1.72330146825218,
"learning_rate": 3.7230863870929963e-07,
"loss": 0.8972689628601074,
"num_input_tokens_seen": 1176936135,
"step": 5870,
"token_acc": 0.7560207487897523
},
{
"epoch": 1.3799845095876264,
"grad_norm": 1.8072698773269937,
"learning_rate": 3.690533396981503e-07,
"loss": 0.8984692573547364,
"num_input_tokens_seen": 1178895693,
"step": 5880,
"token_acc": 0.756615972827414
},
{
"epoch": 1.3823315417654376,
"grad_norm": 2.0171801198061714,
"learning_rate": 3.6580911123781056e-07,
"loss": 0.8955293655395508,
"num_input_tokens_seen": 1180888149,
"step": 5890,
"token_acc": 0.75720176277118
},
{
"epoch": 1.3846785739432488,
"grad_norm": 1.5526432676933917,
"learning_rate": 3.625760102513102e-07,
"loss": 0.8883472442626953,
"num_input_tokens_seen": 1182949920,
"step": 5900,
"token_acc": 0.7599701073124605
},
{
"epoch": 1.3846785739432488,
"eval_loss": 0.9595866799354553,
"eval_runtime": 32.1786,
"eval_samples_per_second": 31.077,
"eval_steps_per_second": 1.305,
"eval_token_acc": 0.7442902980078946,
"num_input_tokens_seen": 1182949920,
"step": 5900
},
{
"epoch": 1.38702560612106,
"grad_norm": 7.97384438350482,
"learning_rate": 3.593540934664383e-07,
"loss": 0.889987564086914,
"num_input_tokens_seen": 1184970120,
"step": 5910,
"token_acc": 0.758863473503418
},
{
"epoch": 1.3893726382988711,
"grad_norm": 1.537068043139113,
"learning_rate": 3.561434174147463e-07,
"loss": 0.911767578125,
"num_input_tokens_seen": 1186953870,
"step": 5920,
"token_acc": 0.7535046522800585
},
{
"epoch": 1.3917196704766823,
"grad_norm": 2.792916091074644,
"learning_rate": 3.5294403843055597e-07,
"loss": 0.8944547653198243,
"num_input_tokens_seen": 1188957102,
"step": 5930,
"token_acc": 0.7568861383047926
},
{
"epoch": 1.3940667026544933,
"grad_norm": 2.5956068773284557,
"learning_rate": 3.497560126499709e-07,
"loss": 0.8902932167053222,
"num_input_tokens_seen": 1190999568,
"step": 5940,
"token_acc": 0.7563681534101937
},
{
"epoch": 1.3964137348323047,
"grad_norm": 1.606752628432743,
"learning_rate": 3.465793960098945e-07,
"loss": 0.8962507247924805,
"num_input_tokens_seen": 1193049609,
"step": 5950,
"token_acc": 0.7568774963666619
},
{
"epoch": 1.3987607670101156,
"grad_norm": 3.201548177908894,
"learning_rate": 3.434142442470437e-07,
"loss": 0.8878293037414551,
"num_input_tokens_seen": 1195126131,
"step": 5960,
"token_acc": 0.7593972961018481
},
{
"epoch": 1.4011077991879268,
"grad_norm": 2.0402971482769034,
"learning_rate": 3.4026061289697396e-07,
"loss": 0.8985117912292481,
"num_input_tokens_seen": 1197179763,
"step": 5970,
"token_acc": 0.7568663489501413
},
{
"epoch": 1.403454831365738,
"grad_norm": 2.0743463496848085,
"learning_rate": 3.371185572931048e-07,
"loss": 0.9137758255004883,
"num_input_tokens_seen": 1199156916,
"step": 5980,
"token_acc": 0.7521096549123137
},
{
"epoch": 1.4058018635435492,
"grad_norm": 1.908781885304316,
"learning_rate": 3.3398813256574843e-07,
"loss": 0.8940442085266114,
"num_input_tokens_seen": 1201161525,
"step": 5990,
"token_acc": 0.7591090088367569
},
{
"epoch": 1.4081488957213604,
"grad_norm": 1.8574032864901908,
"learning_rate": 3.308693936411421e-07,
"loss": 0.8737678527832031,
"num_input_tokens_seen": 1203195084,
"step": 6000,
"token_acc": 0.7614718846052603
},
{
"epoch": 1.4081488957213604,
"eval_loss": 0.9593000411987305,
"eval_runtime": 32.4448,
"eval_samples_per_second": 30.822,
"eval_steps_per_second": 1.295,
"eval_token_acc": 0.7439671291059763,
"num_input_tokens_seen": 1203195084,
"step": 6000
}
],
"logging_steps": 10,
"max_steps": 8000,
"num_input_tokens_seen": 1203195084,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3956677446926336e+16,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}